# \donttest{
##------------------------------------------------------------
## survival analysis
##------------------------------------------------------------
## veteran data
## randomized trial of two treatment regimens for lung cancer
data(veteran, package = "randomForestSRC")
v.obj <- rfsrc(Surv(time, status) ~ ., data = veteran, block.size = 1)
## plot tree number 3
plot(get.tree(v.obj, 3))
## print results of trained forest
print(v.obj)
## plot results of trained forest
plot(v.obj)
## plot survival curves for first 10 individuals -- direct way
matplot(v.obj$time.interest, 100 * t(v.obj$survival.oob[1:10, ]),
xlab = "Time", ylab = "Survival", type = "l", lty = 1)
## plot survival curves for first 10 individuals
## using function "plot.survival"
plot.survival(v.obj, subset = 1:10)
## obtain Brier score using KM and RSF censoring distribution estimators
bs.km <- get.brier.survival(v.obj, cens.model = "km")$brier.score
bs.rsf <- get.brier.survival(v.obj, cens.model = "rfsrc")$brier.score
## plot the brier score
plot(bs.km, type = "s", col = 2)
lines(bs.rsf, type ="s", col = 4)
legend("topright", legend = c("cens.model = km", "cens.model = rfsrc"), fill = c(2,4))
## plot CRPS (continuous rank probability score) as function of time
## here's how to calculate the CRPS for every time point
trapz <- randomForestSRC:::trapz
time <- v.obj$time.interest
crps.km <- sapply(1:length(time), function(j) {
trapz(time[1:j], bs.km[1:j, 2] / diff(range(time[1:j])))
})
crps.rsf <- sapply(1:length(time), function(j) {
trapz(time[1:j], bs.rsf[1:j, 2] / diff(range(time[1:j])))
})
plot(time, crps.km, ylab = "CRPS", type = "s", col = 2)
lines(time, crps.rsf, type ="s", col = 4)
legend("bottomright", legend=c("cens.model = km", "cens.model = rfsrc"), fill=c(2,4))
## fast nodesize optimization for veteran data
## optimal nodesize in survival is larger than other families
## see the function "tune" for more examples
tune.nodesize(Surv(time,status) ~ ., veteran)
## Primary biliary cirrhosis (PBC) of the liver
data(pbc, package = "randomForestSRC")
pbc.obj <- rfsrc(Surv(days, status) ~ ., pbc)
print(pbc.obj)
## save.memory example for survival
## growing many deep trees creates memory issue without this option!
data(pbc, package = "randomForestSRC")
print(rfsrc(Surv(days, status) ~ ., pbc, splitrule = "random",
ntree = 25000, nodesize = 1, save.memory = TRUE))
##------------------------------------------------------------
## trees can be plotted for any family
## see get.tree for details and more examples
##------------------------------------------------------------
## survival where factors have many levels
data(veteran, package = "randomForestSRC")
vd <- veteran
vd$celltype=factor(vd$celltype)
vd$diagtime=factor(vd$diagtime)
vd.obj <- rfsrc(Surv(time,status)~., vd, ntree = 100, nodesize = 5)
plot(get.tree(vd.obj, 3))
## classification
iris.obj <- rfsrc(Species ~., data = iris)
plot(get.tree(iris.obj, 25, class.type = "bayes"))
plot(get.tree(iris.obj, 25, target = "setosa"))
plot(get.tree(iris.obj, 25, target = "versicolor"))
plot(get.tree(iris.obj, 25, target = "virginica"))
## ------------------------------------------------------------
## simple example of VIMP using iris classification
## ------------------------------------------------------------
## directly from trained forest
print(rfsrc(Species~.,iris,importance=TRUE)$importance)
## VIMP (and performance) use misclassification error by default
## but brier prediction error can be requested
print(rfsrc(Species~.,iris,importance=TRUE,perf.type="brier")$importance)
## example using vimp function (see vimp help file for details)
iris.obj <- rfsrc(Species ~., data = iris)
print(vimp(iris.obj)$importance)
print(vimp(iris.obj,perf.type="brier")$importance)
## example using hold out vimp (see holdout.vimp help file for details)
print(holdout.vimp(Species~.,iris)$importance)
print(holdout.vimp(Species~.,iris,perf.type="brier")$importance)
## ------------------------------------------------------------
## confidence interval for vimp using subsampling
## compare with holdout vimp
## ------------------------------------------------------------
## new York air quality measurements
o <- rfsrc(Ozone ~ ., data = airquality)
so <- subsample(o)
plot(so)
## compare with holdout vimp
print(holdout.vimp(Ozone ~ ., data = airquality)$importance)
##------------------------------------------------------------
## example of imputation in survival analysis
##------------------------------------------------------------
data(pbc, package = "randomForestSRC")
pbc.obj2 <- rfsrc(Surv(days, status) ~ ., pbc, na.action = "na.impute")
## same as above but iterate the missing data algorithm
pbc.obj3 <- rfsrc(Surv(days, status) ~ ., pbc,
na.action = "na.impute", nimpute = 3)
## fast way to impute data (no inference is done)
## see impute for more details
pbc.imp <- impute(Surv(days, status) ~ ., pbc, splitrule = "random")
##------------------------------------------------------------
## compare RF-SRC to Cox regression
## Illustrates C-error and Brier score measures of performance
## assumes "pec" and "survival" libraries are loaded
##------------------------------------------------------------
if (library("survival", logical.return = TRUE)
& library("pec", logical.return = TRUE)
& library("prodlim", logical.return = TRUE))
{
##prediction function required for pec
predictSurvProb.rfsrc <- function(object, newdata, times, ...){
ptemp <- predict(object,newdata=newdata,...)$survival
pos <- sindex(jump.times = object$time.interest, eval.times = times)
p <- cbind(1,ptemp)[, pos + 1]
if (NROW(p) != NROW(newdata) || NCOL(p) != length(times))
stop("Prediction failed")
p
}
## data, formula specifications
data(pbc, package = "randomForestSRC")
pbc.na <- na.omit(pbc) ##remove NA's
surv.f <- as.formula(Surv(days, status) ~ .)
pec.f <- as.formula(Hist(days,status) ~ 1)
## run cox/rfsrc models
## for illustration we use a small number of trees
cox.obj <- coxph(surv.f, data = pbc.na, x = TRUE)
rfsrc.obj <- rfsrc(surv.f, pbc.na, ntree = 150)
## compute bootstrap cross-validation estimate of expected Brier score
## see Mogensen, Ishwaran and Gerds (2012) Journal of Statistical Software
set.seed(17743)
prederror.pbc <- pec(list(cox.obj,rfsrc.obj), data = pbc.na, formula = pec.f,
splitMethod = "bootcv", B = 50)
print(prederror.pbc)
plot(prederror.pbc)
## compute out-of-bag C-error for cox regression and compare to rfsrc
rfsrc.obj <- rfsrc(surv.f, pbc.na)
cat("out-of-bag Cox Analysis ...", "\n")
cox.err <- sapply(1:100, function(b) {
if (b%%10 == 0) cat("cox bootstrap:", b, "\n")
train <- sample(1:nrow(pbc.na), nrow(pbc.na), replace = TRUE)
cox.obj <- tryCatch({coxph(surv.f, pbc.na[train, ])}, error=function(ex){NULL})
if (!is.null(cox.obj)) {
get.cindex(pbc.na$days[-train], pbc.na$status[-train], predict(cox.obj, pbc.na[-train, ]))
} else NA
})
cat("\n\tOOB error rates\n\n")
cat("\tRSF : ", rfsrc.obj$err.rate[rfsrc.obj$ntree], "\n")
cat("\tCox regression : ", mean(cox.err, na.rm = TRUE), "\n")
}
##------------------------------------------------------------
## competing risks
##------------------------------------------------------------
## WIHS analysis
## cumulative incidence function (CIF) for HAART and AIDS stratified by IDU
data(wihs, package = "randomForestSRC")
wihs.obj <- rfsrc(Surv(time, status) ~ ., wihs, nsplit = 3, ntree = 100)
plot.competing.risk(wihs.obj)
cif <- wihs.obj$cif.oob
Time <- wihs.obj$time.interest
idu <- wihs$idu
cif.haart <- cbind(apply(cif[,,1][idu == 0,], 2, mean),
apply(cif[,,1][idu == 1,], 2, mean))
cif.aids <- cbind(apply(cif[,,2][idu == 0,], 2, mean),
apply(cif[,,2][idu == 1,], 2, mean))
matplot(Time, cbind(cif.haart, cif.aids), type = "l",
lty = c(1,2,1,2), col = c(4, 4, 2, 2), lwd = 3,
ylab = "Cumulative Incidence")
legend("topleft",
legend = c("HAART (Non-IDU)", "HAART (IDU)", "AIDS (Non-IDU)", "AIDS (IDU)"),
lty = c(1,2,1,2), col = c(4, 4, 2, 2), lwd = 3, cex = 1.5)
## illustrates the various splitting rules
## illustrates event specific and non-event specific variable selection
if (library("survival", logical.return = TRUE)) {
## use the pbc data from the survival package
## events are transplant (1) and death (2)
data(pbc, package = "survival")
pbc$id <- NULL
## modified Gray's weighted log-rank splitting
## (equivalent to cause=c(1,1) and splitrule="logrankCR")
pbc.cr <- rfsrc(Surv(time, status) ~ ., pbc)
## log-rank cause-1 specific splitting and targeted VIMP for cause 1
pbc.log1 <- rfsrc(Surv(time, status) ~ ., pbc,
splitrule = "logrankCR", cause = c(1,0), importance = TRUE)
## log-rank cause-2 specific splitting and targeted VIMP for cause 2
pbc.log2 <- rfsrc(Surv(time, status) ~ ., pbc,
splitrule = "logrankCR", cause = c(0,1), importance = TRUE)
## extract VIMP from the log-rank forests: event-specific
## extract minimal depth from the Gray log-rank forest: non-event specific
var.perf <- data.frame(md = max.subtree(pbc.cr)$order[, 1],
vimp1 = 100 * pbc.log1$importance[ ,1],
vimp2 = 100 * pbc.log2$importance[ ,2])
print(var.perf[order(var.perf$md), ], digits = 2)
}
## ------------------------------------------------------------
## regression analysis
## ------------------------------------------------------------
## new York air quality measurements
airq.obj <- rfsrc(Ozone ~ ., data = airquality, na.action = "na.impute")
# partial plot of variables (see plot.variable for more details)
plot.variable(airq.obj, partial = TRUE, smooth.lines = TRUE)
## motor trend cars
mtcars.obj <- rfsrc(mpg ~ ., data = mtcars)
## ------------------------------------------------------------
## regression with custom bootstrap
## ------------------------------------------------------------
ntree <- 25
n <- nrow(mtcars)
s.size <- n / 2
swr <- TRUE
samp <- randomForestSRC:::make.sample(ntree, n, s.size, swr)
o <- rfsrc(mpg ~ ., mtcars, bootstrap = "by.user", samp = samp)
## ------------------------------------------------------------
## classification analysis
## ------------------------------------------------------------
## iris data
iris.obj <- rfsrc(Species ~., data = iris)
## wisconsin prognostic breast cancer data
data(breast, package = "randomForestSRC")
breast.obj <- rfsrc(status ~ ., data = breast, block.size=1)
plot(breast.obj)
## ------------------------------------------------------------
## big data set, reduce number of variables using simple method
## ------------------------------------------------------------
## use Iowa housing data set
data(housing, package = "randomForestSRC")
## original data contains lots of missing data, use fast imputation
## however see impute for other methods
housing2 <- impute(data = housing, fast = TRUE)
## run shallow trees to find variables that split any tree
xvar.used <- rfsrc(SalePrice ~., housing2, ntree = 250, nodedepth = 4,
var.used="all.trees", mtry = Inf, nsplit = 100)$var.used
## now fit forest using filtered variables
xvar.keep <- names(xvar.used)[xvar.used >= 1]
o <- rfsrc(SalePrice~., housing2[, c("SalePrice", xvar.keep)])
print(o)
## ------------------------------------------------------------
## imbalanced classification data
## see the "imbalanced" function for further details
##
## (a) use balanced random forests with undersampling of the majority class
## Specifically let n0, n1 be sample sizes for majority, minority
## cases. We sample 2 x n1 cases with majority, minority cases chosen
## with probabilities n1/n, n0/n where n=n0+n1
##
## (b) balanced random forests using "imbalanced"
##
## (c) q-classifier (RFQ) using "imbalanced"
##
## ------------------------------------------------------------
## Wisconsin breast cancer example
data(breast, package = "randomForestSRC")
breast <- na.omit(breast)
## balanced random forests - brute force
y <- breast$status
obdirect <- rfsrc(status ~ ., data = breast, nsplit = 10,
case.wt = randomForestSRC:::make.wt(y),
sampsize = randomForestSRC:::make.size(y))
print(obdirect)
print(get.imbalanced.performance(obdirect))
## balanced random forests - using "imbalanced"
ob <- imbalanced(status ~ ., data = breast, method = "brf")
print(ob)
print(get.imbalanced.performance(ob))
## q-classifier (RFQ) - using "imbalanced"
oq <- imbalanced(status ~ ., data = breast)
print(oq)
print(get.imbalanced.performance(oq))
## q-classifier (RFQ) - with auc splitting
oqauc <- imbalanced(status ~ ., data = breast, splitrule = "auc")
print(oqauc)
print(get.imbalanced.performance(oqauc))
## ------------------------------------------------------------
## unsupervised analysis
## ------------------------------------------------------------
## two equivalent ways to implement unsupervised forests
mtcars.unspv <- rfsrc(Unsupervised() ~., data = mtcars)
mtcars2.unspv <- rfsrc(data = mtcars)
## illustration of sidClustering for the mtcars data
## see sidClustering for more details
mtcars.sid <- sidClustering(mtcars, k = 1:10)
print(split(mtcars, mtcars.sid$cl[, 3]))
print(split(mtcars, mtcars.sid$cl[, 10]))
## ------------------------------------------------------------
## bivariate regression using Mahalanobis splitting
## also illustrates user specified covariance matrix
## ------------------------------------------------------------
if (library("mlbench", logical.return = TRUE)) {
## load boston housing data, specify the bivariate regression
data(BostonHousing)
f <- formula("Multivar(lstat, nox) ~.")
## Mahalanobis splitting
bh.mreg <- rfsrc(f, BostonHousing, importance = TRUE, splitrule = "mahal")
## performance error and vimp
vmp <- get.mv.vimp(bh.mreg)
pred <- get.mv.predicted(bh.mreg)
## standardized error and vimp
err.std <- get.mv.error(bh.mreg, standardize = TRUE)
vmp.std <- get.mv.vimp(bh.mreg, standardize = TRUE)
## same analysis, but with user specified covariance matrix
sigma <- cov(BostonHousing[, c("lstat","nox")])
bh.mreg2 <- rfsrc(f, BostonHousing, splitrule = "mahal", sigma = sigma)
}
## ------------------------------------------------------------
## multivariate mixed forests (nutrigenomic study)
## study effects of diet, lipids and gene expression for mice
## diet, genotype and lipids used as the multivariate y
## genes used for the x features
## ------------------------------------------------------------
## load the data (data is a list)
data(nutrigenomic, package = "randomForestSRC")
## assemble the multivariate y data
ydta <- data.frame(diet = nutrigenomic$diet,
genotype = nutrigenomic$genotype,
nutrigenomic$lipids)
## multivariate mixed forest call
## uses "get.mv.formula" for conveniently setting formula
mv.obj <- rfsrc(get.mv.formula(colnames(ydta)),
data.frame(ydta, nutrigenomic$genes),
importance=TRUE, nsplit = 10)
## print results for diet and genotype y values
print(mv.obj, outcome.target = "diet")
print(mv.obj, outcome.target = "genotype")
## extract standardized VIMP
svimp <- get.mv.vimp(mv.obj, standardize = TRUE)
## plot standardized VIMP for diet, genotype and lipid for each gene
boxplot(t(svimp), col = "bisque", cex.axis = .7, las = 2,
outline = FALSE,
ylab = "standardized VIMP",
main = "diet/genotype/lipid VIMP for each gene")
## ------------------------------------------------------------
## illustrates yvar.wt which sets the probability of selecting
## the response variables in multivariate regression
## ------------------------------------------------------------
## use mtcars: add fake responses
mult.mtcars <- cbind(mtcars, mtcars$mpg, mtcars$mpg)
names(mult.mtcars) = c(names(mtcars), "mpg2", "mpg3")
## noise up the fake responses
mult.mtcars$mpg2 <- sample(mtcars$mpg)
mult.mtcars$mpg3 <- sample(mtcars$mpg)
formula = as.formula(Multivar(mpg, mpg2, mpg3) ~ .)
## select 2 of the 3 responses randomly at each split with an associated weight vector.
## choose the noisy y responses which should degrade performance
yvar.wt = c(0.000001, 0.5, 0.5)
ytry = 2
mult.grow <- rfsrc(formula = formula, data = mult.mtcars, ytry = ytry, yvar.wt = yvar.wt)
print(mult.grow)
print(get.mv.error(mult.grow))
## Also, compare the following two results, as they should be similar:
yvar.wt = c(1.0, 00000.1, 00000.1)
ytry = 1
result1 = rfsrc(formula = formula, data = mult.mtcars, ytry = ytry, yvar.wt = yvar.wt)
result2 = rfsrc(mpg ~ ., mtcars)
print(get.mv.error(result1))
print(get.mv.error(result2))
## ------------------------------------------------------------
## custom splitting using the pre-coded examples
## ------------------------------------------------------------
## motor trend cars
mtcars.obj <- rfsrc(mpg ~ ., data = mtcars, splitrule = "custom")
## iris analysis
iris.obj <- rfsrc(Species ~., data = iris, splitrule = "custom1")
## WIHS analysis
wihs.obj <- rfsrc(Surv(time, status) ~ ., wihs, nsplit = 3,
ntree = 100, splitrule = "custom1")
# }
Run the code above in your browser using DataLab