# NOT RUN {
## ------------------------------------------------------------
## use the breast data for illustration
## ------------------------------------------------------------
data(breast, package = "randomForestSRC")
breast <- na.omit(breast)
f <- as.formula(status ~ .)
##----------------------------------------------------------------
## default RFQ call
##----------------------------------------------------------------
o.rfq <- imbalanced(f, breast)
print(o.rfq)
## equivalent to:
## rfsrc(f, breast, rfq = TRUE, perf.type = "g.mean")
##----------------------------------------------------------------
## RFQ with AUC splitting
##----------------------------------------------------------------
print(imbalanced(f, breast, splitrule = "auc"))
##----------------------------------------------------------------
## RFQ call with fast rfsrc
##----------------------------------------------------------------
o.rfq <- imbalanced(f, breast, fast = TRUE)
print(o.rfq)
## equivalent to:
## rfsrc.fast(f, breast, rfq = TRUE, perf.type = "g.mean")
##-----------------------------------------------------------------
## standard RF (uses misclassification)
## ------------------------------------------------------------
o.std <- imbalanced(f, breast, method = "stand")
##-----------------------------------------------------------------
## standard RF using G-mean performance
## ------------------------------------------------------------
o.std <- imbalanced(f, breast, method = "stand", perf.type = "g.mean")
## equivalent to:
## rfsrc(f, breast, perf.type = "g.mean")
##----------------------------------------------------------------
## default BRF call
##----------------------------------------------------------------
o.brf <- imbalanced(f, breast, method = "brf")
## equivalent to:
## imbalanced(f, breast, method = "brf", perf.type = "g.mean")
##----------------------------------------------------------------
## BRF call with misclassification performance
##----------------------------------------------------------------
o.brf <- imbalanced(f, breast, method = "brf", perf.type = "misclass")
##----------------------------------------------------------------
## RFQ with optimized threshold
##----------------------------------------------------------------
o.rfq.opt <- imbalanced(f, breast, optimize = TRUE)
plot(o.rfq.opt$gmean, type = "l")
##----------------------------------------------------------------
## train/test example
##----------------------------------------------------------------
trn <- sample(1:nrow(breast), size = nrow(breast) / 2)
o.trn <- imbalanced(f, breast[trn,], importance = TRUE)
o.tst <- predict(o.trn, breast[-trn,], importance = TRUE)
print(o.trn)
print(o.tst)
print(100 * cbind(o.trn$impo[, 1], o.tst$impo[, 1]))
##----------------------------------------------------------------
## simulation example using the caret R-package
## creates imbalanced data by randomly sampling the class 1 data
##
## uses SMOTE from "imbalance" package to oversample the minority
## illustrates RFQ with and without SMOTE
##
##----------------------------------------------------------------
if (library("caret", logical.return = TRUE) &
library("imbalance", logical.return = TRUE)) {
## experimental settings
n <- 5000
q <- 20
ir <- 6
f <- as.formula(Class ~ .)
## simulate the data, create minority class data
d <- twoClassSim(n, linearVars = 15, noiseVars = q)
d$Class <- factor(as.numeric(d$Class) - 1)
idx.0 <- which(d$Class == 0)
idx.1 <- sample(which(d$Class == 1), sum(d$Class == 1) / ir , replace = FALSE)
d <- d[c(idx.0,idx.1),, drop = FALSE]
d <- d[sample(1:nrow(d)), ]
## define train/test split
trn <- sample(1:nrow(d), size = nrow(d) / 2, replace = FALSE)
## now make SMOTE training data
newd.50 <- mwmote(d[trn, ], numInstances = 50, classAttr = "Class")
newd.500 <- mwmote(d[trn, ], numInstances = 500, classAttr = "Class")
## fit RFQ with and without SMOTE
o.with.50 <- imbalanced(f, rbind(d[trn, ], newd.50))
o.with.500 <- imbalanced(f, rbind(d[trn, ], newd.500))
o.without <- imbalanced(f, d[trn, ])
## compare performance on test data
print(predict(o.with.50, d[-trn, ]))
print(predict(o.with.500, d[-trn, ]))
print(predict(o.without, d[-trn, ]))
}
##----------------------------------------------------------------
## simulation example using the caret R-package simar to above
##
## illustrates effectiveness of blocked VIMP
##
##----------------------------------------------------------------
if (library("caret", logical.return = TRUE)) {
## experimental settings
n <- 1000
q <- 20
ir <- 6
f <- as.formula(Class ~ .)
## simulate the data, create minority class data
d <- twoClassSim(n, linearVars = 15, noiseVars = q)
d$Class <- factor(as.numeric(d$Class) - 1)
idx.0 <- which(d$Class == 0)
idx.1 <- sample(which(d$Class == 1), sum(d$Class == 1) / ir , replace = FALSE)
d <- d[c(idx.0,idx.1),, drop = FALSE]
## VIMP for BRF with and without blocking
## blocked VIMP is a hybrid of Breiman-Cutler/Ishwaran-Kogalur VIMP
brf <- imbalanced(f, d, method = "brf", importance = TRUE, block.size = 1)
brfB <- imbalanced(f, d, method = "brf", importance = TRUE, block.size = 10)
## VIMP for RFQ with and without blocking
rfq <- imbalanced(f, d, importance = TRUE, block.size = 1)
rfqB <- imbalanced(f, d, importance = TRUE, block.size = 10)
## compare VIMP values
imp <- 100 * cbind(brf$importance[, 1], brfB$importance[, 1],
rfq$importance[, 1], rfqB$importance[, 1])
legn <- c("BRF", "BRF-block", "RFQ", "RFQ-block")
colr <- rep(4,20+q)
colr[1:20] <- 2
ylim <- range(c(imp))
nms <- 1:(20+q)
par(mfrow=c(2,2))
barplot(imp[,1],col=colr,las=2,main=legn[1],ylim=ylim,names.arg=nms)
barplot(imp[,2],col=colr,las=2,main=legn[2],ylim=ylim,names.arg=nms)
barplot(imp[,3],col=colr,las=2,main=legn[3],ylim=ylim,names.arg=nms)
barplot(imp[,4],col=colr,las=2,main=legn[4],ylim=ylim,names.arg=nms)
}
# }
Run the code above in your browser using DataLab