if (FALSE) {
## simulate data
set.seed(23432)
## training set
n <- 500
p <- 50
X <- matrix(rnorm(n*p), nrow = n, ncol = p)
colnames(X) <- paste("X", 1:p, sep="")
X <- data.frame(X)
Y <- X[, 1] + sqrt(abs(X[, 2] * X[, 3])) + X[, 2] - X[, 3] + rnorm(n)
## test set
m <- 1000
newX <- matrix(rnorm(m*p), nrow = m, ncol = p)
colnames(newX) <- paste("X", 1:p, sep="")
newX <- data.frame(newX)
newY <- newX[, 1] + sqrt(abs(newX[, 2] * newX[, 3])) + newX[, 2] -
newX[, 3] + rnorm(m)
# generate Library and run Super Learner
SL.library <- c("SL.glm", "SL.randomForest", "SL.gam",
"SL.polymars", "SL.mean")
test <- SuperLearner(Y = Y, X = X, newX = newX, SL.library = SL.library,
verbose = TRUE, method = "method.NNLS")
test
# library with screening
SL.library <- list(c("SL.glmnet", "All"), c("SL.glm", "screen.randomForest",
"All", "screen.SIS"), "SL.randomForest", c("SL.polymars", "All"), "SL.mean")
test <- SuperLearner(Y = Y, X = X, newX = newX, SL.library = SL.library,
verbose = TRUE, method = "method.NNLS")
test
# binary outcome
set.seed(1)
N <- 200
X <- matrix(rnorm(N*10), N, 10)
X <- as.data.frame(X)
Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] +
.1*X[, 3]*X[, 4] - .2*abs(X[, 4])))
SL.library <- c("SL.glmnet", "SL.glm", "SL.knn", "SL.gam", "SL.mean")
# least squares loss function
test.NNLS <- SuperLearner(Y = Y, X = X, SL.library = SL.library,
verbose = TRUE, method = "method.NNLS", family = binomial())
test.NNLS
# negative log binomial likelihood loss function
test.NNloglik <- SuperLearner(Y = Y, X = X, SL.library = SL.library,
verbose = TRUE, method = "method.NNloglik", family = binomial())
test.NNloglik
# 1 - AUC loss function
test.AUC <- SuperLearner(Y = Y, X = X, SL.library = SL.library,
verbose = TRUE, method = "method.AUC", family = binomial())
test.AUC
# 2
# adapted from library(SIS)
set.seed(1)
# training
b <- c(2, 2, 2, -3*sqrt(2))
n <- 150
p <- 200
truerho <- 0.5
corrmat <- diag(rep(1-truerho, p)) + matrix(truerho, p, p)
corrmat[, 4] = sqrt(truerho)
corrmat[4, ] = sqrt(truerho)
corrmat[4, 4] = 1
cholmat <- chol(corrmat)
x <- matrix(rnorm(n*p, mean=0, sd=1), n, p)
x <- x
feta <- x[, 1:4]
fprob <- exp(feta) / (1 + exp(feta))
y <- rbinom(n, 1, fprob)
# test
m <- 10000
newx <- matrix(rnorm(m*p, mean=0, sd=1), m, p)
newx <- newx
newfeta <- newx[, 1:4]
newfprob <- exp(newfeta) / (1 + exp(newfeta))
newy <- rbinom(m, 1, newfprob)
DATA2 <- data.frame(Y = y, X = x)
newDATA2 <- data.frame(Y = newy, X=newx)
create.SL.knn <- function(k = c(20, 30)) {
for(mm in seq(length(k))){
eval(parse(text = paste('SL.knn.', k[mm], '<- function(..., k = ', k[mm],
') SL.knn(..., k = k)', sep = '')), envir = .GlobalEnv)
}
invisible(TRUE)
}
create.SL.knn(c(20, 30, 40, 50, 60, 70))
# library with screening
SL.library <- list(c("SL.glmnet", "All"), c("SL.glm", "screen.randomForest"),
"SL.randomForest", "SL.knn", "SL.knn.20", "SL.knn.30", "SL.knn.40",
"SL.knn.50", "SL.knn.60", "SL.knn.70",
c("SL.polymars", "screen.randomForest"))
test <- SuperLearner(Y = DATA2$Y, X = DATA2[, -1], newX = newDATA2[, -1],
SL.library = SL.library, verbose = TRUE, family = binomial())
test
## examples with multicore
set.seed(23432, "L'Ecuyer-CMRG") # use L'Ecuyer for multicore seeds. see ?set.seed for details
## training set
n <- 500
p <- 50
X <- matrix(rnorm(n*p), nrow = n, ncol = p)
colnames(X) <- paste("X", 1:p, sep="")
X <- data.frame(X)
Y <- X[, 1] + sqrt(abs(X[, 2] * X[, 3])) + X[, 2] - X[, 3] + rnorm(n)
## test set
m <- 1000
newX <- matrix(rnorm(m*p), nrow = m, ncol = p)
colnames(newX) <- paste("X", 1:p, sep="")
newX <- data.frame(newX)
newY <- newX[, 1] + sqrt(abs(newX[, 2] * newX[, 3])) + newX[, 2] - newX[, 3] + rnorm(m)
# generate Library and run Super Learner
SL.library <- c("SL.glm", "SL.randomForest", "SL.gam",
"SL.polymars", "SL.mean")
testMC <- mcSuperLearner(Y = Y, X = X, newX = newX, SL.library = SL.library,
method = "method.NNLS")
testMC
## examples with snow
library(parallel)
cl <- makeCluster(2, type = "PSOCK") # can use different types here
clusterSetRNGStream(cl, iseed = 2343)
# make SL functions available on the clusters, use assignment to avoid printing
foo <- clusterEvalQ(cl, library(SuperLearner))
testSNOW <- snowSuperLearner(cluster = cl, Y = Y, X = X, newX = newX,
SL.library = SL.library, method = "method.NNLS")
testSNOW
stopCluster(cl)
## snow example with user-generated wrappers
# If you write your own wrappers and are using snowSuperLearner()
# These new wrappers need to be added to the SuperLearner namespace and exported to the clusters
# Using a simple example here, but can define any new SuperLearner wrapper
my.SL.wrapper <- function(...) SL.glm(...)
# assign function into SuperLearner namespace
environment(my.SL.wrapper) <-asNamespace("SuperLearner")
cl <- makeCluster(2, type = "PSOCK") # can use different types here
clusterSetRNGStream(cl, iseed = 2343)
# make SL functions available on the clusters, use assignment to avoid printing
foo <- clusterEvalQ(cl, library(SuperLearner))
clusterExport(cl, c("my.SL.wrapper")) # copy the function to all clusters
testSNOW <- snowSuperLearner(cluster = cl, Y = Y, X = X, newX = newX,
SL.library = c("SL.glm", "SL.mean", "my.SL.wrapper"), method = "method.NNLS")
testSNOW
stopCluster(cl)
## timing
replicate(5, system.time(SuperLearner(Y = Y, X = X, newX = newX,
SL.library = SL.library, method = "method.NNLS")))
replicate(5, system.time(mcSuperLearner(Y = Y, X = X, newX = newX,
SL.library = SL.library, method = "method.NNLS")))
cl <- makeCluster(2, type = 'PSOCK')
# make SL functions available on the clusters, use assignment to avoid printing
foo <- clusterEvalQ(cl, library(SuperLearner))
replicate(5, system.time(snowSuperLearner(cl, Y = Y, X = X, newX = newX,
SL.library = SL.library, method = "method.NNLS")))
stopCluster(cl)
}
Run the code above in your browser using DataLab