# \donttest{
# Attach packages
library(cvms)
library(groupdata2) # fold()
library(dplyr) # %>% arrange() mutate()
# Note: More examples of custom functions can be found at:
# model_fn: model_functions()
# predict_fn: predict_functions()
# preprocess_fn: preprocess_functions()
# Data is part of cvms
data <- participant.scores
# Set seed for reproducibility
set.seed(7)
# Fold data
data <- fold(
data,
k = 4,
cat_col = "diagnosis",
id_col = "participant"
) %>%
mutate(diagnosis = as.factor(diagnosis)) %>%
arrange(.folds)
# Cross-validate multiple formulas
formulas_gaussian <- c(
"score ~ diagnosis",
"score ~ age"
)
formulas_binomial <- c(
"diagnosis ~ score",
"diagnosis ~ age"
)
#
# Gaussian
#
# Create model function that returns a fitted model object
lm_model_fn <- function(train_data, formula, hyperparameters) {
lm(formula = formula, data = train_data)
}
# Create predict function that returns the predictions
lm_predict_fn <- function(test_data, model, formula,
hyperparameters, train_data) {
stats::predict(
object = model,
newdata = test_data,
type = "response",
allow.new.levels = TRUE
)
}
# Cross-validate the model function
cross_validate_fn(
data,
formulas = formulas_gaussian,
type = "gaussian",
model_fn = lm_model_fn,
predict_fn = lm_predict_fn,
fold_cols = ".folds"
)
#
# Binomial
#
# Create model function that returns a fitted model object
glm_model_fn <- function(train_data, formula, hyperparameters) {
glm(formula = formula, data = train_data, family = "binomial")
}
# Create predict function that returns the predictions
glm_predict_fn <- function(test_data, model, formula,
hyperparameters, train_data) {
stats::predict(
object = model,
newdata = test_data,
type = "response",
allow.new.levels = TRUE
)
}
# Cross-validate the model function
cross_validate_fn(
data,
formulas = formulas_binomial,
type = "binomial",
model_fn = glm_model_fn,
predict_fn = glm_predict_fn,
fold_cols = ".folds"
)
#
# Support Vector Machine (svm)
# with hyperparameter tuning
#
# Only run if the `e1071` package is installed
if (requireNamespace("e1071", quietly = TRUE)){
# Create model function that returns a fitted model object
# We use the hyperparameters arg to pass in the kernel and cost values
svm_model_fn <- function(train_data, formula, hyperparameters) {
# Expected hyperparameters:
# - kernel
# - cost
if (!"kernel" %in% names(hyperparameters))
stop("'hyperparameters' must include 'kernel'")
if (!"cost" %in% names(hyperparameters))
stop("'hyperparameters' must include 'cost'")
e1071::svm(
formula = formula,
data = train_data,
kernel = hyperparameters[["kernel"]],
cost = hyperparameters[["cost"]],
scale = FALSE,
type = "C-classification",
probability = TRUE
)
}
# Create predict function that returns the predictions
svm_predict_fn <- function(test_data, model, formula,
hyperparameters, train_data) {
predictions <- stats::predict(
object = model,
newdata = test_data,
allow.new.levels = TRUE,
probability = TRUE
)
# Extract probabilities
probabilities <- dplyr::as_tibble(
attr(predictions, "probabilities")
)
# Return second column
probabilities[[2]]
}
# Specify hyperparameters to try
# The optional ".n" samples 4 combinations
svm_hparams <- list(
".n" = 4,
"kernel" = c("linear", "radial"),
"cost" = c(1, 5, 10)
)
# Cross-validate the model function
cv <- cross_validate_fn(
data,
formulas = formulas_binomial,
type = "binomial",
model_fn = svm_model_fn,
predict_fn = svm_predict_fn,
hyperparameters = svm_hparams,
fold_cols = ".folds"
)
cv
# The `HParams` column has the nested hyperparameter values
cv %>%
select(Dependent, Fixed, HParams, `Balanced Accuracy`, F1, AUC, MCC) %>%
tidyr::unnest(cols = "HParams") %>%
arrange(desc(`Balanced Accuracy`), desc(F1))
#
# Use parallelization
# The below examples show the speed gains when running in parallel
#
# Attach doParallel and register four cores
# Uncomment:
# library(doParallel)
# registerDoParallel(4)
# Specify hyperparameters such that we will
# cross-validate 20 models
hparams <- list(
"kernel" = c("linear", "radial"),
"cost" = 1:5
)
# Cross-validate a list of 20 models in parallel
# Make sure to uncomment the parallel argument
system.time({
cross_validate_fn(
data,
formulas = formulas_gaussian,
type = "gaussian",
model_fn = svm_model_fn,
predict_fn = svm_predict_fn,
hyperparameters = hparams,
fold_cols = ".folds"
#, parallel = TRUE # Uncomment
)
})
# Cross-validate a list of 20 models sequentially
system.time({
cross_validate_fn(
data,
formulas = formulas_gaussian,
type = "gaussian",
model_fn = svm_model_fn,
predict_fn = svm_predict_fn,
hyperparameters = hparams,
fold_cols = ".folds"
#, parallel = TRUE # Uncomment
)
})
} # closes `e1071` package check
# }
Run the code above in your browser using DataLab