# NOT RUN {
# ---------------------------------------------------------------------------
# Setup
train <- iris[1:100,]
test <- iris[101:150,]
# ---------------------------------------------------------------------------
# Formula Example
# Call mold() with the training data
processed <- mold(
log(Sepal.Width) ~ Sepal.Length + Species,
train,
blueprint = default_formula_blueprint(intercept = TRUE)
)
# Then, call forge() with the blueprint and the test data
# to have it preprocess the test data in the same way
forge(test, processed$blueprint)
# Use `outcomes = TRUE` to also extract the preprocessed outcome
forge(test, processed$blueprint, outcomes = TRUE)
# ---------------------------------------------------------------------------
# Factors without an intercept
# No intercept is added by default
processed <- mold(Sepal.Width ~ Species, train)
# So factor columns are completely expanded
# into all `K` columns (the number of levels)
processed$predictors
# ---------------------------------------------------------------------------
# Global variables
y <- rep(1, times = nrow(train))
# In base R, global variables are allowed in a model formula
frame <- model.frame(Species ~ y + Sepal.Length, train)
head(frame)
# mold() does not allow them, and throws an error
tryCatch(
expr = mold(Species ~ y + Sepal.Length, train),
error = function(e) print(e$message)
)
# ---------------------------------------------------------------------------
# Dummy variables and interactions
# By default, factor columns are expanded
# and interactions are created, both by
# calling model.matrix(). Some models (like
# tree based models) can take factors directly
# but still might want to use the formula method.
# In those cases, set `indicators = FALSE` to not
# run model.matrix() on factor columns. Interactions
# are still allowed and are run on numeric columns.
blueprint_no_indicators <- default_formula_blueprint(indicators = FALSE)
processed <- mold(
~ Species + Sepal.Width:Sepal.Length,
train,
blueprint = blueprint_no_indicators
)
processed$predictors
# An informative error is thrown when `indicators = FALSE` and
# factors are present in interaction terms or in inline functions
try(mold(Sepal.Width ~ Sepal.Length:Species, train, blueprint = blueprint_no_indicators))
try(mold(Sepal.Width ~ paste0(Species), train, blueprint = blueprint_no_indicators))
# ---------------------------------------------------------------------------
# Multivariate outcomes
# Multivariate formulas can be specified easily
processed <- mold(Sepal.Width + log(Sepal.Length) ~ Species, train)
processed$outcomes
# Inline functions on the LHS are run, but any matrix
# output is flattened (like what happens in `model.matrix()`)
# (essentially this means you don't wind up with columns
# in the tibble that are matrices)
processed <- mold(poly(Sepal.Length, degree = 2) ~ Species, train)
processed$outcomes
# TRUE
ncol(processed$outcomes) == 2
# Multivariate formulas specified in mold()
# carry over into forge()
forge(test, processed$blueprint, outcomes = TRUE)
# ---------------------------------------------------------------------------
# Offsets
# Offsets are handled specially in base R, so they deserve special
# treatment here as well. You can add offsets using the inline function
# offset()
processed <- mold(Sepal.Width ~ offset(Sepal.Length) + Species, train)
processed$extras$offset
# Multiple offsets can be included, and they get added together
processed <- mold(
Sepal.Width ~ offset(Sepal.Length) + offset(Petal.Width),
train
)
identical(
processed$extras$offset$.offset,
train$Sepal.Length + train$Petal.Width
)
# Forging test data will also require
# and include the offset
forge(test, processed$blueprint)
# ---------------------------------------------------------------------------
# Intercept only
# Because `1` and `0` are intercept modifying terms, they are
# not allowed in the formula and are controlled by the
# `intercept` argument of the blueprint. To use an intercept
# only formula, you should supply `NULL` on the RHS of the formula.
mold(~ NULL, train, blueprint = default_formula_blueprint(intercept = TRUE))
# }
Run the code above in your browser using DataLab