# ---------------------------------------------------------------------------
data("hardhat-example-data")
# ---------------------------------------------------------------------------
# Formula Example
# Call mold() with the training data
processed <- mold(
log(num_1) ~ num_2 + fac_1,
example_train,
blueprint = default_formula_blueprint(intercept = TRUE)
)
# Then, call forge() with the blueprint and the test data
# to have it preprocess the test data in the same way
forge(example_test, processed$blueprint)
# Use `outcomes = TRUE` to also extract the preprocessed outcome
forge(example_test, processed$blueprint, outcomes = TRUE)
# ---------------------------------------------------------------------------
# Factors without an intercept
# No intercept is added by default
processed <- mold(num_1 ~ fac_1 + fac_2, example_train)
# So, for factor columns, the first factor is completely expanded into all
# `K` columns (the number of levels), and the subsequent factors are expanded
# into `K - 1` columns.
processed$predictors
# In the above example, `fac_1` is expanded into all three columns,
# `fac_2` is not. This behavior comes from `model.matrix()`, and is somewhat
# known in the R community, but can lead to a model that is difficult to
# interpret since the corresponding p-values are testing wildly different
# hypotheses.
# To get all indicators for all columns (irrespective of the intercept),
# use the `indicators = "one_hot"` option
processed <- mold(
num_1 ~ fac_1 + fac_2,
example_train,
blueprint = default_formula_blueprint(indicators = "one_hot")
)
processed$predictors
# It is not possible to construct a no-intercept model that expands all
# factors into `K - 1` columns using the formula method. If required, a
# recipe could be used to construct this model.
# ---------------------------------------------------------------------------
# Global variables
y <- rep(1, times = nrow(example_train))
# In base R, global variables are allowed in a model formula
frame <- model.frame(fac_1 ~ y + num_2, example_train)
head(frame)
# mold() does not allow them, and throws an error
try(mold(fac_1 ~ y + num_2, example_train))
# ---------------------------------------------------------------------------
# Dummy variables and interactions
# By default, factor columns are expanded
# and interactions are created, both by
# calling `model.matrix()`. Some models (like
# tree based models) can take factors directly
# but still might want to use the formula method.
# In those cases, set `indicators = "none"` to not
# run `model.matrix()` on factor columns. Interactions
# are still allowed and are run on numeric columns.
bp_no_indicators <- default_formula_blueprint(indicators = "none")
processed <- mold(
~ fac_1 + num_1:num_2,
example_train,
blueprint = bp_no_indicators
)
processed$predictors
# An informative error is thrown when `indicators = "none"` and
# factors are present in interaction terms or in inline functions
try(mold(num_1 ~ num_2:fac_1, example_train, blueprint = bp_no_indicators))
try(mold(num_1 ~ paste0(fac_1), example_train, blueprint = bp_no_indicators))
# ---------------------------------------------------------------------------
# Multivariate outcomes
# Multivariate formulas can be specified easily
processed <- mold(num_1 + log(num_2) ~ fac_1, example_train)
processed$outcomes
# Inline functions on the LHS are run, but any matrix
# output is flattened (like what happens in `model.matrix()`)
# (essentially this means you don't wind up with columns
# in the tibble that are matrices)
processed <- mold(poly(num_2, degree = 2) ~ fac_1, example_train)
processed$outcomes
# TRUE
ncol(processed$outcomes) == 2
# Multivariate formulas specified in mold()
# carry over into forge()
forge(example_test, processed$blueprint, outcomes = TRUE)
# ---------------------------------------------------------------------------
# Offsets
# Offsets are handled specially in base R, so they deserve special
# treatment here as well. You can add offsets using the inline function
# `offset()`
processed <- mold(num_1 ~ offset(num_2) + fac_1, example_train)
processed$extras$offset
# Multiple offsets can be included, and they get added together
processed <- mold(
num_1 ~ offset(num_2) + offset(num_3),
example_train
)
identical(
processed$extras$offset$.offset,
example_train$num_2 + example_train$num_3
)
# Forging test data will also require
# and include the offset
forge(example_test, processed$blueprint)
# ---------------------------------------------------------------------------
# Intercept only
# Because `1` and `0` are intercept modifying terms, they are
# not allowed in the formula and are instead controlled by the
# `intercept` argument of the blueprint. To use an intercept
# only formula, you should supply `NULL` on the RHS of the formula.
mold(
~NULL,
example_train,
blueprint = default_formula_blueprint(intercept = TRUE)
)
# ---------------------------------------------------------------------------
# Matrix output for predictors
# You can change the `composition` of the predictor data set
bp <- default_formula_blueprint(composition = "dgCMatrix")
processed <- mold(log(num_1) ~ num_2 + fac_1, example_train, blueprint = bp)
class(processed$predictors)
Run the code above in your browser using DataLab