# NOT RUN {
library(recipes)
# ---------------------------------------------------------------------------
# Setup
train <- iris[1:100, ]
test <- iris[101:150, ]
# ---------------------------------------------------------------------------
# Recipes example
# Create a recipe that logs a predictor
rec <- recipe(Species ~ Sepal.Length + Sepal.Width, train) %>%
step_log(Sepal.Length)
processed <- mold(rec, train)
# Sepal.Length has been logged
processed$predictors
processed$outcomes
# The underlying blueprint is a prepped recipe
processed$blueprint$recipe
# Call forge() with the blueprint and the test data
# to have it preprocess the test data in the same way
forge(test, processed$blueprint)
# Use `outcomes = TRUE` to also extract the preprocessed outcome!
# This logged the Sepal.Length column of `new_data`
forge(test, processed$blueprint, outcomes = TRUE)
# ---------------------------------------------------------------------------
# With an intercept
# You can add an intercept with `intercept = TRUE`
processed <- mold(rec, train, blueprint = default_recipe_blueprint(intercept = TRUE))
processed$predictors
# But you also could have used a recipe step
rec2 <- step_intercept(rec)
mold(rec2, iris)$predictors
# ---------------------------------------------------------------------------
# Non standard roles
# If you have custom recipes roles, it is assumed that they are required
# in `prep()` and afterwards for modeling, but are not required at `bake()`
# time and for prediction. This means that they are processed and returned
# in the `$extras$roles` slot of the return value of `mold()`, but they
# are not required to be in `new_data` in `forge()` and are not returned
# in the result.
rec_roles <- recipe(train) %>%
update_role(Sepal.Width, new_role = "predictor") %>%
update_role(Species, new_role = "outcome") %>%
update_role(Sepal.Length, new_role = "id") %>%
update_role(Petal.Length, new_role = "important")
processed_roles <- mold(rec_roles, train)
# The custom roles will still be in the `mold()` result in case you need
# them for modeling.
processed_roles$extras
# Notice that the columns with custom roles exist in `test`,
# but they weren't passed to `bake()` and aren't in the output.
forge(test, processed_roles$blueprint)$extras
# They can even be removed from `test` entirely, and it still works.
test2 <- test
test2$Petal.Length <- NULL
forge(test2, processed_roles$blueprint)$extras
# Occasionally, you might have a custom role that is required to be able
# to `bake()` on `new_data`. In those cases, you can specify it with
# `bake_dependent_roles` in `default_recipe_blueprint()`, which will ensure
# that it is a required column when calling `forge()`, that it will be
# passed on to `bake()`, and that it will be returned in the result.
bp <- default_recipe_blueprint(bake_dependent_roles = "important")
processed_roles <- mold(rec_roles, train, blueprint = bp)
# Now `"important"` is a required role when `forge()`-ing
forge(test, processed_roles$blueprint)$extras$roles
# Which means that we can't `forge()` with the data frame that is missing
# the `Petal.Length` column
try(forge(test2, processed_roles$blueprint))
# ---------------------------------------------------------------------------
# Matrix output for predictors
# You can change the `composition` of the predictor data set
bp <- default_recipe_blueprint(composition = "dgCMatrix")
processed <- mold(rec, train, blueprint = bp)
class(processed$predictors)
# }
Run the code above in your browser using DataLab