# ---------------------------------------------------------------------------
# Setup
train <- iris[1:100, ]
test <- iris[101:150, ]
train_x <- train["Sepal.Length"]
train_y <- train["Species"]
test_x <- test["Sepal.Length"]
test_y <- test["Species"]
# ---------------------------------------------------------------------------
# XY Example
# First, call mold() with the training data
processed <- mold(train_x, train_y)
# Then, call forge() with the blueprint and the test data
# to have it preprocess the test data in the same way
forge(test_x, processed$blueprint)
# ---------------------------------------------------------------------------
# Intercept
processed <- mold(train_x, train_y, blueprint = default_xy_blueprint(intercept = TRUE))
forge(test_x, processed$blueprint)
# ---------------------------------------------------------------------------
# XY Method and forge(outcomes = TRUE)
# You can request that the new outcome columns are preprocessed as well, but
# they have to be present in `new_data`!
processed <- mold(train_x, train_y)
# Can't do this!
try(forge(test_x, processed$blueprint, outcomes = TRUE))
# Need to use the full test set, including `y`
forge(test, processed$blueprint, outcomes = TRUE)
# With the XY method, if the Y value used in `mold()` is a vector,
# then a column name of `.outcome` is automatically generated.
# This name is what forge() looks for in `new_data`.
# Y is a vector!
y_vec <- train_y$Species
processed_vec <- mold(train_x, y_vec)
# This throws an informative error that tell you
# to include an `".outcome"` column in `new_data`.
try(forge(iris, processed_vec$blueprint, outcomes = TRUE))
test2 <- test
test2$.outcome <- test2$Species
test2$Species <- NULL
# This works, and returns a tibble in the $outcomes slot
forge(test2, processed_vec$blueprint, outcomes = TRUE)
# ---------------------------------------------------------------------------
# Matrix output for predictors
# You can change the `composition` of the predictor data set
bp <- default_xy_blueprint(composition = "dgCMatrix")
processed <- mold(train_x, train_y, blueprint = bp)
class(processed$predictors)
Run the code above in your browser using DataLab