# ---------------------------------------------------------------------------
# Setup
train <- iris[1:100, ]
test <- iris[101:150, ]
# mold() is run at model fit time
# and a formula preprocessing blueprint is recorded
x <- mold(log(Sepal.Width) ~ Species, train)
# Inside the result of mold() are the prototype tibbles
# for the predictors and the outcomes
ptype_pred <- x$blueprint$ptypes$predictors
ptype_out <- x$blueprint$ptypes$outcomes
# ---------------------------------------------------------------------------
# shrink() / scream()
# Pass the test data, along with a prototype, to
# shrink() to extract the prototype columns
test_shrunk <- shrink(test, ptype_pred)
# Now pass that to scream() to perform validation checks
# If no warnings / errors are thrown, the checks were
# successful!
scream(test_shrunk, ptype_pred)
# ---------------------------------------------------------------------------
# Outcomes
# To also extract the outcomes, use the outcome prototype
test_outcome <- shrink(test, ptype_out)
scream(test_outcome, ptype_out)
# ---------------------------------------------------------------------------
# Casting
# scream() uses vctrs::vec_cast() to intelligently convert
# new data to the prototype automatically. This means
# it can automatically perform certain conversions, like
# coercing character columns to factors.
test2 <- test
test2$Species <- as.character(test2$Species)
test2_shrunk <- shrink(test2, ptype_pred)
scream(test2_shrunk, ptype_pred)
# It can also recover missing factor levels.
# For example, it is plausible that the test data only had the
# "virginica" level
test3 <- test
test3$Species <- factor(test3$Species, levels = "virginica")
test3_shrunk <- shrink(test3, ptype_pred)
test3_fixed <- scream(test3_shrunk, ptype_pred)
# scream() recovered the missing levels
levels(test3_fixed$Species)
# ---------------------------------------------------------------------------
# Novel levels
# When novel levels with any data are present in `data`, the default
# is to coerce them to `NA` values with a warning.
test4 <- test
test4$Species <- as.character(test4$Species)
test4$Species[1] <- "new_level"
test4$Species <- factor(
test4$Species,
levels = c(levels(test$Species), "new_level")
)
test4 <- shrink(test4, ptype_pred)
# Warning is thrown
test4_removed <- scream(test4, ptype_pred)
# Novel level is removed
levels(test4_removed$Species)
# No warning is thrown
test4_kept <- scream(test4, ptype_pred, allow_novel_levels = TRUE)
# Novel level is kept
levels(test4_kept$Species)
Run the code above in your browser using DataLab