# Attach packages
library(groupdata2)
library(dplyr)
# Set seed
if (requireNamespace("xpectr", quietly = TRUE)){
xpectr::set_test_seed(42)
}
# Create data frame
df <- data.frame(
"participant" = factor(rep(1:20, 3)),
"age" = rep(sample(c(1:100), 20), 3),
"answer" = factor(sample(c("a", "b", "c", "d"), 60, replace = TRUE)),
"score" = sample(c(1:100), 20 * 3)
)
df <- df %>% dplyr::arrange(participant)
df$session <- rep(c("1", "2", "3"), 20)
# Sample rows to get unequal sizes per participant
df <- dplyr::sample_n(df, size = 53)
# Create the initial groups (to be collapsed)
df <- fold(
data = df,
k = 8,
method = "n_dist",
id_col = "participant"
)
# Ungroup the data frame
# Otherwise `collapse_groups()` would be
# applied to each fold separately!
df <- dplyr::ungroup(df)
# NOTE: Make sure to check the examples with `auto_tune`
# in the end, as this is where the magic lies
# Collapse to 3 groups with size balancing
# Creates new `.coll_groups` column
df_coll <- collapse_groups(
data = df,
n = 3,
group_cols = ".folds",
balance_size = TRUE # enabled by default
)
# Check balances
(coll_summary <- summarize_balances(
data = df_coll,
group_cols = ".coll_groups",
cat_cols = 'answer',
num_cols = c('score', 'age'),
id_cols = 'participant'
))
# Get ranked balances
# NOTE: When we only have a single new group column
# we don't get ranks - but this is good to use
# when comparing multiple group columns!
# The scores are standard deviations across groups
ranked_balances(coll_summary)
# Collapse to 3 groups with size + *categorical* balancing
# We create 2 new `.coll_groups_1/2` columns
df_coll <- collapse_groups(
data = df,
n = 3,
group_cols = ".folds",
cat_cols = "answer",
balance_size = TRUE,
num_new_group_cols = 2
)
# Check balances
# To simplify the output, we only find the
# balance of the `answer` column
(coll_summary <- summarize_balances(
data = df_coll,
group_cols = paste0(".coll_groups_", 1:2),
cat_cols = 'answer'
))
# Get ranked balances
# All scores are standard deviations across groups or (average) ranks
# Rows are ranked by most to least balanced
# (i.e. lowest average SD rank)
ranked_balances(coll_summary)
# Collapse to 3 groups with size + categorical + *numerical* balancing
# We create 2 new `.coll_groups_1/2` columns
df_coll <- collapse_groups(
data = df,
n = 3,
group_cols = ".folds",
cat_cols = "answer",
num_cols = "score",
balance_size = TRUE,
num_new_group_cols = 2
)
# Check balances
(coll_summary <- summarize_balances(
data = df_coll,
group_cols = paste0(".coll_groups_", 1:2),
cat_cols = 'answer',
num_cols = 'score'
))
# Get ranked balances
# All scores are standard deviations across groups or (average) ranks
ranked_balances(coll_summary)
# Collapse to 3 groups with size and *ID* balancing
# We create 2 new `.coll_groups_1/2` columns
df_coll <- collapse_groups(
data = df,
n = 3,
group_cols = ".folds",
id_cols = "participant",
balance_size = TRUE,
num_new_group_cols = 2
)
# Check balances
# To simplify the output, we only find the
# balance of the `participant` column
(coll_summary <- summarize_balances(
data = df_coll,
group_cols = paste0(".coll_groups_", 1:2),
id_cols = 'participant'
))
# Get ranked balances
# All scores are standard deviations across groups or (average) ranks
ranked_balances(coll_summary)
###################
#### Auto-tune ####
# As you might have seen, the balancing does not always
# perform as optimal as we might want or need
# To get a better balance, we can enable `auto_tune`
# which will create a larger set of collapsings
# and select the most balanced new group columns
# While it is not required, we recommend
# enabling parallelization
if (FALSE) {
# Uncomment for parallelization
# library(doParallel)
# doParallel::registerDoParallel(7) # use 7 cores
# Collapse to 3 groups with lots of balancing
# We enable `auto_tune` to get a more balanced set of columns
# We create 10 new `.coll_groups_1/2/...` columns
df_coll <- collapse_groups(
data = df,
n = 3,
group_cols = ".folds",
cat_cols = "answer",
num_cols = "score",
id_cols = "participant",
balance_size = TRUE,
num_new_group_cols = 10,
auto_tune = TRUE,
parallel = FALSE # Set to TRUE for parallelization!
)
# Check balances
# To simplify the output, we only find the
# balance of the `participant` column
(coll_summary <- summarize_balances(
data = df_coll,
group_cols = paste0(".coll_groups_", 1:10),
cat_cols = "answer",
num_cols = "score",
id_cols = 'participant'
))
# Get ranked balances
# All scores are standard deviations across groups or (average) ranks
ranked_balances(coll_summary)
# Now we can choose the .coll_groups_* column(s)
# that we favor the balance of
# and move on with our lives!
}
Run the code above in your browser using DataLab