# Attach packages
library(groupdata2)
library(dplyr)
# Set seed
if (requireNamespace("xpectr", quietly = TRUE)){
xpectr::set_test_seed(42)
}
# Create data frame
df <- data.frame(
"participant" = factor(rep(1:20, 3)),
"age" = rep(sample(c(1:100), 20), 3),
"answer" = factor(sample(c("a", "b", "c", "d"), 60, replace = TRUE)),
"score" = sample(c(1:100), 20 * 3)
)
df <- df %>% dplyr::arrange(participant)
df$session <- rep(c("1", "2", "3"), 20)
# Sample rows to get unequal sizes per participant
df <- dplyr::sample_n(df, size = 53)
# Create the initial groups (to be collapsed)
df <- fold(
data = df,
k = 8,
method = "n_dist",
id_col = "participant"
)
# Ungroup the data frame
# Otherwise `collapse_groups*()` would be
# applied to each fold separately!
df <- dplyr::ungroup(df)
# When `auto_tune` is enabled for larger datasets
# we recommend enabling parallelization
# This can be done with:
# library(doParallel)
# doParallel::registerDoParallel(7) # use 7 cores
if (FALSE) {
# Collapse to 3 groups with size balancing
# Creates new `.coll_groups` column
df_coll <- collapse_groups_by_size(
data = df,
n = 3,
group_cols = ".folds"
)
# Check balances
(coll_summary <- summarize_balances(
data = df_coll,
group_cols = ".coll_groups"
))
# Get ranked balances
# This is most useful when having created multiple
# new group columns with `collapse_groups()`
# The scores are standard deviations across groups
ranked_balances(coll_summary)
# Collapse to 3 groups with *categorical* balancing
df_coll <- collapse_groups_by_levels(
data = df,
n = 3,
group_cols = ".folds",
cat_cols = "answer"
)
# Check balances
(coll_summary <- summarize_balances(
data = df_coll,
group_cols = ".coll_groups",
cat_cols = 'answer'
))
# Collapse to 3 groups with *numerical* balancing
# Also balance size to get similar sums
# as well as means
df_coll <- collapse_groups_by_numeric(
data = df,
n = 3,
group_cols = ".folds",
num_cols = "score",
balance_size = TRUE
)
# Check balances
(coll_summary <- summarize_balances(
data = df_coll,
group_cols = ".coll_groups",
num_cols = 'score'
))
# Collapse to 3 groups with *ID* balancing
# This should give us a similar number of IDs per group
df_coll <- collapse_groups_by_ids(
data = df,
n = 3,
group_cols = ".folds",
id_cols = "participant"
)
# Check balances
(coll_summary <- summarize_balances(
data = df_coll,
group_cols = ".coll_groups",
id_cols = 'participant'
))
# Collapse to 3 groups with balancing of ALL attributes
# We create 5 new grouping factors and compare them
# The latter is in-general a good strategy even if you
# only need a single collapsed grouping factor
# as you can choose your preferred balances
# based on the summary
# NOTE: This is slow (up to a few minutes)
# consider enabling parallelization
df_coll <- collapse_groups(
data = df,
n = 3,
num_new_group_cols = 5,
group_cols = ".folds",
cat_cols = "answer",
num_cols = 'score',
id_cols = "participant",
auto_tune = TRUE # Disabled by default in `collapse_groups()`
# parallel = TRUE # Add comma above and uncomment
)
# Check balances
(coll_summary <- summarize_balances(
data = df_coll,
group_cols = paste0(".coll_groups_", 1:5),
cat_cols = "answer",
num_cols = 'score',
id_cols = 'participant'
))
# Compare the new grouping columns
# The lowest across-group standard deviation
# is the most balanced
ranked_balances(coll_summary)
}
Run the code above in your browser using DataLab