# Attach packages
library(groupdata2)
library(dplyr)
set.seed(1)
# Create data frame
df <- data.frame(
"participant" = factor(rep(c("1", "2", "3", "4", "5", "6"), 3)),
"age" = rep(sample(c(1:100), 6), 3),
"diagnosis" = factor(rep(c("a", "b", "a", "a", "b", "b"), 3)),
"score" = sample(c(1:100), 3 * 6)
)
df <- df %>% arrange(participant)
df$session <- rep(c("1", "2", "3"), 6)
# Using fold()
## Without balancing
set.seed(1)
df_folded <- fold(data = df, k = 3)
# Check the balances of the various columns
# As we have not used balancing in `fold()`
# we should not expect it to be amazingly balanced
df_folded %>%
dplyr::ungroup() %>%
summarize_balances(
group_cols = ".folds",
num_cols = c("score", "age"),
cat_cols = "diagnosis",
id_cols = "participant"
)
## With balancing
set.seed(1)
df_folded <- fold(
data = df,
k = 3,
cat_col = "diagnosis",
num_col = 'score',
id_col = 'participant'
)
# Now the balance should be better
# although it may be difficult to get a good balance
# the 'score' column when also balancing on 'diagnosis'
# and keeping all rows per participant in the same fold
df_folded %>%
dplyr::ungroup() %>%
summarize_balances(
group_cols = ".folds",
num_cols = c("score", "age"),
cat_cols = "diagnosis",
id_cols = "participant"
)
# Comparing multiple grouping columns
# Create 3 fold column that only balance "score"
set.seed(1)
df_folded <- fold(
data = df,
k = 3,
num_fold_cols = 3,
num_col = 'score'
)
# Summarize all three grouping cols at once
(summ <- df_folded %>%
dplyr::ungroup() %>%
summarize_balances(
group_cols = paste0(".folds_", 1:3),
num_cols = c("score")
)
)
# Extract the across-group standard deviations
# The group column with the lowest standard deviation(s)
# is the most balanced group column
summ %>% ranked_balances()
Run the code above in your browser using DataLab