# Generate a couple of sample data.frames to play with
set.seed(1)
dat1 <- data.frame(ID = 1:100,
A = sample(c("AA", "BB", "CC", "DD", "EE"), 100, replace = TRUE),
B = rnorm(100), C = abs(round(rnorm(100), digits=1)),
D = sample(c("CA", "NY", "TX"), 100, replace = TRUE),
E = sample(c("M", "F"), 100, replace = TRUE))
dat2 <- data.frame(ID = 1:20,
A = c(rep("AA", 5), rep("BB", 10),
rep("CC", 3), rep("DD", 2)))
# What do the data look like in general?
summary(dat1)
summary(dat2)
# Let's take a 10% sample from all -A- groups in dat1, seed = 1
stratified(dat1, "A", .1, seed = 1)
# Let's take a 10% sample from only "AA" and "BB" groups from -A- in dat1, seed = 1
stratified(dat1, "A", .1, select = list(A = c("AA", "BB")), seed = 1)
# Let's take 5 samples from all -D- groups in dat1,
# seed = 1, specified by column number
stratified(dat1, group = 5, size = 5, seed = 1)
# Let's take a sample from all -A- groups in dat1, seed = 1,
# where we specify the number wanted from each group
stratified(dat1, "A", size = c(3, 5, 4, 5, 2), seed = 1)
# Use a two-column strata: -E- and -D-
# -E- varies more slowly, so it is better to put that first
stratified(dat1, c("E", "D"), size = .15, seed = 1)
# Use a two-column strata (-E- and -D-) but only interested in
# cases where -E- == "M"
stratified(dat1, c("E", "D"), .15, select = list(E = "M"), seed = 1)
## As above, but where -E- == "M" and -D- == "CA" or "TX"
stratified(dat1, c("E", "D"), .15,
select = list(E = "M", D = c("CA", "TX")), seed = 1)
# Use a three-column strata: -E-, -D-, and -A-
s.out <- stratified(dat1, c("E", "D", "A"), size = 2, seed = 1)
list(head(s.out), tail(s.out))
# How many samples were taken from each strata?
table(interaction(s.out[c("E", "D", "A")]))
# Can we verify the message about group sizes?
names(which(table(interaction(dat1[c("E", "D", "A")])) < 2))
names(which(table(interaction(s.out[c("E", "D", "A")])) < 2))
Run the code above in your browser using DataLab