set.seed(123)
# sampling from a corpus
summary(corpus_sample(data_corpus_inaugural, size = 5))
summary(corpus_sample(data_corpus_inaugural, size = 10, replace = TRUE))
# sampling with by
corp <- data_corpus_inaugural
corp$century <- paste(floor(corp$Year / 100) + 1)
corp$century <- paste0(corp$century, ifelse(corp$century < 21, "th", "st"))
corpus_sample(corp, size = 2, by = century) |>
summary()
# needs drop = TRUE to avoid empty interactions
corpus_sample(corp, size = 1, by = interaction(Party, century, drop = TRUE), replace = TRUE) |>
summary()
# sampling sentences by document
corp <- corpus(c(one = "Sentence one. Sentence two. Third sentence.",
two = "First sentence, doc2. Second sentence, doc2."),
docvars = data.frame(var1 = c("a", "a"), var2 = c(1, 2)))
corpus_reshape(corp, to = "sentences") %>%
corpus_sample(replace = TRUE, by = docid(.))
# oversampling
corpus_sample(corp, size = 5, replace = TRUE)
Run the code above in your browser using DataLab