txts <- c("quanteda is a package for quantitative text analysis",
"quantitative text analysis is a rapidly growing field",
"The population is rapidly growing")
toks <- tokens(txts)
textstat_collocations(toks, method = "lr")
textstat_collocations(toks, method = "lr", min_count = 1)
textstat_collocations(toks, method = "lr", max_size = 3, min_count = 1)
(cols <- textstat_collocations(toks, method = "lr", max_size = 3, min_count = 2))
as.tokens(cols)
# extracting multi-part proper nouns (capitalized terms)
toks2 <- tokens(corpus_segment(data_corpus_inaugural, what = "sentence"))
toks2 <- tokens_select(toks2, stopwords("english"), "remove", padding = TRUE)
seqs <- textstat_collocations(toks2, method = "bj",
features = "^([A-Z][a-z\\-]{2,})",
valuetype = "regex", case_insensitive = FALSE)
head(seqs, 10)
# compounding tokens is more efficient when applied to the same tokens object
toks_comp <- tokens_compound(toks2, seqs)
Run the code above in your browser using DataLab