## for a corpus
toks <- data_corpus_inaugural |>
corpus_subset(Year > 1980) |>
tokens()
dfm(toks)
# removal options
toks <- tokens(c("a b c", "A B C D")) |>
tokens_remove("b", padding = TRUE)
toks
dfm(toks)
dfm(toks) |>
dfm_remove(pattern = "") # remove "pads"
# preserving case
dfm(toks, tolower = FALSE)
Run the code above in your browser using DataLab