dfmat <- tokens(c("My Christmas was ruined by your opposition tax plan.",
"Does the United_States or Sweden have more progressive taxation?")) |>
dfm(tolower = FALSE)
dict <- dictionary(list(countries = c("United_States", "Sweden", "France"),
wordsEndingInY = c("by", "my"),
notintext = "blahblah"))
dfm_select(dfmat, pattern = dict)
dfm_select(dfmat, pattern = dict, case_insensitive = FALSE)
dfm_select(dfmat, pattern = c("s$", ".y"), selection = "keep", valuetype = "regex")
dfm_select(dfmat, pattern = c("s$", ".y"), selection = "remove", valuetype = "regex")
dfm_select(dfmat, pattern = stopwords("english"), selection = "keep", valuetype = "fixed")
dfm_select(dfmat, pattern = stopwords("english"), selection = "remove", valuetype = "fixed")
# select based on character length
dfm_select(dfmat, min_nchar = 5)
dfmat <- dfm(tokens(c("This is a document with lots of stopwords.",
"No if, and, or but about it: lots of stopwords.")))
dfmat
dfm_remove(dfmat, stopwords("english"))
toks <- tokens(c("this contains lots of stopwords",
"no if, and, or but about it: lots"),
remove_punct = TRUE)
fcmat <- fcm(toks)
fcmat
fcm_remove(fcmat, stopwords("english"))
Run the code above in your browser using DataLab