myDfm <- dfm(c("My Christmas was ruined by your opposition tax plan.",
"Does the United_States or Sweden have more progressive taxation?"),
tolower = FALSE, verbose = FALSE)
mydict <- dictionary(list(countries = c("United_States", "Sweden", "France"),
wordsEndingInY = c("by", "my"),
notintext = "blahblah"))
dfm_select(myDfm, mydict)
dfm_select(myDfm, mydict, case_insensitive = FALSE)
dfm_select(myDfm, c("s$", ".y"), "keep")
dfm_select(myDfm, c("s$", ".y"), "keep", valuetype = "regex")
dfm_select(myDfm, c("s$", ".y"), "remove", valuetype = "regex")
dfm_select(myDfm, stopwords("english"), "keep", valuetype = "fixed")
dfm_select(myDfm, stopwords("english"), "remove", valuetype = "fixed")
# selecting on a dfm
textVec1 <- c("This is text one.", "This, the second text.", "Here: the third text.")
textVec2 <- c("Here are new words.", "New words in this text.")
(dfm1 <- dfm(textVec1, verbose = FALSE))
(dfm2a <- dfm(textVec2, verbose = FALSE))
(dfm2b <- dfm_select(dfm2a, dfm1))
setequal(featnames(dfm1), featnames(dfm2b))
# more selection on a dfm
dfm_select(dfm1, dfm2a)
dfm_select(dfm1, dfm2a, selection = "remove")
tmpdfm <- dfm(c("This is a document with lots of stopwords.",
"No if, and, or but about it: lots of stopwords."),
verbose = FALSE)
tmpdfm
dfm_remove(tmpdfm, stopwords("english"))
toks <- tokens(c("this contains lots of stopwords",
"no if, and, or but about it: lots"),
removePunct = TRUE)
tmpfcm <- fcm(toks)
tmpfcm
fcm_remove(tmpfcm, stopwords("english"))
Run the code above in your browser using DataLab