mytexts <- c("The new law included a capital gains tax, and an inheritance tax.",
"New York City has raised a taxes: an income tax and a sales tax.")
mydict <- dictionary(list(tax=c("tax", "income tax", "capital gains tax", "inheritance tax")))
(cw <- phrasetotoken(mytexts, mydict))
dfm(cw, verbose=FALSE)
# when used as a dictionary for dfm creation
mydfm2 <- dfm(cw, dictionary = lapply(mydict, function(x) gsub(" ", "_", x)))
mydfm2
# to pick up "taxes" in the second text, set valuetype = "regex"
mydfm3 <- dfm(cw, dictionary = lapply(mydict, phrasetotoken, mydict),
valuetype = "regex")
mydfm3
## one more token counted for "tax" than before
# using a dictionary to pre-process multi-word expressions
myDict <- dictionary(list(negative = c("bad* word*", "negative", "awful text"),
postiive = c("good stuff", "like? th??")))
txt <- c("I liked this, when we can use bad words, in awful text.",
"Some damn good stuff, like the text, she likes that too.")
phrasetotoken(txt, myDict)
# on simple text
phrasetotoken("This is a simpler version of multi word expressions.", "multi word expression*")
Run the code above in your browser using DataLab