## Not run: ------------------------------------
# mytexts <- c("The new law included a capital gains tax, and an inheritance tax.",
# "New York City has raised a taxes: an income tax and a sales tax.")
# mydict <- dictionary(list(tax=c("tax", "income tax", "capital gains tax", "inheritance tax")))
# (cw <- phrasetotoken(mytexts, mydict))
# dfm(cw, verbose=FALSE)
#
# # when used as a dictionary for dfm creation
# mydfm2 <- dfm(cw, dictionary = dictionary(lapply(mydict, function(x) gsub(" ", "_", x))))
# mydfm2
#
# # to pick up "taxes" in the second text, set valuetype = "regex"
# mydfm3 <- dfm(cw, dictionary = dictionary(lapply(mydict, phrasetotoken, mydict)),
# valuetype = "regex")
# mydfm3
# ## one more token counted for "tax" than before
## ---------------------------------------------
# using a dictionary to pre-process multi-word expressions
myDict <- dictionary(list(negative = c("bad* word*", "negative", "awful text"),
postiive = c("good stuff", "like? th??")))
txt <- c("I liked this, when we can use bad words, in awful text.",
"Some damn good stuff, like the text, she likes that too.")
phrasetotoken(txt, myDict)
# on simple text
phrasetotoken("This is a simpler version of multi word expressions.", "multi word expression*")
# on simple text
toks <- tokenize("Simon sez the multi word expression plural is multi word expressions, Simon sez.")
phrases <- c("multi word expression*", "Simon sez")
phrasetotoken(toks, phrases)
Run the code above in your browser using DataLab