# NOT RUN {
text_tokens("The quick ('brown') fox can't jump 32.3 feet, right?")
# count non-dropped tokens:
text_ntoken("The quick ('brown') fox can't jump 32.3 feet, right?")
# don't normalize:
f <- text_filter(map_case = FALSE, map_compat = FALSE,
map_quote = FALSE, remove_ignorable = FALSE)
text_tokens("The quick ('brown') fox can't jump 32.3 feet, right?", f)
# drop common function words ('stop' words):
text_tokens("Able was I ere I saw Elba.",
text_filter(drop = stopwords("english")))
# drop numbers, with some exceptions:"
text_tokens("0, 1, 2, 3, 4, 5",
text_filter(drop_number = TRUE,
drop_except = c("0", "2", "4")))
# apply stemming...
text_tokens("Mary is running", text_filter(stemmer = "english"))
# ...except for certain words
text_tokens("Mary is running",
text_filter(stemmer = "english", stem_except = "mary"))
# combine abbreviations by default
text_tokens("Ms. Jones")
# disable default combinations
text_tokens("Ms. Jones", text_filter(combine = NULL))
# add new combinations
text_tokens("Ms. Jones is from New York City, New York.",
text_filter(combine = c(abbreviations("english"),
"new york", "new york city")))
# }
Run the code above in your browser using DataLab