myDfm <- dfm(c("My Christmas was ruined by your opposition tax plan.",
"Does the United_States or Sweden have more progressive taxation?"),
toLower = FALSE, verbose = FALSE)
mydict <- dictionary(list(countries = c("United_States", "Sweden", "France"),
wordsEndingInY = c("by", "my"),
notintext = "blahblah"))
selectFeatures(myDfm, mydict)
selectFeatures(myDfm, mydict, case_insensitive = FALSE)
selectFeatures(myDfm, c("s$", ".y"), "keep")
selectFeatures(myDfm, c("s$", ".y"), "keep", valuetype = "regex")
selectFeatures(myDfm, c("s$", ".y"), "remove", valuetype = "regex")
selectFeatures(myDfm, stopwords("english"), "keep", valuetype = "fixed")
selectFeatures(myDfm, stopwords("english"), "remove", valuetype = "fixed")
# selecting on a dfm
textVec1 <- c("This is text one.", "This, the second text.", "Here: the third text.")
textVec2 <- c("Here are new words.", "New words in this text.")
(dfm1 <- dfm(textVec1, verbose = FALSE))
(dfm2a <- dfm(textVec2, verbose = FALSE))
(dfm2b <- selectFeatures(dfm2a, dfm1))
setequal(features(dfm1), features(dfm2b))
# more selection on a dfm
selectFeatures(dfm1, dfm2a)
selectFeatures(dfm1, dfm2a, selection = "remove")
## Not run: ## performance comparisons
# data(SOTUCorpus, package = "quantedaData")
# toks <- tokenize(SOTUCorpus, removePunct = TRUE)
# # toks <- tokenize(tokenize(SOTUCorpus, what='sentence', simplify = TRUE), removePunct = TRUE)
# # head to head, old v. new
# system.time(selectFeaturesOLD(toks, stopwords("english"), "remove", verbose = FALSE))
# system.time(selectFeatures(toks, stopwords("english"), "remove", verbose = FALSE))
# system.time(selectFeaturesOLD(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"))
# system.time(selectFeatures(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"))
# microbenchmark::microbenchmark(
# old = selectFeaturesOLD(toks, stopwords("english"), "remove", verbose = FALSE),
# new = selectFeatures(toks, stopwords("english"), "remove", verbose = FALSE),
# times = 5, unit = "relative")
# microbenchmark::microbenchmark(
# new = selectFeaturesOLD(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"),
# old = selectFeatures(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"),
# times = 2, unit = "relative")
#
# types <- unique(unlist(toks))
# numbers <- types[stringi::stri_detect_regex(types, '[0-9]')]
# microbenchmark::microbenchmark(
# new = selectFeaturesOLD(toks, numbers, "remove", verbose = FALSE, valuetype = "fixed"),
# old = selectFeatures(toks, numbers, "remove", verbose = FALSE, valuetype = "fixed"),
# times = 2, unit = "relative")
#
# # removing tokens before dfm, versus after
# microbenchmark::microbenchmark(
# pre = dfm(selectFeaturesOLD(toks, stopwords("english"), "remove"), verbose = FALSE),
# post = dfm(toks, ignoredFeatures = stopwords("english"), verbose = FALSE),
# times = 5, unit = "relative")
# ## End(Not run)
## with simple examples
toks <- tokenize(c("This is a sentence.", "This is a second sentence."),
removePunct = TRUE)
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "fixed", padding = TRUE, case_insensitive = TRUE)
# how case_insensitive works
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "fixed", padding = TRUE, case_insensitive = FALSE)
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "fixed", padding = TRUE, case_insensitive = TRUE)
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "glob", padding = TRUE, case_insensitive = TRUE)
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "glob", padding = TRUE, case_insensitive = FALSE)
# with longer texts
txts <- c(exampleString, inaugTexts[2])
toks <- tokenize(txts)
selectFeatures(toks, stopwords("english"), "remove")
selectFeatures(toks, stopwords("english"), "keep")
selectFeatures(toks, stopwords("english"), "remove", padding = TRUE)
selectFeatures(toks, stopwords("english"), "keep", padding = TRUE)
selectFeatures(tokenize(encodedTexts[1]), stopwords("english"), "remove", padding = TRUE)
## example for collocations
(myCollocs <- collocations(inaugTexts[1:3], n=20))
selectFeatures(myCollocs, stopwords("english"), "remove")
Run the code above in your browser using DataLab