# NOT RUN {
## for a corpus
corp <- corpus_subset(data_corpus_inaugural, Year > 1980)
dfm(corp)
dfm(corp, tolower = FALSE)
# grouping documents by docvars in a corpus
dfm(corp, groups = "President", verbose = TRUE)
# with English stopwords and stemming
dfm(corp, remove = stopwords("english"), stem = TRUE, verbose = TRUE)
# works for both words in ngrams too
tokens("Banking industry") %>%
tokens_ngrams(n = 2) %>%
dfm(stem = TRUE)
# with dictionaries
dict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"),
opposition = c("Opposition", "reject", "notincorpus"),
taxing = "taxing",
taxation = "taxation",
taxregex = "tax*",
country = "states"))
dfm(corpus_subset(data_corpus_inaugural, Year > 1900), dictionary = dict)
# removing stopwords
txt <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, with
the newspaper from a boy named Seamus, in his mouth."
corp <- corpus(txt)
# note: "also" is not in the default stopwords("english")
featnames(dfm(corp, select = stopwords("english")))
# for ngrams
featnames(dfm(corp, ngrams = 2, select = stopwords("english"), remove_punct = TRUE))
featnames(dfm(corp, ngrams = 1:2, select = stopwords("english"), remove_punct = TRUE))
# removing stopwords before constructing ngrams
toks1 <- tokens(char_tolower(txt), remove_punct = TRUE)
toks2 <- tokens_remove(toks1, stopwords("english"))
toks3 <- tokens_ngrams(toks2, 2)
featnames(dfm(toks3))
# keep only certain words
dfm(corp, select = "*s") # keep only words ending in "s"
dfm(corp, select = "s$", valuetype = "regex")
# testing Twitter functions
txttweets <- c("My homie @justinbieber #justinbieber shopping in #LA yesterday #beliebers",
"2all the ha8ers including my bro #justinbieber #emabiggestfansjustinbieber",
"Justin Bieber #justinbieber #belieber #fetusjustin #EMABiggestFansJustinBieber")
dfm(txttweets, select = "#*", split_tags = FALSE) # keep only hashtags
dfm(txttweets, select = "^#.*$", valuetype = "regex", split_tags = FALSE)
# for a dfm
dfm(corpus_subset(data_corpus_inaugural, Year > 1980), groups = "Party")
# }
Run the code above in your browser using DataLab