# NOT RUN {
## for a corpus
dfmat1 <- corpus_subset(data_corpus_inaugural, Year > 1980)
dfm(dfmat1)
dfm(dfmat1, tolower = FALSE)
# grouping documents by docvars in a corpus
dfm(dfmat1, groups = "President", verbose = TRUE)
# with English stopwords and stemming
dfm(dfmat1, remove = stopwords("english"), stem = TRUE, verbose = TRUE)
# works for both words in ngrams too
dfm("Banking industry", stem = TRUE, ngrams = 2)
# with dictionaries
dfmat2 <- corpus_subset(data_corpus_inaugural, Year > 1900)
dict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"),
opposition = c("Opposition", "reject", "notincorpus"),
taxing = "taxing",
taxation = "taxation",
taxregex = "tax*",
country = "states"))
dfm(dfmat2, dictionary = dict)
# removing stopwords
txt <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, with
the newspaper from a boy named Seamus, in his mouth."
corp <- corpus(txt)
# note: "also" is not in the default stopwords("english")
featnames(dfm(corp, select = stopwords("english")))
# for ngrams
featnames(dfm(corp, ngrams = 2, select = stopwords("english"), remove_punct = TRUE))
featnames(dfm(corp, ngrams = 1:2, select = stopwords("english"), remove_punct = TRUE))
# removing stopwords before constructing ngrams
toks1 <- tokens(char_tolower(txt), remove_punct = TRUE)
toks2 <- tokens_remove(toks1, stopwords("english"))
toks3 <- tokens_ngrams(toks2, 2)
featnames(dfm(toks3))
# keep only certain words
dfm(corp, select = "*s") # keep only words ending in "s"
dfm(corp, select = "s$", valuetype = "regex")
# testing Twitter functions
txttweets <- c("My homie @justinbieber #justinbieber shopping in #LA yesterday #beliebers",
"2all the ha8ers including my bro #justinbieber #emabiggestfansjustinbieber",
"Justin Bieber #justinbieber #belieber #fetusjustin #EMABiggestFansJustinBieber")
dfm(txttweets, select = "#*", remove_twitter = FALSE) # keep only hashtags
dfm(txttweets, select = "^#.*$", valuetype = "regex", remove_twitter = FALSE)
# for a dfm
dfmat3 <- dfm(data_corpus_irishbudget2010)
dfmat4 <- dfm(dfmat3,
groups = ifelse(docvars(data_corpus_irishbudget2010, "party") %in% c("FF", "Green"),
"Govt", "Opposition"),
tolower = FALSE, verbose = TRUE)
# }
Run the code above in your browser using DataLab