# why we phased out dense matrix dfm objects
(size1 <- object.size(dfm(inaugTexts, verbose = FALSE)))
(size2 <- object.size(as.matrix(dfm(inaugTexts, verbose = FALSE))))
cat("Compacted by ", round(as.numeric((1-size1/size2)*100), 1), "%.\n", sep="")
# for a corpus
mydfm <- dfm(subset(inaugCorpus, Year>1980))
mydfm <- dfm(subset(inaugCorpus, Year>1980), toLower=FALSE)
# grouping documents by docvars in a corpus
mydfmGrouped <- dfm(subset(inaugCorpus, Year>1980), groups = "President")
# with English stopwords and stemming
dfmsInaug2 <- dfm(subset(inaugCorpus, Year>1980),
ignoredFeatures=stopwords("english"), stem=TRUE)
# works for both words in ngrams too
dfm("Banking industry", stem = TRUE, ngrams = 2, verbose = FALSE)
# with dictionaries
mycorpus <- subset(inaugCorpus, Year>1900)
mydict <- list(christmas=c("Christmas", "Santa", "holiday"),
opposition=c("Opposition", "reject", "notincorpus"),
taxing="taxing",
taxation="taxation",
taxregex="tax*",
country="united states")
dictDfm <- dfm(mycorpus, dictionary=mydict)
dictDfm
# with the thesaurus feature
mytexts <- c("The new law included a capital gains tax, and an inheritance tax.",
"New York City has raised a taxes: an income tax and a sales tax.")
mydict <- dictionary(list(tax=c("tax", "income tax", "capital gains tax", "inheritance tax")))
dfm(phrasetotoken(mytexts, mydict), thesaurus = lapply(mydict, function(x) gsub("\\s", "_", x)))
# pick up "taxes" with "tax" as a regex
dfm(phrasetotoken(mytexts, mydict), thesaurus = list(anytax = "tax"), valuetype = "regex")
# removing stopwords
testText <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, with
the newspaper from a boy named Seamus, in his mouth."
testCorpus <- corpus(testText)
# note: "also" is not in the default stopwords("english")
features(dfm(testCorpus, ignoredFeatures = stopwords("english")))
# for ngrams
features(dfm(testCorpus, ngrams = 2, ignoredFeatures = stopwords("english")))
features(dfm(testCorpus, ngrams = 1:2, ignoredFeatures = stopwords("english")))
## removing stopwords before constructing ngrams
tokensAll <- tokenize(toLower(testText), removePunct = TRUE)
tokensNoStopwords <- removeFeatures(tokensAll, stopwords("english"))
tokensNgramsNoStopwords <- ngrams(tokensNoStopwords, 2)
features(dfm(tokensNgramsNoStopwords, verbose = FALSE))
# keep only certain words
dfm(testCorpus, keptFeatures = "*s", verbose = FALSE) # keep only words ending in "s"
dfm(testCorpus, keptFeatures = "s$", valuetype = "regex", verbose = FALSE)
# testing Twitter functions
testTweets <- c("My homie @justinbieber #justinbieber shopping in #LA yesterday #beliebers",
"2all the ha8ers including my bro #justinbieber #emabiggestfansjustinbieber",
"Justin Bieber #justinbieber #belieber #fetusjustin #EMABiggestFansJustinBieber")
dfm(testTweets, keptFeatures = "#*", removeTwitter = FALSE) # keep only hashtags
dfm(testTweets, keptFeatures = "^#.*$", valuetype = "regex", removeTwitter = FALSE)
Run the code above in your browser using DataLab