dtm <- dfm(data_corpus_inaugural)
x <- apply(dtm, 1, function(tf) tf/max(tf))
topfeatures(dtm)
normDtm <- dfm_weight(dtm, "relFreq")
topfeatures(normDtm)
maxTfDtm <- dfm_weight(dtm, type = "relMaxFreq")
topfeatures(maxTfDtm)
logTfDtm <- dfm_weight(dtm, type = "logFreq")
topfeatures(logTfDtm)
tfidfDtm <- dfm_weight(dtm, type = "tfidf")
topfeatures(tfidfDtm)
# combine these methods for more complex dfm_weightings, e.g. as in Section 6.4
# of Introduction to Information Retrieval
head(logTfDtm <- dfm_weight(dtm, type = "logFreq"))
head(tfidf(logTfDtm, normalize = FALSE))
#' # apply numeric weights
str <- c("apple is better than banana", "banana banana apple much better")
(mydfm <- dfm(str, remove = stopwords("english")))
dfm_weight(mydfm, weights = c(apple = 5, banana = 3, much = 0.5))
# smooth the dfm
dfm_smooth(mydfm, 0.5)
Run the code above in your browser using DataLab