data(brussels_reviews_anno)
x <- subset(brussels_reviews_anno, xpos == "NN")
x <- x[, c("doc_id", "lemma")]
x <- document_term_frequencies(x)
dtm <- document_term_matrix(x)
dtm <- dtm_remove_lowfreq(dtm, minfreq = 10)
dim(dtm)
## Keep only terms with high tfidf
x <- dtm_remove_tfidf(dtm, top=50)
dim(x)
x <- dtm_remove_tfidf(dtm, top=50, remove_emptydocs = FALSE)
dim(x)
## Keep only terms with tfidf above 1.1
x <- dtm_remove_tfidf(dtm, cutoff=1.1)
dim(x)
## Keep only terms with tfidf above the 60 percent quantile
x <- dtm_remove_tfidf(dtm, prob=0.6)
dim(x)
Run the code above in your browser using DataLab