# create a dfm from inaugural addresses from Reagan onwards
presDfm <- dfm(subset(inaugCorpus, Year > 1980), ignoredFeatures = stopwords("english"),
stem = TRUE)
# compute some document similarities
(tmp <- similarity(presDfm, margin = "documents"))
# output as a matrix
as.matrix(tmp)
# for specific comparisons
similarity(presDfm, "1985-Reagan", n = 5, margin = "documents")
similarity(presDfm, c("2009-Obama" , "2013-Obama"), n = 5, margin = "documents")
similarity(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents")
similarity(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents", method = "cosine")
similarity(presDfm, "2005-Bush", margin = "documents", method = "eJaccard", sorted = FALSE)
# compute some term similarities
similarity(presDfm, c("fair", "health", "terror"), method="cosine", margin = "features", 20)
## Not run:
# # compare to tm
# require(tm)
# data("crude")
# crude <- tm_map(crude, content_transformer(tolower))
# crude <- tm_map(crude, removePunctuation)
# crude <- tm_map(crude, removeNumbers)
# crude <- tm_map(crude, stemDocument)
# tdm <- TermDocumentMatrix(crude)
# findAssocs(tdm, c("oil", "opec", "xyz"), c(0.75, 0.82, 0.1))
# # in quanteda
# quantedaDfm <- new("dfmSparse", Matrix::Matrix(t(as.matrix(tdm))))
# similarity(quantedaDfm, c("oil", "opec", "xyz"), margin = "features", n = 14)
# corMat <- as.matrix(proxy::simil(as.matrix(quantedaDfm), by_rows = FALSE))
# round(head(sort(corMat[, "oil"], decreasing = TRUE), 14), 2)
# round(head(sort(corMat[, "opec"], decreasing = TRUE), 9), 2)
# ## End(Not run)
Run the code above in your browser using DataLab