# NOT RUN {
data("brussels_reviews_anno", package = "udpipe")
x <- subset(brussels_reviews_anno, language %in% "nl" & (upos %in% "ADJ" | lemma %in% "niet"))
dtm <- document_term_frequencies(x, document = "doc_id", term = "lemma")
dtm <- document_term_matrix(dtm)
dtm <- dtm_remove_lowfreq(dtm, minfreq = 3)
## Function performing Singular Value Decomposition on sparse/dense data
dtm_svd <- function(dtm, dim = 5, type = c("RSpectra", "svd"), ...){
type <- match.arg(type)
if(type == "svd"){
SVD <- svd(dtm, nu = 0, nv = dim, ...)
}else if(type == "RSpectra"){
#Uncomment this if you want to use the faster sparse SVD by RSpectra
#SVD <- RSpectra::svds(dtm, nu = 0, k = dim, ...)
}
rownames(SVD$v) <- colnames(dtm)
SVD$v
}
#embedding <- dtm_svd(dtm, dim = 5)
embedding <- dtm_svd(dtm, dim = 5, type = "svd")
## Define positive / negative terms and calculate the similarity to these
weights <- setNames(c(1, 1, 1, 1, -1, -1, -1, -1),
c("fantastisch", "schoon", "vriendelijk", "net",
"lawaaiig", "lastig", "niet", "slecht"))
scores <- dtm_svd_similarity(dtm, embedding = embedding, weights = weights)
scores
str(scores$similarity)
hist(scores$similarity$similarity)
plot(scores$terminology$similarity_weight, log(scores$terminology$freq),
type = "n")
text(scores$terminology$similarity_weight, log(scores$terminology$freq),
labels = scores$terminology$term)
# }
Run the code above in your browser using DataLab