data("brussels_reviews_anno", package = "udpipe")
x <- subset(brussels_reviews_anno, language %in% "nl" & (upos %in% "ADJ" | lemma %in% "niet"))
dtm <- document_term_frequencies(x, document = "doc_id", term = "lemma")
dtm <- document_term_matrix(dtm)
dtm <- dtm_remove_lowfreq(dtm, minfreq = 3)
## Function performing Singular Value Decomposition on sparse/dense data
dtm_svd <- function(dtm, dim = 5, type = c("RSpectra", "svd"), ...){
type <- match.arg(type)
if(type == "svd"){
SVD <- svd(dtm, nu = 0, nv = dim, ...)
}else if(type == "RSpectra"){
#Uncomment this if you want to use the faster sparse SVD by RSpectra
#SVD <- RSpectra::svds(dtm, nu = 0, k = dim, ...)
}
rownames(SVD$v) <- colnames(dtm)
SVD$v
}
#embedding <- dtm_svd(dtm, dim = 5)
embedding <- dtm_svd(dtm, dim = 5, type = "svd")
## Define positive / negative terms and calculate the similarity to these
weights <- setNames(c(1, 1, 1, 1, -1, -1, -1, -1),
c("fantastisch", "schoon", "vriendelijk", "net",
"lawaaiig", "lastig", "niet", "slecht"))
scores <- dtm_svd_similarity(dtm, embedding = embedding, weights = weights)
scores
str(scores$similarity)
hist(scores$similarity$similarity)
plot(scores$terminology$similarity_weight, log(scores$terminology$freq),
type = "n")
text(scores$terminology$similarity_weight, log(scores$terminology$freq),
labels = scores$terminology$term)
if (FALSE) {
## More elaborate example using word2vec
## building word2vec model on all Dutch texts,
## finding similarity of dtm to adjectives only
set.seed(123)
library(word2vec)
text <- subset(brussels_reviews_anno, language == "nl")
text <- paste.data.frame(text, term = "lemma", group = "doc_id")
text <- text$lemma
model <- word2vec(text, dim = 10, iter = 20, type = "cbow", min_count = 1)
predict(model, newdata = names(weights), type = "nearest", top_n = 3)
embedding <- as.matrix(model)
}
data(brussels_reviews_w2v_embeddings_lemma_nl)
embedding <- brussels_reviews_w2v_embeddings_lemma_nl
adjective <- subset(brussels_reviews_anno, language %in% "nl" & upos %in% "ADJ")
adjective <- txt_freq(adjective$lemma)
adjective <- subset(adjective, freq >= 5 & nchar(key) > 1)
adjective <- adjective$key
scores <- dtm_svd_similarity(dtm, embedding, weights = weights, type = "dot",
terminology = adjective)
scores
plot(scores$terminology$similarity_weight, log(scores$terminology$freq),
type = "n")
text(scores$terminology$similarity_weight, log(scores$terminology$freq),
labels = scores$terminology$term, cex = 0.8)
Run the code above in your browser using DataLab