if(require(udpipe)){
library(udpipe)
## Take data and standardise it a bit
data(brussels_reviews, package = "udpipe")
x <- subset(brussels_reviews, language == "nl")
x <- tolower(x$feedback)
## Build the model get word embeddings and nearest neighbours
model <- word2vec(x = x, dim = 15, iter = 20)
emb <- as.matrix(model)
head(emb)
emb <- predict(model, c("bus", "toilet", "unknownword"), type = "embedding")
emb
nn <- predict(model, c("bus", "toilet"), type = "nearest", top_n = 5)
nn
## Get vocabulary
vocab <- summary(model, type = "vocabulary")
# Do some calculations with the vectors and find similar terms to these
emb <- as.matrix(model)
vector <- emb["buurt", ] - emb["rustige", ] + emb["restaurants", ]
predict(model, vector, type = "nearest", top_n = 10)
vector <- emb["gastvrouw", ] - emb["gastvrij", ]
predict(model, vector, type = "nearest", top_n = 5)
vectors <- emb[c("gastheer", "gastvrouw"), ]
vectors <- rbind(vectors, avg = colMeans(vectors))
predict(model, vectors, type = "nearest", top_n = 10)
## Save the model to hard disk
path <- "mymodel.bin"
# \dontshow{
path <- tempfile(pattern = "w2v", fileext = ".bin")
# }
write.word2vec(model, file = path)
model <- read.word2vec(path)
# \dontshow{
file.remove(path)
# }
##
## Example of word2vec with a list of tokens
##
toks <- strsplit(x, split = "[[:space:][:punct:]]+")
model <- word2vec(x = toks, dim = 15, iter = 20)
emb <- as.matrix(model)
emb <- predict(model, c("bus", "toilet", "unknownword"), type = "embedding")
emb
nn <- predict(model, c("bus", "toilet"), type = "nearest", top_n = 5)
nn
##
## Example getting word embeddings
## which are different depending on the parts of speech tag
## Look to the help of the udpipe R package
## to get parts of speech tags on text
##
library(udpipe)
data(brussels_reviews_anno, package = "udpipe")
x <- subset(brussels_reviews_anno, language == "fr")
x <- subset(x, grepl(xpos, pattern = paste(LETTERS, collapse = "|")))
x$text <- sprintf("%s/%s", x$lemma, x$xpos)
x <- subset(x, !is.na(lemma))
x <- split(x$text, list(x$doc_id, x$sentence_id))
model <- word2vec(x = x, dim = 15, iter = 20)
emb <- as.matrix(model)
nn <- predict(model, c("cuisine/NN", "rencontrer/VB"), type = "nearest")
nn
nn <- predict(model, c("accueillir/VBN", "accueillir/VBG"), type = "nearest")
nn
} # End of main if statement running only if the required packages are installed
Run the code above in your browser using DataLab