data(bibles)
# ----- small example of co-occurrences -----
# as an example, just take partially overlapping parts of two bibles
# sim.words uses the names to get the paralellism right, so this works
eng <- bibles$eng[1:5000]
deu <- bibles$deu[2000:7000]
sim <- sim.words(eng, deu, method = res)
# but the statistics are not perfect (because too little data)
# sorted co-occurrences for the english word "your" in German:
sort(sim["your",], decreasing = TRUE)[1:10]
# \donttest{
# ----- complete example of co-occurrences -----
# running the complete bibles takes a bit more time (but still manageable)
system.time(sim <- sim.words(bibles$eng, bibles$deu, method = res))
# results are much better
# sorted co-occurrences for the english word "your" in German:
sort(sim["your",], decreasing = TRUE)[1:10]
# ----- look for 'best' translations -----
# note that selecting the 'best' takes even more time
system.time(sim2 <- sim.words(bibles$eng, bibles$deu, method = res, best = TRUE))
# best co-occurrences for the English word "your"
which(sim2$best["your",])
# but can be made faster by removing low values
# (though the boundary in \code{tol = 5} depends on the method used
system.time(sim3 <- sim.words(bibles$eng, bibles$deu, best = TRUE, method = res, tol = 5))
# note that the decision on the 'best' remains the same here
all.equal(sim2$best, sim3$best)
# }
# ----- computations also work with other languages -----
# All works completely language-independent
# translations for 'we' in Tagalog:
sim <- sim.words(bibles$eng, bibles$tgl, best = TRUE, weight = idf, tol = 0.1)
which(sim$best["we",])
Run the code above in your browser using DataLab