# NOT RUN {
# ----- simple example -----
example <- c("still","till","stable","stale","tale","tall","ill","all")
( sim <- round( sim.strings(example), digits = 3) )
# show similarity in non-metric MDS
library(MASS)
mds <- isoMDS( as.dist(1-sim) )$points
plot(mds, type = "n", ann = FALSE, axes = FALSE)
text(mds, labels = example, cex = .7)
# }
# NOT RUN {
# ----- large example -----
# This similarity is meant to be used for large lists of wordforms.
# for example, all 15526 wordforms from the English Dalby Bible
# takes just a few seconds for the more than 1e8 pairwise comparisons
data(bibles)
words <- splitText(bibles$eng)$wordforms
system.time( sim <- sim.strings(words) )
# see most similar words
rownames(sim) <- colnames(sim) <- words
sort(sim["walk",], decreasing = TRUE)[1:10]
# just compare all words to "walk". This is the same as above, but less comparisons
# note that the overhead for the sparse conversion and matching of matrices is large
# this one is faster than doing all comparisons, but only be a factor 10
system.time( sim <- sim.strings(words, "walk"))
names(sim) <- words
sort(sim, decreasing = TRUE)[1:10]
# ----- comparison with Levinshtein -----
# don't try this with 'adist' from the utils package, it will take long!
# for a comparison, only take 2000 randomly selected strings: about a factor 30 slower
w <- sample(words, 2000)
system.time( sim1 <- sim.strings(w) )
system.time( sim2 <- adist(w) )
# compare the current approach with relative levenshtein similarity
# = number of matches / ( number of edits + number of matches)
# for reasons of speed, just take 1000 random words from the english bible
w <- sample(words, 1000)
sim1 <- sim.strings(w)
tmp <- adist(w, counts = TRUE)
sim2 <- 1- ( tmp / nchar(attr(tmp, "trafos")) )
# plotting relation between the two 'heatmap-style'
# not identical, but usefully similar
image( log(table(
round(as.dist(sim1) / 3, digits = 2) * 3,
round(as.dist(sim2) / 3, digits = 2) * 3 )),
xlab = "bigram similarity", ylab = "relative Levenshtein similarity")
<!-- % benchmarking? -->
<!-- % s <- seq(500,10500,by=1000) -->
<!-- % t <- sapply(s,function(x){system.time(sim.strings(sample(words,x)))[3]}) -->
<!-- % plot(log(s),log(t)) -->
<!-- % lm(log(t[-c(1:5)])~log(s[-c(1:5)])) -->
# }
Run the code above in your browser using DataLab