# NOT RUN {
# ----- load data -----
# an example wordlist, see help(huber) for details
data(huber)
# ----- similarity between languages -----
# most time is spend splitting the strings
# the rest does not really influence the time needed
system.time( sim <- sim.lang(huber, method = "p") )
# a simple distance-based UPGMA tree
plot(hclust(as.dist(-sim), method = "average"), cex = .7)
# }
# NOT RUN {
# ----- similarity between concepts -----
# similarity based on bigrams
system.time( simB <- sim.con(huber, method = "b") )
# similarity based on colexification. much easier to calculate
system.time( simC <- sim.con(huber, method = "c") )
# As an example, look at all adjectival concepts
adj <- c(1,5,13,14,28,35,40,48,67,89,105,106,120,131,137,146,148,
171,179,183,188,193,195,206,222,234,259,262,275,279,292,
294,300,309,341,353,355,359)
# show them as trees
par(mfrow = c(1,2))
plot(hclust(as.dist(-simB[adj,adj]), method = "ward"),
cex = .5, main = "bigrams")
plot(hclust(as.dist(-simC[adj,adj]), method = "ward"),
cex = .5, main = "colexification")
par(mfrow = c(1,1))
# ----- similarity between graphemes -----
# this is a very crude approach towards regular sound correspondences
# when the languages are not too distantly related, it works rather nicely
# can be used as a quick first guess of correspondences for input in more advanced methods
# all 2080 graphemes in the data by all 2080 graphemes, from all languages
system.time( X <- sim.graph(huber) )
# throw away the low values
# select just one pair of languages for a quick visualisation
X$GG <- drop0(X$GG, tol = 1)
colnames(X$GG) <- rownames(X$GG)
correspondences <- X$GG[X$GD[,"bora"],X$GD[,"muinane"]]
heatmap(as.matrix(correspondences))
# }
Run the code above in your browser using DataLab