# ----- load data -----
# an example wordlist, see the help(huber) for details
data(huber)
# ----- show output -----
# a selection, to see the result of splitWordlist
# only show the simplified output here,
# the full output is rather long even for just these six words
sel <- c(1:3, 1255:1258)
splitWordlist(huber[sel,], simplify = TRUE)
# ----- split complete data -----
# splitting the complete wordlist is a lot of work !
# it won't get much quicker than this
# most time goes into the string-splitting of the almost 26,000 words
# Default version, included splitStrings:
system.time( H <- splitWordlist(huber) )
# Simplified version without splitStrings is much quicker:
system.time( H <- splitWordlist(huber, splitstrings = FALSE, simplify = TRUE) )
# ----- investigate colexification -----
# The simple version can be used to check how often two concepts
# are expressed identically across all languages ('colexification')
H <- splitWordlist(huber, splitstrings = FALSE, simplify = TRUE)
sim <- tcrossprod(H$CW*1)
# select only the frequent colexifications for a quick visualisation
diag(sim) <- 0
sim <- drop0(sim, tol = 5)
sim <- sim[rowSums(sim) > 0, colSums(sim) > 0]
if (FALSE) {
# this might lead to errors on some platforms because of non-ASCII symbols
plot( hclust(as.dist(-sim), method = "average"), cex = .5)
}
# ----- investigate regular sound correspondences -----
# One central problem with data from many languages is the variation of orthography
# It is preferred to solve that problem separately
# e.g. check the column "TOKENS" in the huber data
# This is a grapheme-separated version of the data.
# can be used to investigate co-occurrence of graphemes (approx. phonemes)
H <- splitWordlist(huber, counterparts = "TOKENS", sep = " ")
# co-occurrence of all pairs of the 2150 different graphemes through all languages
system.time( G <- assocSparse( (H$CW*1) %*% t(H$SW*1) %*% t(H$GS*1), method = poi))
rownames(G) <- colnames(G) <- H$graphemes
G <- drop0(G, tol = 1)
# select only one language pair for a quick visualisation
# check the nice sound changes between bora and muinane!
GD <- H$GS %*% H$SW %*% t(H$DW)
colnames(GD) <- H$doculects
correspondences <- G[GD[,"bora"],GD[,"muinane"]]
if (FALSE) {
# this might lead to errors on some platforms because of non-ASCII symbols
heatmap(as.matrix(correspondences))
}
Run the code above in your browser using DataLab