# NOT RUN {
# first a simple example using the farms-dataset from MASS
library(MASS)
# similarities between farms
s <- sim.obs(farms)
plot(hclust(as.dist(1-s), method = "ward.D"))
# similarities between attributes (`variables`)
s <- sim.att(farms)
plot(hclust(as.dist(1-s), method = "ward.D"))
# use the split option for multi-valued cells
farms2 <- as.matrix(farms)
farms2[1,1] <- "M1,M5"
s <- sim.obs(farms2, split = ",")
plot(hclust(as.dist(1-s), method = "ward.D"))
# }
# NOT RUN {
# a larger example with lots of missing data: the WALS-data as included here
# computations go reasonably quick
# (on 2566 observations and 131 attributes with 630 different values in total)
data(wals)
system.time(s <- sim.att(wals$data))
rownames(s) <- colnames(wals$data)
plot(hclust(as.dist(1-s), method = "ward.D"), cex = 0.5)
# Note that using sparse=T speeds up computations because it
# ignores zero co-occurrences
# This leads to small errors in the computation of Chuprov's T
system.time( # faster
chup.sparse <- sim.att(wals$data, method = "chuprov", sparse = TRUE)
)
system.time( # slower
chup.full <- sim.att(wals$data, method = "chuprov", sparse = FALSE)
)
# The sparse approach is almost identical to the full approach.
# sparse sligtly underestimates the real values for Chuprov's T
plot(as.dist(chup.sparse), as.dist(chup.full))
# some more similarities on the attributes
g <- sim.att(wals$data, method = "g") # Dunning's G
m <- sim.att(wals$data, method = "mutual") # Mutual Information
v <- sim.att(wals$data, method = "variation") # Variation of Information
# Note the strong differences between these approaches
pairs(~ as.dist(chup.sparse) + as.dist(m) + as.dist(g) + as.dist(v),
labels=c("Chuprov's T","Mutual Information","G-statistic","Variation of Information"))
# Relative Hamming similarity on all observations (languages) in WALS
# time is not a problem, but the data is so sparse
# that for many language-pairs there is no shared data
system.time( s <- sim.obs(wals$data))
# select only the 168 language with more than 80 datapoints
sel <- wals$data[apply(wals$data,1,function(x){sum(!is.na(x))})>80,]
# compare different similarities
w <- sim.obs(sel, "weighted")
h <- sim.obs(sel, "hamming")
r <- sim.obs(sel, "res")
p <- sim.obs(sel, "poi")
m <- sim.obs(sel, "wpmi")
i <- sim.obs(sel, "pmi")
pairs(~ as.dist(w) + as.dist(h) + as.dist(r) + as.dist(p) + as.dist(m) + as.dist(i),
labels = c("weighted","hamming","residuals","poisson","weighted PMI","PMI"))
# }
Run the code above in your browser using DataLab