# a trivial examples to see the results of this function:
text <- c("This is a sentence .","A sentence this is !","Is this a sentence ?")
splitText(text)
splitText(text, simplify = TRUE, lowercase = FALSE)
# reasonably quick with complete bibles (about 1-2 second per complete bible)
# texts with only New Testament is even quicker
data(bibles)
system.time(eng <- splitText(bibles$eng, bibles$verses))
system.time(deu <- splitText(bibles$deu, bibles$verses))
# Use example: Number of co-ocurrences between two bibles
# (this is more conveniently performed by the function sim.words)
# How often do words from the one language cooccur with words from the other language?
ENG <- (eng$wW * 1) %*% (eng$WR * 1) %*% (eng$RS * 1)
DEU <- (deu$wW * 1) %*% (deu$WR * 1) %*% (deu$RS * 1)
C <- tcrossprod(ENG,DEU)
rownames(C) <- eng$lowercase
colnames(C) <- deu$lowercase
C[ c("father","father's","son","son's"),
c("vater","vaters","sohn","sohne","sohnes","sohns")
]
# Pure counts are not very interesting. This is better:
R <- assocSparse(t(ENG), t(DEU))
rownames(R) <- eng$lowercase
colnames(R) <- deu$lowercase
R[ c("father","father's","son","son's"),
c("vater","vaters","sohn","sohne","sohnes","sohns")
]
# For example: best co-occurrences for the english word "mine"
sort(R["mine",], decreasing = TRUE)[1:10]
# \donttest{
# To get a quick-and-dirty translation matrix:
# adding maxima from both sides work quite well
# but this takes some time
cm <- colMax(R, which = TRUE, ignore.zero = FALSE)$which
rm <- rowMax(R, which = TRUE, ignore.zero = FALSE)$which
best <- cm + rm
best <- as(best, "nMatrix")
which(best["your",])
which(best["went",])
# A final speed check:
# split all 4 texts, and simplify them into one matrix
# They have all the same columns, so they can be rbind
system.time(all <- sapply(bibles[-1], function(x){splitText(x, bibles$verses, simplify = TRUE)}))
all <- do.call(rbind, all)
# then try a single co-occerrence measure on all pairs from these 72K words
# (so we are doing about 2.6e9 comparisons here!)
system.time( S <- cosSparse(t(all)) )
# this goes extremely fast! As long as everything fits into RAM this works nicely.
# Note that S quickly gets large
print(object.size(S), units = "auto")
# but most of it can be thrown away, because it is too low anyway
# this leads to a factor 10 reduction in size:
S <- drop0(S, tol = 0.2)
print(object.size(S), units = "auto")
# }
Run the code above in your browser using DataLab