# NOT RUN {
# Build a k-gram frequency table from a character vector
f <- kgram_freqs("a b b a a", 3)
f
summary(f)
query(f, c("a", "b")) # c(3, 2)
query(f, c("a b", "a" %+% EOS(), BOS() %+% "a b")) # c(1, 1, 1)
query(f, "a b b a") # NA (counts for k-grams of order k > 3 are not known)
process_sentences("b", f)
query(f, c("a", "b")) # c(3, 3): 'f' is updated in place
f1 <- process_sentences("b", f, in_place = FALSE)
query(f, c("a", "b")) # c(3, 3): 'f' is copied
query(f1, c("a", "b")) # c(3, 4): the new 'f1' stores the updated counts
# Build a k-gram frequency table from a file connection
# }
# NOT RUN {
f <- kgram_freqs(file("myfile.txt"), 3)
# }
# NOT RUN {
# Build a k-gram frequency table from an URL connection
# }
# NOT RUN {
### Shakespeare's "Much Ado About Nothing" (entire play)
con <- url("http://shakespeare.mit.edu/much_ado/full.html")
# Apply some basic preprocessing
.preprocess <- function(x) {
# Remove character names and locations (boldfaced in original html)
x <- gsub("<b>[A-z]+</b>", "", x)
# Remove other html tags
x <- gsub("<[^>]+>||<[^>]+$||^[^>]+>$", "", x)
# Apply standard preprocessing including lower-case
x <- kgrams::preprocess(x)
return(x)
}
.tknz_sent <- function(x) {
# Tokenize sentences keeping Shakespeare's punctuation
x <- kgrams::tknz_sent(x, keep_first = TRUE)
# Remove empty sentences
x <- x[x != ""]
return(x)
}
f <- kgram_freqs(con, 3, .preprocess, .tknz_sent, batch_size = 1000)
summary(f)
query(f, c("leonato", "thy", "smartphones")) # c(145, 52, 0)
# }
Run the code above in your browser using DataLab