# NOT RUN {
# code is only run when the english language package can be loaded
if(require("koRpus.lang.en", quietly = TRUE)){
sample_file <- file.path(
path.package("koRpus"), "examples", "corpus", "Reality_Winner.txt"
)
tokenized.obj <- tokenize(
txt=sample_file,
lang="en"
)
en_corp <- read.corp.custom(
tokenized.obj,
caseSens=FALSE
)
# look up frequencies for the word "winner"
query(en_corp, var="word", query="winner")
# show all entries with a frequency of exactly 3 in the corpus
query(en_corp, "freq", 3)
# now, which tokens appear more than 40000 times in a million?
query(en_corp, "pmio", 40000, "gt")
# example for a range request: tokens with a log10 between 4.2 and 4.7
# (including these two values)
query(en_corp, "log10", c(4.2, 4.7))
# (and without them)
query(en_corp, "log10", c(4.2, 4.7), "gt")
# example for a list of queries: get words with a frequency between
# 10000 and 25000 per million and at least four letters
query(en_corp, query=list(
list(pmio=c(10000, 25000)),
list(lttr=4, rel="ge"))
)
# get all instances of "the" in a tokenized text object
query(tokenized.obj, "token", "the")
} else {}
# }
Run the code above in your browser using DataLab