# NOT RUN {
library(RcppCWB)
# In this example, we pursue a "pure R" approach. To rely on the "CWB"
# method, you can use the cwb_install() function, which will download and
# install the CWB command line # tools within the package.
tokens <- readLines(system.file(package = "RcppCWB", "extdata", "examples", "reuters.txt"))
# Create new (and empty) directory structure
tmpdir <- normalizePath(tempdir(), winslash = "/")
registry_tmp <- file.path(tmpdir, "registry", fsep = "/")
data_dir_tmp <- file.path(tmpdir, "data_dir", "reuters", fsep = "/")
if (file.exists(file.path(data_dir_tmp, "word.corpus"))){
file.remove(file.path(data_dir_tmp, "word.corpus"))
}
if (dir.exists(registry_tmp)) unlink(registry_tmp, recursive = TRUE)
if (dir.exists(data_dir_tmp)) unlink(data_dir_tmp, recursive = TRUE)
dir.create(registry_tmp)
dir.create(data_dir_tmp, recursive = TRUE)
# Now encode token stream
p_attribute_encode(
corpus = "reuters",
token_stream = tokens, p_attribute = "word",
data_dir = data_dir_tmp, method = "R",
registry_dir = registry_tmp,
compress = FALSE,
encoding = "utf8"
)
# Create minimal registry file
regdata <- registry_data(
id = "REUTERS", name = "Reuters Sample Corpus", home = data_dir_tmp,
properties = c(encoding = "utf-8", language = "en"), p_attributes = "word"
)
regfile <- registry_file_write(
data = regdata, corpus = "REUTERS",
registry_dir = registry_tmp, data_dir = data_dir_tmp,
)
# Reload corpus and run query as a test
if (cqp_is_initialized()) cqp_reset_registry(registry_tmp) else cqp_initialize(registry_tmp)
cqp_query(corpus = "REUTERS", query = '[]{3} "oil" []{3};')
regions <- cqp_dump_subcorpus(corpus = "REUTERS")
kwic <- apply(
regions, 1,
function(region){
ids <- cl_cpos2id("REUTERS", "word", registry_tmp, cpos = region[1]:region[2])
words <- cl_id2str(corpus = "REUTERS", p_attribute = "word", registry = registry_tmp, id = ids)
paste0(words, collapse = " ")
}
)
kwic[1:10]
# }
Run the code above in your browser using DataLab