data_dir <- file.path(tempdir(), "bt_data_dir")
dir.create(data_dir)
cwb_encode(
corpus = "BTMIN",
registry = Sys.getenv("CORPUS_REGISTRY"),
vrt_dir = system.file(package = "RcppCWB", "extdata", "vrt"),
data_dir = data_dir,
p_attributes = c("word", "pos", "lemma"),
s_attributes = list(
plenary_protocol = c(
"lp", "protocol_no", "date", "year", "birthday", "version",
"url", "filetype"
),
speaker = c(
"id", "type", "lp", "protocol_no", "date", "year", "ai_no", "ai_id",
"ai_type", "who", "name", "parliamentary_group", "party", "role"
),
p = character()
)
)
unlink(data_dir)
unlink(file.path(Sys.getenv("CORPUS_REGISTRY"), "btmin"))
# The package includes and 'unfinished' corpus of debates in the UN General
# Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it
# is not compressed.
#
# The first step in the following example is to copy the raw
# corpus to a temporary place.
home_dir <- system.file(
package = "RcppCWB",
"extdata", "cwb", "indexed_corpora", "unga"
)
tmp_data_dir <- file.path(tempdir(), "indexed_corpora")
tmp_unga_dir <- file.path(tmp_data_dir, "unga2")
if (!file.exists(tmp_data_dir)) dir.create(tmp_data_dir)
if (!file.exists(tmp_unga_dir)){
dir.create(tmp_unga_dir)
} else {
file.remove(list.files(tmp_unga_dir, full.names = TRUE))
}
regfile <- readLines(
system.file(package = "RcppCWB", "extdata", "cwb", "registry", "unga")
)
regfile[grep("^HOME", regfile)] <- sprintf('HOME "%s"', tmp_unga_dir)
regfile[grep("^ID", regfile)] <- "ID unga2"
writeLines(text = regfile, con = file.path(get_tmp_registry(), "unga2"))
for (x in list.files(home_dir, full.names = TRUE)){
file.copy(from = x, to = tmp_unga_dir)
}
# perform cwb_makeall (equivalent to cwb-makeall command line utility)
cwb_makeall(
corpus = "UNGA2",
p_attribute = "word",
registry = get_tmp_registry()
)
cl_load_corpus("UNGA2", registry = get_tmp_registry())
cqp_load_corpus("UNGA2", registry = get_tmp_registry())
# see whether it works
ids_sentence_1 <- cl_cpos2id(
corpus = "UNGA2", p_attribute = "word", registry = get_tmp_registry(),
cpos = 0:83
)
tokens_sentence_1 <- cl_id2str(
corpus = "UNGA2", p_attribute = "word",
registry = get_tmp_registry(), id = ids_sentence_1
)
sentence <- gsub(
"\\s+([\\.,])",
"\\1",
paste(tokens_sentence_1, collapse = " ")
)
# perform cwb_huffcode (equivalent to cwb-makeall command line utility)
cwb_huffcode(
corpus = "UNGA2",
p_attribute = "word",
registry = get_tmp_registry()
)
cwb_compress_rdx(
corpus = "UNGA2",
p_attribute = "word",
registry = get_tmp_registry()
)
Run the code above in your browser using DataLab