use("polmineR")
use(pkg = "RcppCWB", corpus = "REUTERS")
# Decode corpus as data.table
dt <- decode("GERMAPARLMINI", to = "data.table")
# Decode corpus selectively
dt <- decode("GERMAPARLMINI", to = "data.table", p_attributes = "word", s_attributes = "party")
# Decode a subcorpus
dt <- corpus("GERMAPARLMINI") %>%
subset(speaker == "Angela Dorothea Merkel") %>%
decode(s_attributes = c("speaker", "party", "date"), to = "data.table")
# Decode subcorpus selectively
corpus("GERMAPARLMINI") %>%
subset(speaker == "Angela Dorothea Merkel") %>%
decode(to = "data.table", p_attributes = "word", s_attributes = "party")
# Decode partition
P <- partition("REUTERS", places = "kuwait", regex = TRUE)
dt <- decode(P)
# Previous versions of polmineR offered an option to decode a single
# s-attribute. This is how you could proceed to get a table with metadata.
dt <- decode(P, s_attribute = "id", decode = FALSE)
dt[, "word" := NULL]
dt[,{list(cpos_left = min(.SD[["cpos"]]), cpos_right = max(.SD[["cpos"]]))}, by = "id"]
# Decode subcorpus as Annotation object
if (FALSE) {
if (requireNamespace("NLP")){
library(NLP)
p <- corpus("GERMAPARLMINI") %>%
subset(date == "2009-11-10" & speaker == "Angela Dorothea Merkel")
s <- as(p, "String")
a <- as(p, "Annotation")
# The beauty of having this NLP Annotation object is that you can now use
# the different annotators of the openNLP package. Here, just a short scenario
# how you can have a look at the tokenized words and the sentences.
words <- s[a[a$type == "word"]]
sentences <- s[a[a$type == "sentence"]] # does not yet work perfectly for plenary protocols
doc <- as(p, "AnnotatedPlainTextDocument")
}
}
# decode vector of token ids
y <- decode(0:20, corpus = "GERMAPARLMINI", p_attributes = "word")
dt <- data.table::data.table(cpos = cpos("GERMAPARLMINI", query = "Liebe")[,1])
decode(dt, corpus = "GERMAPARLMINI", p_attributes = c("word", "pos"))
y <- dt[, .N, by = c("word", "pos")]
Run the code above in your browser using DataLab