use(pkg = "RcppCWB", corpus = "REUTERS")
# Decode first words of GERMAPARLMINI corpus (first sentence)
get_token_stream(0:9, corpus = "GERMAPARLMINI", p_attribute = "word")
# Decode first sentence and collapse tokens into single string
get_token_stream(0:9, corpus = "GERMAPARLMINI", p_attribute = "word", collapse = " ")
# Decode regions defined by two-column integer matrix
region_matrix <- matrix(c(0L,9L,10L,25L), ncol = 2, byrow = TRUE)
get_token_stream(region_matrix, corpus = "GERMAPARLMINI", p_attribute = "word", encoding = "latin1")
# Use argument 'beautify' to remove surplus whitespace
get_token_stream(
region_matrix,
corpus = "GERMAPARLMINI",
p_attribute = "word",
encoding = "latin1",
collapse = " ", beautify = TRUE
)
# Decode entire corpus (corpus object / specified by corpus ID)
fulltext <- get_token_stream("GERMAPARLMINI", p_attribute = "word")
corpus("GERMAPARLMINI") %>%
get_token_stream(p_attribute = "word") %>%
head()
# Decode subcorpus
corpus("REUTERS") %>%
subset(id == "127") %>%
get_token_stream(p_attribute = "word") %>%
head()
# Decode partition_bundle
pb_tokstr <- corpus("REUTERS") %>%
split(s_attribute = "id") %>%
get_token_stream(p_attribute = "word")
# \donttest{
# Get token stream for partition_bundle
pb <- partition_bundle("REUTERS", s_attribute = "id")
ts_list <- get_token_stream(pb)
# Use two p-attributes
sp <- corpus("GERMAPARLMINI") %>%
as.speeches(s_attribute_name = "speaker", s_attribute_date = "date", progress = FALSE)
p2 <- get_token_stream(sp, p_attribute = c("word", "pos"), verbose = FALSE)
# Apply filter
p_sub <- get_token_stream(
sp, p_attribute = c("word", "pos"),
subset = {!grepl("(\\$.$|ART)", pos)}
)
# Concatenate phrases and apply filter
queries <- c('"freiheitliche" "Grundordnung"', '"Bundesrepublik" "Deutschland"' )
phr <- corpus("GERMAPARLMINI") %>%
cpos(query = queries) %>%
as.phrases(corpus = "GERMAPARLMINI")
kill <- tm::stopwords("de")
ts_phr <- get_token_stream(
sp,
p_attribute = c("word", "pos"),
subset = {!word %in% kill & !grepl("(\\$.$|ART)", pos)},
phrases = phr,
progress = FALSE,
verbose = FALSE
)
# }
Run the code above in your browser using DataLab