# NOT RUN {
# Decode first words of GERMAPARLMINI corpus (first sentence)
get_token_stream(0:9, corpus = "GERMAPARLMINI", p_attribute = "word")
# Decode first sentence and collapse tokens into single string
get_token_stream(0:9, corpus = "GERMAPARLMINI", p_attribute = "word", collapse = " ")
# Decode regions defined by two-column matrix
region_matrix <- matrix(c(0,9,10,25), ncol = 2, byrow = TRUE)
get_token_stream(region_matrix, corpus = "GERMAPARLMINI", p_attribute = "word", encoding = "latin1")
# Use argument 'beautify' to remove surplus whitespace
get_token_stream(
region_matrix,
corpus = "GERMAPARLMINI",
p_attribute = "word",
encoding = "latin1",
collapse = " ", beautify = TRUE
)
# Decode entire corpus (corpus object / specified by corpus ID)
fulltext <- get_token_stream("GERMAPARLMINI", p_attribute = "word")
corpus("GERMAPARLMINI") %>%
get_token_stream(p_attribute = "word") %>%
head()
# Decode subcorpus
corpus("REUTERS") %>%
subset(id == "127") %>%
get_token_stream(p_attribute = "word") %>%
head()
# Decode partition_bundle
pb_tokstr <- corpus("REUTERS") %>%
split(s_attribute = "id") %>%
get_token_stream(p_attribute = "word")
# Get token stream for partition_bundle
pb <- partition_bundle("REUTERS", s_attribute = "id")
ts_list <- get_token_stream(pb)
# Workflow to filter decoded subcorpus_bundle
# }
# NOT RUN {
sp <- corpus("GERMAPARLMINI") %>% as.speeches(s_attribute_name = "speaker", progress = FALSE)
queries <- c('"freiheitliche" "Grundordnung"', '"Bundesrepublik" "Deutschland"' )
phr <- corpus("GERMAPARLMINI") %>% cpos(query = queries) %>% as.phrases(corpus = "GERMAPARLMINI")
kill <- tm::stopwords("de")
ts_phr <- get_token_stream(
sp,
p_attribute = c("word", "pos"),
subset = {!word %in% kill & !grepl("(\\$.$|ART)", pos)},
phrases = phr,
progress = FALSE,
verbose = FALSE
)
# }
Run the code above in your browser using DataLab