# Workflow to create document-term-matrix with phrases
obs <- corpus("GERMAPARLMINI") %>%
count(p_attribute = "word")
phrases <- corpus("GERMAPARLMINI") %>%
ngrams(n = 2L, p_attribute = "word") %>%
pmi(observed = obs) %>%
subset(ngram_count > 5L) %>%
subset(1:100) %>%
as.phrases()
dtm <- corpus("GERMAPARLMINI") %>%
as.speeches(s_attribute_name = "speaker", s_attribute_date = "date", progress = TRUE) %>%
count(phrases = phrases, p_attribute = "word", progress = TRUE, verbose = TRUE) %>%
as.DocumentTermMatrix(col = "count", verbose = FALSE)
grep("erneuerbaren_Energien", colnames(dtm))
grep("verpasste_Chancen", colnames(dtm))
use(pkg = "RcppCWB", corpus = "REUTERS")
# Derive phrases object from an ngrams object
reuters_phrases <- ngrams("REUTERS", p_attribute = "word", n = 2L) %>%
pmi(observed = count("REUTERS", p_attribute = "word")) %>%
subset(ngram_count >= 5L) %>%
subset(1:25) %>%
as.phrases()
phr <- as.character(reuters_phrases, p_attribute = "word")
# Derive phrases from explicitly stated CQP queries
cqp_phrase_queries <- c(
'"oil" "revenue";',
'"Sheikh" "Aziz";',
'"Abdul" "Aziz";',
'"Saudi" "Arabia";',
'"oil" "markets";'
)
reuters_phrases <- cpos("REUTERS", cqp_phrase_queries, p_attribute = "word") %>%
as.phrases(corpus = "REUTERS", enc = "latin1")
# Use the concatenate_phrases() function on a data.table
lexical_units_cqp <- c(
'"Deutsche.*" "Bundestag.*";',
'"sozial.*" "Gerechtigkeit";',
'"Ausschuss" "f.r" "Arbeit" "und" "Soziales";',
'"soziale.*" "Marktwirtschaft";',
'"freiheitliche.*" "Grundordnung";'
)
phr <- cpos("GERMAPARLMINI", query = lexical_units_cqp, cqp = TRUE) %>%
as.phrases(corpus = "GERMAPARLMINI", enc = "word")
dt <- corpus("GERMAPARLMINI") %>%
decode(p_attribute = "word", s_attribute = character(), to = "data.table") %>%
concatenate_phrases(phrases = phr, col = "word")
dt[word == "Deutschen_Bundestag"]
dt[word == "soziale_Marktwirtschaft"]
Run the code above in your browser using DataLab