if (FALSE) {
# In a first scenario, we get all cooccurrences for the REUTERS corpus,
# excluding stopwords
stopwords <- unname(unlist(
noise(
terms("REUTERS", p_attribute = "word"),
stopwordsLanguage = "en"
)
))
r <- Cooccurrences(
.Object = "REUTERS", p_attribute = "word",
left = 5L, right = 5L, stoplist = stopwords
)
ll(r) # note that the table in the stat slot is augmented in-place
decode(r) # in-place modification, again
r <- subset(r, ll > 11.83 & ab_count >= 5)
data.table::setorderv(r@stat, cols = "ll", order = -1L)
head(r, 25)
if (requireNamespace("igraph", quietly = TRUE)){
r@partition <- enrich(r@partition, p_attribute = "word")
g <- as_igraph(r, as.undirected = TRUE)
plot(g)
}
# The next scenario is a cross-check that extracting cooccurrences from
# from a Cooccurrences-class object with all cooccurrences and the result
# for getting cooccurrences for a single object are identical
a <- cooccurrences(r, query = "oil")
a <- data.table::as.data.table(a)
b <- cooccurrences("REUTERS", query = "oil", left = 5, right = 5, p_attribute = "word")
b <- data.table::as.data.table(b)
b <- b[!word %in% stopwords]
all(b[["word"]][1:5] == a[["word"]][1:5]) # needs to be identical!
stopwords <- unlist(noise(
terms("GERMAPARLMINI", p_attribute = "word"),
stopwordsLanguage = "german"
)
)
# We now filter cooccurrences by keeping only the statistically
# significant cooccurrens, identified by comparison with cooccurrences
# derived from a reference corpus
plpr_partition <- partition(
"GERMAPARLMINI", date = "2009-11-10", interjection = "speech",
p_attribute = "word"
)
plpr_cooc <- Cooccurrences(
plpr_partition, p_attribute = "word",
left = 3L, right = 3L,
stoplist = stopwords,
verbose = TRUE
)
decode(plpr_cooc)
ll(plpr_cooc)
merkel <- partition(
"GERMAPARLMINI", speaker = "Merkel", date = "2009-11-10", interjection = "speech",
regex = TRUE,
p_attribute = "word"
)
merkel_cooc <- Cooccurrences(
merkel, p_attribute = "word",
left = 3L, right = 3L,
stoplist = stopwords,
verbose = TRUE
)
decode(merkel_cooc)
ll(merkel_cooc)
merkel_min <- subset(
merkel_cooc,
by = subset(features(merkel_cooc, plpr_cooc), rank_ll <= 50)
)
# Esentially the same procedure as in the previous example, but with
# two positional attributes, so that part-of-speech annotation is
# used for additional filtering.
protocol <- partition(
"GERMAPARLMINI",
date = "2009-11-10",
p_attribute = c("word", "pos"),
interjection = "speech"
)
protocol_cooc <- Cooccurrences(
protocol,
p_attribute = c("word", "pos"),
left = 3L, right = 3L
)
ll(protocol_cooc)
decode(protocol_cooc)
merkel <- partition(
"GERMAPARLMINI",
speaker = "Merkel",
date = "2009-11-10",
interjection = "speech",
regex = TRUE,
p_attribute = c("word", "pos")
)
merkel_cooc <- Cooccurrences(
merkel,
p_attribute = c("word", "pos"),
left = 3L, right = 3L,
verbose = TRUE
)
ll(merkel_cooc)
decode(merkel_cooc)
f <- features(merkel_cooc, protocol_cooc)
f <- subset(f, a_pos %in% c("NN", "ADJA"))
f <- subset(f, b_pos %in% c("NN", "ADJA"))
f <- subset(f, c(rep(TRUE, times = 50), rep(FALSE, times = nrow(f) - 50)))
merkel_min <- subset(merkel_cooc, by = f)
if (requireNamespace("igraph", quietly = TRUE)){
g <- as_igraph(merkel_min, as.undirected = TRUE)
plot(g)
}
}
Run the code above in your browser using DataLab