# use ll-method explicitly
oil <- cooccurrences("REUTERS", query = "oil", method = NULL)
oil <- ll(oil)
oil_min <- subset(oil, count_coi >= 3)
if (interactive()) View(format(oil_min))
summary(oil)
# use ll-method on 'Cooccurrences'-object
if (FALSE) {
R <- Cooccurrences("REUTERS", left = 5L, right = 5L, p_attribute = "word")
ll(R)
decode(R)
summary(R)
}
# use log likelihood test for feature extraction
x <- partition(
"GERMAPARLMINI", speaker = "Merkel",
interjection = "speech", regex = TRUE,
p_attribute = "word"
)
f <- features(x, y = "GERMAPARLMINI", included = TRUE, method = "ll")
f <- features(x, y = "GERMAPARLMINI", included = TRUE, method = NULL)
f <- ll(f)
summary(f)
if (FALSE) {
# A sample do-it-yourself calculation for log-likelihood:
# Compute ll-value for query "oil", and "prices"
oil <- context("REUTERS", query = "oil", left = 5, right = 5)
# (a) prepare matrix with observed values
o <- matrix(data = rep(NA, 4), ncol = 2)
o[1,1] <- as(oil, "data.table")[word == "prices"][["count_coi"]]
o[1,2] <- count("REUTERS", query = "prices")[["count"]] - o[1,1]
o[2,1] <- size(oil)[["coi"]] - o[1,1]
o[2,2] <- size(oil)[["ref"]] - o[1,2]
# (b) prepare matrix with expected values, calculate margin sums first
r <- rowSums(o)
c <- colSums(o)
N <- sum(o)
e <- matrix(data = rep(NA, 4), ncol = 2) # matrix with expected values
e[1,1] <- r[1] * (c[1] / N)
e[1,2] <- r[1] * (c[2] / N)
e[2,1] <- r[2] * (c[1] / N)
e[2,2] <- r[2] * (c[2] / N)
# (c) compute log-likelihood value
ll_value <- 2 * (
o[1,1] * log(o[1,1] / e[1,1]) +
o[1,2] * log(o[1,2] / e[1,2]) +
o[2,1] * log(o[2,1] / e[2,1]) +
o[2,2] * log(o[2,2] / e[2,2])
)
df <- as.data.frame(cooccurrences("REUTERS", query = "oil"))
subset(df, word == "prices")[["ll"]]
}
Run the code above in your browser using DataLab