# NOT RUN {
data("usnews")
data("lexicons")
data("valence")
data("epu")
# construct a sentomeasures object to start with
corpusAll <- sento_corpus(corpusdf = usnews)
corpus <- quanteda::corpus_subset(corpusAll, date >= "2004-01-01" & date < "2014-10-01")
l <- setup_lexicons(lexicons[c("LM_eng", "HENRY_eng")], valence[["valence_eng"]])
ctr <- ctr_agg(howWithin = "tf-idf", howDocs = "proportional",
howTime = c("equal_weight", "almon"),
by = "month", lag = 3, ordersAlm = 1:2,
do.inverseAlm = TRUE, do.normalizeAlm = TRUE)
sentomeasures <- sento_measures(corpus, l, ctr)
# prepare y and other x variables
y <- epu[epu$date >= sentomeasures$measures$date[1], ]$index
length(y) == nrow(sentomeasures$measures) # TRUE
x <- data.frame(runif(length(y)), rnorm(length(y))) # two other (random) x variables
colnames(x) <- c("x1", "x2")
# a linear model based on the Akaike information criterion
ctrIC <- ctr_model(model = "gaussian", type = "AIC", do.iter = FALSE, h = 0)
out1 <- sento_model(sentomeasures, y, x = x, ctr = ctrIC)
# some post-analysis (attribution)
attributions1 <- retrieve_attributions(out1, sentomeasures,
refDates = sentomeasures$measures$date[20:40])
# }
# NOT RUN {
# a cross-validation based model
cl <- makeCluster(detectCores() - 2)
registerDoParallel(cl)
ctrCV <- ctr_model(model = "gaussian", type = "cv", do.iter = FALSE,
h = 0, alphas = c(0.10, 0.50, 0.90), trainWindow = 70,
testWindow = 10, oos = 0, do.progress = TRUE)
out2 <- sento_model(sentomeasures, y, x = x, ctr = ctrCV)
stopCluster(cl)
summary(out2)
# }
# NOT RUN {
# }
# NOT RUN {
# a cross-validation based model but for a binomial target
yb <- epu[epu$date >= sentomeasures$measures$date[1], ]$above
ctrCVb <- ctr_model(model = "binomial", type = "cv", do.iter = FALSE,
h = 0, alphas = c(0.10, 0.50, 0.90), trainWindow = 70,
testWindow = 10, oos = 0, do.progress = TRUE)
out3 <- sento_model(sentomeasures, yb, x = x, ctr = ctrCVb)
summary(out3)
# }
# NOT RUN {
# an example of an iterative analysis
ctrIter <- ctr_model(model = "gaussian", type = "BIC", do.iter = TRUE,
alphas = c(0.25, 0.75), h = 0, nSample = 100, start = 21)
out4 <- sento_model(sentomeasures, y, x = x, ctr = ctrIter)
summary(out4)
# }
# NOT RUN {
# a similar iterative analysis, parallelized
cl <- makeCluster(detectCores() - 2)
registerDoParallel(cl)
ctrIter <- ctr_model(model = "gaussian", type = "Cp", do.iter = TRUE,
h = 0, nSample = 100, do.parallel = TRUE)
out5 <- sento_model(sentomeasures, y, x = x, ctr = ctrIter)
stopCluster(cl)
summary(out5)
# }
# NOT RUN {
# some more post-analysis (attribution and prediction)
attributions2 <- retrieve_attributions(out4, sentomeasures)
plot_attributions(attributions2, "features")
nx <- ncol(sentomeasures$measures) - 1 + ncol(x) # don't count date column
newx <- runif(nx) * cbind(sentomeasures$measures[, -1], x)[30:50, ]
preds <- predict(out1, newx = as.matrix(newx), type = "link")
# }
Run the code above in your browser using DataLab