topic_novelty_score: Topic Top-Terms Novelty Score

Description

Calculates the novelty score for a character matrix displaying topic top-terms for all topics.

Usage

topic_novelty_score(top_terms_matrix, row_range = NULL)

Arguments

top_terms_matrix

A character matrix or data.frame containing top terms for all topics.

row_range

Optional argument specifying range of rows to keep. This is useful if we are only interested in the top 10 or 20 terms, but we have a matrix with the top 50 terms for each topic.

Value

A novelty score.

Examples

Run this code

# NOT RUN {
set.seed(12345)
# load the package
library(preText)
# load in the data
data("UK_Manifestos")
# preprocess data
preprocessed_documents <- factorial_preprocessing(
    UK_Manifestos,
    use_ngrams = TRUE,
    infrequent_term_threshold = 0.02,
    verbose = TRUE)
cross_validation_splits <- 10
# create 10 test/train splits
train_inds <- vector(mode = "list", length = cross_validation_splits)
test_inds <- vector(mode = "list", length = cross_validation_splits)
# sample CV indices
for (i in 1:cross_validation_splits) {
    test <- sample(1:length(UK_Manifestos),
                   size = round(length(UK_Manifestos)/5),
                   replace = FALSE)
    train <- 1:length(UK_Manifestos)
    for (j in 1:length(test)) {
        train <- train[-which(train == test[j])]
    }
    train_inds[[i]] <- train
    test_inds[[i]] <- test
}
# get the optimal number of topics (this will take a very long time):
optimal_k <- optimal_k_comparison(
     train_inds,
     test_inds,
     preprocessed_documents$dfm_list,
     topics = c(25,50,75,100,125,150,175,200),
     names  = preprocessed_documents$labels)
# run a topic model with the optimal number of topics for each preproc. spec.
top_terms_list <- vector(mode = "list", length = 128)
for (i in 1:128) {
     fit <- topicmodels::LDA(quanteda::convert(preprocessed_documents$dfm_list[[i]],
                                               to = "topicmodels"),
                             k = optimal_k[i])
     # extract out top 20 terms for each topic
     top_terms <- terms(fit,20)
     top_terms_list[[i]] <- top_terms
}
# calculate novelty score
topic_novelty_score(top_terms_list[[1]])
# }

Run the code above in your browser using DataLab