# NOT RUN {
set.seed(20)
dfmat1 <- dfm(c("a a b b c d", "a d d d", "a a a"))
textstat_frequency(dfmat1)
textstat_frequency(dfmat1, groups = c("one", "two", "one"), ties_method = "first")
textstat_frequency(dfmat1, groups = c("one", "two", "one"), ties_method = "dense")
dfmat2 <- corpus_subset(data_corpus_inaugural, President == "Obama") %>%
dfm(remove_punct = TRUE, remove = stopwords("english"))
tstat1 <- textstat_frequency(dfmat2)
head(tstat1, 10)
# }
# NOT RUN {
# plot 20 most frequent words
library("ggplot2")
ggplot(tstat1[1:20, ], aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency")
# plot relative frequencies by group
dfmat3 <- data_corpus_inaugural %>%
corpus_subset(Year > 2000) %>%
dfm(remove = stopwords("english"), remove_punct = TRUE) %>%
dfm_group(groups = "President") %>%
dfm_weight(scheme = "prop")
# calculate relative frequency by president
tstat2 <- textstat_frequency(dfmat3, n = 10, groups = "President")
# plot frequencies
ggplot(data = tstat2, aes(x = factor(nrow(tstat2):1), y = frequency)) +
geom_point() +
facet_wrap(~ group, scales = "free") +
coord_flip() +
scale_x_discrete(breaks = nrow(tstat2):1,
labels = tstat2$feature) +
labs(x = NULL, y = "Relative frequency")
# }
Run the code above in your browser using DataLab