# NOT RUN {
as.dtm(DATA$state, DATA$person)
as.tdm(DATA$state, DATA$person)
x <- wfm(DATA$state, DATA$person)
as.tdm(x)
as.dtm(x)
library(tm)
plot(as.tdm(x))
pres <- as.tdm(pres_debates2012$dialogue, pres_debates2012$person)
plot(pres, corThreshold = 0.8)
pres
(pres2 <- removeSparseTerms(pres, .3))
plot(pres2, corThreshold = 0.95)
shorts <- all_words(pres_debates2012)[,1][nchar(all_words(
pres_debates2012)[,1]) < 4]
SW <- c(shorts, qdapDictionaries::contractions[, 1],
qdapDictionaries::Top200Words,
"governor", "president", "mister", "obama","romney")
DocTermMat2 <- with(pres_debates2012, as.dtm(dialogue, list(person, time), stopwords = SW))
DocTermMat2 <- removeSparseTerms(DocTermMat2,0.95)
(DocTermMat2 <- DocTermMat2[rowSums(as.matrix(DocTermMat2))> 0,])
plot(DocTermMat2)
## Correspondence Analysis
library(ca)
dat <- pres_debates2012
dat <- dat[dat$person %in% qcv(ROMNEY, OBAMA), ]
speech <- stemmer(dat$dialogue)
mytable1 <- with(dat, as.tdm(speech, list(person, time), stopwords = Top25Words))
fit <- ca(as.matrix(mytable1))
summary(fit)
plot(fit)
plot3d.ca(fit, labels=1)
mytable2 <- with(dat, as.tdm(speech, list(person, time), stopwords = Top200Words))
fit2 <- ca(as.matrix(mytable2))
summary(fit2)
plot(fit2)
plot3d.ca(fit2, labels=1)
## Topic Models
# Example 1 #
library(topicmodels); library(tm)
# Generate stop words based on short words, frequent words and contractions
shorts <- all_words(pres_debates2012)[,1][nchar(all_words(
pres_debates2012)[,1]) < 4]
SW <- c(shorts, qdapDictionaries::contractions[, 1],
qdapDictionaries::Top200Words,
"governor", "president", "mister", "obama","romney")
DocTermMat <- with(pres_debates2012, as.dtm(dialogue, person, stopwords = SW))
DocTermMat <- removeSparseTerms(DocTermMat,0.999)
DocTermMat <- DocTermMat[rowSums(as.matrix(DocTermMat))> 0,]
lda.model <- LDA(DocTermMat, 5)
(topics <- posterior(lda.model, DocTermMat)$topics)
terms(lda.model,20)
# Plot the Topics Per Person
topic.dat <- matrix2df(topics, "Person")
colnames(topic.dat)[-1] <- paste2(t(terms(lda.model,20)), sep=", ")
library(reshape2)
mtopic <- melt(topic.dat, variable="Topic", value.name="Proportion")
ggplot(mtopic, aes(weight=Proportion, x=Topic, fill=Topic)) +
geom_bar() +
coord_flip() +
facet_grid(Person~.) +
guides(fill=FALSE)
# Example 2 #
DocTermMat2 <- with(pres_debates2012, as.dtm(dialogue, list(person, time), stopwords = SW))
DocTermMat2 <- removeSparseTerms(DocTermMat2,0.95)
DocTermMat2 <- DocTermMat2[rowSums(as.matrix(DocTermMat2))> 0,]
lda.model2 <- LDA(DocTermMat2, 6)
(topics2 <- posterior(lda.model2, DocTermMat2)$topics)
terms(lda.model2,20)
qheat(topics2, high="blue", low="yellow", by.col=FALSE)
# Example 3 #
lda.model3 <- LDA(DocTermMat2, 10)
(topics3 <- posterior(lda.model3, DocTermMat2)$topics)
terms(lda.model3, 20)
qheat(topics3, high="blue", low="yellow", by.col=FALSE)
# Plot the Topics Per Person
topic.dat3 <- matrix2df(topics3, "Person&Time")
colnames(topic.dat3)[-1] <- paste2(t(terms(lda.model3, 10)), sep=", ")
topic.dat3 <- colsplit2df(topic.dat3)
library(reshape2)
library(scales)
mtopic3 <- melt(topic.dat3, variable="Topic", value.name="Proportion")
(p1 <- ggplot(mtopic3, aes(weight=Proportion, x=Topic, fill=Topic)) +
geom_bar() +
coord_flip() +
facet_grid(Person~Time) +
guides(fill=FALSE) +
scale_y_continuous(labels = percent) +
theme(plot.margin = unit(c(1, 0, 0.5, .5), "lines")) +
ylab("Proportion"))
mtopic3.b <- mtopic3
mtopic3.b[, "Topic"] <- factor(as.numeric(mtopic3.b[, "Topic"]), levels = 1:10)
mtopic3.b[, "Time"] <- factor(gsub("time ", "", mtopic3.b[, "Time"]))
p2 <- ggplot(mtopic3.b, aes(x=Time, y=Topic, fill=Proportion)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "grey70", high = "red") +
facet_grid(Person~Time, scales = "free") +
theme(axis.title.y = element_blank(),
axis.text.x= element_text(colour="white"),
axis.ticks.x= element_line(colour="white"),
axis.ticks.y = element_blank(),
axis.text.y= element_blank(),
plot.margin = unit(c(1, -.5, .5, -.9), "lines")
)
library(gridExtra)
grid.arrange(p1, p2, nrow=1, widths = grid::unit(c(.85, .15), "native"))
## tm Matrices to wfm
library(tm)
data(crude)
## A Term Document Matrix Conversion
(tm_in <- TermDocumentMatrix(crude, control = list(stopwords = TRUE)))
converted <- as.wfm(tm_in)
head(converted)
summary(converted)
## A Document Term Matrix Conversion
(dtm_in <- DocumentTermMatrix(crude, control = list(stopwords = TRUE)))
summary(as.wfm(dtm_in))
## `apply_as_tm` Examples
## Create a wfm
a <- with(DATA, wfm(state, list(sex, adult)))
summary(a)
## Apply functions meant for a tm TermDocumentMatrix
out <- apply_as_tm(a, tm:::removeSparseTerms, sparse=0.6)
summary(out)
apply_as_tm(a, tm:::findAssocs, "computer", .8)
apply_as_tm(a, tm:::findFreqTerms, 2, 3)
apply_as_tm(a, tm:::Zipf_plot)
apply_as_tm(a, tm:::Heaps_plot)
apply_as_tm(a, tm:::plot.TermDocumentMatrix, corThreshold = 0.4)
library(proxy)
apply_as_tm(a, tm:::weightBin)
apply_as_tm(a, tm:::weightBin, to.qdap = FALSE)
apply_as_tm(a, tm:::weightSMART)
apply_as_tm(a, tm:::weightTfIdf)
## Convert tm Corpus to Dataframe
## A tm Corpus
library(tm)
reut21578 <- system.file("texts", "crude", package = "tm")
reuters <- Corpus(DirSource(reut21578),
readerControl = list(reader = readReut21578XML))
## Convert to dataframe
corp_df <- as.data.frame(reuters)
htruncdf(corp_df)
z <- as.Corpus(DATA$state, DATA$person,
demographic=DATA[, qcv(sex, adult, code)])
as.data.frame(z)
## Apply a qdap function
out <- formality(corp_df$text, corp_df$docs)
plot(out)
## Convert a qdap dataframe to tm package Corpus
(x <- with(DATA2, as.Corpus(state, list(person, class, day))))
library(tm)
inspect(x)
inspect_text(x)
class(x)
(y <- with(pres_debates2012, as.Corpus(dialogue, list(person, time))))
## Add demographic info to DMetaData of Corpus
z <- as.Corpus(DATA$state, DATA$person,
demographic=DATA[, qcv(sex, adult, code)])
lview(z)
lview(as.Corpus(DATA$state, DATA$person,
demographic=DATA$sex))
lview(as.Corpus(DATA$state, DATA$person,
demographic=list(DATA$sex, DATA$adult)))
## Apply qdap functions meant for dataframes from sentSplit to tm Corpus
library(tm)
reut21578 <- system.file("texts", "crude", package = "tm")
reuters <- Corpus(DirSource(reut21578),
readerControl = list(reader = readReut21578XML))
matches <- list(
oil = qcv(oil, crude),
money = c("economic", "money")
)
apply_as_df(reuters, word_stats)
apply_as_df(reuters, formality)
apply_as_df(reuters, word_list)
apply_as_df(reuters, polarity)
apply_as_df(reuters, Dissimilarity)
apply_as_df(reuters, diversity)
apply_as_df(reuters, pos_by)
apply_as_df(reuters, flesch_kincaid)
apply_as_df(reuters, trans_venn)
apply_as_df(reuters, gantt_plot)
apply_as_df(reuters, rank_freq_mplot)
apply_as_df(reuters, character_table)
(termco_out <- apply_as_df(reuters, termco, match.list = matches))
plot(termco_out, values = TRUE, high="red")
(wordcor_out <- apply_as_df(reuters, word_cor, word = unlist(matches)))
plot(wordcor_out)
(f_terms <- apply_as_df(reuters, freq_terms, at.least = 3))
plot(f_terms)
apply_as_df(reuters, trans_cloud)
## To use "all" rather than "docs" as "grouping.var"...
apply_as_df(reuters, trans_cloud, grouping.var=NULL,
target.words=matches, cloud.colors = c("red", "blue", "grey75"))
finds <- apply_as_df(reuters, freq_terms, at.least = 5,
top = 5, stopwords = Top100Words)
apply_as_df(reuters, dispersion_plot, match.terms = finds[, 1],
total.color = NULL)
## Filter for Term Document Matrix/Document Term Matrix
library(tm)
data(crude)
(tdm_in <- TermDocumentMatrix(crude, control = list(stopwords = TRUE)))
Filter(tdm_in, 5)
(dtm_in <- DocumentTermMatrix(crude, control = list(stopwords = TRUE)))
Filter(dtm_in, 5)
## Filter particular words based on max/min values
Filter(dtm_in, 5, 7)
Filter(dtm_in, 4, 4)
Filter(tdm_in, 3, 4)
Filter(tdm_in, 3, 4, stopwords = Top200Words)
## SPECIAL REMOVAL OF TERMS (more flexible consideration of words than wfm)
dat <- data.frame(
person = paste0("person_", 1:5),
tweets = c("test one two", "two apples","hashtag #apple",
"#apple #tree", "http://microsoft.com")
)
## remove specialty items
dat[[2]] <- rm_default(dat[[2]], pattern=pastex("@rm_url", "#apple\\b"))
myCorp <- tm::tm_map(crude, tm::removeWords, Top200Words)
myCorp %>% as.dtm() %>% tm::inspect()
# }
Run the code above in your browser using DataLab