wfm: Word Frequency Matrix

Description

wfm - Generate a word frequency matrix by grouping variable(s).

wfdf - Generate a word frequency data frame by grouping variable.

wfm_expanded - Expand a word frequency matrix to have multiple rows for each word.

wfm_combine - Combines words (rows) of a word frequency matrix (wfdf) together.

weight - Weight a word frequency matrix for analysis where such weighting is sensible.

weight.wfdf - Weight a word frequency matrix for analysis where such weighting is sensible.

as.wfm - Attempts to coerce a matrix to a wfm.

Usage

wfm(
  text.var = NULL,
  grouping.var = NULL,
  output = "raw",
  stopwords = NULL,
  char2space = "~~",
  ...
)
# S3 method for wfdf
wfm(
  text.var = NULL,
  grouping.var = NULL,
  output = "raw",
  stopwords = NULL,
  char2space = "~~",
  ...
)
# S3 method for character
wfm(
  text.var = NULL,
  grouping.var = NULL,
  output = "raw",
  stopwords = NULL,
  char2space = "~~",
  ...
)
# S3 method for factor
wfm(
  text.var = NULL,
  grouping.var = NULL,
  output = "raw",
  stopwords = NULL,
  char2space = "~~",
  ...
)
wfdf(
  text.var,
  grouping.var = NULL,
  stopwords = NULL,
  margins = FALSE,
  output = "raw",
  digits = 2,
  char2space = "~~",
  ...
)
wfm_expanded(text.var, grouping.var = NULL, ...)
wfm_combine(wf.obj, word.lists, matrix = TRUE)
# S3 method for wfm
weight(x, type = "prop", ...)
# S3 method for wfm
weight(x, type = "prop", ...)
as.wfm(x, ...)
# S3 method for matrix
as.wfm(x, ...)
# S3 method for default
as.wfm(x, ...)
# S3 method for TermDocumentMatrix
as.wfm(x, ...)
# S3 method for DocumentTermMatrix
as.wfm(x, ...)
# S3 method for data.frame
as.wfm(x, ...)
# S3 method for wfdf
as.wfm(x, ...)
# S3 method for Corpus
as.wfm(x, col = "docs", row = "text", ...)
# S3 method for Corpus
wfm(text.var, ...)

Arguments

text.var

The text variable.

grouping.var

The grouping variables. Default NULL generates one word list for all text. Also takes a single grouping variable or a list of 1 or more grouping variables.

output

Output type (either "proportion" or "percent").

stopwords

A vector of stop words to remove.

char2space

A vector of characters to be turned into spaces. If char.keep is NULL, char2space will activate this argument.

margins

logical. If TRUE provides grouping.var and word variable totals.

digits

An integer indicating the number of decimal places (round) or significant digits (signif) to be used. Negative values are allowed.

wf.obj

A wfm or wfdf object.

word.lists

A list of character vectors of words to pass to wfm_combine

matrix

logical. If TRUE returns the output as a wfm rather than a wfdf object.

An object with words for row names and integer values.

type

The type of weighting to use: c("prop", "max", "scaled"). All weight by column. "prop" uses a proportion weighting and all columns sum to 1. "max" weights in proportion to the max value; all values are integers and column sums may not be equal. "scaled" uses scale to scale with center = FALSE; output is not integer and column sums may not be equal.

col

The column name (generally not used).

row

The row name (generally not used).

…

Other arguments supplied to Corpus or TermDocumentMatrix. If as.wfm this is other arguments passed to as.wfm methods (currently ignored).

Value

wfm - returns a word frequency of the class matrix.

wfdf - returns a word frequency of the class data.frame with a words column and optional margin sums.

wfm_expanded - returns a matrix similar to a word frequency matrix (wfm) but the rows are expanded to represent the maximum usages of the word and cells are dummy coded to indicate that number of uses.

wfm_combine - returns a word frequency matrix (wfm) or dataframe (wfdf) with counts for the combined word.lists merged and remaining terms (else).

weight - Returns a weighted matrix for use with other R packages. The output is not of the class "wfm".

as.wfm - Returns a matrix of the class "wfm".

Examples

Run this code

# NOT RUN {
## word frequency matrix (wfm) example:
with(DATA, wfm(state, list(sex, adult)))[1:15, ]
with(DATA, wfm(state, person))[1:15, ]
Filter(with(DATA, wfm(state, list(sex, adult))), 5)
with(DATA, wfm(state, list(sex, adult)))

## Filter particular words based on max/min values in wfm
v <- with(DATA, wfm(state, list(sex, adult)))
Filter(v, 5)
Filter(v, 5, count.apostrophe = FALSE)
Filter(v, 5, 7)
Filter(v, 4, 4)
Filter(v, 3, 4)
Filter(v, 3, 4, stopwords = Top25Words)

## insert double tilde ("~~") to keep phrases(i.e., first last name)
alts <- c(" fun", "I ")
state2 <- space_fill(DATA$state, alts, rm.extra = FALSE)
with(DATA, wfm(state2, list(sex, adult)))[1:18, ]

## word frequency dataframe (wfdf) example:
with(DATA, wfdf(state, list(sex, adult)))[1:15, ]
with(DATA, wfdf(state, person))[1:15, ]

## wfm_expanded example:
z <- wfm(DATA$state, DATA$person)
wfm_expanded(z)[30:45, ] #two "you"s

## wf_combine examples:
#===================
## raw no margins (will work) 
x <- wfm(DATA$state, DATA$person) 
                    
## raw with margin (will work) 
y <- wfdf(DATA$state, DATA$person, margins = TRUE) 

## Proportion matrix
z2 <- wfm(DATA$state, DATA$person, output="proportion")

WL1 <- c(y[, 1])                                                                      
WL2 <- list(c("read", "the", "a"), c("you", "your", "you're"))                       
WL3 <- list(bob = c("read", "the", "a"), yous = c("you", "your", "you're"))          
WL4 <- list(bob = c("read", "the", "a"), yous = c("a", "you", "your", "your're"))     
WL5 <- list(yous = c("you", "your", "your're"))                                       
WL6 <- list(c("you", "your", "your're"))  #no name so will be called words 1          
WL7 <- c("you", "your", "your're")                             
                                                               
wfm_combine(z2, WL2) #Won't work not a raw frequency matrix     
wfm_combine(x, WL2)  #Works (raw and no margins)                     
wfm_combine(y, WL2)  #Works (raw with margins)                           
wfm_combine(y, c("you", "your", "your're"))                        
wfm_combine(y, WL1)                                                  
wfm_combine(y, WL3)                                                   
## wfm_combine(y, WL4) #Error         
wfm_combine(y, WL5)                                         
wfm_combine(y, WL6)                                              
wfm_combine(y, WL7)                                           
                                                                  
worlis <- c("you", "it", "it's", "no", "not", "we")              
y <- wfdf(DATA$state, list(DATA$sex, DATA$adult), margins = TRUE)  
z <- wfm_combine(y, worlis)                      
                                                                 
chisq.test(z)                                                      
chisq.test(wfm(y)) 

## Dendrogram
presdeb <- with(pres_debates2012, wfm(dialogue, list(person, time)))
library(sjPlot)
sjc.dend(t(presdeb), 2:4)

## Words correlated within turns of talk
## EXAMPLE 1
library(qdapTools)
x <- factor(with(rajSPLIT, paste(act, pad(TOT(tot)), sep = "|")))
dat <- wfm(rajSPLIT$dialogue, x)

cor(t(dat)[, c("romeo", "juliet")])
cor(t(dat)[, c("romeo", "banished")])
cor(t(dat)[, c("romeo", "juliet", "hate", "love")])
qheat(cor(t(dat)[, c("romeo", "juliet", "hate", "love")]), 
    diag.na = TRUE, values = TRUE, digits = 3, by.column = NULL)
    
dat2 <- wfm(DATA$state, id(DATA))
qheat(cor(t(dat2)), low = "yellow", high = "red", 
    grid = "grey90", diag.na = TRUE, by.column = NULL)
    
## EXAMPLE 2
x2 <- factor(with(pres_debates2012, paste(time, pad(TOT(tot)), sep = "|")))
dat2 <- wfm(pres_debates2012$dialogue, x2)
wrds <- word_list(pres_debates2012$dialogue, 
    stopwords = c("it's", "that's", Top200Words))
wrds2 <- tolower(sort(wrds$rfswl[[1]][, 1]))
qheat(word_cor(t(dat2), word = wrds2, r = NULL),
    diag.na = TRUE, values = TRUE, digits = 3, by.column = NULL, 
    high="red", low="yellow", grid=NULL)
    
## EXAMPLE 3
library(gridExtra); library(ggplot2); library(grid)
dat3 <- lapply(qcv(OBAMA, ROMNEY), function(x) {
    with(pres_debates2012, wfm(dialogue[person == x], x2[person == x]))
})


# Presidential debates by person
dat5 <- pres_debates2012
dat5 <- dat5[dat5$person %in% qcv(ROMNEY, OBAMA), ]

disp <- with(dat5, dispersion_plot(dialogue, wrds2, grouping.var = person, 
    total.color = NULL, rm.vars=time))


cors <- lapply(dat3, function(m) {
    word_cor(t(m), word = wrds2, r = NULL)
})

plots <- lapply(cors, function(x) {
    qheat(x, diag.na = TRUE, values = TRUE, digits = 3, plot = FALSE,
    by.column = NULL, high="red", low="yellow", grid=NULL)
})

plots <- lapply(1:2, function(i) {
    plots[[i]] + ggtitle(qcv(OBAMA, ROMNEY)[i]) +
    theme(axis.title.x = element_blank(),
        plot.margin = unit(rep(0, 4), "lines"))
})

grid.arrange(disp, arrangeGrob(plots[[1]], plots[[2]], ncol=1), ncol=2)

## With `word_cor`
worlis <- list(
    pronouns = c("you", "it", "it's", "we", "i'm", "i"),
    negative = qcv(no, dumb, distrust, not, stinks),
    literacy = qcv(computer, talking, telling)
)
y <- wfdf(DATA$state, qdapTools::id(DATA, prefix = TRUE))
z <- wfm_combine(y, worlis)

word_cor(t(z), word = names(worlis), r = NULL)

## Plotting method
plot(y, TRUE)
plot(z)

## Correspondence Analysis
library(ca)

dat <- pres_debates2012
dat <- dat[dat$person %in% qcv(ROMNEY, OBAMA), ]

speech <- stemmer(dat$dialogue)
mytable1 <- with(dat, wfm(speech, list(person, time), stopwords = Top25Words))

fit <- ca(mytable1)
summary(fit)
plot(fit)
plot3d.ca(fit, labels=1)


mytable2 <- with(dat, wfm(speech, list(person, time), stopwords = Top200Words))

fit2 <- ca(mytable2)
summary(fit2)
plot(fit2)
plot3d.ca(fit2, labels=1)

## Weight a wfm
WFM <- with(DATA, wfm(state, list(sex, adult)))
plot(weight(WFM, "scaled"), TRUE)
weight(WFM, "prop")
weight(WFM, "max")
weight(WFM, "scaled")
# }

Run the code above in your browser using DataLab