
pos
- Apply part of speech tagger to transcript(s).
pos_by
- Apply part of speech tagger to transcript(s) by zero or more
grouping variable(s).
pos_tags
- Useful for interpreting the parts of speech tags created by
pos and pos_by.
pos(
text.var,
parallel = FALSE,
cores = detectCores()/2,
progress.bar = TRUE,
na.omit = FALSE,
digits = 1,
percent = TRUE,
zero.replace = 0,
gc.rate = 10
)pos_by(
text.var,
grouping.var = NULL,
digits = 1,
percent = TRUE,
zero.replace = 0,
...
)
pos_tags(type = "pretty")
pos
- returns a list of 4:
The original text
The original words replaced with parts of speech in context.
Dataframe of the proportion of parts of speech by row.
Dataframe of the frequency of parts of speech by row.
Dataframe of the frequency and proportions of parts of speech by row.
The value of percent used for plotting purposes.
The value of zero.replace used for plotting purposes.
pos_by
- returns a list of 6:
The original text
The original words replaced with parts of speech in context.
Dataframe of the proportion of parts of speech by row.
Dataframe of the frequency of parts of speech by row.
Dataframe of the frequency and proportions of parts of speech by row.
Dataframe of the proportion of parts of speech by grouping variable.
Dataframe of the frequency of parts of speech by grouping variable.
Dataframe of the frequency and proportions of parts of speech by grouping variable.
The value of percent used for plotting purposes.
The value of zero.replace used for plotting purposes.
The text variable.
logical. If TRUE
attempts to run the function on
multiple cores. Note that this may not mean a speed boost if you have one
core or if the data set is smaller as the cluster takes time to create.
The number of cores to use if parallel = TRUE
. Default
is half the number of available cores.
logical. If TRUE
attempts to provide a OS
appropriate progress bar. If parallel is TRUE
this argument is
ignored. Note that setting this argument to TRUE
may slow down the
function.
logical. If TRUE
missing values (NA
) will be
omitted.
Integer; number of decimal places to round when printing.
logical. If TRUE
output given as percent. If
FALSE
the output is proportion.
Value to replace 0 values with.
An integer value. This is a necessary argument because of a
problem with the garbage collection in the openNLP function that
pos
wraps. Consider adjusting this argument upward if
the error java.lang.OutOfMemoryError
occurs.
The grouping variables. Default NULL
generates
one word list for all text. Also takes a single grouping variable or a list
of 1 or more grouping variables.
An optional character string giving the output of the pos tags.
This must be one of the strings "pretty"
(a left justified version of
the output optimized for viewing but not good for export), "matrix"
(a matrix version of the output), "dataframe"
\ "df"
(a
dataframe version of the output), "all"
(a list of all three of the
previous output types).
Other argument supplied to pos
.
http:/opennlp.apache.org
Maxent_POS_Tag_Annotator
,
colcomb2class
if (FALSE) {
posdat <- pos(DATA$state)
ltruncdf(posdat, 7, 4)
## str(posdat)
names(posdat)
posdat$text #original text
## Methods
preprocessed(posdat) #words replaced with parts of speech
counts(posdat) #frequency of parts of speech by row
proportions(posdat) #proportion of parts of speech by row
## Methods Plotting
plot(preprocessed(posdat))
plot(counts(posdat))
plot(proportions(posdat))
plot(posdat)
out1 <- pos(DATA$state, parallel = TRUE) # not always useful
ltruncdf(out1, 7, 4)
#use pos_tags to interpret part of speech tags used by pos & pos_by
pos_tags()[1:10, ]
pos_tags("matrix")[1:10, ]
pos_tags("dataframe")[1:10, ]
pos_tags("df")[1:10, ]
ltruncdf(pos_tags("all"), 3)
posbydat <- with(DATA, pos_by(state, sex))
names(posbydat)
## Methods
scores(posbydat)
preprocessed(posbydat)
counts(posbydat)
proportions(posbydat)
## Methods Plotting
plot(preprocessed(posbydat))
plot(counts(posbydat))
plot(proportions(posbydat))
plot(posbydat)
ltruncdf(posbydat, 7, 4)
truncdf(posbydat$pos.by.prop, 4)
POSby <- with(DATA, pos_by(state, list(adult, sex)))
plot(POSby, values = TRUE, digits = 2)
#or more quickly - reuse the output from before
out2 <- with(DATA, pos_by(posbydat, list(adult, sex)))
## Definite/Indefinite Noun
## 2 approached compared...
## The later is more efficient but less accurate
## ------------------------##
## Part off speech tagging ##
## ------------------------##
pos_after <- function(text.var, words, pos){
posses <- strsplit(as.character(text.var[["POStagged"]][["POStagged"]]), "\\s+")
namespos <- lapply(posses, function(x) {
y <- unlist(strsplit(x, "/"))
setNames(y[c(TRUE, FALSE)], y[c(FALSE, TRUE)])
})
lapply(namespos, function(x, thewords = words, thepos = pos){
locs <- which(x %in% thewords)
locs <- locs[!is.na(locs)]
if (identical(unclass(locs), integer(0))) return(NA_character_)
nounlocs <- which(names(x) %in% thepos)
unname(x[unique(sapply(locs, function(x){
min(nounlocs[nounlocs - x > 0])
}))])
})
}
out2 <- setNames(lapply(list(a=c("a", "an"), the="the"), function(x) {
o <- pos_after(rajPOS, x, c("NN", "NNS", "NNP", "NNPS"))
m <- stats::setNames(data.frame(sort(table(unlist(o))),
stringsAsFactors = FALSE), c("word", "freq"))
m[m$freq> 3, ]
}), c("a", "the"))
dat2 <- setNames(Reduce(function(x, y) {
merge(x, y, by = "word", all = TRUE)}, out2), c("Word", "A", "THE"))
dat2 <- reshape2::melt(dat2, id="Word", variable.name="Article", value.name="freq")
dat2 <- dat2[order(dat2$freq, dat2$Word), ]
ord2 <- aggregate(freq ~ Word, dat2, sum)
dat2$Word <- factor(dat2$Word, levels=ord2[order(ord2[[2]]), 1])
rownames(dat2) <- NULL
ggplot(dat2, aes(x=freq, y=Word)) +
geom_point()+ facet_grid(~Article) +
ggtitle("Part Of Speech Parsing Approach")
dev.new()
## --------------------##
## Regular Expressions ##
## --------------------##
library(qdapRegex);library(ggplot2);library(reshape2)
out <- setNames(lapply(c("@after_a", "@after_the"), function(x) {
o <- rm_default(stringi:::stri_trans_tolower(raj$dialogue),
pattern = x, extract=TRUE)
m <- stats::setNames(data.frame(sort(table(unlist(o))),
stringsAsFactors = FALSE), c("word", "freq"))
m[m$freq> 3, ]
}), c("a", "the"))
dat <- setNames(Reduce(function(x, y) {
merge(x, y, by = "word", all = TRUE)}, out), c("Word", "A", "THE"))
dat <- reshape2::melt(dat, id="Word", variable.name="Article", value.name="freq")
dat <- dat[order(dat$freq, dat$Word), ]
ord <- aggregate(freq ~ Word, dat, sum)
dat$Word <- factor(dat$Word, levels=ord[order(ord[[2]]), 1])
rownames(dat) <- NULL
ggplot(dat, aes(x=freq, y=Word)) +
geom_point()+ facet_grid(~Article) +
ggtitle("Regex Approach")
}
Run the code above in your browser using DataLab