if (FALSE) {
## Mining Citation
url_dl("http://umlreading.weebly.com/uploads/2/5/2/5/25253346/whole_language_timeline-updated.docx")
(txt <- read_docx("whole_language_timeline-updated.docx"))
library(qdapTools); library(ggplot2); library(qdap)
txt <- rm_non_ascii(txt)
parts <- split_vector(txt, split = "References", include = TRUE, regex=TRUE)
parts[[1]]
rm_citation(unbag(parts[[1]]), extract=TRUE)[[1]]
## By line
rm_citation(parts[[1]], extract=TRUE)
## Frequency
left_just(cites <- list2df(sort(table(rm_citation(unbag(parts[[1]]),
extract=TRUE)), T), "freq", "citation")[2:1])
## Distribution of citations (find locations and then plot)
cite_locs <- do.call(rbind, lapply(cites[[1]], function(x){
m <- gregexpr(x, unbag(parts[[1]]), fixed=TRUE)
data.frame(
citation=x,
start = m[[1]] -5,
end = m[[1]] + 5 + attributes(m[[1]])[["match.length"]]
)
}))
ggplot(cite_locs) +
geom_segment(aes(x=start, xend=end, y=citation, yend=citation), size=3,
color="yellow") +
xlab("Duration") +
scale_x_continuous(expand = c(0,0),
limits = c(0, nchar(unbag(parts[[1]])) + 25)) +
theme_grey() +
theme(
panel.grid.major=element_line(color="grey20"),
panel.grid.minor=element_line(color="grey20"),
plot.background = element_rect(fill="black"),
panel.background = element_rect(fill="black"),
panel.border = element_rect(colour = "grey50", fill=NA, size=1),
axis.text=element_text(color="grey50"),
axis.title=element_text(color="grey50")
)
}
Run the code above in your browser using DataLab