read_docx: Read in .docx Content

Description

Read in the content from a .docx file.

Usage

read_docx(file, skip = 0)

Value

Returns a character vector.

Arguments

file: The path to the .docx file.
skip: The number of lines to skip.

Author

Bryan Goodrich and Tyler Rinker <tyler.rinker@gmail.com>.

Examples

Run this code

if (FALSE) {
## Mining Citation
url_dl("http://umlreading.weebly.com/uploads/2/5/2/5/25253346/whole_language_timeline-updated.docx")

(txt <- read_docx("whole_language_timeline-updated.docx"))

library(qdapTools); library(ggplot2); library(qdap)
txt <- rm_non_ascii(txt)

parts <- split_vector(txt, split = "References", include = TRUE, regex=TRUE)

parts[[1]]

rm_citation(unbag(parts[[1]]), extract=TRUE)[[1]]

## By line
rm_citation(parts[[1]], extract=TRUE)

## Frequency
left_just(cites <- list2df(sort(table(rm_citation(unbag(parts[[1]]),
    extract=TRUE)), T), "freq", "citation")[2:1])

## Distribution of citations (find locations and then plot)
cite_locs <- do.call(rbind, lapply(cites[[1]], function(x){
    m <- gregexpr(x, unbag(parts[[1]]), fixed=TRUE)
    data.frame(
        citation=x,
        start = m[[1]] -5,
        end =  m[[1]] + 5 + attributes(m[[1]])[["match.length"]]
    )
}))

ggplot(cite_locs) +
    geom_segment(aes(x=start, xend=end, y=citation, yend=citation), size=3,
        color="yellow") +
    xlab("Duration") +
    scale_x_continuous(expand = c(0,0),
        limits = c(0, nchar(unbag(parts[[1]])) + 25)) +
    theme_grey() +
    theme(
        panel.grid.major=element_line(color="grey20"),
        panel.grid.minor=element_line(color="grey20"),
        plot.background = element_rect(fill="black"),
        panel.background = element_rect(fill="black"),
        panel.border = element_rect(colour = "grey50", fill=NA, size=1),
        axis.text=element_text(color="grey50"),    
        axis.title=element_text(color="grey50")  
    )
}

Run the code above in your browser using DataLab