# NOT RUN {
pdf_dat <- read_pdf(
system.file("docs/rl10075oralhistoryst002.pdf", package = "textreadr")
)
pdf_dat_b <- read_pdf(
system.file("docs/rl10075oralhistoryst002.pdf", package = "textreadr"),
skip = 1
)
# }
# NOT RUN {
library(textshape)
system.file("docs/rl10075oralhistoryst002.pdf", package = "textreadr") %>%
read_pdf(1) %>%
`[[`('text') %>%
head(-1) %>%
textshape::combine() %>%
gsub("([A-Z])( )([A-Z])", "\\1_\\3", .) %>%
strsplit("(-| )(?=[A-Z_]+:)", perl=TRUE) %>%
`[[`(1) %>%
textshape::split_transcript()
# }
# NOT RUN {
# }
# NOT RUN {
## An image based .pdf file returns nothing. Using the tesseract package as
## a backend for OCR overcomes this problem.
## Non-ocr
read_pdf(
system.file("docs/McCune2002Choi2010.pdf", package = "textreadr"),
ocr = FALSE
)
read_pdf(
system.file("docs/McCune2002Choi2010.pdf", package = "textreadr"),
ocr = TRUE
)
# }
Run the code above in your browser using DataLab