# Simple example
text <- ocr("http://jeroenooms.github.io/images/testocr.png")
cat(text)
# Roundtrip test: render PDF to image and OCR it back to text
library(pdftools)
library(tiff)
# A PDF file with some text
setwd(tempdir())
news <- file.path(Sys.getenv("R_DOC_DIR"), "NEWS.pdf")
orig <- pdf_text(news)[1]
# Render pdf to jpeg/tiff image
bitmap <- pdf_render_page(news, dpi = 300, numeric = TRUE)
tiff::writeTIFF(bitmap, "page.tiff")
# Extract text from images
out <- ocr("page.tiff")
cat(out)
engine <- tesseract(options = list(tessedit_char_whitelist = "0123456789"))
Run the code above in your browser using DataLab