# NOT RUN {
# Simple example
text <- ocr("https://jeroen.github.io/images/testocr.png")
cat(text)
xml <- ocr("https://jeroen.github.io/images/testocr.png", HOCR = TRUE)
cat(xml)
df <- ocr_data("https://jeroen.github.io/images/testocr.png")
print(df)
# }
# NOT RUN {
# Full roundtrip test: render PDF to image and OCR it back to text
curl::curl_download("https://cran.r-project.org/doc/manuals/r-release/R-intro.pdf", "R-intro.pdf")
orig <- pdftools::pdf_text("R-intro.pdf")[1]
# Render pdf to png image
img_file <- pdftools::pdf_convert("R-intro.pdf", format = 'tiff', pages = 1, dpi = 400)
# Extract text from png image
text <- ocr(img_file)
unlink(img_file)
cat(text)
# }
# NOT RUN {
engine <- tesseract(options = list(tessedit_char_whitelist = "0123456789"))
# }
Run the code above in your browser using DataLab