if (FALSE) # unzip the files to a temporary directory
FILEDIR <- tempdir()
unzip(system.file("extdata", "data_files_encodedtexts.zip", package = "readtext"),
exdir = FILEDIR)
# get encoding from filename
filenames <- list.files(FILEDIR, "\\.txt$")
# strip the extension
filenames <- gsub(".txt$", "", filenames)
parts <- strsplit(filenames, "_")
fileencodings <- sapply(parts, "[", 3)
fileencodings
# find out which conversions are unavailable (through iconv())
cat("Encoding conversions not available for this platform:")
notAvailableIndex <- which(!(fileencodings %in% iconvlist()))
fileencodings[notAvailableIndex]
# try readtext
require(quanteda)
txts <- readtext(paste0(FILEDIR, "/", "*.txt"))
substring(texts(txts)[1], 1, 80) # gibberish
substring(texts(txts)[4], 1, 80) # hex
substring(texts(txts)[40], 1, 80) # hex
# read them in again
txts <- readtext(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings)
substring(texts(txts)[1], 1, 80) # English
substring(texts(txts)[4], 1, 80) # Arabic, looking good
substring(texts(txts)[40], 1, 80) # Cyrillic, looking good
substring(texts(txts)[7], 1, 80) # Chinese, looking good
substring(texts(txts)[26], 1, 80) # Hindi, looking good
txts <- readtext(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings,
docvarsfrom = "filenames",
docvarnames = c("document", "language", "inputEncoding"))
encodingCorpus <- corpus(txts, source = "Created by encoding-tests.R")
summary(encodingCorpus)
Run the code above in your browser using DataLab