## Not run: # unzip the files to a temporary directory
# FILEDIR <- tempdir()
# unzip(system.file("extdata", "encodedTextFiles.zip", package = "quanteda"), exdir = FILEDIR)
#
# # get encoding from filename
# filenames <- list.files(FILEDIR, "\\.txt$")
# # strip the extension
# filenames <- gsub(".txt$", "", filenames)
# parts <- strsplit(filenames, "_")
# fileencodings <- sapply(parts, "[", 3)
# fileencodings
#
# # find out which conversions are unavailable (through iconv())
# cat("Encoding conversions not available for this platform:")
# notAvailableIndex <- which(!(fileencodings %in% iconvlist()))
# fileencodings[notAvailableIndex]
#
# # try textfile
# require(quanteda)
# tfile <- textfile(paste0(FILEDIR, "/", "*.txt"))
# substring(texts(tfile)[1], 1, 80) # gibberish
# substring(texts(tfile)[4], 1, 80) # hex
# substring(texts(tfile)[40], 1, 80) # hex
#
# # read them in again
# tfile <- textfile(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings)
# substring(texts(tfile)[1], 1, 80) # English
# substring(texts(tfile)[4], 1, 80) # Arabic, looking good
# substring(texts(tfile)[40], 1, 80) # Cyrillic, looking good
# substring(texts(tfile)[7], 1, 80) # Chinese, looking good
# substring(texts(tfile)[26], 1, 80) # Hindi, looking good
#
# tfile <- textfile(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings,
# docvarsfrom = "filenames",
# docvarnames = c("document", "language", "inputEncoding"))
# encodingCorpus <- corpus(tfile, source = "Created by encoding-tests.R")
# summary(encodingCorpus)
# ## End(Not run)
Run the code above in your browser using DataLab