encodedTextFiles: a .zip file of texts containing a variety of differently encoded texts


A set of translations of the Universal Declaration of Human Rights, plus one or two other miscellaneous texts, for testing the text input functions that need to translate different input encodings.



The Universal Declaration of Human Rights resources, http://www.ohchr.org/EN/UDHR/Pages/SearchByLang.aspx


Run this code
## Not run: # unzip the files to a temporary directory
# FILEDIR <- tempdir()
# unzip(system.file("extdata", "encodedTextFiles.zip", package = "quanteda"), exdir = FILEDIR)
# # get encoding from filename
# filenames <- list.files(FILEDIR, "\\.txt$")
# # strip the extension
# filenames <- gsub(".txt$", "", filenames)
# parts <- strsplit(filenames, "_")
# fileencodings <- sapply(parts, "[", 3)
# fileencodings
# # find out which conversions are unavailable (through iconv())
# cat("Encoding conversions not available for this platform:")
# notAvailableIndex <- which(!(fileencodings %in% iconvlist()))
# fileencodings[notAvailableIndex]
# # try textfile
# require(quanteda)
# tfile <- textfile(paste0(FILEDIR, "/", "*.txt"))
# substring(texts(tfile)[1], 1, 80) # gibberish
# substring(texts(tfile)[4], 1, 80) # hex
# substring(texts(tfile)[40], 1, 80) # hex
# # read them in again
# tfile <- textfile(paste0(FILEDIR,  "/", "*.txt"), encoding = fileencodings)
# substring(texts(tfile)[1], 1, 80)  # English
# substring(texts(tfile)[4], 1, 80)  # Arabic, looking good 
# substring(texts(tfile)[40], 1, 80) # Cyrillic, looking good
# substring(texts(tfile)[7], 1, 80)  # Chinese, looking good
# substring(texts(tfile)[26], 1, 80) # Hindi, looking good
# tfile <- textfile(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings,
#                   docvarsfrom = "filenames", 
#                   docvarnames = c("document", "language", "inputEncoding"))
# encodingCorpus <- corpus(tfile, source = "Created by encoding-tests.R") 
# summary(encodingCorpus)
# ## End(Not run)

