if (FALSE) {
## For an XML file
library(tm)
file <- system.file("texts", "reut21578-factiva.xml",
package = "tm.plugin.factiva")
source <- FactivaSource(file)
corpus <- Corpus(source, readerControl = list(language = NA))
# See the contents of the documents
inspect(corpus)
# See meta-data associated with first article
meta(corpus[[1]])
}
## For an HTML file
library(tm)
file <- system.file("texts", "factiva_test.html",
package = "tm.plugin.factiva")
source <- FactivaSource(file)
corpus <- Corpus(source, readerControl = list(language = NA))
# See the contents of the documents
inspect(corpus)
# See meta-data associated with first article
meta(corpus[[1]])
# \dontshow{
# Check that texts with non-ASCII characters are properly marked as UTF-8,
# as bugs in XML have created issues in the past
stopifnot(all(Encoding(content(corpus[[1]])[1]) == "UTF-8"))
# }
Run the code above in your browser using DataLab