# create a corpus from texts
corpus(inaugTexts)
# create a corpus from texts and assign meta-data and document variables
ukimmigCorpus <- corpus(ukimmigTexts,
docvars = data.frame(party = names(ukimmigTexts)))
corpus(texts(ie2010Corpus))
## Not run: # the fifth column of this csv file is the text field
# mytexts <- textfile("http://www.kenbenoit.net/files/text_example.csv", textField = 5)
# mycorp <- corpus(mytexts)
# mycorp2 <- corpus(textfile("http://www.kenbenoit.net/files/text_example.csv", textField = "Title"))
# identical(texts(mycorp), texts(mycorp2))
# identical(docvars(mycorp), docvars(mycorp2))
# ## End(Not run)
# import a tm VCorpus
if ("tm" %in% rownames(installed.packages())) {
data(crude, package = "tm") # load in a tm example VCorpus
mytmCorpus <- corpus(crude)
summary(mytmCorpus, showmeta=TRUE)
data(acq, package = "tm")
summary(corpus(acq), 5, showmeta=TRUE)
tmCorp <- tm::VCorpus(tm::VectorSource(inaugTexts[49:57]))
quantCorp <- corpus(tmCorp)
summary(quantCorp)
}
# construct a corpus from a data.frame
mydf <- data.frame(letter_factor = factor(rep(letters[1:3], each = 2)),
some_ints = 1L:6L,
some_text = paste0("This is text number ", 1:6, "."),
stringsAsFactors = FALSE,
row.names = paste0("fromDf_", 1:6))
mydf
summary(corpus(mydf, textField = "some_text", source = "From a data.frame called mydf."))
# construct a corpus from a kwic object
mykwic <- kwic(inaugCorpus, "southern")
summary(corpus(mykwic))
# concatenate corpus objects
corpus1 <- corpus(inaugTexts[1:2])
corpus2 <- corpus(inaugTexts[3:4])
corpus3 <- subset(inaugCorpus, President == "Obama")
summary(c(corpus1, corpus2, corpus3))
# ways to index corpus elements
inaugCorpus["1793-Washington"] # 2nd Washington inaugural speech
inaugCorpus[2] # same
ie2010Corpus[, "year"] # access the docvars from ie2010Corpus
ie2010Corpus[["year"]] # same
# create a new document variable
ie2010Corpus[["govtopp"]] <- ifelse(ie2010Corpus[["party"]] %in% c("FF", "Greens"),
"Government", "Opposition")
docvars(ie2010Corpus)
Run the code above in your browser using DataLab