# NOT RUN {
## segmenting a corpus
testCorpus <-
corpus(c("##INTRO This is the introduction.
##DOC1 This is the first document. Second sentence in Doc 1.
##DOC3 Third document starts here. End of third document.",
"##INTRO Document ##NUMBER Two starts before ##NUMBER Three."))
# add a docvar
testCorpus[["serialno"]] <- paste0("textSerial", 1:ndoc(testCorpus))
testCorpusSeg <- corpus_segment(testCorpus, "tags")
summary(testCorpusSeg)
texts(testCorpusSeg)
# segment a corpus into sentences
segmentedCorpus <- corpus_segment(corpus(data_char_ukimmig2010), "sentences")
summary(segmentedCorpus)
## segmenting a character object
# same as tokenize()
identical(as.character(tokens(data_char_ukimmig2010)),
as.character(char_segment(data_char_ukimmig2010, what = "tokens")))
# segment into paragraphs
char_segment(data_char_ukimmig2010[3:4], "paragraphs")
# segment a text into sentences
segmentedChar <- char_segment(data_char_ukimmig2010, "sentences")
segmentedChar[3]
# }
Run the code above in your browser using DataLab