## segmenting a corpus
testCorpus <- 
corpus(c("##INTRO This is the introduction.
          ##DOC1 This is the first document.  Second sentence in Doc 1.
          ##DOC3 Third document starts here.  End of third document.",
         "##INTRO Document ##NUMBER Two starts before ##NUMBER Three."))
# add a docvar
testCorpus[["serialno"]] <- paste0("textSerial", 1:ndoc(testCorpus))
testCorpusSeg <- corpus_segment(testCorpus, "tags")
summary(testCorpusSeg)
texts(testCorpusSeg)
# segment a corpus into sentences
segmentedCorpus <- corpus_segment(corpus(data_char_ukimmig2010), "sentences")
summary(segmentedCorpus)
## segmenting a character object
# same as tokenize()
identical(as.character(tokens(data_char_ukimmig2010)), 
          as.character(char_segment(data_char_ukimmig2010, what = "tokens")))
# segment into paragraphs
char_segment(data_char_ukimmig2010[3:4], "paragraphs")
# segment a text into sentences
segmentedChar <- char_segment(data_char_ukimmig2010, "sentences")
segmentedChar[3]
Run the code above in your browser using DataLab