# same as tokenize()
identical(tokenize(ukimmigTexts), segment(ukimmigTexts))
# segment into paragraphs
segment(ukimmigTexts[3:4], "paragraphs")
# segment a text into sentences
segmentedChar <- segment(ukimmigTexts, "sentences")
segmentedChar[2]
testCorpus <- corpus(c("##INTRO This is the introduction.
##DOC1 This is the first document.
Second sentence in Doc 1.
##DOC3 Third document starts here.
End of third document.",
"##INTRO Document ##NUMBER Two starts before ##NUMBER Three."))
# add a docvar
testCorpus[["serialno"]] <- paste0("textSerial", 1:ndoc(testCorpus))
testCorpusSeg <- segment(testCorpus, "tags")
summary(testCorpusSeg)
texts(testCorpusSeg)
# segment a corpus into sentences
segmentedCorpus <- segment(corpus(ukimmigTexts), "sentences")
identical(ndoc(segmentedCorpus), length(unlist(segmentedChar)))
Run the code above in your browser using DataLab