txt <- c("PAGE 1. This is a single sentence. Short sentence. Three word sentence.",
"PAGE 2. Very short! Shorter.",
"Very long sentence, with multiple parts, separated by commas. PAGE 3.")
corp <- corpus(txt, docvars = data.frame(serial = 1:3))
corp
# exclude sentences shorter than 3 tokens
corpus_trim(corp, min_ntoken = 3)
# exclude sentences that start with "PAGE "
corpus_trim(corp, exclude_pattern = "^PAGE \\d+")
# trimming character objects
char_trim(txt, "sentences", min_ntoken = 3)
char_trim(txt, "sentences", exclude_pattern = "sentence\\.")
Run the code above in your browser using DataLab