# NOT RUN {
# code is only run when the english language package can be loaded
if(require("koRpus.lang.en", quietly = TRUE)){
sample_file <- file.path(
path.package("koRpus"), "examples", "corpus", "Reality_Winner.txt"
)
tokenized.obj <- tokenize(
txt=sample_file,
lang="en"
)
## character manipulation
# this is useful if you know of problematic characters in your
# raw text files, but don't want to touch them directly. you
# don't have to, as you can substitute them, even using regular
# expressions. a simple example: replace all single quotes by
# double quotes througout the text:
tokenized.obj <- tokenize(
txt=sample_file,
lang="en",
clean.raw=list("'"='\"')
)
# now replace all occurrances of the letter A followed
# by two digits with the letter B, followed by the same
# two digits:
tokenized.obj <- tokenize(
txt=sample_file,
lang="en",
clean.raw=list("(A)([[:digit:]]{2})"="B\\2"),
perl=TRUE
)
## enabling stopword detection and stemming
if(all(
requireNamespace("tm", quietly=TRUE),
requireNamespace("SnowballC", quietly=TRUE)
)){
# if you also installed the packages tm and Snowball,
# you can use some of their features with koRpus:
tokenized.obj <- tokenize(
txt=sample_file,
lang="en",
stopwords=tm::stopwords("en"),
stemmer=SnowballC::wordStem
)
# removing all stopwords now is simple:
tokenized.noStopWords <- filterByClass(tokenized.obj, "stopword")
} else {}
} else {}
# }
Run the code above in your browser using DataLab