# NOT RUN {
sample_file <- file.path(
path.package("koRpus"), "examples", "corpus", "Reality_Winner.txt"
)
# }
# NOT RUN {
# first way to invoke POS tagging, using a built-in preset:
tagged.results <- treetag(
sample_file,
treetagger="manual",
lang="en",
TT.options=list(
path=file.path("~","bin","treetagger"),
preset="en"
)
)
# second way, use one of the batch scripts that come with TreeTagger:
tagged.results <- treetag(
sample_file,
treetagger=file.path("~","bin","treetagger","cmd","tree-tagger-english"),
lang="en"
)
# third option, set the above batch script in an environment object first:
set.kRp.env(
TT.cmd=file.path("~","bin","treetagger","cmd","tree-tagger-english"),
lang="en"
)
tagged.results <- treetag(
sample_file
)
# after tagging, use the resulting object with other functions in this package:
readability(tagged.results)
lex.div(tagged.results)
## enabling stopword detection and stemming
# if you also installed the packages tm and SnowballC,
# you can use some of their features with koRpus:
set.kRp.env(
TT.cmd="manual",
lang="en",
TT.options=list(
path=file.path("~","bin","treetagger"),
preset="en"
)
)
tagged.results <- treetag(
sample_file,
stopwords=tm::stopwords("en"),
stemmer=SnowballC::wordStem
)
# removing all stopwords now is simple:
tagged.noStopWords <- filterByClass(
tagged.results,
"stopword"
)
# }
Run the code above in your browser using DataLab