Last chance! 50% off unlimited learning
Sale ends in
tokens(x, what = c("word", "sentence", "character", "fastestword",
"fasterword"), remove_numbers = FALSE, remove_punct = FALSE,
remove_symbols = FALSE, remove_separators = TRUE,
remove_twitter = FALSE, remove_hyphens = FALSE, remove_url = FALSE,
ngrams = 1L, skip = 0L, concatenator = "_", hash = TRUE,
verbose = quanteda_options("verbose"), include_docvars = TRUE, ...)
"word"
"fasterword"
{stri_split_charclass(x, "\\p{WHITE_SPACE}")}
"fastestword"
stri_split_fixed(x, " ")
"character"
"sentence"
2day
TRUE
, remove all characters in the Unicode
"Punctuation" [P] classTRUE
, remove all characters in the Unicode
"Symbol" [S] classremove_punct=FALSE
. Only
applicable for what = "character"
(when you probably want it to be
FALSE
) and for what = "word"
(when you probably want it to be
TRUE
). Note that if what = "word"
and you set
remove_punct = TRUE
, then remove_separators
has no effect. Use
carefully.@
and #
; set to
TRUE
if you wish to eliminate these. Note that this will always be set
to FALSE
if remove_punct = FALSE
.TRUE
, split words that are connected by
hyphenation and hyphenation-like characters in between words, e.g.
"self-storage"
becomes c("self", "storage")
. Default is
FALSE
to preserve such words as is, with the hyphens. Only applies
if what = "word"
.TRUE
, find and eliminate URLs beginning with
http(s) -- see section "Dealing with URLs".1
(unigrams). For bigrams, for instance, use 2
; for
bigrams and unigrams, use 1:2
. You can even include irregular
sequences such as 2:3
for bigrams and trigrams only. See
tokens_ngrams
.ngrams
is
different from the default of 1. See tokens_skipgrams
._
", which is recommended since this is included in the regular
expression and Unicode definitions of "word" charactersTRUE
(default), return a hashed tokens object,
otherwise, return a classic tokenizedTexts
object. (This will be
phased out soon in coming versions.)TRUE
, print timing messages to the console; off by
defaultTRUE
, pass docvars and metadoc fields through to
the tokens object. Only applies when tokenizing corpus objects.tokens
class object, by default a hashed list
of integers corresponding to a vector of types.what = "fasterword"
and remove_url = TRUE
. If you wish to
keep the URLs, but do not want them mangled, then your options are more
limited, since removing punctuation and symbols will also remove them from
URLs. We are working on improving this behaviour.See the examples below.
tokens()
as an
intermediate step. Since tokens()
is most likely to be used by more
technical users, we have set its options to default to minimal
intervention. This means that punctuation is tokenized as well, and that
nothing is removed by default from the text being tokenized except
inter-word spacing and equivalent characters.tokens_ngrams
, tokens_skipgrams
txt <- c(doc1 = "This is a sample: of tokens.",
doc2 = "Another sentence, to demonstrate how tokens works.")
tokens(txt)
# removing punctuation marks and lowecasing texts
tokens(char_tolower(txt), remove_punct = TRUE)
# keeping versus removing hyphens
tokens("quanteda data objects are auto-loading.", remove_punct = TRUE)
tokens("quanteda data objects are auto-loading.", remove_punct = TRUE, remove_hyphens = TRUE)
# keeping versus removing symbols
tokens("<tags> and other + symbols.", remove_symbols = FALSE)
tokens("<tags> and other + symbols.", remove_symbols = TRUE)
tokens("<tags> and other + symbols.", remove_symbols = FALSE, what = "fasterword")
tokens("<tags> and other + symbols.", remove_symbols = TRUE, what = "fasterword")
## examples with URLs - hardly perfect!
txt <- "Repo https://githib.com/kbenoit/quanteda, and www.stackoverflow.com."
tokens(txt, remove_url = TRUE, remove_punct = TRUE)
tokens(txt, remove_url = FALSE, remove_punct = TRUE)
tokens(txt, remove_url = FALSE, remove_punct = TRUE, what = "fasterword")
tokens(txt, remove_url = FALSE, remove_punct = FALSE, what = "fasterword")
## MORE COMPARISONS
txt <- "#textanalysis is MY <3 4U @myhandle gr8 #stuff :-)"
tokens(txt, remove_punct = TRUE)
tokens(txt, remove_punct = TRUE, remove_twitter = TRUE)
#tokens("great website http://textasdata.com", remove_url = FALSE)
#tokens("great website http://textasdata.com", remove_url = TRUE)
txt <- c(text1="This is $10 in 999 different ways,\n up and down; left and right!",
text2="@kenbenoit working: on #quanteda 2day\t4ever, http://textasdata.com?page=123.")
tokens(txt, verbose = TRUE)
tokens(txt, remove_numbers = TRUE, remove_punct = TRUE)
tokens(txt, remove_numbers = FALSE, remove_punct = TRUE)
tokens(txt, remove_numbers = TRUE, remove_punct = FALSE)
tokens(txt, remove_numbers = FALSE, remove_punct = FALSE)
tokens(txt, remove_numbers = FALSE, remove_punct = FALSE, remove_separators = FALSE)
tokens(txt, remove_numbers = TRUE, remove_punct = TRUE, remove_url = TRUE)
# character level
tokens("Great website: http://textasdata.com?page=123.", what = "character")
tokens("Great website: http://textasdata.com?page=123.", what = "character",
remove_separators = FALSE)
# sentence level
tokens(c("Kurt Vongeut said; only assholes use semi-colons.",
"Today is Thursday in Canberra: It is yesterday in London.",
"Today is Thursday in Canberra: \nIt is yesterday in London.",
"To be? Or\nnot to be?"),
what = "sentence")
tokens(data_corpus_inaugural[c(2,40)], what = "sentence")
# removing features (stopwords) from tokenized texts
txt <- char_tolower(c(mytext1 = "This is a short test sentence.",
mytext2 = "Short.",
mytext3 = "Short, shorter, and shortest."))
tokens(txt, remove_punct = TRUE)
### removeFeatures(tokens(txt, remove_punct = TRUE), stopwords("english"))
# ngram tokenization
### tokens(txt, remove_punct = TRUE, ngrams = 2)
### tokens(txt, remove_punct = TRUE, ngrams = 2, skip = 1, concatenator = " ")
### tokens(txt, remove_punct = TRUE, ngrams = 1:2)
# removing features from ngram tokens
### removeFeatures(tokens(txt, remove_punct = TRUE, ngrams = 1:2), stopwords("english"))
Run the code above in your browser using DataLab