# NOT RUN {
# same for character vectors and for lists
tokensFromChar <- tokenize(data_corpus_inaugural[1:3])
tokensFromCorp <- tokenize(corpus_subset(data_corpus_inaugural, Year<1798))
identical(tokensFromChar, tokensFromCorp)
str(tokensFromChar)
# returned as a list
head(tokenize(data_corpus_inaugural[57])[[1]], 10)
# returned as a character vector using simplify=TRUE
head(tokenize(data_corpus_inaugural[57], simplify = TRUE), 10)
# removing punctuation marks and lowecasing texts
head(tokenize(char_tolower(data_corpus_inaugural[57]), simplify = TRUE, remove_punct = TRUE), 30)
# keeping case and punctuation
head(tokenize(data_corpus_inaugural[57], simplify = TRUE), 30)
# keeping versus removing hyphens
tokenize("quanteda data objects are auto-loading.", remove_punct = TRUE)
tokenize("quanteda data objects are auto-loading.", remove_punct = TRUE, remove_hyphens = TRUE)
# keeping versus removing symbols
tokenize("<tags> and other + symbols.", remove_symbols = FALSE)
tokenize("<tags> and other + symbols.", remove_symbols = TRUE)
tokenize("<tags> and other + symbols.", remove_symbols = FALSE, what = "fasterword")
tokenize("<tags> and other + symbols.", remove_symbols = TRUE, what = "fasterword")
## examples with URLs - hardly perfect!
txt <- "Repo https://githib.com/kbenoit/quanteda, and www.stackoverflow.com."
tokenize(txt, remove_url = TRUE, remove_punct = TRUE)
tokenize(txt, remove_url = FALSE, remove_punct = TRUE)
tokenize(txt, remove_url = FALSE, remove_punct = TRUE, what = "fasterword")
tokenize(txt, remove_url = FALSE, remove_punct = FALSE, what = "fasterword")
## MORE COMPARISONS
txt <- "#textanalysis is MY <3 4U @myhandle gr8 #stuff :-)"
tokenize(txt, remove_punct = TRUE)
tokenize(txt, remove_punct = TRUE, remove_twitter = TRUE)
#tokenize("great website http://textasdata.com", remove_url = FALSE)
#tokenize("great website http://textasdata.com", remove_url = TRUE)
txt <- c(text1="This is $10 in 999 different ways,\n up and down; left and right!",
text2="@kenbenoit working: on #quanteda 2day\t4ever, http://textasdata.com?page=123.")
tokenize(txt, verbose = TRUE)
tokenize(txt, remove_numbers = TRUE, remove_punct = TRUE)
tokenize(txt, remove_numbers = FALSE, remove_punct = TRUE)
tokenize(txt, remove_numbers = TRUE, remove_punct = FALSE)
tokenize(txt, remove_numbers = FALSE, remove_punct = FALSE)
tokenize(txt, remove_numbers = FALSE, remove_punct = FALSE, remove_separators = FALSE)
tokenize(txt, remove_numbers = TRUE, remove_punct = TRUE, remove_url = TRUE)
# character level
tokenize("Great website: http://textasdata.com?page=123.", what = "character")
tokenize("Great website: http://textasdata.com?page=123.", what = "character",
remove_separators = FALSE)
# sentence level
tokenize(c("Kurt Vongeut said; only assholes use semi-colons.",
"Today is Thursday in Canberra: It is yesterday in London.",
"Today is Thursday in Canberra: \nIt is yesterday in London.",
"To be? Or\nnot to be?"),
what = "sentence")
tokenize(data_corpus_inaugural[c(2,40)], what = "sentence", simplify = TRUE)
# removing features (stopwords) from tokenized texts
txt <- char_tolower(c(mytext1 = "This is a short test sentence.",
mytext2 = "Short.",
mytext3 = "Short, shorter, and shortest."))
tokenize(txt, remove_punct = TRUE)
removeFeatures(tokenize(txt, remove_punct = TRUE), stopwords("english"))
# ngram tokenization
tokenize(txt, remove_punct = TRUE, ngrams = 2)
tokenize(txt, remove_punct = TRUE, ngrams = 2, skip = 1, concatenator = " ")
tokenize(txt, remove_punct = TRUE, ngrams = 1:2)
# removing features from ngram tokens
removeFeatures(tokenize(txt, remove_punct = TRUE, ngrams = 1:2), stopwords("english"))
# }
Run the code above in your browser using DataLab