txt <- c(doc1 = "A sentence, showing how tokens() works.",
doc2 = "@quantedainit and #textanalysis https://example.com?p=123.",
doc3 = "Self-documenting code??",
doc4 = "£1,000,000 for 50¢ is gr8 4ever \U0001f600")
tokens(txt)
tokens(txt, what = "word1")
# removing punctuation marks but keeping tags and URLs
tokens(txt[1:2], remove_punct = TRUE)
# splitting hyphenated words
tokens(txt[3])
tokens(txt[3], split_hyphens = TRUE)
# symbols and numbers
tokens(txt[4])
tokens(txt[4], remove_numbers = TRUE)
tokens(txt[4], remove_numbers = TRUE, remove_symbols = TRUE)
if (FALSE) # using other tokenizers
tokens(tokenizers::tokenize_words(txt[4]), remove_symbols = TRUE)
tokenizers::tokenize_words(txt, lowercase = FALSE, strip_punct = FALSE) |>
tokens(remove_symbols = TRUE)
tokenizers::tokenize_characters(txt[3], strip_non_alphanum = FALSE) |>
tokens(remove_punct = TRUE)
tokenizers::tokenize_sentences(
"The quick brown fox. It jumped over the lazy dog.") |>
tokens()
Run the code above in your browser using DataLab