library(dplyr)
library(janeaustenr)
d <- data_frame(txt = prideprejudice)
d
d %>%
unnest_tokens(word, txt)
d %>%
unnest_tokens(sentence, txt, token = "sentences")
d %>%
unnest_tokens(ngram, txt, token = "ngrams", n = 2)
d %>%
unnest_tokens(ngram, txt, token = "skip_ngrams", n = 4, k = 2)
d %>%
unnest_tokens(chapter, txt, token = "regex", pattern = "Chapter [\\d]")
# custom function
d %>%
unnest_tokens(word, txt, token = stringr::str_split, pattern = " ")
# tokenize HTML
h <- data_frame(row = 1:2,
text = c("<h1>Text <b>is<b>", "<a href='example.com'>here</a>"))
h %>%
unnest_tokens(word, text, format = "html")
Run the code above in your browser using DataLab