# NOT RUN {
replace_tokens(DATA$state, c('No', 'what', "it's"))
replace_tokens(DATA$state, c('No', 'what', "it's"), "<<TOKEN>>")
replace_tokens(
DATA$state,
c('No', 'what', "it's"),
"<<TOKEN>>",
ignore.case = TRUE
)
# }
# NOT RUN {
## Now let's see the speed
## Set up data
library(textshape)
data(hamlet)
set.seed(11)
tokens <- sample(unique(unlist(split_token(hamlet$dialogue))), 2000)
tic <- Sys.time()
head(replace_tokens(hamlet$dialogue, tokens))
(toc <- Sys.time() - tic)
tic <- Sys.time()
head(mgsub(hamlet$dialogue, tokens, ""))
(toc <- Sys.time() - tic)
## Amp it up 20x more data
tic <- Sys.time()
head(replace_tokens(rep(hamlet$dialogue, 20), tokens))
(toc <- Sys.time() - tic)
## Replace names example
library(lexicon)
library(textshape)
nms <- gsub("(^.)(.*)", "\\U\\1\\L\\2", common_names, perl = TRUE)
x <- split_portion(
sample(c(sample(grady_augmented, 5000), sample(nms, 10000, TRUE))),
n.words = 12
)
x$text.var <- paste0(
x$text.var,
sample(c('.', '!', '?'), length(x$text.var), TRUE)
)
replace_tokens(x$text.var, nms, 'NAME')
# }
Run the code above in your browser using DataLab