txt <- "The United Kingdom is leaving the European Union."
toks <- tokens(txt, remove_punct = TRUE)
# character vector - not compounded
tokens_compound(toks, c("United", "Kingdom", "European", "Union"))
# elements separated by spaces - not compounded
tokens_compound(toks, c("United Kingdom", "European Union"))
# list of characters - is compounded
tokens_compound(toks, list(c("United", "Kingdom"), c("European", "Union")))
# elements separated by spaces, wrapped in phrase() - is compounded
tokens_compound(toks, phrase(c("United Kingdom", "European Union")))
# supplied as values in a dictionary (same as list) - is compounded
# (keys do not matter)
tokens_compound(toks, dictionary(list(key1 = "United Kingdom",
key2 = "European Union")))
# pattern as dictionaries with glob matches
tokens_compound(toks, dictionary(list(key1 = c("U* K*"))), valuetype = "glob")
# note the differences caused by join = FALSE
compounds <- list(c("the", "European"), c("European", "Union"))
tokens_compound(toks, pattern = compounds, join = TRUE)
tokens_compound(toks, pattern = compounds, join = FALSE)
# use window to form ngrams
tokens_remove(toks, pattern = stopwords("en")) |>
tokens_compound(pattern = "leav*", join = FALSE, window = c(0, 3))
Run the code above in your browser using DataLab