library(dplyr)
library(tidytext)
registry_tmp <- fs::path(tempdir(), "cwb_registry")
dir.create(registry_tmp)
tidydata <- quanteda::data_char_ukimmig2010 %>%
as.data.frame() %>%
as_tibble(rownames = "party") %>%
rename(`text` = ".")
tokenstream <- tidydata %>%
unnest_tokens(word, text, to_lower = FALSE, strip_punct = FALSE) %>%
mutate(cpos = 0L:(nrow(.) - 1L))
metadata <- tokenstream %>%
group_by(party) %>%
summarise(cpos_left = min(cpos), cpos_right = max(cpos))
tokenstream %>%
select(-cpos, -party) %>%
encode(
corpus = "UKIMMIG2010",
s_attributes = metadata,
properties = c(lang = "en")
)
Run the code above in your browser using DataLab