# NOT RUN {
library(udpipe)
data(joboffer)
head(joboffer)
joboffer$textrank_id <- unique_identifier(joboffer, c("doc_id", "paragraph_id", "sentence_id"))
sentences <- unique(joboffer[, c("textrank_id", "sentence")])
cat(sentences$sentence)
terminology <- subset(joboffer, upos %in% c("NOUN", "ADJ"), select = c("textrank_id", "lemma"))
head(terminology)
## Textrank for finding the most relevant sentences
tr <- textrank_sentences(data = sentences, terminology = terminology)
summary(tr, n = 2)
summary(tr, n = 5, keep.sentence.order = TRUE)
# }
# NOT RUN {
## Using minhash to reduce sentence combinations - relevant if you have a lot of sentences
library(textreuse)
minhash <- minhash_generator(n = 1000, seed = 123456789)
candidates <- textrank_candidates_lsh(x = terminology$lemma, sentence_id = terminology$textrank_id,
minhashFUN = minhash, bands = 500)
tr <- textrank_sentences(data = sentences, terminology = terminology,
textrank_candidates = candidates)
summary(tr, n = 2)
# }
# NOT RUN {
## You can also reduce the number of sentence combinations by sampling
tr <- textrank_sentences(data = sentences, terminology = terminology, max = 100)
tr
summary(tr, n = 2)
# }
Run the code above in your browser using DataLab