## Compare names with possible typos using a reference of known names
known_names <- c("Roberto", "Umberto", "Alberto", "Emberto", "Norberto", "Humberto")
m1 <- InVocabulary(known_names)
m2 <- Levenshtein(similarity = TRUE, normalize = TRUE)
x <- "Emberto"
y <- c("Enberto", "Umberto")
# "Emberto" and "Umberto" are likely to refer to distinct people (since
# they are known distinct names) so their Levenshtein similarity is
# downweighted to 0.61. "Emberto" and "Enberto" may refer to the same
# person (likely typo), so their Levenshtein similarity of 0.87 is not
# downweighted.
similarities <- m1(x, y) * m2(x, y)
Run the code above in your browser using DataLab