# \dontshow{
threads_dt <- data.table::getDTthreads()
threads_OMP <- Sys.getenv("OMP_THREAD_LIMIT")
data.table::setDTthreads(2)
data.table::setDTthreads(2)
Sys.setenv(`OMP_THREAD_LIMIT` = 2)
# }
if (FALSE) {
# Load PGR passport database
GN <- GN1000
# Specify as a vector the database fields to be used
GNfields <- c("NationalID", "CollNo", "DonorID", "OtherID1", "OtherID2")
# Clean the data
GN[GNfields] <- lapply(GN[GNfields], function(x) DataClean(x))
y1 <- list(c("Gujarat", "Dwarf"), c("Castle", "Cary"), c("Small", "Japan"),
c("Big", "Japan"), c("Mani", "Blanco"), c("Uganda", "Erect"),
c("Mota", "Company"))
y2 <- c("Dark", "Light", "Small", "Improved", "Punjab", "SAM")
y3 <- c("Local", "Bold", "Cary", "Mutant", "Runner", "Giant", "No.",
"Bunch", "Peanut")
GN[GNfields] <- lapply(GN[GNfields], function(x) MergeKW(x, y1, delim = c("space", "dash")))
GN[GNfields] <- lapply(GN[GNfields], function(x) MergePrefix(x, y2, delim = c("space", "dash")))
GN[GNfields] <- lapply(GN[GNfields], function(x) MergeSuffix(x, y3, delim = c("space", "dash")))
# Generate KWIC index
GNKWIC <- KWIC(GN, GNfields)
# Specify the exceptions as a vector
exep <- c("A", "B", "BIG", "BOLD", "BUNCH", "C", "COMPANY", "CULTURE",
"DARK", "E", "EARLY", "EC", "ERECT", "EXOTIC", "FLESH", "GROUNDNUT",
"GUTHUKAI", "IMPROVED", "K", "KUTHUKADAL", "KUTHUKAI", "LARGE",
"LIGHT", "LOCAL", "OF", "OVERO", "P", "PEANUT", "PURPLE", "R",
"RED", "RUNNER", "S1", "SAM", "SMALL", "SPANISH", "TAN", "TYPE",
"U", "VALENCIA", "VIRGINIA", "WHITE")
# Specify the synsets as a list
syn <- list(c("CHANDRA", "AH114"), c("TG1", "VIKRAM"))
# Fetch probable duplicate sets
GNdup <- ProbDup(kwic1 = GNKWIC, method = "a", excep = exep, fuzzy = TRUE,
phonetic = TRUE, encoding = "primary",
semantic = TRUE, syn = syn)
# Get disjoint probable duplicate sets of each kind
disGNdup <- DisProbDup(GNdup, combine = NULL)
# Get the data frame for reviewing the duplicate sets identified
RevGNdup <- ReviewProbDup(pdup = disGNdup, db1 = GN1000,
extra.db1 = c("SourceCountry", "TransferYear"),
max.count = 30, insert.blanks = TRUE)
# Examine and review the duplicate sets using edit function
RevGNdup <- edit(RevGNdup)
# Examine and make changes to a set
subset(RevGNdup, SET_NO==12 & TYPE=="P", select= c(IDKW, DEL, SPLIT))
RevGNdup[c(110, 112, 114, 118, 121, 122, 124), 6] <- "Y"
RevGNdup[c(111, 115, 128), 7] <- 1
RevGNdup[c(113, 117, 120), 7] <- 2
RevGNdup[c(116, 119), 7] <- 3
RevGNdup[c(123, 125), 7] <- 4
RevGNdup[c(126, 127), 7] <- 5
subset(RevGNdup, SET_NO==12 & TYPE=="P", select= c(IDKW, DEL, SPLIT))
# Reconstruct ProDup object
GNdup2 <- ReconstructProbDup(RevGNdup)
lapply(disGNdup, nrow)
lapply(GNdup2, nrow)
}
# \dontshow{
data.table::setDTthreads(threads_dt)
Sys.setenv(`OMP_THREAD_LIMIT` = threads_OMP)
# }
Run the code above in your browser using DataLab