ViewProbDup: Visualize the probable duplicate sets retrieved in a `ProbDup` object

Description

ViewProbDup plots summary visualizations of accessions within the probable duplicate sets retrieved in a ProbDup object according to a grouping factor field(column) in the original database(s).

Usage

ViewProbDup(
  pdup,
  db1,
  db2 = NULL,
  factor.db1,
  factor.db2 = NULL,
  max.count = 30,
  select,
  order = "type",
  main = NULL
)

Value

A list containing the following objects:

`Summary1`	The summary `data.frame` of number of accessions per factor level.
`Summary2`	The summary `data.frame` of number of accessions and sets per each type of sets classified according to factor levels.
`SummaryGrob`	A grid graphical object (Grob) of the summary visualization plot. Can be plotted using the `grid.arrange` function

Arguments

pdup: An object of class ProbDup.
db1: A data frame of the PGR passport database.
db2: A data frame of the PGR passport database. Required when pdup was created using more than one KWIC Index.
factor.db1: The db1 column to be considered for grouping the accessions. Should be of class character or factor.
factor.db2: The db2 column to be considered for grouping the accessions. Should be of class character or factor. retrieved.
max.count: The maximum count of probable duplicate sets whose information is to be plotted (see Note).
select: A character vector of factor names in factor.db1 and/or factor.db2 to be considered for grouping accessions (see Note).
order: The order of the type of sets retrieved in the plot. The default is "type" (see Details).
main: The title of the plot.

Examples

Run this code


# \dontshow{
threads_dt <- data.table::getDTthreads()
threads_OMP <- Sys.getenv("OMP_THREAD_LIMIT")
data.table::setDTthreads(2)

data.table::setDTthreads(2)
Sys.setenv(`OMP_THREAD_LIMIT` = 2)
# }

if (FALSE) {

# Method "b and c"
#=================

# Load PGR passport databases
GN1 <- GN1000[!grepl("^ICG", GN1000$DonorID), ]
GN1$DonorID <- NULL
GN2 <- GN1000[grepl("^ICG", GN1000$DonorID), ]
GN2 <- GN2[!grepl("S", GN2$DonorID), ]
GN2$NationalID <- NULL

GN1$SourceCountry <- toupper(GN1$SourceCountry)
GN2$SourceCountry <- toupper(GN2$SourceCountry)

GN1$SourceCountry <- gsub("UNITED STATES OF AMERICA", "USA", GN1$SourceCountry)
GN2$SourceCountry <- gsub("UNITED STATES OF AMERICA", "USA", GN2$SourceCountry)

# Specify as a vector the database fields to be used
GN1fields <- c("NationalID", "CollNo", "OtherID1", "OtherID2")
GN2fields <- c("DonorID", "CollNo", "OtherID1", "OtherID2")

# Clean the data
GN1[GN1fields] <- lapply(GN1[GN1fields], function(x) DataClean(x))
GN2[GN2fields] <- lapply(GN2[GN2fields], function(x) DataClean(x))
y1 <- list(c("Gujarat", "Dwarf"), c("Castle", "Cary"), c("Small", "Japan"),
           c("Big", "Japan"), c("Mani", "Blanco"), c("Uganda", "Erect"),
           c("Mota", "Company"))
y2 <- c("Dark", "Light", "Small", "Improved", "Punjab", "SAM")
y3 <- c("Local", "Bold", "Cary", "Mutant", "Runner", "Giant", "No.",
        "Bunch", "Peanut")
GN1[GN1fields] <- lapply(GN1[GN1fields], function(x) MergeKW(x, y1, delim = c("space", "dash")))
GN1[GN1fields] <- lapply(GN1[GN1fields], function(x) MergePrefix(x, y2, delim = c("space", "dash")))
GN1[GN1fields] <- lapply(GN1[GN1fields], function(x) MergeSuffix(x, y3, delim = c("space", "dash")))
GN2[GN2fields] <- lapply(GN2[GN2fields], function(x) MergeKW(x, y1, delim = c("space", "dash")))
GN2[GN2fields] <- lapply(GN2[GN2fields], function(x) MergePrefix(x, y2, delim = c("space", "dash")))
GN2[GN2fields] <- lapply(GN2[GN2fields], function(x) MergeSuffix(x, y3, delim = c("space", "dash")))

# Remove duplicated DonorID records in GN2
GN2 <- GN2[!duplicated(GN2$DonorID), ]

# Generate KWIC index
GN1KWIC <- KWIC(GN1, GN1fields)
GN2KWIC <- KWIC(GN2, GN2fields)

# Specify the exceptions as a vector
exep <- c("A", "B", "BIG", "BOLD", "BUNCH", "C", "COMPANY", "CULTURE",
          "DARK", "E", "EARLY", "EC", "ERECT", "EXOTIC", "FLESH", "GROUNDNUT",
          "GUTHUKAI", "IMPROVED", "K", "KUTHUKADAL", "KUTHUKAI", "LARGE",
          "LIGHT", "LOCAL", "OF", "OVERO", "P", "PEANUT", "PURPLE", "R",
          "RED", "RUNNER", "S1", "SAM", "SMALL", "SPANISH", "TAN", "TYPE",
          "U", "VALENCIA", "VIRGINIA", "WHITE")

# Specify the synsets as a list
syn <- list(c("CHANDRA", "AH114"), c("TG1", "VIKRAM"))

GNdupc <- ProbDup(kwic1 = GN1KWIC, kwic2 = GN2KWIC, method = "c",
                  excep = exep, fuzzy = TRUE, phonetic = TRUE,
                  encoding = "primary", semantic = TRUE, syn = syn)

GNdupcView <- ViewProbDup(GNdupc, GN1, GN2, "SourceCountry", "SourceCountry",
                         max.count = 30, select = c("INDIA", "USA"), order = "type",
                         main = "Groundnut Probable Duplicates")

library(gridExtra)                                                    
grid.arrange(GNdupcView$SummaryGrob)                          

}   

# \dontshow{
data.table::setDTthreads(threads_dt)
Sys.setenv(`OMP_THREAD_LIMIT` = threads_OMP)
# }

Run the code above in your browser using DataLab

Description

Usage

Value

Arguments

See Also

Examples