# Example data.frame
db <- data.frame(sequence_id=LETTERS[1:4],
sequence_alignment=c("CCCCTGGG", "CCCCTGGN", "NAACTGGN", "NNNCTGNN"),
c_call=c("IGHM", "IGHG", "IGHG", "IGHA"),
sample_id=c("S1", "S1", "S2", "S2"),
duplicate_count=1:4,
stringsAsFactors=FALSE)
# Annotations are not parsed if neither text_fields nor num_fields is specified
# The retained sequence annotations will be random
collapseDuplicates(db, verbose=TRUE)
# Unique text_fields annotations are combined into a single string with ","
# num_fields annotations are summed
# Ambiguous duplicates are discarded
collapseDuplicates(db, text_fields=c("c_call", "sample_id"), num_fields="duplicate_count",
verbose=TRUE)
# Use alternate delimiter for collapsing textual annotations
collapseDuplicates(db, text_fields=c("c_call", "sample_id"), num_fields="duplicate_count",
sep="/", verbose=TRUE)
# Add count of duplicates
collapseDuplicates(db, text_fields=c("c_call", "sample_id"), num_fields="duplicate_count",
add_count=TRUE, verbose=TRUE)
# Masking ragged ends may impact duplicate removal
db$sequence_alignment <- maskSeqEnds(db$sequence_alignment)
collapseDuplicates(db, text_fields=c("c_call", "sample_id"), num_fields="duplicate_count",
add_count=TRUE, verbose=TRUE)
Run the code above in your browser using DataLab