## ---------------------------------------------------------------------
## A. CIGAR_OPS, explodeCigarOps(), explodeCigarOpLengths(),
## cigarToRleList(), and cigarOpTable()
## ---------------------------------------------------------------------
## Supported CIGAR operations:
CIGAR_OPS
## Transform CIGARs into other useful representations:
cigar1 <- "3H15M55N4M2I6M2D5M6S"
cigar2 <- c("40M2I9M", cigar1, "2S10M2000N15M", "3H33M5H")
explodeCigarOps(cigar2)
explodeCigarOpLengths(cigar2)
explodeCigarOpLengths(cigar2, ops=c("I", "S"))
cigarToRleList(cigar2)
## Summarize CIGARs:
cigarOpTable(cigar2)
## ---------------------------------------------------------------------
## B. From CIGARs to ranges and to sequence lengths
## ---------------------------------------------------------------------
## CIGAR ranges along the "reference" space:
cigarRangesAlongReferenceSpace(cigar1, with.ops=TRUE)[[1]]
cigarRangesAlongReferenceSpace(cigar1,
reduce.ranges=TRUE, with.ops=TRUE)[[1]]
ops <- setdiff(CIGAR_OPS, "N")
cigarRangesAlongReferenceSpace(cigar1, ops=ops, with.ops=TRUE)[[1]]
cigarRangesAlongReferenceSpace(cigar1, ops=ops,
reduce.ranges=TRUE, with.ops=TRUE)[[1]]
ops <- setdiff(CIGAR_OPS, c("D", "N"))
cigarRangesAlongReferenceSpace(cigar1, ops=ops, with.ops=TRUE)[[1]]
cigarWidthAlongReferenceSpace(cigar1)
pos2 <- c(1, 1001, 1, 351)
cigarRangesAlongReferenceSpace(cigar2, pos=pos2, with.ops=TRUE)
res1a <- extractAlignmentRangesOnReference(cigar2, pos=pos2)
res1b <- cigarRangesAlongReferenceSpace(cigar2,
pos=pos2,
ops=setdiff(CIGAR_OPS, "N"),
reduce.ranges=TRUE)
stopifnot(identical(res1a, res1b))
res2a <- extractAlignmentRangesOnReference(cigar2, pos=pos2,
drop.D.ranges=TRUE)
res2b <- cigarRangesAlongReferenceSpace(cigar2,
pos=pos2,
ops=setdiff(CIGAR_OPS, c("D", "N")),
reduce.ranges=TRUE)
stopifnot(identical(res2a, res2b))
seqnames <- factor(c("chr6", "chr6", "chr2", "chr6"),
levels=c("chr2", "chr6"))
extractAlignmentRangesOnReference(cigar2, pos=pos2, f=seqnames)
## CIGAR ranges along the "query" space:
cigarRangesAlongQuerySpace(cigar2, with.ops=TRUE)
cigarWidthAlongQuerySpace(cigar1)
cigarWidthAlongQuerySpace(cigar1, before.hard.clipping=TRUE)
## CIGAR ranges along the "pairwise" space:
cigarRangesAlongPairwiseSpace(cigar2, with.ops=TRUE)
cigarRangesAlongPairwiseSpace(cigar2, dense=TRUE, with.ops=TRUE)
## ---------------------------------------------------------------------
## C. COMPUTE THE COVERAGE OF THE READS STORED IN A BAM FILE
## ---------------------------------------------------------------------
## The information stored in a BAM file can be used to compute the
## "coverage" of the mapped reads i.e. the number of reads that hit any
## given position in the reference genome.
## The following function takes the path to a BAM file and returns an
## object representing the coverage of the mapped reads that are stored
## in the file. The returned object is an RleList object named with the
## names of the reference sequences that actually receive some coverage.
flag0 <- scanBamFlag(isUnmappedQuery=FALSE, isDuplicate=FALSE)
extractCoverageFromBAM <- function(bamfile)
{
stopifnot(is(bamfile, "BamFile"))
## This ScanBamParam object allows us to load only the necessary
## information from the file.
param <- ScanBamParam(flag=flag0, what=c("rname", "pos", "cigar"))
bam <- scanBam(bamfile, param=param)[[1]]
## Note that unmapped reads and reads that are PCR/optical duplicates
## have already been filtered out by using the ScanBamParam object
## above.
f <- factor(bam$rname, levels=seqlevels(bamfile))
irl <- extractAlignmentRangesOnReference(bam$cigar, pos=bam$pos, f=f)
coverage(irl, width=seqlengths(bamfile))
}
library(Rsamtools)
f1 <- system.file("extdata", "ex1.bam", package="Rsamtools")
cvg <- extractCoverageFromBAM(BamFile(f1))
## extractCoverageFromBAM() is equivalent but slightly more efficient
## than loading a GAlignments object and computing its coverage:
cvg2 <- coverage(readGAlignments(f1, param=ScanBamParam(flag=flag0)))
stopifnot(identical(cvg, cvg2))
## ---------------------------------------------------------------------
## D. cigarNarrow() and cigarQNarrow()
## ---------------------------------------------------------------------
## cigarNarrow():
cigarNarrow(cigar1) # only drops the soft/hard clipping
cigarNarrow(cigar1, start=10)
cigarNarrow(cigar1, start=15)
cigarNarrow(cigar1, start=15, width=57)
cigarNarrow(cigar1, start=16)
#cigarNarrow(cigar1, start=16, width=55) # ERROR! (empty cigar)
cigarNarrow(cigar1, start=71)
cigarNarrow(cigar1, start=72)
cigarNarrow(cigar1, start=75)
## cigarQNarrow():
cigarQNarrow(cigar1, start=4, end=-3)
cigarQNarrow(cigar1, start=10)
cigarQNarrow(cigar1, start=19)
cigarQNarrow(cigar1, start=24)
## ---------------------------------------------------------------------
## E. PERFORMANCE
## ---------------------------------------------------------------------
if (interactive()) {
## We simulate 20 millions aligned reads, all 40-mers. 95% of them
## align with no indels. 5% align with a big deletion in the
## reference. In the context of an RNAseq experiment, those 5% would
## be suspected to be "junction reads".
set.seed(123)
nreads <- 20000000L
njunctionreads <- nreads * 5L / 100L
cigar3 <- character(nreads)
cigar3[] <- "40M"
junctioncigars <- paste(
paste(10:30, "M", sep=""),
paste(sample(80:8000, njunctionreads, replace=TRUE), "N", sep=""),
paste(30:10, "M", sep=""), sep="")
cigar3[sample(nreads, njunctionreads)] <- junctioncigars
some_fake_rnames <- paste("chr", c(1:6, "X"), sep="")
rname <- factor(sample(some_fake_rnames, nreads, replace=TRUE),
levels=some_fake_rnames)
pos <- sample(80000000L, nreads, replace=TRUE)
## The following takes < 3 sec. to complete:
system.time(irl1 <- extractAlignmentRangesOnReference(cigar3, pos=pos))
## The following takes < 4 sec. to complete:
system.time(irl2 <- extractAlignmentRangesOnReference(cigar3, pos=pos,
f=rname))
## The sizes of the resulting objects are about 240M and 160M,
## respectively:
object.size(irl1)
object.size(irl2)
}
Run the code above in your browser using DataLab