## ---------------------------------------------------------------------
## A. readGAlignments()
## ---------------------------------------------------------------------
## Simple use:
bamfile <- system.file("extdata", "ex1.bam", package="Rsamtools",
mustWork=TRUE)
gal1 <- readGAlignments(bamfile)
gal1
names(gal1)
## Using the 'use.names' arg:
gal2 <- readGAlignments(bamfile, use.names=TRUE)
gal2
head(names(gal2))
## Using the 'param' arg to drop PCR or optical duplicates as well as
## secondary alignments, and to load additional BAM fields:
param <- ScanBamParam(flag=scanBamFlag(isDuplicate=FALSE,
isSecondaryAlignment=FALSE),
what=c("qual", "flag"))
gal3 <- readGAlignments(bamfile, param=param)
gal3
mcols(gal3)
## Using the 'param' arg to load alignments from particular regions.
which <- RangesList(seq1=IRanges(1000, 1100),
seq2=IRanges(c(1546, 1555, 1567), width=10))
param <- ScanBamParam(which=which)
gal4 <- readGAlignments(bamfile, use.names=TRUE, param=param)
gal4
## IMPORTANT NOTE: A given record is loaded one time for each region
## it overlaps with. We call this "duplicated record selection" (this
## is a scanBam() feature, readGAlignments() is based on scanBam()):
which <- IRangesList(seq2=IRanges(c(1555, 1567), width=10))
param <- ScanBamParam(which=which)
gal5 <- readGAlignments(bamfile, use.names=TRUE, param=param)
gal5 # record EAS114_26:7:37:79:581 was loaded twice
## This becomes clearer if we use 'with.which_label=TRUE' to identify
## the region in 'which' where each element in 'gal5' originates from.
gal5 <- readGAlignments(bamfile, use.names=TRUE, param=param,
with.which_label=TRUE)
gal5
## Not surprisingly, we also get "duplicated record selection" when
## 'which' contains repeated or overlapping regions. Using the same
## regions as we did for 'gal4' above, except that now we're
## repeating the region on seq1:
which <- RangesList(seq1=rep(IRanges(1000, 1100), 2),
seq2=IRanges(c(1546, 1555, 1567), width=10))
param <- ScanBamParam(which=which)
gal4b <- readGAlignments(bamfile, use.names=TRUE, param=param)
length(gal4b) # > length(gal4), because all the records overlapping
# with bases 1000 to 1100 on seq1 are now duplicated
## The "duplicated record selection" will artificially increase the
## coverage or affect other downstream results. It can be mitigated
## (but not completely eliminated) by first "reducing" the set of
## regions:
which <- reduce(which)
which
param <- ScanBamParam(which=which)
gal4c <- readGAlignments(bamfile, use.names=TRUE, param=param)
length(gal4c) # < length(gal4), because the 2 first original regions
# on seq2 were merged into a single one
## Note that reducing the set of regions didn't completely eliminate
## "duplicated record selection". Records that overlap the 2 reduced
## regions on seq2 (which$seq2) are loaded twice (like for 'gal5'
## above). See example D. below for how to completely eliminate
## "duplicated record selection".
## Using the 'param' arg to load tags. Except for MF and Aq, the tags
## specified below are predefined tags (see the SAM Spec for the list
## of predefined tags and their meaning).
param <- ScanBamParam(tag=c("MF", "Aq", "NM", "UQ", "H0", "H1"),
what="isize")
gal6 <- readGAlignments(bamfile, param=param)
mcols(gal6) # "tag" cols always after "what" cols
## With a BamViews object:
fls <- system.file("extdata", "ex1.bam", package="Rsamtools",
mustWork=TRUE)
bv <- BamViews(fls,
bamSamples=DataFrame(info="test", row.names="ex1"),
auto.range=TRUE)
## Note that the "readGAlignments" method for BamViews objects
## requires the ShortRead package to be installed.
aln <- readGAlignments(bv)
aln
aln[[1]]
aln[colnames(bv)]
mcols(aln)
## ---------------------------------------------------------------------
## B. readGAlignmentPairs()
## ---------------------------------------------------------------------
galp1 <- readGAlignmentPairs(bamfile)
head(galp1)
names(galp1)
## Here we use the 'param' arg to filter by proper pair, drop PCR / optical
## duplicates, and drop secondary alignments. Filtering by proper pair and
## dropping secondary alignments can help make the pairing algorithm run
## significantly faster:
param <- ScanBamParam(flag=scanBamFlag(isProperPair=TRUE,
isDuplicate=FALSE,
isSecondaryAlignment=FALSE))
galp2 <- readGAlignmentPairs(bamfile, use.names=TRUE, param=param)
galp2
head(galp2)
head(names(galp2))
## ---------------------------------------------------------------------
## C. readGAlignmentsList()
## ---------------------------------------------------------------------
library(pasillaBamSubset)
## 'file' as character.
bam <- untreated3_chr4()
galist1 <- readGAlignmentsList(bam)
galist1[1:3]
length(galist1)
table(elementNROWS(galist1))
## When 'file' is a BamFile, 'asMates' must be TRUE. If FALSE,
## the data are treated as single-end and each list element of the
## GAlignmentsList will be of length 1. For single-end data
## use readGAlignments().
bamfile <- BamFile(bam, yieldSize=3, asMates=TRUE)
readGAlignmentsList(bamfile)
## Use a 'param' to fine tune the results.
param <- ScanBamParam(flag=scanBamFlag(isProperPair=TRUE))
galist2 <- readGAlignmentsList(bam, param=param)
length(galist2)
## ---------------------------------------------------------------------
## D. COMPARING 4 STRATEGIES FOR LOADING THE ALIGNMENTS THAT OVERLAP
## WITH THE EXONIC REGIONS ON FLY CHROMMOSOME 4
## ---------------------------------------------------------------------
library(pasillaBamSubset)
bam <- untreated1_chr4()
library(TxDb.Dmelanogaster.UCSC.dm3.ensGene)
txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene
ex <- exons(txdb)
seqlevels(ex, force=TRUE) <- "chr4"
length(ex)
## Some of the exons overlap with each other:
isDisjoint(ex) # FALSE
exonic_regions <- reduce(ex)
isDisjoint(exonic_regions) # no more overlaps
length(exonic_regions)
## Strategy #1: slow and loads a lot of records more than once (see
## "duplicated record selection" in example A. above).
param1 <- ScanBamParam(which=ex)
gal1 <- readGAlignments(bam, param=param1)
length(gal1) # many "duplicated records"
## Strategy #2: faster and generates less duplicated records but
## doesn't eliminate them.
param2 <- ScanBamParam(which=exonic_regions)
gal2 <- readGAlignments(bam, param=param2)
length(gal2) # less "duplicated records"
## Strategy #3: fast and completely eliminates duplicated records.
gal0 <- readGAlignments(bam)
gal3 <- subsetByOverlaps(gal0, exonic_regions, ignore.strand=TRUE)
length(gal3) # no "duplicated records"
## Note that, in this case using 'exonic_regions' or 'ex' makes no
## difference:
gal3b <- subsetByOverlaps(gal0, ex, ignore.strand=TRUE)
stopifnot(identical(gal3, gal3b))
## Strategy #4: strategy #3 however can require a lot of memory if the
## file is big because we load all the alignments into memory before we
## select those that overlap with the exonic regions. Strategy #4
## addresses this by loading the file by chunks.
bamfile <- BamFile(bam, yieldSize=50000)
open(bamfile)
while (length(chunk0 <- readGAlignments(bamfile))) {
chunk <- subsetByOverlaps(chunk0, ex, ignore.strand=TRUE)
cat("chunk0:", length(chunk0), "- chunk:", length(chunk), "\n")
## ... do something with 'chunk' ...
}
close(bamfile)
## ---------------------------------------------------------------------
## E. readGappedReads()
## ---------------------------------------------------------------------
greads1 <- readGappedReads(bamfile)
greads1
names(greads1)
qseq(greads1)
greads2 <- readGappedReads(bamfile, use.names=TRUE)
head(greads2)
head(names(greads2))
Run the code above in your browser using DataLab