library(RNAseqData.HNRNPC.bam.chr14)
bamfile <- RNAseqData.HNRNPC.bam.chr14_BAMFILES[1]
## ---------------------------------------------------------------------
## A. junctions()
## ---------------------------------------------------------------------
gal <- readGAlignments(bamfile)
table(njunc(gal)) # some alignments have 3 junctions!
juncs <- junctions(gal)
juncs
stopifnot(identical(unname(elementNROWS(juncs)), njunc(gal)))
galp <- readGAlignmentPairs(bamfile)
juncs <- junctions(galp)
juncs
stopifnot(identical(unname(elementNROWS(juncs)), njunc(galp)))
## ---------------------------------------------------------------------
## B. summarizeJunctions()
## ---------------------------------------------------------------------
## By default, only the "score", "plus_score", and "minus_score"
## metadata columns are returned:
junc_summary <- summarizeJunctions(gal)
junc_summary
## The "score" metadata column reports the total number of alignments
## crossing each junction, i.e., that have the junction encoded in their
## CIGAR:
median(mcols(junc_summary)$score)
## The "plus_score" and "minus_score" metadata columns report the
## strand-specific number of alignments crossing each junction:
stopifnot(identical(mcols(junc_summary)$score,
mcols(junc_summary)$plus_score +
mcols(junc_summary)$minus_score))
## If 'with.revmap' is TRUE, the "revmap" metadata column is added to
## the output. This metadata column is an IntegerList object represen-
## ting the mapping from each element in the ouput (i.e. a junction) to
## the corresponding elements in the input 'x'. Here we're going to use
## this to compute a 'score2' for each junction. We obtain this score
## by summing the mapping qualities of the alignments crossing the
## junction:
gal <- readGAlignments(bamfile, param=ScanBamParam(what="mapq"))
junc_summary <- summarizeJunctions(gal, with.revmap=TRUE)
junc_score2 <- sum(extractList(mcols(gal)$mapq,
mcols(junc_summary)$revmap))
mcols(junc_summary)$score2 <- junc_score2
## If the name of the reference genome is specified thru the 'genome'
## argument (in which case the corresponding BSgenome data package needs
## to be installed), then summarizeJunctions() returns the intron motif
## and strand for each junction.
## Since the reads in RNAseqData.HNRNPC.bam.chr14 were aligned to
## the hg19 genome, the following requires that you have
## BSgenome.Hsapiens.UCSC.hg19 installed:
junc_summary <- summarizeJunctions(gal, with.revmap=TRUE, genome="hg19")
mcols(junc_summary)$score2 <- junc_score2 # putting 'score2' back
## The "intron_motif" metadata column is a factor whose levels are the
## 5 natural intron motifs stored in predefined character vector
## 'NATURAL_INTRON_MOTIFS':
table(mcols(junc_summary)$intron_motif)
## ---------------------------------------------------------------------
## C. STRANDED RNA-seq PROTOCOL
## ---------------------------------------------------------------------
## Here is a simple test for checking whether the RNA-seq protocol was
## stranded or not:
strandedTest <- function(plus_score, minus_score)
(sum(plus_score ^ 2) + sum(minus_score ^ 2)) /
sum((plus_score + minus_score) ^ 2)
## The result of this test is guaranteed to be >= 0.5 and <= 1.
## If, for each junction, the strand of the crossing alignments looks
## random (i.e. "plus_score" and "minus_score" are close), then
## strandedTest() will return a value close to 0.5. If it doesn't look
## random (i.e. for each junction, one of "plus_score" and "minus_score"
## is much bigger than the other), then strandedTest() will return a
## value close to 1.
## If the reads are single-end, the test is meaningful when applied
## directly on 'junc_summary'. However, for the test to be meaningful
## on paired-end reads, it needs to be applied on the first and last
## alignments separately:
junc_summary1 <- summarizeJunctions(first(galp))
junc_summary2 <- summarizeJunctions(last(galp))
strandedTest(mcols(junc_summary1)$plus_score,
mcols(junc_summary1)$minus_score)
strandedTest(mcols(junc_summary2)$plus_score,
mcols(junc_summary2)$minus_score)
## Both values are close to 0.5 which suggests that the RNA-seq protocol
## used for this experiment was not stranded.
## ---------------------------------------------------------------------
## D. UTILITIES FOR IMPORTING THE JUNCTION FILE GENERATED BY SOME
## ALIGNERS
## ---------------------------------------------------------------------
## The TopHat aligner generates a junctions.bed file where it reports
## all the junctions satisfying some "quality" criteria (see the TopHat
## manual at http://tophat.cbcb.umd.edu/manual.shtml for more
## information). This file can be loaded with readTopHatJunctions():
runname <- names(RNAseqData.HNRNPC.bam.chr14_BAMFILES)[1]
junctions_file <- system.file("extdata", "tophat2_out", runname,
"junctions.bed",
package="RNAseqData.HNRNPC.bam.chr14")
th_junctions <- readTopHatJunctions(junctions_file)
## Comparing the "TopHat junctions" with the result of
## summarizeJunctions():
th_junctions14 <- th_junctions
seqlevels(th_junctions14, force=TRUE) <- "chr14"
mcols(th_junctions14)$intron_strand <- strand(th_junctions14)
strand(th_junctions14) <- "*"
## All the "TopHat junctions" are in 'junc_summary':
stopifnot(all(th_junctions14 %in% junc_summary))
## But not all the junctions in 'junc_summary' are reported by TopHat
## (that's because TopHat reports only junctions that satisfy some
## "quality" criteria):
is_in_th_junctions14 <- junc_summary %in% th_junctions14
table(is_in_th_junctions14) # 32 junctions are not in TopHat's
# junctions.bed file
junc_summary2 <- junc_summary[is_in_th_junctions14]
## 'junc_summary2' and 'th_junctions14' contain the same junctions in
## the same order:
stopifnot(all(junc_summary2 == th_junctions14))
## Let's merge their metadata columns. We use our own version of
## merge() for this, which is stricter (it checks that the common
## columns are the same in the 2 data frames to merge) and also
## simpler:
merge2 <- function(df1, df2)
{
common_colnames <- intersect(colnames(df1), colnames(df2))
lapply(common_colnames,
function(colname)
stopifnot(all(df1[ , colname] == df2[ , colname])))
extra_mcolnames <- setdiff(colnames(df2), colnames(df1))
cbind(df1, df2[ , extra_mcolnames, drop=FALSE])
}
mcols(th_junctions14) <- merge2(mcols(th_junctions14),
mcols(junc_summary2))
## Here is a peculiar junction reported by TopHat:
idx0 <- which(mcols(th_junctions14)$score2 == 0L)
th_junctions14[idx0]
gal[mcols(th_junctions14)$revmap[[idx0]]]
## The junction is crossed by 5 alignments (score is 5), all of which
## have a mapping quality of 0!
Run the code above in your browser using DataLab