## ---------------------------------------------------------------------
## A. SIMPLE EXAMPLES
## ---------------------------------------------------------------------
## Load the Caenorhabditis elegans genome (UCSC Release ce2):
library(BSgenome.Celegans.UCSC.ce2)
## Look at the index of sequences:
Celegans
## Get chromosome V as a DNAString object:
getSeq(Celegans, "chrV")
## which is in fact the same as doing:
Celegans$chrV
## Not run:
# ## Never try this:
# getSeq(Celegans, "chrV", as.character=TRUE)
# ## or this (even worse):
# getSeq(Celegans, as.character=TRUE)
# ## End(Not run)
## Get the first 20 bases of each chromosome:
getSeq(Celegans, end=20)
## Get the last 20 bases of each chromosome:
getSeq(Celegans, start=-20)
## ---------------------------------------------------------------------
## B. EXTRACTING SMALL SEQUENCES FROM DIFFERENT CHROMOSOMES
## ---------------------------------------------------------------------
myseqs <- data.frame(
chr=c("chrI", "chrX", "chrM", "chrM", "chrX", "chrI", "chrM", "chrI"),
start=c(NA, -40, 8510, 301, 30001, 9220500, -2804, -30),
end=c(50, NA, 8522, 324, 30011, 9220555, -2801, -11),
strand=c("+", "-", "+", "+", "-", "-", "+", "-")
)
getSeq(Celegans, myseqs$chr,
start=myseqs$start, end=myseqs$end)
getSeq(Celegans, myseqs$chr,
start=myseqs$start, end=myseqs$end, strand=myseqs$strand)
## ---------------------------------------------------------------------
## C. USING A GRanges OBJECT
## ---------------------------------------------------------------------
gr1 <- GRanges(seqnames=c("chrI", "chrI", "chrM"),
ranges=IRanges(start=101:103, width=9))
gr1 # all strand values are "*"
getSeq(Celegans, gr1) # treats strand values as if they were "+"
strand(gr1)[] <- "-"
getSeq(Celegans, gr1)
strand(gr1)[1] <- "+"
getSeq(Celegans, gr1)
strand(gr1)[2] <- "*"
if (interactive())
getSeq(Celegans, gr1) # Error: cannot mix "*" with other strand values
gr2 <- GRanges(seqnames=c("chrM", "NM_058280_up_1000"),
ranges=IRanges(start=103:102, width=9))
gr2
if (interactive()) {
## Because the sequence names are supplied via a GRanges object, they
## are not treated as regular expressions:
getSeq(Celegans, gr2) # Error: sequence NM_058280_up_1000 not found
}
## ---------------------------------------------------------------------
## D. USING A GRangesList OBJECT
## ---------------------------------------------------------------------
gr1 <- GRanges(seqnames=c("chrI", "chrII", "chrM", "chrII"),
ranges=IRanges(start=101:104, width=12),
strand="+")
gr2 <- shift(gr1, 5)
gr3 <- gr2
strand(gr3) <- "-"
grl <- GRangesList(gr1, gr2, gr3)
getSeq(Celegans, grl)
## ---------------------------------------------------------------------
## E. EXTRACTING A HIGH NUMBER OF RANDOM 40-MERS FROM A GENOME
## ---------------------------------------------------------------------
extractRandomReads <- function(x, density, readlength)
{
if (!is.integer(readlength))
readlength <- as.integer(readlength)
start <- lapply(seqnames(x),
function(name)
{
seqlength <- seqlengths(x)[name]
sample(seqlength - readlength + 1L,
seqlength * density,
replace=TRUE)
})
names <- rep.int(seqnames(x), elementNROWS(start))
ranges <- IRanges(start=unlist(start), width=readlength)
strand <- strand(sample(c("+", "-"), length(names), replace=TRUE))
gr <- GRanges(seqnames=names, ranges=ranges, strand=strand)
getSeq(x, gr)
}
## With a density of 1 read every 100 genome bases, the total number of
## extracted 40-mers is about 1 million:
rndreads <- extractRandomReads(Celegans, 0.01, 40)
## Notes:
## - The short sequences in 'rndreads' can be seen as the result of a
## simulated high-throughput sequencing experiment. A non-realistic
## one though because:
## (a) It assumes that the underlying technology is perfect (the
## generated reads have no technology induced errors).
## (b) It assumes that the sequenced genome is exactly the same as
## the reference genome.
## (c) The simulated reads can contain IUPAC ambiguity letters only
## because the reference genome contains them. In a real
## high-throughput sequencing experiment, the sequenced genome
## of course doesn't contain those letters, but the sequencer
## can introduce them in the generated reads to indicate
## ambiguous base-calling.
## - Those reads are coming from the plus and minus strands of the
## chromosomes.
## - With a density of 0.01 and the reads being only 40-base long, the
## average coverage of the genome is only 0.4 which is low. The total
## number of reads is about 1 million and it takes less than 10 sec.
## to generate them.
## - A higher coverage can be achieved by using a higher density and/or
## longer reads. For example, with a density of 0.1 and 100-base reads
## the average coverage is 10. The total number of reads is about 10
## millions and it takes less than 1 minute to generate them.
## - Those reads could easily be mapped back to the reference by using
## an efficient matching tool like matchPDict() for performing exact
## matching (see ?matchPDict for more information). Typically, a
## small percentage of the reads (4 to 5% in our case) will hit the
## reference at multiple locations. This is especially true for such
## short reads, and, in a lower proportion, is still true for longer
## reads, even for reads as long as 300 bases.
## ---------------------------------------------------------------------
## F. SEE THE BSgenome CACHE IN ACTION
## ---------------------------------------------------------------------
options(verbose=TRUE)
first20 <- getSeq(Celegans, end=20)
first20
gc()
stopifnot(length(ls(Celegans@.seqs_cache)) == 0L)
## One more gc() call is needed in order to see the amount of memory in
## used after all the chromosomes have been removed from the cache:
gc()
Run the code above in your browser using DataLab