## Loading a BSgenome data package doesn't load its sequences
## into memory:
library(BSgenome.Celegans.UCSC.ce2)
## Number of sequences in this genome:
length(Celegans)
## Display a summary of the sequences:
Celegans
## Index of single sequences:
seqnames(Celegans)
## Lengths (i.e. number of nucleotides) of the single sequences:
seqlengths(Celegans)
## Load chromosome I from disk to memory (hence takes some time)
## and keep a reference to it:
chrI <- Celegans[["chrI"]] # equivalent to Celegans$chrI
chrI
class(chrI) # a DNAString instance
length(chrI) # with 15080483 nucleotides
## Single sequence can be renamed:
seqnames(Celegans) <- sub("^chr", "", seqnames(Celegans))
seqlengths(Celegans)
Celegans$I
seqnames(Celegans) <- paste0("chr", seqnames(Celegans))
## Multiple sequences:
library(BSgenome.Rnorvegicus.UCSC.rn5)
rn5 <- BSgenome.Rnorvegicus.UCSC.rn5
rn5
seqnames(rn5)
rn5_chr1 <- rn5$chr1
mseqnames(rn5)
rn5_random <- Rnorvegicus$random
rn5_random
class(rn5_random) # a DNAStringSet instance
## Character vector containing the description lines of the first
## 4 sequences in the original FASTA file:
names(rn5_random)[1:4]
## ---------------------------------------------------------------------
## PASS-BY-ADDRESS SEMANTIC, CACHING AND MEMORY USAGE
## ---------------------------------------------------------------------
## We want a message to be printed each time a sequence is removed
## from the cache:
options(verbose=TRUE)
gc() # nothing seems to be removed from the cache
rm(rn5_chr1, rn5_random)
gc() # rn5_chr1 and rn5_random are removed from the cache (they are
# not in use anymore)
options(verbose=FALSE)
## Get the current amount of data in memory (in Mb):
mem0 <- gc()["Vcells", "(Mb)"]
system.time(rn5_chr2 <- rn5$chr2) # read from disk
gc()["Vcells", "(Mb)"] - mem0 # 'rn5_chr2' occupies 20Mb in memory
system.time(tmp <- rn5$chr2) # much faster! (sequence
# is in the cache)
gc()["Vcells", "(Mb)"] - mem0 # we're still using 20Mb (sequences
# have a pass-by-address semantic
# i.e. the sequence data are not
# duplicated)
## subseq() doesn't copy the sequence data either, hence it is very
## fast and memory efficient (but the returned object will hold a
## reference to 'rn5_chr2'):
y <- subseq(rn5_chr2, 10, 8000000)
gc()["Vcells", "(Mb)"] - mem0
## We must remove all references to 'rn5_chr2' before it can be
## removed from the cache (so the 20Mb of memory used by this
## sequence are freed):
options(verbose=TRUE)
rm(rn5_chr2, tmp)
gc()
## Remember that 'y' holds a reference to 'rn5_chr2' too:
rm(y)
gc()
options(verbose=FALSE)
gc()["Vcells", "(Mb)"] - mem0
Run the code above in your browser using DataLab