## ---------------------------------------------------------------------
## A. READ/WRITE FASTA FILES
## ---------------------------------------------------------------------
## Read a non-compressed FASTA files:
filepath1 <- system.file("extdata", "someORF.fa", package="Biostrings")
fasta.seqlengths(filepath1, seqtype="DNA")
x1 <- readDNAStringSet(filepath1)
x1
## Read a gzip-compressed FASTA file:
filepath2 <- system.file("extdata", "someORF.fa.gz", package="Biostrings")
fasta.seqlengths(filepath2, seqtype="DNA")
x2 <- readDNAStringSet(filepath2)
x2
## Sanity check:
stopifnot(identical(as.character(x1), as.character(x2)))
## Read 2 FASTA files at once:
filepath3 <- system.file("extdata", "fastaEx.fa", package="Biostrings")
fasta.seqlengths(c(filepath2, filepath3), seqtype="DNA")
x23 <- readDNAStringSet(c(filepath2, filepath3))
x23
## Sanity check:
x3 <- readDNAStringSet(filepath3)
stopifnot(identical(as.character(x23), as.character(c(x2, x3))))
## Use a FASTA index to load only an arbitrary subset of sequences:
filepath4 <- system.file("extdata", "dm3_upstream2000.fa.gz",
package="Biostrings")
fai <- fasta.index(filepath4, seqtype="DNA")
head(fai)
head(fai$desc)
i <- sample(nrow(fai), 10) # randomly pick up 10 sequences
x4 <- readDNAStringSet(fai[i, ])
## Sanity check:
stopifnot(identical(as.character(readDNAStringSet(filepath4)[i]),
as.character(x4)))
## Write FASTA files:
out23a <- tempfile()
writeXStringSet(x23, out23a)
out23b <- tempfile()
writeXStringSet(x23, out23b, compress=TRUE)
file.info(c(out23a, out23b))$size
## Sanity checks:
stopifnot(identical(as.character(readDNAStringSet(out23a)),
as.character(x23)))
stopifnot(identical(readLines(out23a), readLines(out23b)))
## ---------------------------------------------------------------------
## B. READ/WRITE FASTQ FILES
## ---------------------------------------------------------------------
filepath <- system.file("extdata", "s_1_sequence.txt",
package="Biostrings")
fastq.geometry(filepath)
readDNAStringSet(filepath, format="fastq")
library(BSgenome.Celegans.UCSC.ce2)
## Create a "sliding window" on chr I:
sw_start <- seq.int(1, length(Celegans$chrI)-50, by=50)
sw <- Views(Celegans$chrI, start=sw_start, width=10)
my_fake_shortreads <- as(sw, "XStringSet")
my_fake_ids <- sprintf("ID%06d", seq_len(length(my_fake_shortreads)))
names(my_fake_shortreads) <- my_fake_ids
my_fake_shortreads
## Fake quality ';' will be assigned to each base in 'my_fake_shortreads':
out2 <- tempfile()
writeXStringSet(my_fake_shortreads, out2, format="fastq")
## Passing qualities thru the 'qualities' argument:
my_fake_quals <- rep.int(BStringSet("DCBA@?>=<;"),
length(my_fake_shortreads))
my_fake_quals
out3 <- tempfile()
writeXStringSet(my_fake_shortreads, out3, format="fastq",
qualities=my_fake_quals)
## ---------------------------------------------------------------------
## C. SERIALIZATION
## ---------------------------------------------------------------------
saveXStringSet(my_fake_shortreads, "my_fake_shortreads", dirpath=tempdir())
Run the code above in your browser using DataLab