## 1. Simple text files
TEXTfile <- tempfile("exdna", fileext = ".txt")
## 1a. Extract from data(woodmouse) in sequential format:
cat("3 40",
"No305 NTTCGAAAAACACACCCACTACTAAAANTTATCAGTCACT",
"No304 ATTCGAAAAACACACCCACTACTAAAAATTATCAACCACT",
"No306 ATTCGAAAAACACACCCACTACTAAAAATTATCAATCACT",
file = TEXTfile, sep = "\n")
ex.dna <- read.dna(TEXTfile, format = "sequential")
str(ex.dna)
ex.dna
## 1b. The same data in interleaved format, ...
cat("3 40",
"No305 NTTCGAAAAA CACACCCACT",
"No304 ATTCGAAAAA CACACCCACT",
"No306 ATTCGAAAAA CACACCCACT",
" ACTAAAANTT ATCAGTCACT",
" ACTAAAAATT ATCAACCACT",
" ACTAAAAATT ATCAATCACT",
file = TEXTfile, sep = "\n")
ex.dna2 <- read.dna(TEXTfile)
## 1c. ... in clustal format, ...
cat("CLUSTAL (ape) multiple sequence alignment", "",
"No305 NTTCGAAAAACACACCCACTACTAAAANTTATCAGTCACT",
"No304 ATTCGAAAAACACACCCACTACTAAAAATTATCAACCACT",
"No306 ATTCGAAAAACACACCCACTACTAAAAATTATCAATCACT",
" ************************** ****** ****",
file = TEXTfile, sep = "\n")
ex.dna3 <- read.dna(TEXTfile, format = "clustal")
## 1d. ... and in FASTA format
FASTAfile <- tempfile("exdna", fileext = ".fas")
cat(">No305",
"NTTCGAAAAACACACCCACTACTAAAANTTATCAGTCACT",
">No304",
"ATTCGAAAAACACACCCACTACTAAAAATTATCAACCACT",
">No306",
"ATTCGAAAAACACACCCACTACTAAAAATTATCAATCACT",
file = FASTAfile, sep = "\n")
ex.dna4 <- read.dna(FASTAfile, format = "fasta")
## The 4 data objects are the same:
identical(ex.dna, ex.dna2)
identical(ex.dna, ex.dna3)
identical(ex.dna, ex.dna4)
## 2. How to read GZ compressed files
## create a GZ file and open a connection:
GZfile <- tempfile("exdna", fileext = ".fas.gz")
con <- gzfile(GZfile, "wt")
## write the data using the connection:
cat(">No305", "NTTCGAAAAACACACCCACTACTAAAANTTATCAGTCACT",
">No304", "ATTCGAAAAACACACCCACTACTAAAAATTATCAACCACT",
">No306", "ATTCGAAAAACACACCCACTACTAAAAATTATCAATCACT",
file = con, sep = "\n")
close(con) # close the connection
## read the GZ'ed file:
ex.dna5 <- read.dna(gzfile(GZfile), "fasta")
## This example is with a FASTA file but this works as well
## with the other formats described above.
## All 5 data objects are identical:
identical(ex.dna, ex.dna5)
unlink(c(TEXTfile, FASTAfile, GZfile)) # clean-up
if (FALSE) {
## 3. How to read files from a ZIP archive
## NOTE: since ape 5.7-1, all files in these examples are written
## in the temporary directory, thus the following commands work
## best when run in the user's working directory.
## write the woodmouse data in a FASTA file:
data(woodmouse)
write.dna(woodmouse, "woodmouse.fas", "fasta")
## archive a FASTA file in a ZIP file:
zip("myarchive.zip", "woodmouse.fas")
## Note: the file myarchive.zip is created if necessary
## Read the FASTA file from the ZIP archive without extraction:
wood2 <- read.dna(unz("myarchive.zip", "woodmouse.fas"), "fasta")
## Alternatively, unzip the archive:
fl <- unzip("myarchive.zip")
## the previous command eventually creates locally
## the fullpath archived with 'woodmouse.fas'
wood3 <- read.dna(fl, "fasta")
identical(woodmouse, wood2)
identical(woodmouse, wood3)
}
## read a FASTQ file from 1000 Genomes:
if (FALSE) {
a <- "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG00096/sequence_read/"
file <- "SRR062641.filt.fastq.gz"
URL <- paste0(a, file)
download.file(URL, file)
## If the above command doesn't work, you may copy/paste URL in
## a Web browser instead.
X <- read.fastq(file)
X # 109,811 sequences
## get the qualities of the first sequence:
(qual1 <- attr(X, "QUAL")[[1]])
## the corresponding probabilities:
10^(-qual1/10)
## get the mean quality for each sequence:
mean.qual <- sapply(attr(X, "Q"), mean)
## can do the same for var, sd, ...
}
Run the code above in your browser using DataLab