fl <- system.file("extdata", "ex2.vcf", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
## vcf <- readVcf(fl, c("20"="hg19")) ## 'genome' as named vector
## ---------------------------------------------------------------------
## Header and genome information
## ---------------------------------------------------------------------
vcf
## all header information
hdr <- header(vcf)
## header information for 'info' and 'fixed' fields
info(hdr)
fixed(hdr)
## ---------------------------------------------------------------------
## Accessors
## ---------------------------------------------------------------------
## fixed fields together
head(fixed(vcf), 5)
## fixed fields separately
filt(vcf)
ref(vcf)
## info data
info(hdr)
info(vcf)
info(vcf)$DP
## geno data
geno(hdr)
geno(vcf)
head(geno(vcf)$GT)
## genome
unique(genome(rowRanges(vcf)))
## ---------------------------------------------------------------------
## Data subsets with lightweight read* functions
## ---------------------------------------------------------------------
## Import a single 'info' or 'geno' variable
DP <- readInfo(fl, "DP")
HQ <- readGeno(fl, "HQ")
## Import GT as numeric representation
GT <- readGT(fl)
## Import GT as nucleotides
GT <- readGT(fl, nucleotides=TRUE)
## ---------------------------------------------------------------------
## Data subsets with ScanVcfParam
## ---------------------------------------------------------------------
## Subset on genome coordinates:
## 'file' must have a Tabix index
rngs <- GRanges("20", IRanges(c(14370, 1110000), c(17330, 1234600)))
names(rngs) <- c("geneA", "geneB")
param <- ScanVcfParam(which=rngs)
compressVcf <- bgzip(fl, tempfile())
idx <- indexTabix(compressVcf, "vcf")
tab <- TabixFile(compressVcf, idx)
vcf <- readVcf(tab, "hg19", param)
## When data are subset by range ('which' argument in ScanVcfParam),
## the 'paramRangeID' column provides a map back to the original
## range in 'param'.
rowRanges(vcf)[,"paramRangeID"]
vcfWhich(param)
## Subset on samples:
## Consult the header for the sample names.
samples(hdr)
## Specify one or more names in 'samples' in a ScanVcfParam.
param <- ScanVcfParam(samples="NA00002")
vcf <- readVcf(tab, "hg19", param)
geno(vcf)$GT
## Subset on 'fixed', 'info' or 'geno' fields:
param <- ScanVcfParam(fixed="ALT", geno=c("GT", "HQ"), info=c("NS", "AF"))
vcf_tab <- readVcf(tab, "hg19", param)
info(vcf_tab)
geno(vcf_tab)
## No ranges are specified in the 'param' so tabix file is not
## required. Instead, the uncompressed VCF can be used as 'file'.
vcf_fname <- readVcf(fl, "hg19", param)
## The header will always contain information for all variables
## in the original file reguardless of how the data were subset.
## For example, all 'geno' fields are listed in the header
geno(header(vcf_fname))
## but only 'GT' and 'HQ' are present in the VCF object.
geno(vcf_fname)
## Subset on both genome coordinates and 'info', 'geno' fields:
param <- ScanVcfParam(geno="HQ", info="AF", which=rngs)
vcf <- readVcf(tab, "hg19", param)
## When any of 'fixed', 'info' or 'geno' are omitted (i.e., no
## elements specified) all records are retrieved. Use NA to indicate
## that no records should be retrieved. This param specifies
## all 'fixed fields, the "GT" 'geno' field and none of 'info'.
ScanVcfParam(geno="GT", info=NA)
## ---------------------------------------------------------------------
## Iterate through VCF with 'yieldSize'
## ---------------------------------------------------------------------
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
param <- ScanVcfParam(fixed="ALT", geno=c("GT", "GL"), info=c("LDAF"))
tab <- TabixFile(fl, yieldSize=4000)
open(tab)
while (nrow(vcf_yield <- readVcf(tab, "hg19", param=param)))
cat("vcf dim:", dim(vcf_yield), "\n")
close(tab)
## ---------------------------------------------------------------------
## Debugging with 'verbose'
## ---------------------------------------------------------------------
## readVcf() uses information in the header lines to parse the data to
## the correct number and type. Fields without header lines are skipped.
## If a call to readVcf() results in no info(VCF) or geno(VCF) data the
## file may be missing header lines. Set 'verbose = TRUE' to get
## a listing of fields found in the header.
## readVcf(myfile, "mygenome", verbose=TRUE)
## Header fields can also be discovered with scanVcfHeader().
hdr <- scanVcfHeader(fl)
geno(hdr)
Run the code above in your browser using DataLab