# the file of VCF
vcf.fn <- seqExampleFileName("vcf")
vcf.fn
# or vcf.fn <- "C:/YourFolder/Your_VCF_File.vcf"
# parse the header
seqVCF_Header(vcf.fn)
# get sample id
seqVCF_SampID(vcf.fn)
# convert
seqVCF2GDS(vcf.fn, "tmp.gds")
seqSummary("tmp.gds")
# list the structure of GDS variables
f <- seqOpen("tmp.gds")
f
seqClose(f)
unlink("tmp.gds")
############################################################
# the GDS file
(gds.fn <- seqExampleFileName("gds"))
# display
(f <- seqOpen(gds.fn))
# get 'sample.id
(samp.id <- seqGetData(f, "sample.id"))
# "NA06984" "NA06985" "NA06986" ...
# get 'variant.id'
head(variant.id <- seqGetData(f, "variant.id"))
# get 'chromosome'
table(seqGetData(f, "chromosome"))
# get 'allele'
head(seqGetData(f, "allele"))
# "T,C" "G,A" "G,A" ...
# set sample and variant filters
seqSetFilter(f, sample.id=samp.id[c(2,4,6,8,10)])
set.seed(100)
seqSetFilter(f, variant.id=sample(variant.id, 10))
# get genotypic data
seqGetData(f, "genotype")
# get annotation/info/DP
seqGetData(f, "annotation/info/DP")
# get annotation/info/AA, a variable-length dataset
seqGetData(f, "annotation/info/AA")
# $length <- indicating the length of each variable-length data
# [1] 1 1 1 1 1 1 ...
# $data <- the data according to $length
# [1] "T" "C" "T" "C" "G" "C" ...
# get annotation/format/DP, a variable-length dataset
seqGetData(f, "annotation/format/DP")
# $length <- indicating the length of each variable-length data
# [1] 1 1 1 1 1 1 ...
# $data <- the data according to $length
# variant
# sample [,1] [,2] [,3] [,4] [,5] [,6] ...
# [1,] 25 25 22 3 4 17 ...
# read multiple variables variant by variant
seqApply(f, c(geno="genotype", phase="phase", qual="annotation/id"),
FUN=function(x) print(x), as.is="none")
# get the numbers of alleles per variant
seqApply(f, "allele",
FUN=function(x) length(unlist(strsplit(x,","))), as.is="integer")
################################################################
# remove the sample and variant filters
seqResetFilter(f)
# calculate the frequency of reference allele,
# a faster version could be obtained by C coding
af <- seqApply(f, "genotype", FUN=function(x) mean(x==0, na.rm=TRUE),
as.is="double")
length(af)
summary(af)
# close the GDS file
seqClose(f)
Run the code above in your browser using DataLab