# open an example dataset (HapMap)
genofile <- snpgdsOpen(snpgdsExampleFileName())
# run PCA
RV <- snpgdsPCA(genofile)
# eigenvalues
head(RV$eigenval)
# variance proportion (%)
head(round(RV$varprop*100, 2))
# [1] 12.23 5.84 1.01 0.95 0.84 0.74
#### there is no population information ####
# make a data.frame
tab <- data.frame(sample.id = RV$sample.id,
EV1 = RV$eigenvect[,1], # the first eigenvector
EV2 = RV$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
head(tab)
# sample.id EV1 EV2
# 1 NA19152 -0.08411287 -0.01226860
# 2 NA19139 -0.08360644 -0.01085849
# 3 NA18912 -0.08110808 -0.01184524
# 4 NA19160 -0.08680864 -0.01447106
# 5 NA07034 0.03109761 0.07709255
# 6 NA07055 0.03228450 0.08155730
# draw
plot(tab$EV2, tab$EV1, xlab="eigenvector 2", ylab="eigenvector 1")
#### there are population information ####
# get population information
# or pop_code <- scan("pop.txt", what=character())
# if it is stored in a text file "pop.txt"
pop_code <- read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))
# get sample id
samp.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
# assume the order of sample IDs is as the same as population codes
cbind(samp.id, pop_code)
# samp.id pop_code
# [1,] "NA19152" "YRI"
# [2,] "NA19139" "YRI"
# [3,] "NA18912" "YRI"
# [4,] "NA19160" "YRI"
# [5,] "NA07034" "CEU"
# ...
# make a data.frame
tab <- data.frame(sample.id = RV$sample.id,
pop = factor(pop_code)[match(RV$sample.id, samp.id)],
EV1 = RV$eigenvect[,1], # the first eigenvector
EV2 = RV$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
head(tab)
# sample.id pop EV1 EV2
# 1 NA19152 YRI -0.08411287 -0.01226860
# 2 NA19139 YRI -0.08360644 -0.01085849
# 3 NA18912 YRI -0.08110808 -0.01184524
# 4 NA19160 YRI -0.08680864 -0.01447106
# 5 NA07034 CEU 0.03109761 0.07709255
# 6 NA07055 CEU 0.03228450 0.08155730
# draw
plot(tab$EV2, tab$EV1, col=as.integer(tab$pop),
xlab="eigenvector 2", ylab="eigenvector 1")
legend("bottomright", legend=levels(tab$pop), pch="o", col=1:4)
# close the file
snpgdsClose(genofile)
Run the code above in your browser using DataLab