## generate some artificial data from 'woodmouse':
data(woodmouse)
x <- woodmouse[sample(15, size = 110, replace = TRUE), ]
(h <- haplotype(x))
## the indices of the individuals belonging to the 1st haplotype:
attr(h, "index")[[1]]
plot(sort(h))
## get the frequencies in a named vector:
setNames(lengths(attr(h, "index")), labels(h))
## data posted by Hirra Farooq on r-sig-phylo (see link above):
cat(">[A]\nCCCGATTTTATATCAACATTTATTT------",
">[D]\nCCCGATTTT----------------------",
">[B]\nCCCGATTTTATATCAACATTTATTT------",
">[C]\nCCCGATTTTATATCACCATTTATTTTGATTT",
file = "x.fas", sep = "\n")
x <- read.dna("x.fas", "f")
unlink("x.fas")
## show the sequences and the distances:
alview(x)
dist.dna(x, "N", p = TRUE)
## by default there are 3 haplotypes with a warning about ambiguity:
haplotype(x)
## the same 3 haplotypes without warning:
haplotype(x, strict = TRUE)
## if we remove the last sequence there is, by default, a single haplotype:
haplotype(x[-4, ])
## to get two haplotypes separately as with the complete data:
haplotype(x[-4, ], strict = TRUE)
## a simpler example:
y <- as.DNAbin(matrix(c("A", "A", "A", "A", "R", "-"), 3))
haplotype(y) # 1 haplotype
haplotype(y, strict = TRUE) # 3 haplotypes
haplotype(y, trailingGapsAsN = FALSE) # 2 haplotypes
## a tricky example with 4 sequences and 1 site:
z <- as.DNAbin(matrix(c("Y", "A", "R", "N"), 4))
alview(z, showpos = FALSE)
## a single haplotype is identified:
haplotype(z)
## 'Y' has zero-distance with (and only with) 'N', so they are pooled
## together; at a later iteration of this pooling step, 'N' has
## zero-distance with 'R' (and ultimately with 'A') so they are pooled
## if the sequences are ordered differently, 'Y' and 'A' are separated:
haplotype(z[c(4, 1:3), ])
Run the code above in your browser using DataLab