## EXAMPLE 1 (from Levy and Lemeshow p 292):
## We intend to conduct a survey of nurse practitioners to estimate the 
## average number of patients seen by each nurse. There are five health
## centres in the study area, each with three nurses. We intend to sample
## two nurses from each health centre. We would like to be 95% confident
## that our estimate is within 30% of the true population value. We expect 
## that the mean number of patients seen at the health centre level 
## is 84 (var 567) and the mean number of patients seen at the nurse 
## level is 28 (var 160). How many health centres should be sampled?
tn <- c(5, 3); tmean <- c(84, 28); tsigma2.x <- c(567, 160)
epi.cluster2size(nbar = 2, n = tn, mean = tmean, sigma2.x = tsigma2.x, 
   sigma2.y = NA, sigma2.xy = NA, epsilon.r = 0.3, method = "mean", 
   conf.level = 0.95)
## Three health centres need to be sampled to meet the survey 
## specifications.
## EXAMPLE 2 (from Levy and Lemeshow p 294):
## Same scenario as above, but this time we want to estimate the proportion
## of patients referred to a general practitioner from each clinic. As before, 
## we want to be 95% confident that our estimate of the proportion of referred 
## patients is within 30% of the true population value. We expect that 
## approximately 36% of patients are referred.
## On page 295 Levy and Lemeshow state that the parameters sigma2.x, sigma2.y
## and sigma2.xy are rarely known in advance and must be either estimated
## or guessed from experience or intuition. In this example (for 
## demonstration) we use the actual patient data to calculate sigma2.x, 
## sigma2.y and sigma2.xy.
## Nurse-level data. The following code reproduces Table 10.4 of Levy and 
## Lemeshow (page 293).
clinic <- rep(1:5, each = 3)
nurse <- 1:15
Xij <- c(58,44,18,42,53,10,13,18,37,16,32,10,25,23,23)
Yij <- c(5,6,6,3,19,2,12,6,30,5,14,4,17,9,14)
ssudat <- data.frame(clinic, nurse, Xij, Yij)
Xbar <- by(data = ssudat$Xij, INDICES = ssudat$clinic, FUN = mean)
ssudat$Xbar <- rep(Xbar, each = 3)
Ybar <- by(data = ssudat$Yij, INDICES = ssudat$clinic, FUN = mean)
ssudat$Ybar <- rep(Ybar, each = 3)
ssudat$Xij.Xbar <- (ssudat$Xij - ssudat$Xbar)^2
ssudat$Yij.Ybar <- (ssudat$Yij - ssudat$Ybar)^2
ssudat$XY <- (ssudat$Xij - ssudat$Xbar) * (ssudat$Yij - ssudat$Ybar)
## Collapse the nurse-level data (created above) to the clinic level. 
## The following code reproduces Table 10.3 of Levy and Lemeshow (page 292). 
clinic <- as.vector(by(ssudat$clinic, INDICES = ssudat$clinic, FUN = min))
Xi <- as.vector(by(ssudat$Xij, INDICES = ssudat$clinic, FUN = sum))
Yi <- as.vector(by(ssudat$Yij, INDICES = ssudat$clinic, FUN = sum))
psudat <- data.frame(clinic, Xi, Yi)
psudat$Xi.Xbar <- (psudat$Xi - mean(psudat$Xi))^2
psudat$Yi.Ybar <- (psudat$Yi - mean(psudat$Yi))^2
psudat$XY <- (psudat$Xi - mean(psudat$Xi)) * (psudat$Yi - mean(psudat$Yi))
## Number of primary and secondary sampling units:
npsu <- nrow(psudat)
nssu <- mean(by(ssudat$nurse, INDICES = ssudat$clinic, FUN = length))
tn <- c(npsu, nssu)
## Mean of X at primary sampling unit and secondary sampling unit level:
tmean <- c(mean(psudat$Xi), mean(ssudat$Xij))
## Variance of number of patients seen:
tsigma2.x <- c(mean(psudat$Xi.Xbar), mean(ssudat$Xij.Xbar))
## Variance of number of patients referred:
tsigma2.y <- c(mean(psudat$Yi.Ybar), mean(ssudat$Yij.Ybar))
tsigma2.xy <- c(mean(psudat$XY), mean(ssudat$XY))
epi.cluster2size(nbar = 2, R = 0.36, n = tn, mean = tmean, 
   sigma2.x = tsigma2.x, sigma2.y = tsigma2.y, sigma2.xy = tsigma2.xy, 
   epsilon.r = 0.3, method = "proportion", conf.level = 0.95)
## Two health centres need to be sampled to meet the survey 
## specifications.
## EXAMPLE 3:
## We want to determine the prevalence of brucellosis in dairy cattle in a
## country comprised of 20 provinces. The number of dairy herds per province 
## ranges from 50 to 1200. Herd size ranges from 25 to 900. We suspect that
## the prevalence of brucellosis-positive herds across the entire country 
## is around 10%. We suspect that there are a small number of provinces 
## with a relatively high individual cow-level prevalence of disease 
## (thought to be between 40% and 80%). How many herds should be sampled 
## from each province if we want our estimate of prevalence to be within 
## 30% of the true population value?
epi.simplesize(N = 1200, Vsq = NA, Py = 0.10, epsilon.r = 0.30, 
   method = "proportion", conf.level = 0.95)
## A total of 234 herds should be sampled from each province.
## Next we work out the number of provinces that need to be sampled. 
## Again, we would like to be 95% confident that our estimate is within 
## 30% of the true population value. Simulate some data to derive appropriate
## estimates of sigma2.x, sigma2.y and sigma2.xy.
## Number of herds per province:
npsu <- 20
nherds.p <- as.integer(runif(n = npsu, min = 50, max = 1200))
## Mean herd size per province:
hsize.p <- as.integer(runif(n = npsu, min = 25, max = 900))
## Simulate estimates of the cow-level prevalence of brucellosis in each 
## province. Here we generate an equal mix of `low' and `high' brucellosis
## prevalence provinces:
prev.p <- c(runif(n = 15, min = 0, max = 0.05), 
   runif(n = 5, min = 0.40, max = 0.80)) 
## Generate some data:
prov <- c(); herd <- c(); 
Xij <- c(); Yij <- c(); 
Xbar <- c(); Ybar <- c();
Xij.Xbar <- c(); Yij.Ybar <- c()
for(i in 1:npsu){
   ## Province identifiers:
   tprov <- rep(i, times = nherds.p[i])
   prov <- c(prov, tprov)
   
   ## Herd identifiers:
   therd <- 1:nherds.p[i]
   herd <- c(herd, therd)
   
   ## Number of cows in each of the herds in this province:
   tXij <- as.integer(rlnorm(n = nherds.p[i], meanlog = log(hsize.p[i]), 
      sdlog = 0.5))
   tXbar <- mean(tXij)
   tXij.Xbar <- (tXij - tXbar)^2
   Xij <- c(Xij, tXij)
   Xbar <- c(Xbar, rep(tXbar, times = nherds.p[i]))
   Xij.Xbar <- c(Xij.Xbar, tXij.Xbar)   
   
   ## Number of brucellosis-positive cows in each herd:
   tYij <- c()
   for(j in 1:nherds.p[i]){ 
      ttYij <- rbinom(n = 1, size = tXij[j], prob = prev.p[i]) 
      tYij <- c(tYij, ttYij)
      }
    tYbar <- mean(tYij)
    tYij.Ybar <- (tYij - tYbar)^2
    Yij <- c(Yij, tYij)
    Ybar <- c(Ybar, rep(tYbar, times = nherds.p[i]))
    Yij.Ybar <- c(Yij.Ybar, tYij.Ybar)   
}
ssudat <- data.frame(prov, herd, Xij, Yij, Xbar, Ybar, Xij.Xbar, Yij.Ybar)
ssudat$XY <- (ssudat$Xij - ssudat$Xbar) * (ssudat$Yij - ssudat$Ybar)
## Collapse the herd-level data (created above) to the province level: 
prov <- as.vector(by(ssudat$prov, INDICES = ssudat$prov, FUN = min))
Xi <- as.vector(by(ssudat$Xij, INDICES = ssudat$prov, FUN = sum))
Yi <- as.vector(by(ssudat$Yij, INDICES = ssudat$prov, FUN = sum))
psudat <- data.frame(prov, Xi, Yi)
psudat$Xi.Xbar <- (psudat$Xi - mean(psudat$Xi))^2
psudat$Yi.Ybar <- (psudat$Yi - mean(psudat$Yi))^2
psudat$XY <- (psudat$Xi - mean(psudat$Xi)) * (psudat$Yi - mean(psudat$Yi))
## Number of primary and secondary sampling units:
npsu <- nrow(psudat)
nssu <- round(mean(by(ssudat$herd, INDICES = ssudat$prov, FUN = length)),
   digits = 0)
tn <- c(npsu, nssu)
## Mean of X at primary sampling unit and secondary sampling unit level:
tmean <- c(mean(psudat$Xi), mean(ssudat$Xij))
## Variance of herd size:
tsigma2.x <- c(mean(psudat$Xi.Xbar), mean(ssudat$Xij.Xbar))
## Variance of number of brucellosis-positive cows:
tsigma2.y <- c(mean(psudat$Yi.Ybar), mean(ssudat$Yij.Ybar))
tsigma2.xy <- c(mean(psudat$XY), mean(ssudat$XY))
## Finally, calculate the number of provinces to be sampled:
tR <- sum(psudat$Yi) / sum(psudat$Xi)
epi.cluster2size(nbar = 234, R = tR, n = tn, mean = tmean, 
   sigma2.x = tsigma2.x, sigma2.y = tsigma2.y, sigma2.xy = tsigma2.xy, 
   epsilon.r = 0.3, method = "proportion", conf.level = 0.95)
## Four provinces (sampling 234 herds from each) are required to be 95% 
## confident that our estimate of the individual animal prevalence of 
## brucellosis is within 30% of the true population value.
Run the code above in your browser using DataLab