require(SDaA)
data(agpop, agsrs, agstrat, package="SDaA") #loads ag data from SDaA
str(agpop)
str(agsrs)
str(agstrat)
# adds variable "region" to agsrs
state.region <- data.frame(xtabs(weight~state+region, data=agstrat))
state.region <- subset(state.region, Freq>0)
agsrs <- merge(agsrs, state.region[,1:2], by="state", all.x=TRUE)
# simulate a statistical matching framework
A <- agsrs[, c("region", "acres82", "farms82", "acres87", "farms87")]
B <- agstrat[, c("region", "acres82", "farms82", "acres92","farms92",
"weight")]
# find a donor in the subset of closest donors using cut.don="rot";
# the distance is computed using "acres82" and "farms82"
out.NND.1 <- RANDwNND.hotdeck(data.rec=A, data.don=B,
match.vars=c("acres82", "farms82") )
# create the synthetic (or fused) data.frame:
# fill in "acres92" and "farms92" in A
fused.1 <- create.fused(data.rec=A, data.don=B,
mtc.ids=out.NND.1$mtc.ids, z.vars=c("acres92","farms92"))
head(fused.1)
# find a donor in the subset of closest donors using cut.don="rot";
# the distance is computed using "acres82" and "farms82"
# weights are used in selecting the donor
out.NND.2 <- RANDwNND.hotdeck(data.rec=A, data.don=B,
match.vars=c("acres82", "farms82"), weight.don="weight" )
fused.2 <- create.fused(data.rec=A, data.don=B,
mtc.ids=out.NND.2$mtc.ids, z.vars=c("acres92","farms92"))
head(fused.2)
# as before, but with a different criteria to reduce the no. of donors:
# the first half (k=0.5) of the closest available donors is retained,
# then a donor is chosen with prob proportional to its weight
out.NND.3 <- RANDwNND.hotdeck(data.rec=A, data.don=B,
match.vars=c("acres82", "farms82"),
cut.don="span", k=0.5, weight.don="weight")
fused.3 <- create.fused(data.rec=A, data.don=B,
mtc.ids=out.NND.3$mtc.ids, z.vars=c("acres92","farms92"))
head(fused.3)
# as before, but the subset of closest donors is formed by considering
# only the first k=5 closest donors
out.NND.4 <- RANDwNND.hotdeck(data.rec=A, data.don=B,
match.vars=c("acres82", "farms82"),
cut.don="exact", k=5, weight.don="weight")
fused.4 <- create.fused(data.rec=A, data.don=B,
mtc.ids=out.NND.4$mtc.ids, z.vars=c("acres92","farms92"))
head(fused.4)
# find a donor in the subset of closest donors using cut.don="rot";
# the distance is computed using "acres82" and "farms82"
# only donors in the same "region" are considered
out.NND.5 <- RANDwNND.hotdeck(data.rec=A, data.don=B, don.class="region",
match.vars=c("acres82", "farms82") )
fused.5 <- create.fused(data.rec=A, data.don=B,
mtc.ids=out.NND.5$mtc.ids, z.vars=c("acres92","farms92"),
dup.x=TRUE, match.vars="region")
head(fused.5)
# Example of Imputation of missing values
# introducing missing vales in iris
ir.mat <- iris
miss <- rbinom(nrow(iris), 1, 0.3)
ir.mat[miss==1,"Sepal.Length"] <- NA
iris.rec <- ir.mat[miss==1,-1]
iris.don <- ir.mat[miss==0,]
#search for NND donors
imp.NND <- RANDwNND.hotdeck(data.rec=iris.rec, data.don=iris.don,
match.vars=c("Sepal.Width","Petal.Length", "Petal.Width"),
don.class="Species")
# imputing missing values
iris.rec.imp <- create.fused(data.rec=iris.rec, data.don=iris.don,
mtc.ids=imp.NND$mtc.ids, z.vars="Sepal.Length")
# rebuild the imputed data.frame
final <- rbind(iris.rec.imp, iris.don)
Run the code above in your browser using DataLab