data("adult")
## Find complete cases.
adult <- adult[complete.cases(adult), ]
## Map metric attributes.
adult[["Capital.Loss"]] <- ordered(cut(adult[["Capital.Loss"]], 2000))
adult[["Capital.Gain"]] <- ordered(cut(adult[["Capital.Gain"]], 2000))
## Show level attributes for binary and discrete variables.
levels(adult[["Type"]])
levels(adult[["Workclass"]])
levels(adult[["Education"]])
levels(adult[["Marital.Status"]])
levels(adult[["Occupation"]])
levels(adult[["Relationship"]])
levels(adult[["Race"]])
levels(adult[["Sex"]])
levels(adult[["Native.Country"]])
levels(adult[["Income"]])
## Replace levels with numbers.
adult <- as.data.frame(data.matrix(adult))
## Levels should start with 0 for discrete distributions except for the
## Dirac distribution.
f <- c("Type", "Workclass", "Education", "Marital.Status", "Occupation",
"Relationship", "Race", "Sex", "Native.Country", "Income")
adult[, f] <- adult[, f] - 1
## Split adult dataset into two train subsets for the two Incomes
## and remove Type and Income columns.
trainle50k <- subset(adult, subset = (Type == 1) & (Income == 0),
select = c(-Type, -Income))
traingt50k <- subset(adult, subset = (Type == 1) & (Income == 1),
select = c(-Type, -Income))
trainall <- subset(adult, subset = Type == 1, select = c(-Type, -Income))
train <- as.factor(subset(adult, subset = Type == 1, select = c(Income))[, 1])
## Extract test dataset form adult dataset and remove Type
## and Income columns.
testle50k <- subset(adult, subset = (Type == 0) & (Income == 0),
select = c(-Type, -Income))
testgt50k <- subset(adult, subset = (Type == 0) & (Income == 1),
select = c(-Type, -Income))
testall <- subset(adult, subset = Type == 0, select = c(-Type, -Income))
test <- as.factor(subset(adult, subset = Type == 0, select = c(Income))[, 1])
## Calculate prior probabilities.
P <- c(nrow(trainle50k), nrow(traingt50k))
P <- P / sum(P)
## Estimate number of components, component weights and component
## parameters for Naive Bayes.
Variables <- c("continuous", "discrete", "continuous", "discrete",
"discrete", "discrete", "discrete", "discrete", "discrete",
"discrete", "discrete", "discrete", "discrete", "discrete")
pdf <- c("normal", "Dirac", "normal", "Dirac",
"Dirac", "Dirac", "Dirac", "Dirac", "Dirac",
"Dirac", "Dirac", "Dirac", "Dirac", "Dirac")
K <- list(13:44, 1, 13:44, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
ymin <- as.numeric(apply(trainall, 2, min))
ymax <- as.numeric(apply(trainall, 2, max))
## In case of zero occurrences for discrete and integer variables
## Laplace smoothing is applied.
LS <- list(NA, 0:7, NA, 0:15, ymin[5]:ymax[5], 0:6, 0:13,
0:5, 0:4, 0:1, NA, NA, ymin[13]:ymax[13], 0:40)
adultest <- list()
for (i in c(1:14)) {
adultest[[i]] <- REBMIX(Dataset = list(as.data.frame(trainle50k[, i]),
as.data.frame(traingt50k[, i])),
Preprocessing = "histogram",
D = if (Variables[i] == "continuous") 0.025 else 0.0,
cmax = if (Variables[i] == "continuous") 15 else 100,
Criterion = "D",
Variables = Variables[i],
pdf = pdf[i],
K = K[[i]],
ymin = ymin[i],
ymax = ymax[i])
plot(adultest[[i]], pos = 1, nrow = 1, ncol = 1)
plot(adultest[[i]], pos = 2, nrow = 1, ncol = 1)
}
## Best-first feature subset selection.
c <- NULL; rvs <- 1:14; Error <- 1.0
for (i in 1:14) {
k <- NA
for (j in rvs) {
predictive <- predict(adultest[c(c, j)],
P = P,
Dataset = trainall[, c(c, j)])
CM <- table(train, predictive)
Accuracy <- (CM[1, 1] + CM[2, 2]) / sum(CM)
if (1.0 - Accuracy < Error) {
Error <- 1.0 - Accuracy; k <- j
}
}
if (is.na(k)) {
break
}
else {
c <- c(c, k); rvs <- rvs[-which(rvs == k)]
}
}
## Error on train dataset.
Error
## Selected features.
c
predictive <- predict(adultest[c],
P = P,
Dataset = testall[, c])
CM <- table(test, predictive)
Accuracy <- (CM[1, 1] + CM[2, 2]) / sum(CM)
Error <- 1.0 - Accuracy
## Error on test dataset.
Error
Run the code above in your browser using DataLab