predict.list: Predicts Class Membership Based Upon a Model Trained by REBMIX

Description

Takes a list of objects of class REBMIX, prior probabilities P and data frame Dataset with test classes and returns predictive classes.

Usage

## S3 method for class 'list':
predict(object,
  P = NULL,
  Dataset = NULL, ...)

Arguments

object

a list of objects of class REBMIX containing outputs for a finite set of size $s$ of classes $\Omega_g$.

a vector of length $s$ containing prior probabilites.

Dataset

a data frame containing test classes.

...

further arguments to predict.

Value

predictive classes.

References

R. O. Duda and P. E. Hart. Pattern Classification and Scene Analysis. John Wiley & Sons, New York, 1973.

Examples

Run this code

data("adult")

## Find complete cases.

adult <- adult[complete.cases(adult), ]

## Map metric attributes.

adult[["Capital.Loss"]] <- ordered(cut(adult[["Capital.Loss"]], 2000))
adult[["Capital.Gain"]] <- ordered(cut(adult[["Capital.Gain"]], 2000))

## Show level attributes for binary and discrete variables.

levels(adult[["Type"]])
levels(adult[["Workclass"]])
levels(adult[["Education"]])
levels(adult[["Marital.Status"]])
levels(adult[["Occupation"]])
levels(adult[["Relationship"]])
levels(adult[["Race"]])
levels(adult[["Sex"]])
levels(adult[["Native.Country"]])
levels(adult[["Income"]])

## Replace levels with numbers.

adult <- as.data.frame(data.matrix(adult))

## Levels should start with 0 for discrete distributions except for the 
## Dirac distribution.

f <- c("Type", "Workclass", "Education", "Marital.Status", "Occupation", 
  "Relationship", "Race", "Sex", "Native.Country", "Income")

adult[, f] <- adult[, f] - 1

## Split adult dataset into two train subsets for the two Incomes
## and remove Type and Income columns.

trainle50k <- subset(adult, subset = (Type == 1) & (Income == 0), 
  select = c(-Type, -Income))
traingt50k <- subset(adult, subset = (Type == 1) & (Income == 1), 
  select = c(-Type, -Income))

trainall <- subset(adult, subset = Type == 1, select = c(-Type, -Income))

train <- as.factor(subset(adult, subset = Type == 1, select = c(Income))[, 1])

## Extract test dataset form adult dataset and remove Type 
## and Income columns.

testle50k <- subset(adult, subset = (Type == 0) & (Income == 0), 
  select = c(-Type, -Income))
testgt50k <- subset(adult, subset = (Type == 0) & (Income == 1), 
  select = c(-Type, -Income))

testall <- subset(adult, subset = Type == 0, select = c(-Type, -Income))

test <- as.factor(subset(adult, subset = Type == 0, select = c(Income))[, 1])

## Calculate prior probabilities.

P <- c(nrow(trainle50k), nrow(traingt50k))

P <- P / sum(P)

## Estimate number of components, component weights and component 
## parameters for Naive Bayes.

Variables <- c("continuous", "discrete", "continuous", "discrete",
 "discrete", "discrete", "discrete", "discrete", "discrete",
 "discrete", "discrete", "discrete", "discrete", "discrete")

pdf <- c("normal", "Dirac", "normal", "Dirac",
  "Dirac", "Dirac", "Dirac", "Dirac", "Dirac",
  "Dirac", "Dirac", "Dirac", "Dirac", "Dirac")

K <- list(13:44, 1, 13:44, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

ymin <- as.numeric(apply(trainall, 2, min))
ymax <- as.numeric(apply(trainall, 2, max))

## In case of zero occurrences for discrete and integer variables 
## Laplace smoothing is applied.

LS <- list(NA, 0:7, NA, 0:15, ymin[5]:ymax[5], 0:6, 0:13, 
  0:5, 0:4, 0:1, NA, NA, ymin[13]:ymax[13], 0:40)

adultest <- list()

for (i in c(1:14)) {
  adultest[[i]] <- REBMIX(Dataset = list(as.data.frame(trainle50k[, i]), 
    as.data.frame(traingt50k[, i])),
    Preprocessing = "histogram",
    D = if (Variables[i] == "continuous") 0.025 else 0.0,
    cmax = if (Variables[i] == "continuous") 15 else 100,
    Criterion = "D",
    Variables = Variables[i],
    pdf = pdf[i],
    K = K[[i]],
    ymin = ymin[i],
    ymax = ymax[i])

    plot(adultest[[i]], pos = 1, nrow = 1, ncol = 1)
    plot(adultest[[i]], pos = 2, nrow = 1, ncol = 1)
}

## Best-first feature subset selection.

c <- NULL; rvs <- 1:14; Error <- 1.0

for (i in 1:14) {
  k <- NA

  for (j in rvs) {
    predictive <- predict(adultest[c(c, j)],
      P = P,
      Dataset = trainall[, c(c, j)]) 

    CM <- table(train, predictive)

    Accuracy <- (CM[1, 1] + CM[2, 2]) / sum(CM)

    if (1.0 - Accuracy < Error) {
      Error <- 1.0 - Accuracy; k <- j
    }
  }

  if (is.na(k)) {
    break
  }
  else {
    c <- c(c, k); rvs <- rvs[-which(rvs == k)]
  }
}

## Error on train dataset.

Error

## Selected features.

c

predictive <- predict(adultest[c],
  P = P,
  Dataset = testall[, c]) 

CM <- table(test, predictive)

Accuracy <- (CM[1, 1] + CM[2, 2]) / sum(CM)

Error <- 1.0 - Accuracy

## Error on test dataset.

Error

Run the code above in your browser using DataLab