data("adult")
## Find complete cases.
adult <- adult[complete.cases(adult), ]
## Map metric attributes.
adult[["Capital.Loss"]] <- ordered(cut(adult[["Capital.Loss"]], 2000))
adult[["Capital.Gain"]] <- ordered(cut(adult[["Capital.Gain"]], 2000))
## Show level attributes for binary and discrete variables.
levels(adult[["Type"]])
levels(adult[["Workclass"]])
levels(adult[["Education"]])
levels(adult[["Marital.Status"]])
levels(adult[["Occupation"]])
levels(adult[["Relationship"]])
levels(adult[["Race"]])
levels(adult[["Sex"]])
levels(adult[["Native.Country"]])
levels(adult[["Income"]])
## Replace levels with numbers.
adult <- as.data.frame(data.matrix(adult))
## Levels should start with 0 for discrete distributions except for the
## Dirac distribution.
f <- c("Type", "Workclass", "Education", "Marital.Status", "Occupation",
"Relationship", "Race", "Sex", "Native.Country", "Income")
adult[, f] <- adult[, f] - 1
## Split adult dataset into two train subsets for the two Incomes
## and remove Type and Income columns.
trainle50k <- subset(adult, subset = (Type == 1) & (Income == 0),
select = c(-Type, -Income))
traingt50k <- subset(adult, subset = (Type == 1) & (Income == 1),
select = c(-Type, -Income))
trainall <- subset(adult, subset = Type == 1, select = c(-Type, -Income))
train <- as.factor(subset(adult, subset = Type == 1, select = c(Income))[, 1])
## Extract test dataset form adult dataset and remove Type
## and Income columns.
testle50k <- subset(adult, subset = (Type == 0) & (Income == 0),
select = c(-Type, -Income))
testgt50k <- subset(adult, subset = (Type == 0) & (Income == 1),
select = c(-Type, -Income))
testall <- subset(adult, subset = Type == 0, select = c(-Type, -Income))
test <- as.factor(subset(adult, subset = Type == 0, select = c(Income))[, 1])
save(trainall, file = "trainall.rda")
save(testall, file = "testall.rda")
Run the code above in your browser using DataLab