## --------------------------------------------------------------------
##
## 1) For illustration of use, a small data set.
## Subsets of variables of all cardinalities are sought using the
## RM criterion.
##
data(swiss)
eleaps(cor(swiss),nsol=3, criterion="RM")
##$subsets
##, , Card.1
##
## Var.1 Var.2 Var.3 Var.4 Var.5
##Solution 1 3 0 0 0 0
##Solution 2 1 0 0 0 0
##Solution 3 4 0 0 0 0
##
##, , Card.2
##
## Var.1 Var.2 Var.3 Var.4 Var.5
##Solution 1 3 6 0 0 0
##Solution 2 4 5 0 0 0
##Solution 3 1 2 0 0 0
##
##, , Card.3
##
## Var.1 Var.2 Var.3 Var.4 Var.5
##Solution 1 4 5 6 0 0
##Solution 2 1 2 5 0 0
##Solution 3 3 4 6 0 0
##
##, , Card.4
##
## Var.1 Var.2 Var.3 Var.4 Var.5
##Solution 1 2 4 5 6 0
##Solution 2 1 2 5 6 0
##Solution 3 1 4 5 6 0
##
##, , Card.5
##
## Var.1 Var.2 Var.3 Var.4 Var.5
##Solution 1 1 2 3 5 6
##Solution 2 1 2 4 5 6
##Solution 3 2 3 4 5 6
##
##
##$values
## card.1 card.2 card.3 card.4 card.5
##Solution 1 0.6729689 0.8016409 0.9043760 0.9510757 0.9804629
##Solution 2 0.6286185 0.7982296 0.8791856 0.9506434 0.9776338
##Solution 3 0.6286130 0.7945390 0.8777509 0.9395708 0.9752551
##
##$bestvalues
## Card.1 Card.2 Card.3 Card.4 Card.5
##0.6729689 0.8016409 0.9043760 0.9510757 0.9804629
##
##$bestsets
## Var.1 Var.2 Var.3 Var.4 Var.5
##Card.1 3 0 0 0 0
##Card.2 3 6 0 0 0
##Card.3 4 5 6 0 0
##Card.4 2 4 5 6 0
##Card.5 1 2 3 5 6
##
##$call
##eleaps(cor(swiss), nsol = 3, criterion="RM")
## --------------------------------------------------------------------
##
## 2) Asking only for 2- and 3- dimensional subsets and excluding
## variable number 6.
##
data(swiss)
eleaps(cor(swiss),2,3,exclude=6,nsol=3,criterion="rm")
##$subsets
##, , Card.2
##
## Var.1 Var.2 Var.3
##Solution 1 4 5 0
##Solution 2 1 2 0
##Solution 3 1 3 0
##
##, , Card.3
##
## Var.1 Var.2 Var.3
##Solution 1 1 2 5
##Solution 2 1 4 5
##Solution 3 2 4 5
##
##
##$values
## card.2 card.3
##Solution 1 0.7982296 0.8791856
##Solution 2 0.7945390 0.8686515
##Solution 3 0.7755232 0.8628693
##
##$bestvalues
## Card.2 Card.3
##0.7982296 0.8791856
##
##$bestsets
## Var.1 Var.2 Var.3
##Card.2 4 5 0
##Card.3 1 2 5
##
##$call
##eleaps(cor(swiss), 2, 3, exclude = 6, nsol = 3, criterion = "gcd")
## --------------------------------------------------------------------
##
## 3) Searching for 2- and 3- dimensional subsets that best approximate
## the spaces generated by the first three Principal Components
##
data(swiss)
eleaps(cor(swiss),2,3,criterion="gcd",pcindices=1:3,nsol=3)
##$subsets
##, , Card.2
##
## Var.1 Var.2 Var.3
##Solution 1 4 5 0
##Solution 2 5 6 0
##Solution 3 4 6 0
##
##, , Card.3
##
## Var.1 Var.2 Var.3
##Solution 1 4 5 6
##Solution 2 3 5 6
##Solution 3 2 5 6
##
##
##$values
## card.2 card.3
##Solution 1 0.7831827 0.9253684
##Solution 2 0.7475630 0.8459302
##Solution 3 0.7383665 0.8243032
##
##$bestvalues
## Card.2 Card.3
##0.7831827 0.9253684
##
##$bestsets
## Var.1 Var.2 Var.3
##Card.2 4 5 0
##Card.3 4 5 6
##
##$call
##eleaps(cor(swiss), 2, 3, criterion = "gcd", pcindices = 1:3, nsol = 3)
## --------------------------------------------------------------------
##
## 4) An example of subset selection in the context of Multiple Linear
## Regression. Variable 5 (average car price) in the Cars93 MASS library
## data set is regressed on 13 other variables. A best subset of linear
## predictors is sought, using the default criterion ("TAU_2") which,
## in the case of a Linear Regression, is merely the standard Coefficient
## of Determination, R^2 (as are the other three criteria for the
## multivariate linear hypothesis, "XI_2", "CCR1_2" and "ZETA_2").
##
library(MASS)
data(Cars93)
CarsHmat <- lmHmat(Cars93[,c(7:8,12:15,17:22,25)],Cars93[,5])
names(Cars93[,5,drop=FALSE])
## [1] "Price"
colnames(CarsHmat$mat)
## [1] "MPG.city" "MPG.highway" "EngineSize"
## [4] "Horsepower" "RPM" "Rev.per.mile"
## [7] "Fuel.tank.capacity" "Passengers" "Length"
## [10] "Wheelbase" "Width" "Turn.circle"
## [13] "Weight"
eleaps(CarsHmat$mat, kmin=4, kmax=6, H=CarsHmat$H, r=1)
## $subsets
## , , Card.4
##
## Var.1 Var.2 Var.3 Var.4 Var.5 Var.6
## Solution 1 4 5 10 11 0 0
##
## , , Card.5
##
## Var.1 Var.2 Var.3 Var.4 Var.5 Var.6
## Solution 1 4 5 10 11 12 0
##
## , , Card.6
##
## Var.1 Var.2 Var.3 Var.4 Var.5 Var.6
## Solution 1 4 5 9 10 11 12
##
##
## $values
## card.4 card.5 card.6
## Solution 1 0.7143794 0.7241457 0.731015
##
## $bestvalues
## Card.4 Card.5 Card.6
## 0.7143794 0.7241457 0.7310150
##
## $bestsets
## Var.1 Var.2 Var.3 Var.4 Var.5 Var.6
## Card.4 4 5 10 11 0 0
## Card.5 4 5 10 11 12 0
## Card.6 4 5 9 10 11 12
##
## --------------------------------------------------------------------
## 5) A Linear Discriminant Analysis example with a very small data set.
## We consider the Iris data and three groups, defined by species (setosa,
## versicolor and virginica). The goal is to select the 2- and 3-variable
## subsets that are optimal for the linear discrimination (as measured
## by the "CCR1_2" criterion).
data(iris)
irisHmat <- ldaHmat(iris[1:4],iris$Species)
eleaps(irisHmat$mat,kmin=2,kmax=3,H=irisHmat$H,r=2,crit="ccr12")
## $subsets
## , , Card.2
##
## Var.1 Var.2 Var.3
## Solution 1 1 3 0
##
## , , Card.3
##
## Var.1 Var.2 Var.3
## Solution 1 2 3 4
##
##
## $values
## card.2 card.3
## Solution 1 0.9589055 0.967897
##
## $bestvalues
## Card.2 Card.3
## 0.9589055 0.9678971
##
## $bestsets
## Var.1 Var.2 Var.3
## Card.2 1 3 0
## Card.3 2 3 4
## --------------------------------------------------------------------
## 6) An example of subset selection in the context of a Canonical
## Correlation Analysis. Two groups of variables within the Cars93
## MASS library data set are compared. The goal is to select 4- to
## 6-variable subsets of the 13-variable 'X' group that are optimal in
## terms of preserving the canonical correlations, according to the
## "ZETA_2" criterion (Warning: the 3-variable 'Y' group is kept
## intact; subset selection is carried out in the 'X'
## group only). The 'tolsym' parameter is used to relax the symmetry
## requirements on the effect matrix H which, for numerical reasons,
## is slightly asymmetric. Since corresponding off-diagonal entries of
## matrix H are different, but by less than tolsym, H is replaced
## by its symmetric part: (H+t(H))/2.
library(MASS)
data(Cars93)
CarsHmat <- lmHmat(Cars93[,c(7:8,12:15,17:22,25)],Cars93[,4:6])
names(Cars93[,4:6])
## [1] "Min.Price" "Price" "Max.Price"
## colnames(CarsHmat$mat)
## [1] "MPG.city" "MPG.highway" "EngineSize"
## [4] "Horsepower" "RPM" "Rev.per.mile"
## [7] "Fuel.tank.capacity" "Passengers" "Length"
## [10] "Wheelbase" "Width" "Turn.circle"
## [13] "Weight"
eleaps(CarsHmat$mat, kmin=4, kmax=6, H=CarsHmat$H, r=3,
crit="zeta2", tolsym=1e-9)
## $subsets
## , , Card.4
##
## Var.1 Var.2 Var.3 Var.4 Var.5 Var.6
## Solution 1 3 4 10 11 0 0
##
## , , Card.5
##
## Var.1 Var.2 Var.3 Var.4 Var.5 Var.6
## Solution 1 4 5 9 10 11 0
##
## , , Card.6
##
## Var.1 Var.2 Var.3 Var.4 Var.5 Var.6
## Solution 1 4 5 9 10 11 12
##
##
## $values
## card.4 card.5 card.6
## Solution 1 0.4827353 0.5018922 0.5168627
##
## $bestvalues
## Card.4 Card.5 Card.6
## 0.4827353 0.5018922 0.5168627
##
## $bestsets
## Var.1 Var.2 Var.3 Var.4 Var.5 Var.6
## Card.4 3 4 10 11 0 0
## Card.5 4 5 9 10 11 0
## Card.6 4 5 9 10 11 12
##
## Warning message:
##
## The effect description matrix (H) supplied was slightly asymmetric:
## symmetric entries differed by up to 3.63797880709171e-12.
## (less than the 'tolsym' parameter).
## The H matrix has been replaced by its symmetric part.
## in: validnovcrit(mat, criterion, H, r, p, tolval, tolsym)
## --------------------------------------------------------------------
## 7) An example of variable selection in the context of a logistic
## regression model. We consider the last 100 observations of
## the iris data set (versicolor an verginica species) and try
## to find the best variable subsets for the model that takes species
## as response variable.
data(iris)
iris2sp <- iris[iris$Species != "setosa",]
logrfit <- glm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width,
iris2sp,family=binomial)
Hmat <- glmHmat(logrfit)
eleaps(Hmat$mat,H=Hmat$H,r=1,criterion="Wald",nsol=3)
## $subsets
## , , Card.1
## Var.1 Var.2 Var.3
## Solution 1 4 0 0
## Solution 2 1 0 0
## Solution 3 3 0 0
## , , Card.2
## Var.1 Var.2 Var.3
## Solution 1 1 3 0
## Solution 2 3 4 0
## Solution 3 2 4 0
## , , Card.3
## Var.1 Var.2 Var.3
## Solution 1 2 3 4
## Solution 2 1 3 4
## Solution 3 1 2 3
## $values
## card.1 card.2 card.3
## Solution 1 4.894554 3.522885 1.060121
## Solution 2 5.147360 3.952538 2.224335
## Solution 3 5.161553 3.972410 3.522879
## $bestvalues
## Card.1 Card.2 Card.3
## 4.894554 3.522885 1.060121
## $bestsets
## Var.1 Var.2 Var.3
## Card.1 4 0 0
## Card.2 1 3 0
## Card.3 2 3 4
## $call
## eleaps(mat = Hmat$mat, nsol = 3, criterion = "Wald", H = Hmat$H,
## r = 1)
## --------------------------------------------------------------------
## It should be stressed that, unlike other criteria in the
## subselect package, the Wald criterion is not bounded above by
## 1 and is a decreasing function of subset quality, so that the
## 3-variable subsets do, in fact, perform better than their smaller-sized
## counterparts.
Run the code above in your browser using DataLab