Variable Selection

Digits Data

This is actually NOT a good example for regression since the response is binary, not numeric. I use it to illustrate the point that less-could-be-better for prediction (less = less predictors)

load(file="dig14.Rata")
ntrain=dim(tr14)[1]
ntest = dim(test14)[1]
par(mfrow=c(3,4));
for(i in 1:12){
 tmp=matrix(as.numeric(tr14[sample(ntrain,1),2:257]),16,16);
 tmp=tmp[,16:1]
 image(tmp,col=gray((32:0)/32)); }

ggplot(tmp) + geom_tile(aes(fill = rescale), colour = “white”) + scale_fill_gradient(low = “white”, high = “steelblue”))

Use the 256 features (each corresponds to a pixel) to train a regression model to predict Y, which is either 1 (if Y=1) or 0 (if Y=4)

tr14$Y = ifelse(tr14$Y==1, 1, 0);
test14$Y = ifelse(test14$Y==1, 1, 0);
g0=lm(Y~., data=tr14);
summary(g0)

## 
## Call:
## lm(formula = Y ~ ., data = tr14)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.70832 -0.02442  0.00000  0.02902  0.24802 
## 
## Coefficients: (8 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.941e+02  2.862e+02  -0.678 0.498386    
## V1           2.471e+03  1.240e+03   1.993 0.047568 *  
## V2          -1.934e+02  9.913e+01  -1.951 0.052357 .  
## V3           2.155e+00  1.771e+00   1.217 0.225069    
## V4           2.769e-01  5.004e-01   0.553 0.580597    
## V5          -1.152e-01  1.631e-01  -0.706 0.480924    
## V6           7.183e-02  9.576e-02   0.750 0.454006    
## V7           2.429e-02  2.747e-02   0.884 0.377601    
## ......
## V235         3.741e-02  8.855e-02   0.422 0.673124    
## V236         1.857e-01  1.915e-01   0.970 0.333225    
## V237        -1.061e+00  6.006e-01  -1.767 0.078612 .  
## V238        -2.288e+00  2.361e+00  -0.969 0.333452    
## V239                NA         NA      NA       NA    
## V240                NA         NA      NA       NA    
## V241                NA         NA      NA       NA    
## V242                NA         NA      NA       NA    
## V243         8.671e+01  4.078e+01   2.127 0.034601 *  
## V244        -5.857e+00  3.520e+00  -1.664 0.097580 .  
## V245        -2.236e+00  1.009e+00  -2.216 0.027759 *  
## V246         1.164e-02  1.986e-01   0.059 0.953306    
## V247         8.271e-03  4.320e-02   0.191 0.848339    
## V248        -1.827e-02  2.720e-02  -0.672 0.502574    
## V249        -4.287e-02  2.359e-02  -1.818 0.070514 .  
## V250         2.795e-02  2.622e-02   1.066 0.287729    
## V251         9.692e-02  6.094e-02   1.590 0.113226    
## V252        -1.629e-02  8.905e-02  -0.183 0.855024    
## V253         1.707e+00  6.381e-01   2.675 0.008044 ** 
## V254        -8.636e+00  3.440e+00  -2.511 0.012791 *  
## V255                NA         NA      NA       NA    
## V256                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09679 on 215 degrees of freedom
## Multiple R-squared:  0.9823, Adjusted R-squared:  0.9619 
## F-statistic: 48.11 on 248 and 215 DF,  p-value: < 2.2e-16

Compute the training error and test error.

tr14.pred = g0$fitted>0.5
table(tr14.pred, tr14$Y)

##          
## tr14.pred   0   1
##     FALSE 199   0
##     TRUE    1 264

training.err = mean(tr14.pred != tr14$Y)
training.err

## [1] 0.002155172

te14.pred = predict(g0, newdata=test14)>0.5;
table(te14.pred , test14$Y)

##          
## te14.pred    0    1
##     FALSE  476    3
##     TRUE   176 1002

test.err = mean(te14.pred != test14$Y)
test.err

## [1] 0.1080266

An ad hoc variable selection approach: refit the linear model excluding features with p-value less than 5%.

g0.coef=summary(g0)$coef
g1.var = row.names(g0.coef)[g0.coef[,4]<0.05]
g1.var

##  [1] "V1"   "V17"  "V21"  "V28"  "V33"  "V45"  "V46"  "V50"  "V51"  "V71" 
## [11] "V87"  "V88"  "V89"  "V97"  "V100" "V105" "V114" "V123" "V127" "V141"
## [21] "V152" "V158" "V159" "V160" "V161" "V163" "V168" "V173" "V174" "V180"
## [31] "V186" "V190" "V191" "V197" "V203" "V222" "V243" "V245" "V253" "V254"

m = length(g1.var)
g1.model=paste("Y ~", g1.var[1])
g1.model

## [1] "Y ~ V1"

for(j in 2:m) g1.model=paste(g1.model, " + ", g1.var[j], sep="");
g1.model

## [1] "Y ~ V1 + V17 + V21 + V28 + V33 + V45 + V46 + V50 + V51 + V71 + V87 + V88 + V89 + V97 + V100 + V105 + V114 + V123 + V127 + V141 + V152 + V158 + V159 + V160 + V161 + V163 + V168 + V173 + V174 + V180 + V186 + V190 + V191 + V197 + V203 + V222 + V243 + V245 + V253 + V254"

g1 = lm(g1.model, data=tr14)
summary(g1)

## 
## Call:
## lm(formula = g1.model, data = tr14)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.94921 -0.05432  0.01591  0.09475  0.66311 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.547510   2.119045   0.730  0.46562    
## V1          -7.179337   8.094699  -0.887  0.37563    
## V17          8.439283   9.919466   0.851  0.39537    
## V21         -0.089348   0.029827  -2.996  0.00290 ** 
## V28         -0.018962   0.024691  -0.768  0.44292    
## V33          0.602115   0.919377   0.655  0.51288    
## V45          0.017541   0.029218   0.600  0.54859    
## V46          0.088061   0.041682   2.113  0.03521 *  
## V50         -0.033818   0.084968  -0.398  0.69083    
## V51         -0.042665   0.042344  -1.008  0.31423    
## V71          0.055050   0.030465   1.807  0.07147 .  
## V87         -0.154790   0.030390  -5.093 5.31e-07 ***
## V88          0.213171   0.024374   8.746  < 2e-16 ***
## V89          0.070630   0.036625   1.928  0.05447 .  
## V97         -0.045498   0.061490  -0.740  0.45976    
## V100        -0.136781   0.022114  -6.185 1.46e-09 ***
## V105        -0.004537   0.032680  -0.139  0.88965    
## V114        -0.033375   0.038467  -0.868  0.38610    
## V123        -0.136923   0.022895  -5.980 4.74e-09 ***
## V127        -0.046560   0.053718  -0.867  0.38657    
## V141        -0.018986   0.027985  -0.678  0.49785    
## V152        -0.075270   0.025924  -2.904  0.00388 ** 
## V158         0.006603   0.054221   0.122  0.90313    
## V159        -0.012274   0.079631  -0.154  0.87758    
## V160         0.082083   0.121024   0.678  0.49799    
## V161        -0.125155   0.101727  -1.230  0.21926    
## V163         0.080202   0.045433   1.765  0.07824 .  
## V168         0.132952   0.025758   5.162 3.78e-07 ***
## V173        -0.042634   0.046374  -0.919  0.35843    
## V174        -0.058764   0.063548  -0.925  0.35564    
## V180        -0.143194   0.053880  -2.658  0.00817 ** 
## V186         0.047889   0.017204   2.784  0.00562 ** 
## V190        -0.073812   0.096432  -0.765  0.44444    
## V191         0.063848   0.134599   0.474  0.63549    
## V197        -0.068569   0.056400  -1.216  0.22476    
## V203         0.013134   0.024263   0.541  0.58856    
## V222        -0.014410   0.072731  -0.198  0.84304    
## V243        -1.137426   0.674106  -1.687  0.09228 .  
## V245         0.072664   0.091163   0.797  0.42585    
## V253         0.045880   0.098799   0.464  0.64262    
## V254         0.843130   0.504645   1.671  0.09551 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2055 on 423 degrees of freedom
## Multiple R-squared:  0.843,  Adjusted R-squared:  0.8282 
## F-statistic: 56.78 on 40 and 423 DF,  p-value: < 2.2e-16

Compute the training error and test error.

tr14.pred = g1$fitted>0.5
table(tr14.pred, tr14$Y)

##          
## tr14.pred   0   1
##     FALSE 190   4
##     TRUE   10 260

training.err = mean(tr14.pred != tr14$Y)
training.err

## [1] 0.03017241

te14.pred = predict(g1, newdata=test14)>0.5;
table(te14.pred , test14$Y)

##          
## te14.pred    0    1
##     FALSE  579    2
##     TRUE    73 1003

test.err=mean(te14.pred != test14$Y)
test.err

## [1] 0.04526252

Further drop features with p-value less than 5%.

g1.coef=summary(g1)$coef
g2.var = row.names(g1.coef)[g1.coef[,4]<0.05]
g2.var

##  [1] "V21"  "V46"  "V87"  "V88"  "V100" "V123" "V152" "V168" "V180" "V186"

m = length(g2.var)
g2.model=paste("Y ~", g2.var[1])
for(j in 2:m) g2.model=paste(g2.model, " + ", g2.var[j], sep="");
g2.model

## [1] "Y ~ V21 + V46 + V87 + V88 + V100 + V123 + V152 + V168 + V180 + V186"

g2 = lm(g2.model, data=tr14)
summary(g2)

## 
## Call:
## lm(formula = g2.model, data = tr14)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.90122 -0.06603  0.01198  0.11272  0.71738 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.08005    0.03929   2.038 0.042174 *  
## V21         -0.08565    0.02330  -3.676 0.000266 ***
## V46          0.02471    0.02441   1.012 0.311931    
## V87         -0.13147    0.01688  -7.787 4.70e-14 ***
## V88          0.25273    0.02071  12.202  < 2e-16 ***
## V100        -0.16534    0.01815  -9.108  < 2e-16 ***
## V123        -0.14689    0.01973  -7.446 4.91e-13 ***
## V152        -0.06082    0.02236  -2.720 0.006772 ** 
## V168         0.12940    0.02183   5.927 6.12e-09 ***
## V180        -0.19545    0.03121  -6.263 8.79e-10 ***
## V186         0.04772    0.01606   2.971 0.003131 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2101 on 453 degrees of freedom
## Multiple R-squared:  0.8242, Adjusted R-squared:  0.8203 
## F-statistic: 212.4 on 10 and 453 DF,  p-value: < 2.2e-16

Compute the training error and test error.

tr14.pred = g2$fitted>0.5
table(tr14.pred, tr14$Y)

##          
## tr14.pred   0   1
##     FALSE 188   8
##     TRUE   12 256

training.err = mean(tr14.pred != tr14$Y)
training.err

## [1] 0.04310345

te14.pred = predict(g2, newdata=test14)>0.5;
table(te14.pred , test14$Y)

##          
## te14.pred    0    1
##     FALSE  591    4
##     TRUE    61 1001

test.err=mean(te14.pred != test14$Y)
test.err

## [1] 0.03922752

Take a look of where the selected variables (i.e., pixels) are located.

par(mfrow=c(2,3))

for(i in 1:4){
 tmp=matrix(as.numeric(tr14[i,2:257]),16,16);
 tmp=tmp[,16:1];
 image(tmp,col=gray((32:0)/32));
 }

var.id=rep(0, 256);
var.id[names(tr14)[-1] %in% g1.var]=1;
var.id[var.id==0]=NA;
tmp=matrix(var.id,16,16);
tmp=tmp[,16:1]
image(tmp,col=gray((32:0)/32));
var.id=rep(0, 256);
var.id[names(tr14)[-1] %in% g2.var]=1;
var.id[var.id==0]=NA;
tmp=matrix(var.id,16,16);
tmp=tmp[,16:1]
image(tmp,col=gray((32:0)/32))

State.x77 Data: matrix with 50 rows and 8 columns

Population: population estimate as of July 1, 1975
Income: per capita income (1974)
Illiteracy: illiteracy (1970, percent of population)
Life.Exp: life expectancy in years (1969–71)
Murder: murder and non-negligent manslaughter rate per 100,000 population (1976)
HS.Grad: percent high-school graduates (1970)
Frost: mean number of days with minimum temperature below freezing (1931–1960) in capital or large city
Area: land area in square miles

Fit a linear regression model to predict Life Expectancy with all other 7 predictors.

library(faraway)
# ?state.x77
statedata=data.frame(state.x77,row.names=state.abb,check.names=T)
g = lm(Life.Exp ~., data=statedata)
summary(g)

## 
## Call:
## lm(formula = Life.Exp ~ ., data = statedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.48895 -0.51232 -0.02747  0.57002  1.49447 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.094e+01  1.748e+00  40.586  < 2e-16 ***
## Population   5.180e-05  2.919e-05   1.775   0.0832 .  
## Income      -2.180e-05  2.444e-04  -0.089   0.9293    
## Illiteracy   3.382e-02  3.663e-01   0.092   0.9269    
## Murder      -3.011e-01  4.662e-02  -6.459 8.68e-08 ***
## HS.Grad      4.893e-02  2.332e-02   2.098   0.0420 *  
## Frost       -5.735e-03  3.143e-03  -1.825   0.0752 .  
## Area        -7.383e-08  1.668e-06  -0.044   0.9649    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7448 on 42 degrees of freedom
## Multiple R-squared:  0.7362, Adjusted R-squared:  0.6922 
## F-statistic: 16.74 on 7 and 42 DF,  p-value: 2.534e-10

Check outliers

n=dim(statedata)[1]
sort(abs(rstudent(g)), decreasing=TRUE)[1:5]

##       HI       ME       SC       AK       AR 
## 2.735242 2.232206 1.615402 1.606163 1.562068

qt(0.05/(n*2), 41)

## [1] -3.544184

Check high-influential points

cook = cooks.distance(g)
max(cook)

## [1] 1.320804

#sort(cook)
halfnorm(cook)

Let’s first keep AK in our model, and then will repeat the analysis without AK.

Variable selection with `leaps` package.

library(leaps)
b=regsubsets(Life.Exp~., data = statedata)
rs = summary(b)
rs$which

##   (Intercept) Population Income Illiteracy Murder HS.Grad Frost  Area
## 1        TRUE      FALSE  FALSE      FALSE   TRUE   FALSE FALSE FALSE
## 2        TRUE      FALSE  FALSE      FALSE   TRUE    TRUE FALSE FALSE
## 3        TRUE      FALSE  FALSE      FALSE   TRUE    TRUE  TRUE FALSE
## 4        TRUE       TRUE  FALSE      FALSE   TRUE    TRUE  TRUE FALSE
## 5        TRUE       TRUE   TRUE      FALSE   TRUE    TRUE  TRUE FALSE
## 6        TRUE       TRUE   TRUE       TRUE   TRUE    TRUE  TRUE FALSE
## 7        TRUE       TRUE   TRUE       TRUE   TRUE    TRUE  TRUE  TRUE

n=dim(statedata)[1]; msize = 2:8;
par(mfrow=c(2,2))
plot(msize, rs$adjr2, xlab="No. of Parameters", ylab = "Adjusted Rsquare");
plot(msize, rs$cp, xlab="No. of Parameters", ylab = "Mallow's Cp");

Aic = n*log(rs$rss/n) + 2*msize;
Bic = n*log(rs$rss/n) + msize*log(n);
plot(msize, Aic, xlab="No. of Parameters", ylab = "AIC");
plot(msize, Bic, xlab="No. of Parameters", ylab = "BIC");

For this particular data set, AIC and BIC end up selecting the same model. Sometimes, the models selected by different criteria may be different.

rs$which[which.min(Aic),]

## (Intercept)  Population      Income  Illiteracy      Murder     HS.Grad 
##        TRUE        TRUE       FALSE       FALSE        TRUE        TRUE 
##       Frost        Area 
##        TRUE       FALSE

select.var = colnames(rs$which)[rs$which[which.min(Aic),]]
select.var = select.var[-1]
myfit = lm(Life.Exp ~ . , data=statedata[, c(select.var, "Life.Exp")])
summary(myfit)

## 
## Call:
## lm(formula = Life.Exp ~ ., data = statedata[, c(select.var, "Life.Exp")])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.47095 -0.53464 -0.03701  0.57621  1.50683 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.103e+01  9.529e-01  74.542  < 2e-16 ***
## Population   5.014e-05  2.512e-05   1.996  0.05201 .  
## Murder      -3.001e-01  3.661e-02  -8.199 1.77e-10 ***
## HS.Grad      4.658e-02  1.483e-02   3.142  0.00297 ** 
## Frost       -5.943e-03  2.421e-03  -2.455  0.01802 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7197 on 45 degrees of freedom
## Multiple R-squared:  0.736,  Adjusted R-squared:  0.7126 
## F-statistic: 31.37 on 4 and 45 DF,  p-value: 1.696e-12

leaps does not return AIC, but BIC. Its BIC differs from what I compute above, but the difference is a constant, so the two BIC formulae (mine and the one used by leaps) are essentially the same.

names(rs)

## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"

cbind(rs$bic, Bic,rs$bic-Bic )

##                      Bic          
## [1,] -39.22051 -10.78522 -28.43529
## [2,] -42.62472 -14.18943 -28.43529
## [3,] -46.70678 -18.27149 -28.43529
## [4,] -47.03640 -18.60111 -28.43529
## [5,] -43.13738 -14.70209 -28.43529
## [6,] -39.23342 -10.79813 -28.43529
## [7,] -35.32373  -6.88844 -28.43529

Stepwise algorithm based on AIC and BIC

 step(g, direction="both")

## Start:  AIC=-22.18
## Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad + 
##     Frost + Area
## 
##              Df Sum of Sq    RSS     AIC
## - Area        1    0.0011 23.298 -24.182
## - Income      1    0.0044 23.302 -24.175
## - Illiteracy  1    0.0047 23.302 -24.174
## <none>                    23.297 -22.185
## - Population  1    1.7472 25.044 -20.569
## - Frost       1    1.8466 25.144 -20.371
## - HS.Grad     1    2.4413 25.738 -19.202
## - Murder      1   23.1411 46.438  10.305
## 
## Step:  AIC=-24.18
## Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad + 
##     Frost
## 
##              Df Sum of Sq    RSS     AIC
## - Illiteracy  1    0.0038 23.302 -26.174
## - Income      1    0.0059 23.304 -26.170
## <none>                    23.298 -24.182
## - Population  1    1.7599 25.058 -22.541
## + Area        1    0.0011 23.297 -22.185
## - Frost       1    2.0488 25.347 -21.968
## - HS.Grad     1    2.9804 26.279 -20.163
## - Murder      1   26.2721 49.570  11.569
## 
## Step:  AIC=-26.17
## Life.Exp ~ Population + Income + Murder + HS.Grad + Frost
## 
##              Df Sum of Sq    RSS     AIC
## - Income      1     0.006 23.308 -28.161
## <none>                    23.302 -26.174
## - Population  1     1.887 25.189 -24.280
## + Illiteracy  1     0.004 23.298 -24.182
## + Area        1     0.000 23.302 -24.174
## - Frost       1     3.037 26.339 -22.048
## - HS.Grad     1     3.495 26.797 -21.187
## - Murder      1    34.739 58.041  17.456
## 
## Step:  AIC=-28.16
## Life.Exp ~ Population + Murder + HS.Grad + Frost
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    23.308 -28.161
## + Income      1     0.006 23.302 -26.174
## + Illiteracy  1     0.004 23.304 -26.170
## + Area        1     0.001 23.307 -26.163
## - Population  1     2.064 25.372 -25.920
## - Frost       1     3.122 26.430 -23.877
## - HS.Grad     1     5.112 28.420 -20.246
## - Murder      1    34.816 58.124  15.528

## 
## Call:
## lm(formula = Life.Exp ~ Population + Murder + HS.Grad + Frost, 
##     data = statedata)
## 
## Coefficients:
## (Intercept)   Population       Murder      HS.Grad        Frost  
##   7.103e+01    5.014e-05   -3.001e-01    4.658e-02   -5.943e-03

 step(g, direction="both", k=log(n))

## Start:  AIC=-6.89
## Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad + 
##     Frost + Area
## 
##              Df Sum of Sq    RSS      AIC
## - Area        1    0.0011 23.298 -10.7981
## - Income      1    0.0044 23.302 -10.7910
## - Illiteracy  1    0.0047 23.302 -10.7903
## - Population  1    1.7472 25.044  -7.1846
## - Frost       1    1.8466 25.144  -6.9866
## <none>                    23.297  -6.8884
## - HS.Grad     1    2.4413 25.738  -5.8178
## - Murder      1   23.1411 46.438  23.6891
## 
## Step:  AIC=-10.8
## Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad + 
##     Frost
## 
##              Df Sum of Sq    RSS      AIC
## - Illiteracy  1    0.0038 23.302 -14.7021
## - Income      1    0.0059 23.304 -14.6975
## - Population  1    1.7599 25.058 -11.0691
## <none>                    23.298 -10.7981
## - Frost       1    2.0488 25.347 -10.4960
## - HS.Grad     1    2.9804 26.279  -8.6912
## + Area        1    0.0011 23.297  -6.8884
## - Murder      1   26.2721 49.570  23.0406
## 
## Step:  AIC=-14.7
## Life.Exp ~ Population + Income + Murder + HS.Grad + Frost
## 
##              Df Sum of Sq    RSS     AIC
## - Income      1     0.006 23.308 -18.601
## - Population  1     1.887 25.189 -14.720
## <none>                    23.302 -14.702
## - Frost       1     3.037 26.339 -12.488
## - HS.Grad     1     3.495 26.797 -11.627
## + Illiteracy  1     0.004 23.298 -10.798
## + Area        1     0.000 23.302 -10.790
## - Murder      1    34.739 58.041  27.017
## 
## Step:  AIC=-18.6
## Life.Exp ~ Population + Murder + HS.Grad + Frost
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    23.308 -18.601
## - Population  1     2.064 25.372 -18.271
## - Frost       1     3.122 26.430 -16.228
## + Income      1     0.006 23.302 -14.702
## + Illiteracy  1     0.004 23.304 -14.697
## + Area        1     0.001 23.307 -14.691
## - HS.Grad     1     5.112 28.420 -12.598
## - Murder      1    34.816 58.124  23.176

## 
## Call:
## lm(formula = Life.Exp ~ Population + Murder + HS.Grad + Frost, 
##     data = statedata)
## 
## Coefficients:
## (Intercept)   Population       Murder      HS.Grad        Frost  
##   7.103e+01    5.014e-05   -3.001e-01    4.658e-02   -5.943e-03

Repeat the above variable selection algorithm without AK.

b = regsubsets(Life.Exp ~., data=statedata[-2, ])
rs = summary(b)
n=49; msize = 2:8;

par(mfrow=c(2,2))
plot(msize, rs$adjr2, xlab="No. of Parameters", ylab = "Adjusted R-square");
plot(msize, rs$cp, xlab="No. of Parameters", ylab = "Mallow's Cp");
Aic = n*log(rs$rss/n) + 2*msize;
Bic = n*log(rs$rss/n) + msize*log(n);
plot(msize, Aic, xlab="No. of Parameters", ylab = "AIC");
plot(msize, Bic, xlab="No. of Parameters", ylab = "BIC")

Variable Selection

Digits Data

Use the 256 features (each corresponds to a pixel) to train a regression model to predict Y, which is either 1 (if Y=1) or 0 (if Y=4)

An ad hoc variable selection approach: refit the linear model excluding features with p-value less than 5%.

Further drop features with p-value less than 5%.

Take a look of where the selected variables (i.e., pixels) are located.

State.x77 Data: matrix with 50 rows and 8 columns

Fit a linear regression model to predict Life Expectancy with all other 7 predictors.

Check outliers

Check high-influential points

Variable selection with leaps package.

For this particular data set, AIC and BIC end up selecting the same model. Sometimes, the models selected by different criteria may be different.

Stepwise algorithm based on AIC and BIC

Repeat the above variable selection algorithm without AK.

Variable selection with `leaps` package.