Digits Data

This is actually NOT a good example for regression since the response is binary, not numeric. I use it to illustrate the point that less-could-be-better for prediction (less = less predictors)

load(file="dig14.Rata")
ntrain=dim(tr14)[1]
ntest = dim(test14)[1]
par(mfrow=c(3,4));
for(i in 1:12){
 tmp=matrix(as.numeric(tr14[sample(ntrain,1),2:257]),16,16);
 tmp=tmp[,16:1]
 image(tmp,col=gray((32:0)/32)); }

ggplot(tmp) + geom_tile(aes(fill = rescale), colour = “white”) + scale_fill_gradient(low = “white”, high = “steelblue”))

Use the 256 features (each corresponds to a pixel) to train a regression model to predict Y, which is either 1 (if Y=1) or 0 (if Y=4)

tr14$Y = ifelse(tr14$Y==1, 1, 0);
test14$Y = ifelse(test14$Y==1, 1, 0);
g0=lm(Y~., data=tr14);
summary(g0)
## 
## Call:
## lm(formula = Y ~ ., data = tr14)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.70832 -0.02442  0.00000  0.02902  0.24802 
## 
## Coefficients: (8 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.941e+02  2.862e+02  -0.678 0.498386    
## V1           2.471e+03  1.240e+03   1.993 0.047568 *  
## V2          -1.934e+02  9.913e+01  -1.951 0.052357 .  
## V3           2.155e+00  1.771e+00   1.217 0.225069    
## V4           2.769e-01  5.004e-01   0.553 0.580597    
## V5          -1.152e-01  1.631e-01  -0.706 0.480924    
## V6           7.183e-02  9.576e-02   0.750 0.454006    
## V7           2.429e-02  2.747e-02   0.884 0.377601    
## ......
## V235         3.741e-02  8.855e-02   0.422 0.673124    
## V236         1.857e-01  1.915e-01   0.970 0.333225    
## V237        -1.061e+00  6.006e-01  -1.767 0.078612 .  
## V238        -2.288e+00  2.361e+00  -0.969 0.333452    
## V239                NA         NA      NA       NA    
## V240                NA         NA      NA       NA    
## V241                NA         NA      NA       NA    
## V242                NA         NA      NA       NA    
## V243         8.671e+01  4.078e+01   2.127 0.034601 *  
## V244        -5.857e+00  3.520e+00  -1.664 0.097580 .  
## V245        -2.236e+00  1.009e+00  -2.216 0.027759 *  
## V246         1.164e-02  1.986e-01   0.059 0.953306    
## V247         8.271e-03  4.320e-02   0.191 0.848339    
## V248        -1.827e-02  2.720e-02  -0.672 0.502574    
## V249        -4.287e-02  2.359e-02  -1.818 0.070514 .  
## V250         2.795e-02  2.622e-02   1.066 0.287729    
## V251         9.692e-02  6.094e-02   1.590 0.113226    
## V252        -1.629e-02  8.905e-02  -0.183 0.855024    
## V253         1.707e+00  6.381e-01   2.675 0.008044 ** 
## V254        -8.636e+00  3.440e+00  -2.511 0.012791 *  
## V255                NA         NA      NA       NA    
## V256                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09679 on 215 degrees of freedom
## Multiple R-squared:  0.9823, Adjusted R-squared:  0.9619 
## F-statistic: 48.11 on 248 and 215 DF,  p-value: < 2.2e-16

Compute the training error and test error.

tr14.pred = g0$fitted>0.5
table(tr14.pred, tr14$Y)
##          
## tr14.pred   0   1
##     FALSE 199   0
##     TRUE    1 264
training.err = mean(tr14.pred != tr14$Y)
training.err
## [1] 0.002155172
te14.pred = predict(g0, newdata=test14)>0.5;
table(te14.pred , test14$Y)
##          
## te14.pred    0    1
##     FALSE  476    3
##     TRUE   176 1002
test.err = mean(te14.pred != test14$Y)
test.err
## [1] 0.1080266

An ad hoc variable selection approach: refit the linear model excluding features with p-value less than 5%.

g0.coef=summary(g0)$coef
g1.var = row.names(g0.coef)[g0.coef[,4]<0.05]
g1.var
##  [1] "V1"   "V17"  "V21"  "V28"  "V33"  "V45"  "V46"  "V50"  "V51"  "V71" 
## [11] "V87"  "V88"  "V89"  "V97"  "V100" "V105" "V114" "V123" "V127" "V141"
## [21] "V152" "V158" "V159" "V160" "V161" "V163" "V168" "V173" "V174" "V180"
## [31] "V186" "V190" "V191" "V197" "V203" "V222" "V243" "V245" "V253" "V254"
m = length(g1.var)
g1.model=paste("Y ~", g1.var[1])
g1.model
## [1] "Y ~ V1"
for(j in 2:m) g1.model=paste(g1.model, " + ", g1.var[j], sep="");
g1.model
## [1] "Y ~ V1 + V17 + V21 + V28 + V33 + V45 + V46 + V50 + V51 + V71 + V87 + V88 + V89 + V97 + V100 + V105 + V114 + V123 + V127 + V141 + V152 + V158 + V159 + V160 + V161 + V163 + V168 + V173 + V174 + V180 + V186 + V190 + V191 + V197 + V203 + V222 + V243 + V245 + V253 + V254"
g1 = lm(g1.model, data=tr14)
summary(g1)
## 
## Call:
## lm(formula = g1.model, data = tr14)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.94921 -0.05432  0.01591  0.09475  0.66311 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.547510   2.119045   0.730  0.46562    
## V1          -7.179337   8.094699  -0.887  0.37563    
## V17          8.439283   9.919466   0.851  0.39537    
## V21         -0.089348   0.029827  -2.996  0.00290 ** 
## V28         -0.018962   0.024691  -0.768  0.44292    
## V33          0.602115   0.919377   0.655  0.51288    
## V45          0.017541   0.029218   0.600  0.54859    
## V46          0.088061   0.041682   2.113  0.03521 *  
## V50         -0.033818   0.084968  -0.398  0.69083    
## V51         -0.042665   0.042344  -1.008  0.31423    
## V71          0.055050   0.030465   1.807  0.07147 .  
## V87         -0.154790   0.030390  -5.093 5.31e-07 ***
## V88          0.213171   0.024374   8.746  < 2e-16 ***
## V89          0.070630   0.036625   1.928  0.05447 .  
## V97         -0.045498   0.061490  -0.740  0.45976    
## V100        -0.136781   0.022114  -6.185 1.46e-09 ***
## V105        -0.004537   0.032680  -0.139  0.88965    
## V114        -0.033375   0.038467  -0.868  0.38610    
## V123        -0.136923   0.022895  -5.980 4.74e-09 ***
## V127        -0.046560   0.053718  -0.867  0.38657    
## V141        -0.018986   0.027985  -0.678  0.49785    
## V152        -0.075270   0.025924  -2.904  0.00388 ** 
## V158         0.006603   0.054221   0.122  0.90313    
## V159        -0.012274   0.079631  -0.154  0.87758    
## V160         0.082083   0.121024   0.678  0.49799    
## V161        -0.125155   0.101727  -1.230  0.21926    
## V163         0.080202   0.045433   1.765  0.07824 .  
## V168         0.132952   0.025758   5.162 3.78e-07 ***
## V173        -0.042634   0.046374  -0.919  0.35843    
## V174        -0.058764   0.063548  -0.925  0.35564    
## V180        -0.143194   0.053880  -2.658  0.00817 ** 
## V186         0.047889   0.017204   2.784  0.00562 ** 
## V190        -0.073812   0.096432  -0.765  0.44444    
## V191         0.063848   0.134599   0.474  0.63549    
## V197        -0.068569   0.056400  -1.216  0.22476    
## V203         0.013134   0.024263   0.541  0.58856    
## V222        -0.014410   0.072731  -0.198  0.84304    
## V243        -1.137426   0.674106  -1.687  0.09228 .  
## V245         0.072664   0.091163   0.797  0.42585    
## V253         0.045880   0.098799   0.464  0.64262    
## V254         0.843130   0.504645   1.671  0.09551 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2055 on 423 degrees of freedom
## Multiple R-squared:  0.843,  Adjusted R-squared:  0.8282 
## F-statistic: 56.78 on 40 and 423 DF,  p-value: < 2.2e-16

Compute the training error and test error.

tr14.pred = g1$fitted>0.5
table(tr14.pred, tr14$Y)
##          
## tr14.pred   0   1
##     FALSE 190   4
##     TRUE   10 260
training.err = mean(tr14.pred != tr14$Y)
training.err
## [1] 0.03017241
te14.pred = predict(g1, newdata=test14)>0.5;
table(te14.pred , test14$Y)
##          
## te14.pred    0    1
##     FALSE  579    2
##     TRUE    73 1003
test.err=mean(te14.pred != test14$Y)
test.err
## [1] 0.04526252

Further drop features with p-value less than 5%.

g1.coef=summary(g1)$coef
g2.var = row.names(g1.coef)[g1.coef[,4]<0.05]
g2.var
##  [1] "V21"  "V46"  "V87"  "V88"  "V100" "V123" "V152" "V168" "V180" "V186"
m = length(g2.var)
g2.model=paste("Y ~", g2.var[1])
for(j in 2:m) g2.model=paste(g2.model, " + ", g2.var[j], sep="");
g2.model
## [1] "Y ~ V21 + V46 + V87 + V88 + V100 + V123 + V152 + V168 + V180 + V186"
g2 = lm(g2.model, data=tr14)
summary(g2)
## 
## Call:
## lm(formula = g2.model, data = tr14)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.90122 -0.06603  0.01198  0.11272  0.71738 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.08005    0.03929   2.038 0.042174 *  
## V21         -0.08565    0.02330  -3.676 0.000266 ***
## V46          0.02471    0.02441   1.012 0.311931    
## V87         -0.13147    0.01688  -7.787 4.70e-14 ***
## V88          0.25273    0.02071  12.202  < 2e-16 ***
## V100        -0.16534    0.01815  -9.108  < 2e-16 ***
## V123        -0.14689    0.01973  -7.446 4.91e-13 ***
## V152        -0.06082    0.02236  -2.720 0.006772 ** 
## V168         0.12940    0.02183   5.927 6.12e-09 ***
## V180        -0.19545    0.03121  -6.263 8.79e-10 ***
## V186         0.04772    0.01606   2.971 0.003131 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2101 on 453 degrees of freedom
## Multiple R-squared:  0.8242, Adjusted R-squared:  0.8203 
## F-statistic: 212.4 on 10 and 453 DF,  p-value: < 2.2e-16

Compute the training error and test error.

tr14.pred = g2$fitted>0.5
table(tr14.pred, tr14$Y)
##          
## tr14.pred   0   1
##     FALSE 188   8
##     TRUE   12 256
training.err = mean(tr14.pred != tr14$Y)
training.err
## [1] 0.04310345
te14.pred = predict(g2, newdata=test14)>0.5;
table(te14.pred , test14$Y)
##          
## te14.pred    0    1
##     FALSE  591    4
##     TRUE    61 1001
test.err=mean(te14.pred != test14$Y)
test.err
## [1] 0.03922752

Take a look of where the selected variables (i.e., pixels) are located.

par(mfrow=c(2,3))

for(i in 1:4){
 tmp=matrix(as.numeric(tr14[i,2:257]),16,16);
 tmp=tmp[,16:1];
 image(tmp,col=gray((32:0)/32));
 }

var.id=rep(0, 256);
var.id[names(tr14)[-1] %in% g1.var]=1;
var.id[var.id==0]=NA;
tmp=matrix(var.id,16,16);
tmp=tmp[,16:1]
image(tmp,col=gray((32:0)/32));
var.id=rep(0, 256);
var.id[names(tr14)[-1] %in% g2.var]=1;
var.id[var.id==0]=NA;
tmp=matrix(var.id,16,16);
tmp=tmp[,16:1]
image(tmp,col=gray((32:0)/32))

State.x77 Data: matrix with 50 rows and 8 columns

Fit a linear regression model to predict Life Expectancy with all other 7 predictors.

library(faraway)
# ?state.x77
statedata=data.frame(state.x77,row.names=state.abb,check.names=T)
g = lm(Life.Exp ~., data=statedata)
summary(g)
## 
## Call:
## lm(formula = Life.Exp ~ ., data = statedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.48895 -0.51232 -0.02747  0.57002  1.49447 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.094e+01  1.748e+00  40.586  < 2e-16 ***
## Population   5.180e-05  2.919e-05   1.775   0.0832 .  
## Income      -2.180e-05  2.444e-04  -0.089   0.9293    
## Illiteracy   3.382e-02  3.663e-01   0.092   0.9269    
## Murder      -3.011e-01  4.662e-02  -6.459 8.68e-08 ***
## HS.Grad      4.893e-02  2.332e-02   2.098   0.0420 *  
## Frost       -5.735e-03  3.143e-03  -1.825   0.0752 .  
## Area        -7.383e-08  1.668e-06  -0.044   0.9649    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7448 on 42 degrees of freedom
## Multiple R-squared:  0.7362, Adjusted R-squared:  0.6922 
## F-statistic: 16.74 on 7 and 42 DF,  p-value: 2.534e-10

Check outliers

n=dim(statedata)[1]
sort(abs(rstudent(g)), decreasing=TRUE)[1:5]
##       HI       ME       SC       AK       AR 
## 2.735242 2.232206 1.615402 1.606163 1.562068
qt(0.05/(n*2), 41)
## [1] -3.544184

Check high-influential points

cook = cooks.distance(g)
max(cook)
## [1] 1.320804
#sort(cook)
halfnorm(cook)

Let’s first keep AK in our model, and then will repeat the analysis without AK.

Variable selection with leaps package.

library(leaps)
b=regsubsets(Life.Exp~., data = statedata)
rs = summary(b)
rs$which
##   (Intercept) Population Income Illiteracy Murder HS.Grad Frost  Area
## 1        TRUE      FALSE  FALSE      FALSE   TRUE   FALSE FALSE FALSE
## 2        TRUE      FALSE  FALSE      FALSE   TRUE    TRUE FALSE FALSE
## 3        TRUE      FALSE  FALSE      FALSE   TRUE    TRUE  TRUE FALSE
## 4        TRUE       TRUE  FALSE      FALSE   TRUE    TRUE  TRUE FALSE
## 5        TRUE       TRUE   TRUE      FALSE   TRUE    TRUE  TRUE FALSE
## 6        TRUE       TRUE   TRUE       TRUE   TRUE    TRUE  TRUE FALSE
## 7        TRUE       TRUE   TRUE       TRUE   TRUE    TRUE  TRUE  TRUE
n=dim(statedata)[1]; msize = 2:8;
par(mfrow=c(2,2))
plot(msize, rs$adjr2, xlab="No. of Parameters", ylab = "Adjusted Rsquare");
plot(msize, rs$cp, xlab="No. of Parameters", ylab = "Mallow's Cp");

Aic = n*log(rs$rss/n) + 2*msize;
Bic = n*log(rs$rss/n) + msize*log(n);
plot(msize, Aic, xlab="No. of Parameters", ylab = "AIC");
plot(msize, Bic, xlab="No. of Parameters", ylab = "BIC");

For this particular data set, AIC and BIC end up selecting the same model. Sometimes, the models selected by different criteria may be different.

rs$which[which.min(Aic),]
## (Intercept)  Population      Income  Illiteracy      Murder     HS.Grad 
##        TRUE        TRUE       FALSE       FALSE        TRUE        TRUE 
##       Frost        Area 
##        TRUE       FALSE
select.var = colnames(rs$which)[rs$which[which.min(Aic),]]
select.var = select.var[-1]
myfit = lm(Life.Exp ~ . , data=statedata[, c(select.var, "Life.Exp")])
summary(myfit)
## 
## Call:
## lm(formula = Life.Exp ~ ., data = statedata[, c(select.var, "Life.Exp")])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.47095 -0.53464 -0.03701  0.57621  1.50683 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.103e+01  9.529e-01  74.542  < 2e-16 ***
## Population   5.014e-05  2.512e-05   1.996  0.05201 .  
## Murder      -3.001e-01  3.661e-02  -8.199 1.77e-10 ***
## HS.Grad      4.658e-02  1.483e-02   3.142  0.00297 ** 
## Frost       -5.943e-03  2.421e-03  -2.455  0.01802 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7197 on 45 degrees of freedom
## Multiple R-squared:  0.736,  Adjusted R-squared:  0.7126 
## F-statistic: 31.37 on 4 and 45 DF,  p-value: 1.696e-12

leaps does not return AIC, but BIC. Its BIC differs from what I compute above, but the difference is a constant, so the two BIC formulae (mine and the one used by leaps) are essentially the same.

names(rs)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
cbind(rs$bic, Bic,rs$bic-Bic )
##                      Bic          
## [1,] -39.22051 -10.78522 -28.43529
## [2,] -42.62472 -14.18943 -28.43529
## [3,] -46.70678 -18.27149 -28.43529
## [4,] -47.03640 -18.60111 -28.43529
## [5,] -43.13738 -14.70209 -28.43529
## [6,] -39.23342 -10.79813 -28.43529
## [7,] -35.32373  -6.88844 -28.43529

Stepwise algorithm based on AIC and BIC

 step(g, direction="both")
## Start:  AIC=-22.18
## Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad + 
##     Frost + Area
## 
##              Df Sum of Sq    RSS     AIC
## - Area        1    0.0011 23.298 -24.182
## - Income      1    0.0044 23.302 -24.175
## - Illiteracy  1    0.0047 23.302 -24.174
## <none>                    23.297 -22.185
## - Population  1    1.7472 25.044 -20.569
## - Frost       1    1.8466 25.144 -20.371
## - HS.Grad     1    2.4413 25.738 -19.202
## - Murder      1   23.1411 46.438  10.305
## 
## Step:  AIC=-24.18
## Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad + 
##     Frost
## 
##              Df Sum of Sq    RSS     AIC
## - Illiteracy  1    0.0038 23.302 -26.174
## - Income      1    0.0059 23.304 -26.170
## <none>                    23.298 -24.182
## - Population  1    1.7599 25.058 -22.541
## + Area        1    0.0011 23.297 -22.185
## - Frost       1    2.0488 25.347 -21.968
## - HS.Grad     1    2.9804 26.279 -20.163
## - Murder      1   26.2721 49.570  11.569
## 
## Step:  AIC=-26.17
## Life.Exp ~ Population + Income + Murder + HS.Grad + Frost
## 
##              Df Sum of Sq    RSS     AIC
## - Income      1     0.006 23.308 -28.161
## <none>                    23.302 -26.174
## - Population  1     1.887 25.189 -24.280
## + Illiteracy  1     0.004 23.298 -24.182
## + Area        1     0.000 23.302 -24.174
## - Frost       1     3.037 26.339 -22.048
## - HS.Grad     1     3.495 26.797 -21.187
## - Murder      1    34.739 58.041  17.456
## 
## Step:  AIC=-28.16
## Life.Exp ~ Population + Murder + HS.Grad + Frost
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    23.308 -28.161
## + Income      1     0.006 23.302 -26.174
## + Illiteracy  1     0.004 23.304 -26.170
## + Area        1     0.001 23.307 -26.163
## - Population  1     2.064 25.372 -25.920
## - Frost       1     3.122 26.430 -23.877
## - HS.Grad     1     5.112 28.420 -20.246
## - Murder      1    34.816 58.124  15.528
## 
## Call:
## lm(formula = Life.Exp ~ Population + Murder + HS.Grad + Frost, 
##     data = statedata)
## 
## Coefficients:
## (Intercept)   Population       Murder      HS.Grad        Frost  
##   7.103e+01    5.014e-05   -3.001e-01    4.658e-02   -5.943e-03
 step(g, direction="both", k=log(n))
## Start:  AIC=-6.89
## Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad + 
##     Frost + Area
## 
##              Df Sum of Sq    RSS      AIC
## - Area        1    0.0011 23.298 -10.7981
## - Income      1    0.0044 23.302 -10.7910
## - Illiteracy  1    0.0047 23.302 -10.7903
## - Population  1    1.7472 25.044  -7.1846
## - Frost       1    1.8466 25.144  -6.9866
## <none>                    23.297  -6.8884
## - HS.Grad     1    2.4413 25.738  -5.8178
## - Murder      1   23.1411 46.438  23.6891
## 
## Step:  AIC=-10.8
## Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad + 
##     Frost
## 
##              Df Sum of Sq    RSS      AIC
## - Illiteracy  1    0.0038 23.302 -14.7021
## - Income      1    0.0059 23.304 -14.6975
## - Population  1    1.7599 25.058 -11.0691
## <none>                    23.298 -10.7981
## - Frost       1    2.0488 25.347 -10.4960
## - HS.Grad     1    2.9804 26.279  -8.6912
## + Area        1    0.0011 23.297  -6.8884
## - Murder      1   26.2721 49.570  23.0406
## 
## Step:  AIC=-14.7
## Life.Exp ~ Population + Income + Murder + HS.Grad + Frost
## 
##              Df Sum of Sq    RSS     AIC
## - Income      1     0.006 23.308 -18.601
## - Population  1     1.887 25.189 -14.720
## <none>                    23.302 -14.702
## - Frost       1     3.037 26.339 -12.488
## - HS.Grad     1     3.495 26.797 -11.627
## + Illiteracy  1     0.004 23.298 -10.798
## + Area        1     0.000 23.302 -10.790
## - Murder      1    34.739 58.041  27.017
## 
## Step:  AIC=-18.6
## Life.Exp ~ Population + Murder + HS.Grad + Frost
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    23.308 -18.601
## - Population  1     2.064 25.372 -18.271
## - Frost       1     3.122 26.430 -16.228
## + Income      1     0.006 23.302 -14.702
## + Illiteracy  1     0.004 23.304 -14.697
## + Area        1     0.001 23.307 -14.691
## - HS.Grad     1     5.112 28.420 -12.598
## - Murder      1    34.816 58.124  23.176
## 
## Call:
## lm(formula = Life.Exp ~ Population + Murder + HS.Grad + Frost, 
##     data = statedata)
## 
## Coefficients:
## (Intercept)   Population       Murder      HS.Grad        Frost  
##   7.103e+01    5.014e-05   -3.001e-01    4.658e-02   -5.943e-03

Repeat the above variable selection algorithm without AK.

b = regsubsets(Life.Exp ~., data=statedata[-2, ])
rs = summary(b)
n=49; msize = 2:8;

par(mfrow=c(2,2))
plot(msize, rs$adjr2, xlab="No. of Parameters", ylab = "Adjusted R-square");
plot(msize, rs$cp, xlab="No. of Parameters", ylab = "Mallow's Cp");
Aic = n*log(rs$rss/n) + 2*msize;
Bic = n*log(rs$rss/n) + msize*log(n);
plot(msize, Aic, xlab="No. of Parameters", ylab = "AIC");
plot(msize, Bic, xlab="No. of Parameters", ylab = "BIC")