This is actually NOT a good example for regression since the response is binary, not numeric. I use it to illustrate the point that less-could-be-better for prediction (less = less predictors)
load(file="dig14.Rata")
ntrain=dim(tr14)[1]
ntest = dim(test14)[1]
par(mfrow=c(3,4));
for(i in 1:12){
tmp=matrix(as.numeric(tr14[sample(ntrain,1),2:257]),16,16);
tmp=tmp[,16:1]
image(tmp,col=gray((32:0)/32)); }
ggplot(tmp) + geom_tile(aes(fill = rescale), colour = “white”) + scale_fill_gradient(low = “white”, high = “steelblue”))
tr14$Y = ifelse(tr14$Y==1, 1, 0);
test14$Y = ifelse(test14$Y==1, 1, 0);
g0=lm(Y~., data=tr14);
summary(g0)
##
## Call:
## lm(formula = Y ~ ., data = tr14)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.70832 -0.02442 0.00000 0.02902 0.24802
##
## Coefficients: (8 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.941e+02 2.862e+02 -0.678 0.498386
## V1 2.471e+03 1.240e+03 1.993 0.047568 *
## V2 -1.934e+02 9.913e+01 -1.951 0.052357 .
## V3 2.155e+00 1.771e+00 1.217 0.225069
## V4 2.769e-01 5.004e-01 0.553 0.580597
## V5 -1.152e-01 1.631e-01 -0.706 0.480924
## V6 7.183e-02 9.576e-02 0.750 0.454006
## V7 2.429e-02 2.747e-02 0.884 0.377601
## ......
## V235 3.741e-02 8.855e-02 0.422 0.673124
## V236 1.857e-01 1.915e-01 0.970 0.333225
## V237 -1.061e+00 6.006e-01 -1.767 0.078612 .
## V238 -2.288e+00 2.361e+00 -0.969 0.333452
## V239 NA NA NA NA
## V240 NA NA NA NA
## V241 NA NA NA NA
## V242 NA NA NA NA
## V243 8.671e+01 4.078e+01 2.127 0.034601 *
## V244 -5.857e+00 3.520e+00 -1.664 0.097580 .
## V245 -2.236e+00 1.009e+00 -2.216 0.027759 *
## V246 1.164e-02 1.986e-01 0.059 0.953306
## V247 8.271e-03 4.320e-02 0.191 0.848339
## V248 -1.827e-02 2.720e-02 -0.672 0.502574
## V249 -4.287e-02 2.359e-02 -1.818 0.070514 .
## V250 2.795e-02 2.622e-02 1.066 0.287729
## V251 9.692e-02 6.094e-02 1.590 0.113226
## V252 -1.629e-02 8.905e-02 -0.183 0.855024
## V253 1.707e+00 6.381e-01 2.675 0.008044 **
## V254 -8.636e+00 3.440e+00 -2.511 0.012791 *
## V255 NA NA NA NA
## V256 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09679 on 215 degrees of freedom
## Multiple R-squared: 0.9823, Adjusted R-squared: 0.9619
## F-statistic: 48.11 on 248 and 215 DF, p-value: < 2.2e-16
Compute the training error and test error.
tr14.pred = g0$fitted>0.5
table(tr14.pred, tr14$Y)
##
## tr14.pred 0 1
## FALSE 199 0
## TRUE 1 264
training.err = mean(tr14.pred != tr14$Y)
training.err
## [1] 0.002155172
te14.pred = predict(g0, newdata=test14)>0.5;
table(te14.pred , test14$Y)
##
## te14.pred 0 1
## FALSE 476 3
## TRUE 176 1002
test.err = mean(te14.pred != test14$Y)
test.err
## [1] 0.1080266
g0.coef=summary(g0)$coef
g1.var = row.names(g0.coef)[g0.coef[,4]<0.05]
g1.var
## [1] "V1" "V17" "V21" "V28" "V33" "V45" "V46" "V50" "V51" "V71"
## [11] "V87" "V88" "V89" "V97" "V100" "V105" "V114" "V123" "V127" "V141"
## [21] "V152" "V158" "V159" "V160" "V161" "V163" "V168" "V173" "V174" "V180"
## [31] "V186" "V190" "V191" "V197" "V203" "V222" "V243" "V245" "V253" "V254"
m = length(g1.var)
g1.model=paste("Y ~", g1.var[1])
g1.model
## [1] "Y ~ V1"
for(j in 2:m) g1.model=paste(g1.model, " + ", g1.var[j], sep="");
g1.model
## [1] "Y ~ V1 + V17 + V21 + V28 + V33 + V45 + V46 + V50 + V51 + V71 + V87 + V88 + V89 + V97 + V100 + V105 + V114 + V123 + V127 + V141 + V152 + V158 + V159 + V160 + V161 + V163 + V168 + V173 + V174 + V180 + V186 + V190 + V191 + V197 + V203 + V222 + V243 + V245 + V253 + V254"
g1 = lm(g1.model, data=tr14)
summary(g1)
##
## Call:
## lm(formula = g1.model, data = tr14)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.94921 -0.05432 0.01591 0.09475 0.66311
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.547510 2.119045 0.730 0.46562
## V1 -7.179337 8.094699 -0.887 0.37563
## V17 8.439283 9.919466 0.851 0.39537
## V21 -0.089348 0.029827 -2.996 0.00290 **
## V28 -0.018962 0.024691 -0.768 0.44292
## V33 0.602115 0.919377 0.655 0.51288
## V45 0.017541 0.029218 0.600 0.54859
## V46 0.088061 0.041682 2.113 0.03521 *
## V50 -0.033818 0.084968 -0.398 0.69083
## V51 -0.042665 0.042344 -1.008 0.31423
## V71 0.055050 0.030465 1.807 0.07147 .
## V87 -0.154790 0.030390 -5.093 5.31e-07 ***
## V88 0.213171 0.024374 8.746 < 2e-16 ***
## V89 0.070630 0.036625 1.928 0.05447 .
## V97 -0.045498 0.061490 -0.740 0.45976
## V100 -0.136781 0.022114 -6.185 1.46e-09 ***
## V105 -0.004537 0.032680 -0.139 0.88965
## V114 -0.033375 0.038467 -0.868 0.38610
## V123 -0.136923 0.022895 -5.980 4.74e-09 ***
## V127 -0.046560 0.053718 -0.867 0.38657
## V141 -0.018986 0.027985 -0.678 0.49785
## V152 -0.075270 0.025924 -2.904 0.00388 **
## V158 0.006603 0.054221 0.122 0.90313
## V159 -0.012274 0.079631 -0.154 0.87758
## V160 0.082083 0.121024 0.678 0.49799
## V161 -0.125155 0.101727 -1.230 0.21926
## V163 0.080202 0.045433 1.765 0.07824 .
## V168 0.132952 0.025758 5.162 3.78e-07 ***
## V173 -0.042634 0.046374 -0.919 0.35843
## V174 -0.058764 0.063548 -0.925 0.35564
## V180 -0.143194 0.053880 -2.658 0.00817 **
## V186 0.047889 0.017204 2.784 0.00562 **
## V190 -0.073812 0.096432 -0.765 0.44444
## V191 0.063848 0.134599 0.474 0.63549
## V197 -0.068569 0.056400 -1.216 0.22476
## V203 0.013134 0.024263 0.541 0.58856
## V222 -0.014410 0.072731 -0.198 0.84304
## V243 -1.137426 0.674106 -1.687 0.09228 .
## V245 0.072664 0.091163 0.797 0.42585
## V253 0.045880 0.098799 0.464 0.64262
## V254 0.843130 0.504645 1.671 0.09551 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2055 on 423 degrees of freedom
## Multiple R-squared: 0.843, Adjusted R-squared: 0.8282
## F-statistic: 56.78 on 40 and 423 DF, p-value: < 2.2e-16
Compute the training error and test error.
tr14.pred = g1$fitted>0.5
table(tr14.pred, tr14$Y)
##
## tr14.pred 0 1
## FALSE 190 4
## TRUE 10 260
training.err = mean(tr14.pred != tr14$Y)
training.err
## [1] 0.03017241
te14.pred = predict(g1, newdata=test14)>0.5;
table(te14.pred , test14$Y)
##
## te14.pred 0 1
## FALSE 579 2
## TRUE 73 1003
test.err=mean(te14.pred != test14$Y)
test.err
## [1] 0.04526252
g1.coef=summary(g1)$coef
g2.var = row.names(g1.coef)[g1.coef[,4]<0.05]
g2.var
## [1] "V21" "V46" "V87" "V88" "V100" "V123" "V152" "V168" "V180" "V186"
m = length(g2.var)
g2.model=paste("Y ~", g2.var[1])
for(j in 2:m) g2.model=paste(g2.model, " + ", g2.var[j], sep="");
g2.model
## [1] "Y ~ V21 + V46 + V87 + V88 + V100 + V123 + V152 + V168 + V180 + V186"
g2 = lm(g2.model, data=tr14)
summary(g2)
##
## Call:
## lm(formula = g2.model, data = tr14)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.90122 -0.06603 0.01198 0.11272 0.71738
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.08005 0.03929 2.038 0.042174 *
## V21 -0.08565 0.02330 -3.676 0.000266 ***
## V46 0.02471 0.02441 1.012 0.311931
## V87 -0.13147 0.01688 -7.787 4.70e-14 ***
## V88 0.25273 0.02071 12.202 < 2e-16 ***
## V100 -0.16534 0.01815 -9.108 < 2e-16 ***
## V123 -0.14689 0.01973 -7.446 4.91e-13 ***
## V152 -0.06082 0.02236 -2.720 0.006772 **
## V168 0.12940 0.02183 5.927 6.12e-09 ***
## V180 -0.19545 0.03121 -6.263 8.79e-10 ***
## V186 0.04772 0.01606 2.971 0.003131 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2101 on 453 degrees of freedom
## Multiple R-squared: 0.8242, Adjusted R-squared: 0.8203
## F-statistic: 212.4 on 10 and 453 DF, p-value: < 2.2e-16
Compute the training error and test error.
tr14.pred = g2$fitted>0.5
table(tr14.pred, tr14$Y)
##
## tr14.pred 0 1
## FALSE 188 8
## TRUE 12 256
training.err = mean(tr14.pred != tr14$Y)
training.err
## [1] 0.04310345
te14.pred = predict(g2, newdata=test14)>0.5;
table(te14.pred , test14$Y)
##
## te14.pred 0 1
## FALSE 591 4
## TRUE 61 1001
test.err=mean(te14.pred != test14$Y)
test.err
## [1] 0.03922752
par(mfrow=c(2,3))
for(i in 1:4){
tmp=matrix(as.numeric(tr14[i,2:257]),16,16);
tmp=tmp[,16:1];
image(tmp,col=gray((32:0)/32));
}
var.id=rep(0, 256);
var.id[names(tr14)[-1] %in% g1.var]=1;
var.id[var.id==0]=NA;
tmp=matrix(var.id,16,16);
tmp=tmp[,16:1]
image(tmp,col=gray((32:0)/32));
var.id=rep(0, 256);
var.id[names(tr14)[-1] %in% g2.var]=1;
var.id[var.id==0]=NA;
tmp=matrix(var.id,16,16);
tmp=tmp[,16:1]
image(tmp,col=gray((32:0)/32))
library(faraway)
# ?state.x77
statedata=data.frame(state.x77,row.names=state.abb,check.names=T)
g = lm(Life.Exp ~., data=statedata)
summary(g)
##
## Call:
## lm(formula = Life.Exp ~ ., data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.48895 -0.51232 -0.02747 0.57002 1.49447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.094e+01 1.748e+00 40.586 < 2e-16 ***
## Population 5.180e-05 2.919e-05 1.775 0.0832 .
## Income -2.180e-05 2.444e-04 -0.089 0.9293
## Illiteracy 3.382e-02 3.663e-01 0.092 0.9269
## Murder -3.011e-01 4.662e-02 -6.459 8.68e-08 ***
## HS.Grad 4.893e-02 2.332e-02 2.098 0.0420 *
## Frost -5.735e-03 3.143e-03 -1.825 0.0752 .
## Area -7.383e-08 1.668e-06 -0.044 0.9649
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7448 on 42 degrees of freedom
## Multiple R-squared: 0.7362, Adjusted R-squared: 0.6922
## F-statistic: 16.74 on 7 and 42 DF, p-value: 2.534e-10
n=dim(statedata)[1]
sort(abs(rstudent(g)), decreasing=TRUE)[1:5]
## HI ME SC AK AR
## 2.735242 2.232206 1.615402 1.606163 1.562068
qt(0.05/(n*2), 41)
## [1] -3.544184
cook = cooks.distance(g)
max(cook)
## [1] 1.320804
#sort(cook)
halfnorm(cook)
Let’s first keep AK in our model, and then will repeat the analysis without AK.
leaps
package.library(leaps)
b=regsubsets(Life.Exp~., data = statedata)
rs = summary(b)
rs$which
## (Intercept) Population Income Illiteracy Murder HS.Grad Frost Area
## 1 TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## 2 TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## 3 TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
## 4 TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE
## 5 TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## 6 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## 7 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
n=dim(statedata)[1]; msize = 2:8;
par(mfrow=c(2,2))
plot(msize, rs$adjr2, xlab="No. of Parameters", ylab = "Adjusted Rsquare");
plot(msize, rs$cp, xlab="No. of Parameters", ylab = "Mallow's Cp");
Aic = n*log(rs$rss/n) + 2*msize;
Bic = n*log(rs$rss/n) + msize*log(n);
plot(msize, Aic, xlab="No. of Parameters", ylab = "AIC");
plot(msize, Bic, xlab="No. of Parameters", ylab = "BIC");
rs$which[which.min(Aic),]
## (Intercept) Population Income Illiteracy Murder HS.Grad
## TRUE TRUE FALSE FALSE TRUE TRUE
## Frost Area
## TRUE FALSE
select.var = colnames(rs$which)[rs$which[which.min(Aic),]]
select.var = select.var[-1]
myfit = lm(Life.Exp ~ . , data=statedata[, c(select.var, "Life.Exp")])
summary(myfit)
##
## Call:
## lm(formula = Life.Exp ~ ., data = statedata[, c(select.var, "Life.Exp")])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.47095 -0.53464 -0.03701 0.57621 1.50683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.103e+01 9.529e-01 74.542 < 2e-16 ***
## Population 5.014e-05 2.512e-05 1.996 0.05201 .
## Murder -3.001e-01 3.661e-02 -8.199 1.77e-10 ***
## HS.Grad 4.658e-02 1.483e-02 3.142 0.00297 **
## Frost -5.943e-03 2.421e-03 -2.455 0.01802 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7197 on 45 degrees of freedom
## Multiple R-squared: 0.736, Adjusted R-squared: 0.7126
## F-statistic: 31.37 on 4 and 45 DF, p-value: 1.696e-12
leaps
does not return AIC, but BIC. Its BIC differs from what I compute above, but the difference is a constant, so the two BIC formulae (mine and the one used byleaps
) are essentially the same.
names(rs)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
cbind(rs$bic, Bic,rs$bic-Bic )
## Bic
## [1,] -39.22051 -10.78522 -28.43529
## [2,] -42.62472 -14.18943 -28.43529
## [3,] -46.70678 -18.27149 -28.43529
## [4,] -47.03640 -18.60111 -28.43529
## [5,] -43.13738 -14.70209 -28.43529
## [6,] -39.23342 -10.79813 -28.43529
## [7,] -35.32373 -6.88844 -28.43529
step(g, direction="both")
## Start: AIC=-22.18
## Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad +
## Frost + Area
##
## Df Sum of Sq RSS AIC
## - Area 1 0.0011 23.298 -24.182
## - Income 1 0.0044 23.302 -24.175
## - Illiteracy 1 0.0047 23.302 -24.174
## <none> 23.297 -22.185
## - Population 1 1.7472 25.044 -20.569
## - Frost 1 1.8466 25.144 -20.371
## - HS.Grad 1 2.4413 25.738 -19.202
## - Murder 1 23.1411 46.438 10.305
##
## Step: AIC=-24.18
## Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad +
## Frost
##
## Df Sum of Sq RSS AIC
## - Illiteracy 1 0.0038 23.302 -26.174
## - Income 1 0.0059 23.304 -26.170
## <none> 23.298 -24.182
## - Population 1 1.7599 25.058 -22.541
## + Area 1 0.0011 23.297 -22.185
## - Frost 1 2.0488 25.347 -21.968
## - HS.Grad 1 2.9804 26.279 -20.163
## - Murder 1 26.2721 49.570 11.569
##
## Step: AIC=-26.17
## Life.Exp ~ Population + Income + Murder + HS.Grad + Frost
##
## Df Sum of Sq RSS AIC
## - Income 1 0.006 23.308 -28.161
## <none> 23.302 -26.174
## - Population 1 1.887 25.189 -24.280
## + Illiteracy 1 0.004 23.298 -24.182
## + Area 1 0.000 23.302 -24.174
## - Frost 1 3.037 26.339 -22.048
## - HS.Grad 1 3.495 26.797 -21.187
## - Murder 1 34.739 58.041 17.456
##
## Step: AIC=-28.16
## Life.Exp ~ Population + Murder + HS.Grad + Frost
##
## Df Sum of Sq RSS AIC
## <none> 23.308 -28.161
## + Income 1 0.006 23.302 -26.174
## + Illiteracy 1 0.004 23.304 -26.170
## + Area 1 0.001 23.307 -26.163
## - Population 1 2.064 25.372 -25.920
## - Frost 1 3.122 26.430 -23.877
## - HS.Grad 1 5.112 28.420 -20.246
## - Murder 1 34.816 58.124 15.528
##
## Call:
## lm(formula = Life.Exp ~ Population + Murder + HS.Grad + Frost,
## data = statedata)
##
## Coefficients:
## (Intercept) Population Murder HS.Grad Frost
## 7.103e+01 5.014e-05 -3.001e-01 4.658e-02 -5.943e-03
step(g, direction="both", k=log(n))
## Start: AIC=-6.89
## Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad +
## Frost + Area
##
## Df Sum of Sq RSS AIC
## - Area 1 0.0011 23.298 -10.7981
## - Income 1 0.0044 23.302 -10.7910
## - Illiteracy 1 0.0047 23.302 -10.7903
## - Population 1 1.7472 25.044 -7.1846
## - Frost 1 1.8466 25.144 -6.9866
## <none> 23.297 -6.8884
## - HS.Grad 1 2.4413 25.738 -5.8178
## - Murder 1 23.1411 46.438 23.6891
##
## Step: AIC=-10.8
## Life.Exp ~ Population + Income + Illiteracy + Murder + HS.Grad +
## Frost
##
## Df Sum of Sq RSS AIC
## - Illiteracy 1 0.0038 23.302 -14.7021
## - Income 1 0.0059 23.304 -14.6975
## - Population 1 1.7599 25.058 -11.0691
## <none> 23.298 -10.7981
## - Frost 1 2.0488 25.347 -10.4960
## - HS.Grad 1 2.9804 26.279 -8.6912
## + Area 1 0.0011 23.297 -6.8884
## - Murder 1 26.2721 49.570 23.0406
##
## Step: AIC=-14.7
## Life.Exp ~ Population + Income + Murder + HS.Grad + Frost
##
## Df Sum of Sq RSS AIC
## - Income 1 0.006 23.308 -18.601
## - Population 1 1.887 25.189 -14.720
## <none> 23.302 -14.702
## - Frost 1 3.037 26.339 -12.488
## - HS.Grad 1 3.495 26.797 -11.627
## + Illiteracy 1 0.004 23.298 -10.798
## + Area 1 0.000 23.302 -10.790
## - Murder 1 34.739 58.041 27.017
##
## Step: AIC=-18.6
## Life.Exp ~ Population + Murder + HS.Grad + Frost
##
## Df Sum of Sq RSS AIC
## <none> 23.308 -18.601
## - Population 1 2.064 25.372 -18.271
## - Frost 1 3.122 26.430 -16.228
## + Income 1 0.006 23.302 -14.702
## + Illiteracy 1 0.004 23.304 -14.697
## + Area 1 0.001 23.307 -14.691
## - HS.Grad 1 5.112 28.420 -12.598
## - Murder 1 34.816 58.124 23.176
##
## Call:
## lm(formula = Life.Exp ~ Population + Murder + HS.Grad + Frost,
## data = statedata)
##
## Coefficients:
## (Intercept) Population Murder HS.Grad Frost
## 7.103e+01 5.014e-05 -3.001e-01 4.658e-02 -5.943e-03
b = regsubsets(Life.Exp ~., data=statedata[-2, ])
rs = summary(b)
n=49; msize = 2:8;
par(mfrow=c(2,2))
plot(msize, rs$adjr2, xlab="No. of Parameters", ylab = "Adjusted R-square");
plot(msize, rs$cp, xlab="No. of Parameters", ylab = "Mallow's Cp");
Aic = n*log(rs$rss/n) + 2*msize;
Bic = n*log(rs$rss/n) + msize*log(n);
plot(msize, Aic, xlab="No. of Parameters", ylab = "AIC");
plot(msize, Bic, xlab="No. of Parameters", ylab = "BIC")