The Coronary Risk‐Factor Study data involve 462 males between the ages of 15 and 64 from a heart‐disease high‐risk region of the Western Cape, South Africa. The response is “chd”, the presence (chd=1) or absence (chd=0) of coronary heart disease.
There are 9 covariates:
heart = read.table("http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data", sep=",",head=T,row.names=1)
heartfull = glm(chd~., data=heart, family=binomial)
summary(heartfull)
##
## Call:
## glm(formula = chd ~ ., family = binomial, data = heart)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7781 -0.8213 -0.4387 0.8889 2.5435
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.1507209 1.3082600 -4.701 2.58e-06 ***
## sbp 0.0065040 0.0057304 1.135 0.256374
## tobacco 0.0793764 0.0266028 2.984 0.002847 **
## ldl 0.1739239 0.0596617 2.915 0.003555 **
## adiposity 0.0185866 0.0292894 0.635 0.525700
## famhistPresent 0.9253704 0.2278940 4.061 4.90e-05 ***
## typea 0.0395950 0.0123202 3.214 0.001310 **
## obesity -0.0629099 0.0442477 -1.422 0.155095
## alcohol 0.0001217 0.0044832 0.027 0.978350
## age 0.0452253 0.0121298 3.728 0.000193 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 596.11 on 461 degrees of freedom
## Residual deviance: 472.14 on 452 degrees of freedom
## AIC: 492.14
##
## Number of Fisher Scoring iterations: 5
How to interprete the coefficient associated with variable “obesity”? It is strange that having excessive body fat can actually lower your chance of having a heart disease?
cor(heart$adi, heart$obe)
## [1] 0.7165563
The two procedures happen to return the same model.
heartstepA = step(heartfull, scope=list(upper=~., lower=~1))
## Start: AIC=492.14
## chd ~ sbp + tobacco + ldl + adiposity + famhist + typea + obesity +
## alcohol + age
##
## Df Deviance AIC
## - alcohol 1 472.14 490.14
## - adiposity 1 472.55 490.55
## - sbp 1 473.44 491.44
## <none> 472.14 492.14
## - obesity 1 474.23 492.23
## - ldl 1 481.07 499.07
## - tobacco 1 481.67 499.67
## - typea 1 483.05 501.05
## - age 1 486.53 504.53
## - famhist 1 488.89 506.89
##
## Step: AIC=490.14
## chd ~ sbp + tobacco + ldl + adiposity + famhist + typea + obesity +
## age
##
## Df Deviance AIC
## - adiposity 1 472.55 488.55
## - sbp 1 473.47 489.47
## <none> 472.14 490.14
## - obesity 1 474.24 490.24
## + alcohol 1 472.14 492.14
## - ldl 1 481.15 497.15
## - tobacco 1 482.06 498.06
## - typea 1 483.06 499.06
## - age 1 486.64 502.64
## - famhist 1 488.99 504.99
##
## Step: AIC=488.55
## chd ~ sbp + tobacco + ldl + famhist + typea + obesity + age
##
## Df Deviance AIC
## - sbp 1 473.98 487.98
## <none> 472.55 488.55
## - obesity 1 474.65 488.65
## + adiposity 1 472.14 490.14
## + alcohol 1 472.55 490.55
## - tobacco 1 482.54 496.54
## - ldl 1 482.95 496.95
## - typea 1 483.19 497.19
## - famhist 1 489.38 503.38
## - age 1 495.48 509.48
##
## Step: AIC=487.98
## chd ~ tobacco + ldl + famhist + typea + obesity + age
##
## Df Deviance AIC
## - obesity 1 475.69 487.69
## <none> 473.98 487.98
## + sbp 1 472.55 488.55
## + adiposity 1 473.47 489.47
## + alcohol 1 473.94 489.94
## - tobacco 1 484.18 496.18
## - typea 1 484.30 496.30
## - ldl 1 484.53 496.53
## - famhist 1 490.58 502.58
## - age 1 502.11 514.11
##
## Step: AIC=487.69
## chd ~ tobacco + ldl + famhist + typea + age
##
## Df Deviance AIC
## <none> 475.69 487.69
## + obesity 1 473.98 487.98
## + sbp 1 474.65 488.65
## + adiposity 1 475.44 489.44
## + alcohol 1 475.65 489.65
## - ldl 1 484.71 494.71
## - typea 1 485.44 495.44
## - tobacco 1 486.03 496.03
## - famhist 1 492.09 502.09
## - age 1 502.38 512.38
summary(heartstepA)
##
## Call:
## glm(formula = chd ~ tobacco + ldl + famhist + typea + age, family = binomial,
## data = heart)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9165 -0.8054 -0.4430 0.9329 2.6139
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.44644 0.92087 -7.000 2.55e-12 ***
## tobacco 0.08038 0.02588 3.106 0.00190 **
## ldl 0.16199 0.05497 2.947 0.00321 **
## famhistPresent 0.90818 0.22576 4.023 5.75e-05 ***
## typea 0.03712 0.01217 3.051 0.00228 **
## age 0.05046 0.01021 4.944 7.65e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 596.11 on 461 degrees of freedom
## Residual deviance: 475.69 on 456 degrees of freedom
## AIC: 487.69
##
## Number of Fisher Scoring iterations: 5
n=dim(heart)[1]
heartstepB = step(heartfull, scope=list(upper=~., lower=~1), trace = 0, k=log(n))
summary(heartstepB)
##
## Call:
## glm(formula = chd ~ tobacco + ldl + famhist + typea + age, family = binomial,
## data = heart)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9165 -0.8054 -0.4430 0.9329 2.6139
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.44644 0.92087 -7.000 2.55e-12 ***
## tobacco 0.08038 0.02588 3.106 0.00190 **
## ldl 0.16199 0.05497 2.947 0.00321 **
## famhistPresent 0.90818 0.22576 4.023 5.75e-05 ***
## typea 0.03712 0.01217 3.051 0.00228 **
## age 0.05046 0.01021 4.944 7.65e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 596.11 on 461 degrees of freedom
## Residual deviance: 475.69 on 456 degrees of freedom
## AIC: 487.69
##
## Number of Fisher Scoring iterations: 5
Estimation (probability of being 1) on the training samples
phat=heartstepA$fitted.values;
mypred = (phat>0.5)
table(heart$chd, mypred)
## mypred
## FALSE TRUE
## 0 256 46
## 1 73 87
Consider the following three males with the same value on all other predictors except age: min, max, and median. What’s the estimated chance of getting heart disease for each of them?
testsamples = heart[c(1, 1, 1),]
testsamples
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 1 160 12 5.73 23.11 Present 49 25.3 97.2 52 1
## 1.1 160 12 5.73 23.11 Present 49 25.3 97.2 52 1
## 1.2 160 12 5.73 23.11 Present 49 25.3 97.2 52 1
testsamples$age = c(min(heart$age), median(heart$age), max(heart$age))
testsamples
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 1 160 12 5.73 23.11 Present 49 25.3 97.2 15 1
## 1.1 160 12 5.73 23.11 Present 49 25.3 97.2 45 1
## 1.2 160 12 5.73 23.11 Present 49 25.3 97.2 64 1
predict(heartstepA, newdata=testsamples) # predict log-odds
## 1 1.1 1.2
## -1.0700021 0.4438094 1.4025567
?predict.glm
predict(heartstepA, newdata=testsamples, type="response") # predict probabilities
## 1 1.1 1.2
## 0.2554027 0.6091664 0.8025893
The motivation for collecting this database was the explosion of the USA Space Shuttle Challenger on 28 January 1986. The explosion was eventually traced to the failure of one of the three field joints on one of the two solid booster rockets. Each of these six field joints includes two O‐rings, designated as primary and secondary, which fail when phenomena called erosion and blowby both occur.
The night before the launch a decision had to be made regarding launch safety. The discussion among engineers and managers leading to this decision included concern that the probability of failure of the O‐rings depended on the temperature at launch, which was forecasted to be 31 degrees F. There are strong engineering reasons based on the composition of O‐rings to support the judgment that failure probability may rise monotonically as temperature drops. One other variable, the pressure at which safety testing for field join leaks was performed, was available, but its relevance to the failure process was unclear.
The data set includes the temperature and the number of O‐ring failures for the 23 shuttle flights previous to the Challenger disaster. No previous liftoff temperature was below 53 degrees F. So tremendous extrapolation must be done from the given data to assess risk at 31 degrees F. However it is obvious (from the plot) the risk is high at 31 degrees F.
The task is to predict the Chance of one of the O‐rings would fail when the launch temperature is below freezing.
orings = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/space-shuttle/o-ring-erosion-or-blowby.data")
orings
## V1 V2 V3 V4 V5
## 1 6 0 66 50 1
## 2 6 1 70 50 2
## 3 6 0 69 50 3
## 4 6 0 68 50 4
## 5 6 0 67 50 5
## 6 6 0 72 50 6
## 7 6 0 73 100 7
## 8 6 0 70 100 8
## 9 6 1 57 200 9
## 10 6 1 63 200 10
## 11 6 1 70 200 11
## 12 6 0 78 200 12
## 13 6 0 67 200 13
## 14 6 2 53 200 14
## 15 6 0 67 200 15
## 16 6 0 75 200 16
## 17 6 0 70 200 17
## 18 6 0 81 200 18
## 19 6 0 76 200 19
## 20 6 0 79 200 20
## 21 6 2 75 200 21
## 22 6 0 76 200 22
## 23 6 1 58 200 23
orings=orings[, c(2:3)]
names(orings)=c("damage", "temp")
orings[order(orings$temp),]
## damage temp
## 14 2 53
## 9 1 57
## 23 1 58
## 10 1 63
## 1 0 66
## 5 0 67
## 13 0 67
## 15 0 67
## 4 0 68
## 3 0 69
## 2 1 70
## 8 0 70
## 11 1 70
## 17 0 70
## 6 0 72
## 7 0 73
## 16 0 75
## 21 2 75
## 19 0 76
## 22 0 76
## 12 0 78
## 20 0 79
## 18 0 81
logitmod=glm(cbind(damage, 6-damage) ~ temp, family=binomial, data=orings)
summary(logitmod)
##
## Call:
## glm(formula = cbind(damage, 6 - damage) ~ temp, family = binomial,
## data = orings)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.95227 -0.78299 -0.54117 -0.04379 2.65152
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.08498 3.05247 1.666 0.0957 .
## temp -0.11560 0.04702 -2.458 0.0140 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24.230 on 22 degrees of freedom
## Residual deviance: 18.086 on 21 degrees of freedom
## AIC: 35.647
##
## Number of Fisher Scoring iterations: 5
predict(logitmod, data.frame(temp=31), type="response")
## 1
## 0.8177744
min.temp=31; max.temp=max(orings$temp)
plot(orings$temp, orings$damage/6,
xlim=c(min.temp, max.temp), ylim=c(0,1),
xlab="Temp", ylab="Chance of Damage")
newtemp = seq(min.temp, max.temp, length=100)
phat = predict(logitmod, data.frame(temp=newtemp), type="response")
lines(newtemp, phat, col="red")
Consider the following simple example: Y = 1 if x1 + x2 > 0, Y=0, otherwise.
x = matrix(runif(20), 10, 2)
y = ifelse(x[,1]+x[,2]>1, 1, 0);
plot(x[,1], x[,2], type="n", xlab="", ylab="");
text(x[,1], x[,2], y )
myfit=glm(y~x, family=binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(myfit)
##
## Call:
## glm(formula = y ~ x, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.965e-05 -2.110e-08 -2.110e-08 2.110e-08 2.391e-05
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -345.5 395361.0 -0.001 0.999
## x1 353.5 408387.0 0.001 0.999
## x2 361.4 418630.2 0.001 0.999
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.3460e+01 on 9 degrees of freedom
## Residual deviance: 1.1345e-09 on 7 degrees of freedom
## AIC: 6
##
## Number of Fisher Scoring iterations: 25
When the two group data (based on Y value) can be seperated by a linear function of x, i.e., linearly separable, then the logistic model would encounter some convergence issue, since the MLE of beta would converge to infinity or negative-infinity.