Cathedral nave heights and lengths in England

library(faraway)
?cathedral # cathedral is a dataset from faraway package
attach(cathedral)
cathedral 
##              style   x   y
## Durham           r  75 502
## Canterbury       r  80 522
## Gloucester       r  68 425
## Hereford         r  64 344
## Norwich          r  83 407
## Peterborough     r  80 451
## St.Albans        r  70 551
## Winchester       r  76 530
## Ely              r  74 547
## York             g 100 519
## Bath             g  75 225
## Bristol          g  52 300
## Chichester       g  62 418
## Exeter           g  68 409
## GloucesterG      g  86 425
## Lichfield        g  57 370
## Lincoln          g  82 506
## NorwichG         g  72 407
## Ripon            g  88 295
## Southwark        g  55 273
## Wells            g  67 415
## St.Asaph         g  45 182
## WinchesterG      g 103 530
## Old.St.Paul      g 103 611
## Salisbury        g  84 473
plot(x, y,type="n",xlab="Nave height",ylab="Length")
text(x,y,as.character(style))

Fit the Full Model

full model: different intercepts and different slopes. How to interpret the coefficients?

g.full = lm(y~x+style+x:style, data=cathedral)
# same as lm(y~x*style, data=cathedral)
summary(g.full)
## 
## Call:
## lm(formula = y ~ x + style + x:style, data = cathedral)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -172.68  -30.22   23.75   55.78   89.50 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   37.111     85.675   0.433 0.669317    
## x              4.808      1.112   4.322 0.000301 ***
## styler       204.722    347.207   0.590 0.561733    
## x:styler      -1.669      4.641  -0.360 0.722657    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 79.11 on 21 degrees of freedom
## Multiple R-squared:  0.5412, Adjusted R-squared:  0.4757 
## F-statistic: 8.257 on 3 and 21 DF,  p-value: 0.0008072
model.matrix(g.full)
##              (Intercept)   x styler x:styler
## Durham                 1  75      1       75
## Canterbury             1  80      1       80
## Gloucester             1  68      1       68
## Hereford               1  64      1       64
## Norwich                1  83      1       83
## Peterborough           1  80      1       80
## St.Albans              1  70      1       70
## Winchester             1  76      1       76
## Ely                    1  74      1       74
## York                   1 100      0        0
## Bath                   1  75      0        0
## Bristol                1  52      0        0
## Chichester             1  62      0        0
## Exeter                 1  68      0        0
## GloucesterG            1  86      0        0
## Lichfield              1  57      0        0
## Lincoln                1  82      0        0
## NorwichG               1  72      0        0
## Ripon                  1  88      0        0
## Southwark              1  55      0        0
## Wells                  1  67      0        0
## St.Asaph               1  45      0        0
## WinchesterG            1 103      0        0
## Old.St.Paul            1 103      0        0
## Salisbury              1  84      0        0
## attr(,"assign")
## [1] 0 1 2 3
## attr(,"contrasts")
## attr(,"contrasts")$style
## [1] "contr.treatment"
plot(x,y,type="n",xlab="Nave height",ylab="Length")
text(x[style=='r'],y[style=='r'], 'r', col="red")
text(x[style=='g'],y[style=='g'], 'g', col="blue")
abline(g.full$coef[1], g.full$coef[2], col="blue", lty=1, lwd=1.5)
abline(sum(g.full$coef[c(1,3)]), sum(g.full$coef[c(2,4)]), col="red", lty=2, lwd=1.5)

The full model is NOT the same as we fitting a SLR model separately in each group. The coefficients are the same, but the t-statistics are different since sigma-hat is not the same.

g.r=lm(y~x, data=cathedral[style=='r',]);
g.g=lm(y~x, data=cathedral[style=='g',]);
summary(g.r)
## 
## Call:
## lm(formula = y ~ x, data = cathedral[style == "r", ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -98.67 -41.88  24.81  49.67  89.50 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  241.833    316.452   0.764    0.470
## x              3.138      4.238   0.740    0.483
## 
## Residual standard error: 74.4 on 7 degrees of freedom
## Multiple R-squared:  0.07264,    Adjusted R-squared:  -0.05984 
## F-statistic: 0.5483 on 1 and 7 DF,  p-value: 0.4831
summary(g.g)
## 
## Call:
## lm(formula = y ~ x, data = cathedral[style == "g", ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -172.68  -26.30   18.32   56.55   82.82 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   37.111     88.113   0.421 0.680025    
## x              4.808      1.144   4.202 0.000887 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 81.36 on 14 degrees of freedom
## Multiple R-squared:  0.5578, Adjusted R-squared:  0.5262 
## F-statistic: 17.66 on 1 and 14 DF,  p-value: 0.0008869

Center the numerical predictor, then refit the model with interactions. Which coefficients would stay the same, and which would change? How to interpret the coefficients? Usually centering the numerical predictor leads to a more meaningful interpretation of the coefficients.

newdata = cathedral
newdata$x = newdata$x - mean(newdata$x)
summary(lm(y~x+style + x:style, data=newdata))
## 
## Call:
## lm(formula = y ~ x + style + x:style, data = newdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -172.68  -30.22   23.75   55.78   89.50 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  396.522     19.777  20.049 3.57e-15 ***
## x              4.808      1.112   4.322 0.000301 ***
## styler        79.913     32.992   2.422 0.024559 *  
## x:styler      -1.669      4.641  -0.360 0.722657    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 79.11 on 21 degrees of freedom
## Multiple R-squared:  0.5412, Adjusted R-squared:  0.4757 
## F-statistic: 8.257 on 3 and 21 DF,  p-value: 0.0008072

Fit the Additive Model

The interaction term is not significant, so we fit a smaller model, the (additive) model with two paralle lines: different intercepts and same slope. How to intepret the LS coefficients?

g.add =lm(y~x+style, data=cathedral)
summary(g.add)
## 
## Call:
## lm(formula = y ~ x + style, data = cathedral)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -172.67  -30.44   20.38   55.02   96.50 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   44.298     81.648   0.543   0.5929    
## x              4.712      1.058   4.452   0.0002 ***
## styler        80.393     32.306   2.488   0.0209 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 77.53 on 22 degrees of freedom
## Multiple R-squared:  0.5384, Adjusted R-squared:  0.4964 
## F-statistic: 12.83 on 2 and 22 DF,  p-value: 0.0002028
model.matrix(g.add)
##              (Intercept)   x styler
## Durham                 1  75      1
## Canterbury             1  80      1
## Gloucester             1  68      1
## Hereford               1  64      1
## Norwich                1  83      1
## Peterborough           1  80      1
## St.Albans              1  70      1
## Winchester             1  76      1
## Ely                    1  74      1
## York                   1 100      0
## Bath                   1  75      0
## Bristol                1  52      0
## Chichester             1  62      0
## Exeter                 1  68      0
## GloucesterG            1  86      0
## Lichfield              1  57      0
## Lincoln                1  82      0
## NorwichG               1  72      0
## Ripon                  1  88      0
## Southwark              1  55      0
## Wells                  1  67      0
## St.Asaph               1  45      0
## WinchesterG            1 103      0
## Old.St.Paul            1 103      0
## Salisbury              1  84      0
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$style
## [1] "contr.treatment"
plot(x,y,type="n",xlab="Nave height",ylab="Length")
text(x,y,as.character(style))
abline(g.add$coef[1:2], col="red", lty=1, lwd=1.5)
abline(sum(g.add$coef[c(1,3)]), g.add$coef[2], col="blue", lty=2, lwd=1.5)

Fit My Model

I’m a little suspicious about the line fit for the ‘r’ group due to its small sample size and fan-shape pattern. Maybe the slope for that group won’t be significant. In other words, I’m considering the following model for the two groups:

  • y = a0 + a1*x, but a1=0, if style=‘r’

  • y = b0 + b1*x, if style=‘g’

Next, I construct a design matrix for the model (above) and test whether a1=0.

newX = cbind(rep(1,25), rep(1,25),x,x);
newX[style=='r',3]=0; newX[style=='g',4]=0;
newX[style=='g',2]=0;
newX
##             x  x
##  [1,] 1 1   0 75
##  [2,] 1 1   0 80
##  [3,] 1 1   0 68
##  [4,] 1 1   0 64
##  [5,] 1 1   0 83
##  [6,] 1 1   0 80
##  [7,] 1 1   0 70
##  [8,] 1 1   0 76
##  [9,] 1 1   0 74
## [10,] 1 0 100  0
## [11,] 1 0  75  0
## [12,] 1 0  52  0
## [13,] 1 0  62  0
## [14,] 1 0  68  0
## [15,] 1 0  86  0
## [16,] 1 0  57  0
## [17,] 1 0  82  0
## [18,] 1 0  72  0
## [19,] 1 0  88  0
## [20,] 1 0  55  0
## [21,] 1 0  67  0
## [22,] 1 0  45  0
## [23,] 1 0 103  0
## [24,] 1 0 103  0
## [25,] 1 0  84  0
# We already include the intercept column in newX, so run lm with this "-1" option
summary(lm(y~newX-1)) 
## 
## Call:
## lm(formula = y ~ newX - 1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -172.68  -30.22   23.75   55.78   89.50 
## 
## Coefficients:
##       Estimate Std. Error t value Pr(>|t|)    
## newX    37.111     85.675   0.433 0.669317    
## newX   204.722    347.207   0.590 0.561733    
## newXx    4.808      1.112   4.322 0.000301 ***
## newXx    3.138      4.506   0.696 0.493790    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 79.11 on 21 degrees of freedom
## Multiple R-squared:  0.9727, Adjusted R-squared:  0.9675 
## F-statistic:   187 on 4 and 21 DF,  p-value: 4.273e-16

The slope for the ‘r’ group (the last coef) is not significant, so I remove the last column and refit the model. Note that in g.new, we just assigned an intercept to the ‘r’ group, so no slope, i.e., a flat line for the ‘r’ group

newX = newX[,-4]
g.new = lm(y~newX-1);
summary(g.new)
## 
## Call:
## lm(formula = y ~ newX - 1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -172.68  -28.52   23.75   55.78   82.82 
## 
## Coefficients:
##       Estimate Std. Error t value Pr(>|t|)    
## newX    37.111     84.667   0.438 0.665430    
## newX   438.334     88.586   4.948 5.97e-05 ***
## newXx    4.808      1.099   4.373 0.000242 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.17 on 22 degrees of freedom
## Multiple R-squared:  0.9721, Adjusted R-squared:  0.9683 
## F-statistic: 255.1 on 3 and 22 DF,  p-value: < 2.2e-16
plot(x,y,type="n",xlab="Nave height",ylab="Length")
text(x,y,as.character(style))
text(x[style=='r'],y[style=='r'], 'r', col="red")
text(x[style=='g'],y[style=='g'], 'g', col="blue")
abline(sum(g.new$coef[c(1,2)]), 0, col="red", lty=1, lwd=1.5)
abline(g.new$coef[1], g.new$coef[3], col="blue", lty=2, lwd=1.5)

How to compare ‘g.new’ and ‘g.add’? They are not nested; you can check their design matrices.

as.matrix(model.matrix(g.add))
##              (Intercept)   x styler
## Durham                 1  75      1
## Canterbury             1  80      1
## Gloucester             1  68      1
## Hereford               1  64      1
## Norwich                1  83      1
## Peterborough           1  80      1
## St.Albans              1  70      1
## Winchester             1  76      1
## Ely                    1  74      1
## York                   1 100      0
## Bath                   1  75      0
## Bristol                1  52      0
## Chichester             1  62      0
## Exeter                 1  68      0
## GloucesterG            1  86      0
## Lichfield              1  57      0
## Lincoln                1  82      0
## NorwichG               1  72      0
## Ripon                  1  88      0
## Southwark              1  55      0
## Wells                  1  67      0
## St.Asaph               1  45      0
## WinchesterG            1 103      0
## Old.St.Paul            1 103      0
## Salisbury              1  84      0
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$style
## [1] "contr.treatment"
as.matrix(model.matrix(g.new))
##    newX newX newXx
## 1     1    1     0
## 2     1    1     0
## 3     1    1     0
## 4     1    1     0
## 5     1    1     0
## 6     1    1     0
## 7     1    1     0
## 8     1    1     0
## 9     1    1     0
## 10    1    0   100
## 11    1    0    75
## 12    1    0    52
## 13    1    0    62
## 14    1    0    68
## 15    1    0    86
## 16    1    0    57
## 17    1    0    82
## 18    1    0    72
## 19    1    0    88
## 20    1    0    55
## 21    1    0    67
## 22    1    0    45
## 23    1    0   103
## 24    1    0   103
## 25    1    0    84
## attr(,"assign")
## [1] 1 1 1

So we cannot use F-test. But they have the same number of parameters, so we can compare their R-squares. Which model is better?

summary(g.add)$r.sq
## [1] 0.5383801
summary(g.new)$r.sq
## [1] 0.9720613

As I mentioned in class, the R-square for g.new is not correctly computed since R thought I was fitting a linear model with the intercept.

We should remove the intercept the column from newX and then re-call lm (R will add the intercept column back). You’ll find that all the results, coefficients and p-values, are the same as the output from g.new, but R-square is different, which is slightly better than g.add.

g.new2=lm(y~newX[,-1])
summary(g.new2)
## 
## Call:
## lm(formula = y ~ newX[, -1])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -172.68  -28.52   23.75   55.78   82.82 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   37.111     84.667   0.438 0.665430    
## newX[, -1]   438.334     88.586   4.948 5.97e-05 ***
## newX[, -1]x    4.808      1.099   4.373 0.000242 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.17 on 22 degrees of freedom
## Multiple R-squared:  0.5306, Adjusted R-squared:  0.4879 
## F-statistic: 12.43 on 2 and 22 DF,  p-value: 0.0002437
# R-square should be computed this way
1 - sum(g.new2$res^2)/sum((y-mean(y))^2)
## [1] 0.5306104
summary(g.new2)$r.sq
## [1] 0.5306104
# For models without an intercept, R-square is computed this way
# (g.new2$res and g.new$res should be the same)
1 - sum(g.new2$res^2)/sum(y^2)
## [1] 0.9720613
summary(g.new)$r.sq
## [1] 0.9720613

Longevity of fruiflies depending on sexual activity and thorax length

The fruitfly data frame has 9 rows and 3 columns. 125 fruitflies were divided randomly into 5 groups of 25 each. The response was the longevity of the fruitfly in days.

Pregnant fruitflies will not mate. The thorax length of each male was measured as this was known to affect longevity. One observation in the many group has been lost. So the total sample size is 124.

data(fruitfly)
head(fruitfly)
##   thorax longevity activity
## 1   0.68        37     many
## 2   0.68        49     many
## 3   0.72        46     many
## 4   0.72        63     many
## 5   0.76        39     many
## 6   0.76        46     many
fruitfly$a
##   [1] many     many     many     many     many     many     many    
##   [8] many     many     many     many     many     many     many    
##  [15] many     many     many     many     many     many     many    
##  [22] many     many     many     isolated isolated isolated isolated
##  [29] isolated isolated isolated isolated isolated isolated isolated
##  [36] isolated isolated isolated isolated isolated isolated isolated
##  [43] isolated isolated isolated isolated isolated isolated isolated
##  [50] one      one      one      one      one      one      one     
##  [57] one      one      one      one      one      one      one     
##  [64] one      one      one      one      one      one      one     
##  [71] one      one      one      one      low      low      low     
##  [78] low      low      low      low      low      low      low     
##  [85] low      low      low      low      low      low      low     
##  [92] low      low      low      low      low      low      low     
##  [99] low      high     high     high     high     high     high    
## [106] high     high     high     high     high     high     high    
## [113] high     high     high     high     high     high     high    
## [120] high     high     high     high     high    
## Levels: isolated one low many high
dim(fruitfly)
## [1] 124   3
library(ggplot2)
ggplot(aes(x = thorax, y=longevity), data=fruitfly) + geom_point() +
  facet_wrap(~activity)

Fit the most general model, i.e., the model including the interaction. Then use the two sequential ANOVA tables to select an approrpiate model for the data.

lmod= lm(longevity ~ thorax * activity, fruitfly)
# summary(lmod)  
anova(lmod)
## Analysis of Variance Table
## 
## Response: longevity
##                  Df  Sum Sq Mean Sq F value    Pr(>F)    
## thorax            1 15003.3 15003.3 130.733 < 2.2e-16 ***
## activity          4  9634.6  2408.6  20.988 5.503e-13 ***
## thorax:activity   4    24.3     6.1   0.053    0.9947    
## Residuals       114 13083.0   114.8                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm(longevity ~ activity * thorax, fruitfly))
## Analysis of Variance Table
## 
## Response: longevity
##                  Df  Sum Sq Mean Sq F value  Pr(>F)    
## activity          4 12269.5  3067.4  26.728 1.2e-15 ***
## thorax            1 12368.4 12368.4 107.774 < 2e-16 ***
## activity:thorax   4    24.3     6.1   0.053  0.9947    
## Residuals       114 13083.0   114.8                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value from the F-test for the interaction term should be the same in both tables. Apparently, the interaction term is not significant. Do we need both thorax and activity, or just one of them, or neither? thorax makes a significant contribution to the response variable, no matter the other variable activity is included or not; same for activity. So we should pick the additive model.

lmod.add = lm(longevity ~ thorax + activity, fruitfly)
summary(lmod.add)
## 
## Call:
## lm(formula = longevity ~ thorax + activity, data = fruitfly)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.108  -7.014  -1.101   6.234  30.265 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -48.749     10.850  -4.493 1.65e-05 ***
## thorax        134.341     12.731  10.552  < 2e-16 ***
## activityone     2.637      2.984   0.884   0.3786    
## activitylow    -7.015      2.981  -2.353   0.0203 *  
## activitymany    4.139      3.027   1.367   0.1741    
## activityhigh  -20.004      3.016  -6.632 1.05e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.54 on 118 degrees of freedom
## Multiple R-squared:  0.6527, Adjusted R-squared:  0.638 
## F-statistic: 44.36 on 5 and 118 DF,  p-value: < 2.2e-16

Back to the sequential ANOVA tables. As being mentioned in the notes, some of the F-tests in the sequential ANOVA table are computed slightly different from the F-test we learned before for comparing two nested models. But they are not wrong, but just constructed slightly differently, and they are theoretically sound procedures.

Suppose we want to compare the following two models

We can use the F-test formula for comparing two nested models which will give us an F-stat, or we can grab the F-stat from the sequential ANOVA table. The two numbers are differently since the denominators are different: in one, we use the sigma-hat from the alternative model longevity ~ thorax + activity, but in the other, we use the sigma-hat from the bigger model longevity ~ thorax + activity + thorax:activity.

anova(lm(longevity ~ thorax, fruitfly), lmod.add)
## Analysis of Variance Table
## 
## Model 1: longevity ~ thorax
## Model 2: longevity ~ thorax + activity
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1    122 22742                                  
## 2    118 13107  4    9634.6 21.684 1.974e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
9634.6/4/(13107/118)  # follows F(4, 118) dist under the null
## [1] 21.68465
anova(lmod)
## Analysis of Variance Table
## 
## Response: longevity
##                  Df  Sum Sq Mean Sq F value    Pr(>F)    
## thorax            1 15003.3 15003.3 130.733 < 2.2e-16 ***
## activity          4  9634.6  2408.6  20.988 5.503e-13 ***
## thorax:activity   4    24.3     6.1   0.053    0.9947    
## Residuals       114 13083.0   114.8                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
9634.6/4/(13083/114)  # follows F(4, 114) dist under the null
## [1] 20.98801