Simple Linear Regression (I)

Load the Cats Data

Load packages. The Cats data is from the library MASS; we will try the plotting function qplot from the ggplot2 package.

library(ggplot2)    ## for qplot
library(MASS)       
help(cats)

The data “cats” is stored as a data frame in R, which is almost the same as a matrix, but its columns have names.

str(cats)

## 'data.frame':    144 obs. of  3 variables:
##  $ Sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Bwt: num  2 2 2 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ...
##  $ Hwt: num  7 7.4 9.5 7.2 7.3 7.6 8.1 8.2 8.3 8.5 ...

dim(cats)

## [1] 144   3

head(cats)

##   Sex Bwt Hwt
## 1   F 2.0 7.0
## 2   F 2.0 7.4
## 3   F 2.0 9.5
## 4   F 2.1 7.2
## 5   F 2.1 7.3
## 6   F 2.1 7.6

tail(cats)

##     Sex Bwt  Hwt
## 139   M 3.6 15.0
## 140   M 3.7 11.0
## 141   M 3.8 14.8
## 142   M 3.8 16.8
## 143   M 3.9 14.4
## 144   M 3.9 20.5

names(cats)

## [1] "Sex" "Bwt" "Hwt"

attach(cats)
summary(cats)

##  Sex         Bwt            Hwt       
##  F:47   Min.   :2.00   Min.   : 6.30  
##  M:97   1st Qu.:2.30   1st Qu.: 8.95  
##         Median :2.70   Median :10.10  
##         Mean   :2.72   Mean   :10.63  
##         3rd Qu.:3.02   3rd Qu.:12.12  
##         Max.   :3.90   Max.   :20.50

Graphical Displays

Try various graphical commands with the data

pairs(cats)
plot(Bwt, Hwt)
par(mfrow=c(2,2))
hist(Bwt)
plot(density(Bwt))
hist(Hwt)
plot(density(Hwt))

The very first step: produce a scatter plot of the data.

plot(Bwt, Hwt,
       xlab="Body Weight in kg",
       ylab="Heart Weight in g", type="n");
points(Bwt[Sex=='F'], Hwt[Sex=='F'], col="red", pch=1)
points(Bwt[Sex=='M'], Hwt[Sex=='M'], col="blue", pch=2)

Try qplot from the ggplot2 package.

qplot(Bwt, Hwt, data=cats, colour=Sex, size=I(3))

?qplot
qplot(Bwt, Hwt, data=cats, size=I(3), facets=~Sex, alpha=I(1/2))
qplot(Bwt, Hwt, data=cats, size=I(3), facets=~Sex, alpha=I(1/2) , position="jitter")

Fit a Regression Model

Fit a linear regression model with command lm, and save the output.

out = lm(Hwt~Bwt, data = cats)
summary(out)

## 
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.569 -0.963 -0.092  1.043  5.124 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.357      0.692   -0.52     0.61    
## Bwt            4.034      0.250   16.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.45 on 142 degrees of freedom
## Multiple R-squared:  0.647,  Adjusted R-squared:  0.644 
## F-statistic:  260 on 1 and 142 DF,  p-value: <2e-16

You can extract various information from the lm output, which is stored as a list in R.

names(out)

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

out$coef

## (Intercept)         Bwt 
##     -0.3567      4.0341

cor(Bwt, Hwt)*sd(Hwt)/sd(Bwt)  ## Slope = corr(X, Y) * sd(Y)/sd(X)

## [1] 4.034

out$rank

## [1] 2

out$df

## [1] 142

out$df.residual

## [1] 142

var(out$fitted)/var(Hwt)

## [1] 0.6466

1 - sum(out$res^2)/sum((Hwt-mean(Hwt))^2)

## [1] 0.6466

summary(out)$r.sq

## [1] 0.6466

What is adjusted R-square? Recall that R-sq = 1 - RSS/TSS; Adjusted R-square is defined similarly except that RSS is normalized by its df and TSS is normalized by its df.

n = length(Hwt)
1 - sum(out$res^2)/(n-2)/(sum((Hwt-mean(Hwt))^2)/(n-1))

## [1] 0.6441

summary(out)$adj.r

## [1] 0.6441

What’s the prediction of the Hwt for a cat with Bwt = 2.05? You can directly calculate it based on the estimated LS coefficients or use the function predict.

sum(out$coef*c(1, 2.05))

## [1] 7.913

predict(out, newdata=data.frame(Bwt = 2.05))

##     1 
## 7.913

## "predict" is a generic command and its return and options depend on the class
## of its argument. The particular command for prediction with output from the 
## command "lm" is "predict.lm"
?predict.lm  
predict(out, newdata=data.frame(Bwt = 2.05), se=TRUE)

## $fit
##     1 
## 7.913 
## 
## $se.fit
## [1] 0.2075
## 
## $df
## [1] 142
## 
## $residual.scale
## [1] 1.452

More Graphical Displays.

Add the regression line on the scatter plot using the basic plotting commands.

plot(Bwt,Hwt); 
abline(out, lwd=2, col="blue")
points(mean(Bwt), mean(Hwt), pch=4, cex=1.5, col="red")

Try ggplot to add regression results (line, CIs, etc) to the scatter plot.

ggplot(cats, aes(Bwt, Hwt)) + geom_point() + geom_smooth(method=lm)

Change the data points using hollow circles and remove the shaded confidence region.

ggplot(cats, aes(Bwt, Hwt)) +
  geom_point(shape=1, aes(color=Sex), size=3) + ## Use hollow circles; sex color by Sex 
  geom_smooth(method=lm,   ## Add linear regression line
              se=FALSE)    ## Don't add shaded confidence region

Try other variations

ggplot(cats, aes(Bwt, Hwt, color=Sex)) + ## set color by Sex and 
                            ## different regression lines for different Sex 
  geom_point(shape=1, aes(color=Sex)) + # Use hollow circles; 
  geom_smooth(method=lm, se=FALSE)   

## Same, but extend the regression line to the full display range of x. 
ggplot(cats, aes(Bwt, Hwt, color=Sex)) +
  geom_point(shape=1, aes(color=Sex)) + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)

How the regression coefficients would change if we rescale/shift X and/or Y?

tmp = cats$Bwt; 
cats$Bwt = 1000*cats$Bwt; 
summary(lm(Hwt ~ Bwt, data=cats))
cats$Bwt = tmp  ## transform the data back to the original

tmp = cats$Hwt; 
cats$Hwt = cats$Hwt + 1;
summary(lm(Hwt ~ Bwt, data=cats))
cats$Hwt = tmp ## transform the data back to the original

Regression through the origin.

Note how the R-square is computed.

tmp=lm(Hwt ~ 0 + Bwt, data=cats)
## same as "summary(lm(Hwt ~ Bwt -1, data=cats))"
summary(tmp)

## 
## Call:
## lm(formula = Hwt ~ 0 + Bwt, data = cats)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.456 -0.998 -0.100  1.004  5.262 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## Bwt   3.9071     0.0436    89.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.45 on 143 degrees of freedom
## Multiple R-squared:  0.982,  Adjusted R-squared:  0.982 
## F-statistic: 8.02e+03 on 1 and 143 DF,  p-value: <2e-16

var(tmp$fitted)/var(Hwt)

## [1] 0.6066

sum(tmp$fitted^2)/sum(Hwt^2)

## [1] 0.9825