Load packages. The Cats data is from the library MASS; we will try the plotting function qplot
from the ggplot2
package.
library(ggplot2) ## for qplot
library(MASS)
help(cats)
The data “cats” is stored as a data frame in R, which is almost the same as a matrix, but its columns have names.
str(cats)
## 'data.frame': 144 obs. of 3 variables:
## $ Sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
## $ Bwt: num 2 2 2 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ...
## $ Hwt: num 7 7.4 9.5 7.2 7.3 7.6 8.1 8.2 8.3 8.5 ...
dim(cats)
## [1] 144 3
head(cats)
## Sex Bwt Hwt
## 1 F 2.0 7.0
## 2 F 2.0 7.4
## 3 F 2.0 9.5
## 4 F 2.1 7.2
## 5 F 2.1 7.3
## 6 F 2.1 7.6
tail(cats)
## Sex Bwt Hwt
## 139 M 3.6 15.0
## 140 M 3.7 11.0
## 141 M 3.8 14.8
## 142 M 3.8 16.8
## 143 M 3.9 14.4
## 144 M 3.9 20.5
names(cats)
## [1] "Sex" "Bwt" "Hwt"
attach(cats)
summary(cats)
## Sex Bwt Hwt
## F:47 Min. :2.00 Min. : 6.30
## M:97 1st Qu.:2.30 1st Qu.: 8.95
## Median :2.70 Median :10.10
## Mean :2.72 Mean :10.63
## 3rd Qu.:3.02 3rd Qu.:12.12
## Max. :3.90 Max. :20.50
Try various graphical commands with the data
pairs(cats)
plot(Bwt, Hwt)
par(mfrow=c(2,2))
hist(Bwt)
plot(density(Bwt))
hist(Hwt)
plot(density(Hwt))
The very first step: produce a scatter plot of the data.
plot(Bwt, Hwt,
xlab="Body Weight in kg",
ylab="Heart Weight in g", type="n");
points(Bwt[Sex=='F'], Hwt[Sex=='F'], col="red", pch=1)
points(Bwt[Sex=='M'], Hwt[Sex=='M'], col="blue", pch=2)
Try qplot
from the ggplot2
package.
qplot(Bwt, Hwt, data=cats, colour=Sex, size=I(3))
?qplot
qplot(Bwt, Hwt, data=cats, size=I(3), facets=~Sex, alpha=I(1/2))
qplot(Bwt, Hwt, data=cats, size=I(3), facets=~Sex, alpha=I(1/2) , position="jitter")
Fit a linear regression model with command lm
, and save the output.
out = lm(Hwt~Bwt, data = cats)
summary(out)
##
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.569 -0.963 -0.092 1.043 5.124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.357 0.692 -0.52 0.61
## Bwt 4.034 0.250 16.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.45 on 142 degrees of freedom
## Multiple R-squared: 0.647, Adjusted R-squared: 0.644
## F-statistic: 260 on 1 and 142 DF, p-value: <2e-16
You can extract various information from the lm
output, which is stored as a list in R.
names(out)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
out$coef
## (Intercept) Bwt
## -0.3567 4.0341
cor(Bwt, Hwt)*sd(Hwt)/sd(Bwt) ## Slope = corr(X, Y) * sd(Y)/sd(X)
## [1] 4.034
out$rank
## [1] 2
out$df
## [1] 142
out$df.residual
## [1] 142
var(out$fitted)/var(Hwt)
## [1] 0.6466
1 - sum(out$res^2)/sum((Hwt-mean(Hwt))^2)
## [1] 0.6466
summary(out)$r.sq
## [1] 0.6466
What is adjusted R-square? Recall that R-sq = 1 - RSS/TSS; Adjusted R-square is defined similarly except that RSS is normalized by its df and TSS is normalized by its df.
n = length(Hwt)
1 - sum(out$res^2)/(n-2)/(sum((Hwt-mean(Hwt))^2)/(n-1))
## [1] 0.6441
summary(out)$adj.r
## [1] 0.6441
What’s the prediction of the Hwt for a cat with Bwt = 2.05? You can directly calculate it based on the estimated LS coefficients or use the function predict
.
sum(out$coef*c(1, 2.05))
## [1] 7.913
predict(out, newdata=data.frame(Bwt = 2.05))
## 1
## 7.913
## "predict" is a generic command and its return and options depend on the class
## of its argument. The particular command for prediction with output from the
## command "lm" is "predict.lm"
?predict.lm
predict(out, newdata=data.frame(Bwt = 2.05), se=TRUE)
## $fit
## 1
## 7.913
##
## $se.fit
## [1] 0.2075
##
## $df
## [1] 142
##
## $residual.scale
## [1] 1.452
Add the regression line on the scatter plot using the basic plotting commands.
plot(Bwt,Hwt);
abline(out, lwd=2, col="blue")
points(mean(Bwt), mean(Hwt), pch=4, cex=1.5, col="red")
Try ggplot
to add regression results (line, CIs, etc) to the scatter plot.
ggplot(cats, aes(Bwt, Hwt)) + geom_point() + geom_smooth(method=lm)
Change the data points using hollow circles and remove the shaded confidence region.
ggplot(cats, aes(Bwt, Hwt)) +
geom_point(shape=1, aes(color=Sex), size=3) + ## Use hollow circles; sex color by Sex
geom_smooth(method=lm, ## Add linear regression line
se=FALSE) ## Don't add shaded confidence region
Try other variations
ggplot(cats, aes(Bwt, Hwt, color=Sex)) + ## set color by Sex and
## different regression lines for different Sex
geom_point(shape=1, aes(color=Sex)) + # Use hollow circles;
geom_smooth(method=lm, se=FALSE)
## Same, but extend the regression line to the full display range of x.
ggplot(cats, aes(Bwt, Hwt, color=Sex)) +
geom_point(shape=1, aes(color=Sex)) +
geom_smooth(method=lm, se=FALSE, fullrange=TRUE)
How the regression coefficients would change if we rescale/shift X and/or Y?
tmp = cats$Bwt;
cats$Bwt = 1000*cats$Bwt;
summary(lm(Hwt ~ Bwt, data=cats))
cats$Bwt = tmp ## transform the data back to the original
tmp = cats$Hwt;
cats$Hwt = cats$Hwt + 1;
summary(lm(Hwt ~ Bwt, data=cats))
cats$Hwt = tmp ## transform the data back to the original
Note how the R-square is computed.
tmp=lm(Hwt ~ 0 + Bwt, data=cats)
## same as "summary(lm(Hwt ~ Bwt -1, data=cats))"
summary(tmp)
##
## Call:
## lm(formula = Hwt ~ 0 + Bwt, data = cats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.456 -0.998 -0.100 1.004 5.262
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Bwt 3.9071 0.0436 89.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.45 on 143 degrees of freedom
## Multiple R-squared: 0.982, Adjusted R-squared: 0.982
## F-statistic: 8.02e+03 on 1 and 143 DF, p-value: <2e-16
var(tmp$fitted)/var(Hwt)
## [1] 0.6066
sum(tmp$fitted^2)/sum(Hwt^2)
## [1] 0.9825