How to cheat with R-square?
n=40;
x=runif(n, -5,5);
y=1+2*x+3*rnorm(n)
mydata = data.frame(x=x, y=y)
ggplot(mydata, aes(x,y)) + geom_point(shape=1) +
geom_smooth(method=lm, se=FALSE, lwd=1.2)
lab=rep(1, n)
x.quan=quantile(x)
lab[ (x < x.quan[2]) | (x > x.quan[4]) ]= 0;
table(lab)
## lab
## 0 1
## 20 20
mydata$lab = as.factor(lab) ## add "lab" to the data frame "mydata"
qplot(x,y, data=mydata, color=lab,size = I(3) )
myfit1=lm(y~x, mydata, subset=(lab==1))
myfit0 = lm(y~x, mydata, subset=(lab==0))
summary(myfit1)$r.sq ## points in the middle
## [1] 0.6546052
summary(myfit0)$r.sq ## points on the two sides
## [1] 0.9445071
## The two LS lines are similar, but R-squares are quite different
ggplot(mydata, aes(x, y, color=lab)) +
geom_point(size=I(3), aes(color=lab)) +
geom_smooth(method=lm, se=FALSE, fullrange=TRUE, lwd=1.2)
The Cats Example
out = lm(Hwt~Bwt, data = cats)
summary(out)
##
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5694 -0.9634 -0.0921 1.0426 5.1238
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3567 0.6923 -0.515 0.607
## Bwt 4.0341 0.2503 16.119 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.452 on 142 degrees of freedom
## Multiple R-squared: 0.6466, Adjusted R-squared: 0.6441
## F-statistic: 259.8 on 1 and 142 DF, p-value: < 2.2e-16
mycoef=summary(out)$coef
mycoef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3566624 0.6922770 -0.5152019 6.072131e-01
## Bwt 4.0340627 0.2502615 16.1193908 6.969045e-34
2*pt(mycoef[1,1]/mycoef[1,2], 142) ## compute the p-value for beta0
## [1] 0.6072131
2*pt(-mycoef[2,1]/mycoef[2,2], 142) ## compute the p-value for beta1
## [1] 6.969045e-34
anova(out)
## Analysis of Variance Table
##
## Response: Hwt
## Df Sum Sq Mean Sq F value Pr(>F)
## Bwt 1 548.09 548.09 259.83 < 2.2e-16 ***
## Residuals 142 299.53 2.11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
myanova = anova(out)
## The F-test is essentially the t-test for SLR.
myanova[1,5]
## [1] 6.969045e-34
mycoef[2,4]
## [1] 6.969045e-34
myanova[1,4]
## [1] 259.8348
mycoef[2,3]^2
## [1] 259.8348
## What's the prediction of the Hwt for a cat with Bwt = 2.05?
predict(out, newdata=data.frame(Bwt = 2.05))
## 1
## 7.913166
predict(out, newdata=data.frame(Bwt = 2.05), interval="confidence")
## fit lwr upr
## 1 7.913166 7.502925 8.323407
predict(out, newdata=data.frame(Bwt = 2.05), interval="prediction")
## fit lwr upr
## 1 7.913166 5.012937 10.81339
## The gray area is the connected "point-wise" confidence intervals
ggplot(cats, aes(Bwt, Hwt)) + geom_point() + geom_smooth(method=lm)+
geom_point(aes(x = mean(cats$Bwt), y=mean(cats$Hwt)), color="red") +
geom_vline(xintercept=mean(cats$Bwt), color="gray", linetype="dashed")