How to cheat with R-square?

n=40; 
x=runif(n, -5,5); 
y=1+2*x+3*rnorm(n)
mydata = data.frame(x=x, y=y)
ggplot(mydata, aes(x,y)) + geom_point(shape=1) + 
  geom_smooth(method=lm, se=FALSE, lwd=1.2)

lab=rep(1, n)
x.quan=quantile(x)
lab[ (x <  x.quan[2]) | (x > x.quan[4]) ]= 0; 
table(lab)
## lab
##  0  1 
## 20 20
mydata$lab = as.factor(lab)  ## add "lab" to the data frame "mydata"

qplot(x,y, data=mydata, color=lab,size = I(3) )

myfit1=lm(y~x, mydata, subset=(lab==1))
myfit0 = lm(y~x, mydata, subset=(lab==0))
summary(myfit1)$r.sq  ## points in the middle
## [1] 0.6546052
summary(myfit0)$r.sq  ## points on the two sides
## [1] 0.9445071
## The two LS lines are similar, but R-squares are quite different
ggplot(mydata, aes(x, y, color=lab)) +
  geom_point(size=I(3), aes(color=lab)) + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE, lwd=1.2)   

The Cats Example

out = lm(Hwt~Bwt, data = cats)
summary(out)
## 
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5694 -0.9634 -0.0921  1.0426  5.1238 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.3567     0.6923  -0.515    0.607    
## Bwt           4.0341     0.2503  16.119   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.452 on 142 degrees of freedom
## Multiple R-squared:  0.6466, Adjusted R-squared:  0.6441 
## F-statistic: 259.8 on 1 and 142 DF,  p-value: < 2.2e-16
mycoef=summary(out)$coef
mycoef
##               Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) -0.3566624  0.6922770 -0.5152019 6.072131e-01
## Bwt          4.0340627  0.2502615 16.1193908 6.969045e-34
2*pt(mycoef[1,1]/mycoef[1,2], 142)  ## compute the p-value for beta0
## [1] 0.6072131
2*pt(-mycoef[2,1]/mycoef[2,2], 142) ## compute the p-value for beta1
## [1] 6.969045e-34
anova(out)
## Analysis of Variance Table
## 
## Response: Hwt
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## Bwt         1 548.09  548.09  259.83 < 2.2e-16 ***
## Residuals 142 299.53    2.11                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
myanova = anova(out)

## The F-test is essentially the t-test for SLR. 
myanova[1,5]
## [1] 6.969045e-34
mycoef[2,4]
## [1] 6.969045e-34
myanova[1,4]
## [1] 259.8348
mycoef[2,3]^2
## [1] 259.8348
## What's the prediction of the Hwt for a cat with Bwt = 2.05? 
predict(out, newdata=data.frame(Bwt = 2.05))
##        1 
## 7.913166
predict(out, newdata=data.frame(Bwt = 2.05), interval="confidence")
##        fit      lwr      upr
## 1 7.913166 7.502925 8.323407
predict(out, newdata=data.frame(Bwt = 2.05), interval="prediction")
##        fit      lwr      upr
## 1 7.913166 5.012937 10.81339
## The gray area is the connected "point-wise" confidence intervals
ggplot(cats, aes(Bwt, Hwt)) + geom_point() + geom_smooth(method=lm)+
  geom_point(aes(x = mean(cats$Bwt), y=mean(cats$Hwt)), color="red") + 
  geom_vline(xintercept=mean(cats$Bwt), color="gray", linetype="dashed")