Simple Linear Regression (09/03/15)

How to cheat with R-square?

n=40; 
x=runif(n, -5,5); 
y=1+2*x+3*rnorm(n)
mydata = data.frame(x=x, y=y)
ggplot(mydata, aes(x,y)) + geom_point(shape=1) + 
  geom_smooth(method=lm, se=FALSE, lwd=1.2)

lab=rep(1, n)
x.quan=quantile(x)
lab[ (x <  x.quan[2]) | (x > x.quan[4]) ]= 0; 
table(lab)

## lab
##  0  1 
## 20 20

mydata$lab = as.factor(lab)  ## add "lab" to the data frame "mydata"

qplot(x,y, data=mydata, color=lab,size = I(3) )

myfit1=lm(y~x, mydata, subset=(lab==1))
myfit0 = lm(y~x, mydata, subset=(lab==0))
summary(myfit1)$r.sq  ## points in the middle

## [1] 0.6546052

summary(myfit0)$r.sq  ## points on the two sides

## [1] 0.9445071

## The two LS lines are similar, but R-squares are quite different
ggplot(mydata, aes(x, y, color=lab)) +
  geom_point(size=I(3), aes(color=lab)) + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE, lwd=1.2)

The Cats Example

out = lm(Hwt~Bwt, data = cats)
summary(out)

## 
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5694 -0.9634 -0.0921  1.0426  5.1238 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.3567     0.6923  -0.515    0.607    
## Bwt           4.0341     0.2503  16.119   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.452 on 142 degrees of freedom
## Multiple R-squared:  0.6466, Adjusted R-squared:  0.6441 
## F-statistic: 259.8 on 1 and 142 DF,  p-value: < 2.2e-16

mycoef=summary(out)$coef
mycoef

##               Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) -0.3566624  0.6922770 -0.5152019 6.072131e-01
## Bwt          4.0340627  0.2502615 16.1193908 6.969045e-34

2*pt(mycoef[1,1]/mycoef[1,2], 142)  ## compute the p-value for beta0

## [1] 0.6072131

2*pt(-mycoef[2,1]/mycoef[2,2], 142) ## compute the p-value for beta1

## [1] 6.969045e-34

anova(out)

## Analysis of Variance Table
## 
## Response: Hwt
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## Bwt         1 548.09  548.09  259.83 < 2.2e-16 ***
## Residuals 142 299.53    2.11                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

myanova = anova(out)

## The F-test is essentially the t-test for SLR. 
myanova[1,5]

## [1] 6.969045e-34

mycoef[2,4]

## [1] 6.969045e-34

myanova[1,4]

## [1] 259.8348

mycoef[2,3]^2

## [1] 259.8348

## What's the prediction of the Hwt for a cat with Bwt = 2.05? 
predict(out, newdata=data.frame(Bwt = 2.05))

##        1 
## 7.913166

predict(out, newdata=data.frame(Bwt = 2.05), interval="confidence")

##        fit      lwr      upr
## 1 7.913166 7.502925 8.323407

predict(out, newdata=data.frame(Bwt = 2.05), interval="prediction")

##        fit      lwr      upr
## 1 7.913166 5.012937 10.81339

## The gray area is the connected "point-wise" confidence intervals
ggplot(cats, aes(Bwt, Hwt)) + geom_point() + geom_smooth(method=lm)+
  geom_point(aes(x = mean(cats$Bwt), y=mean(cats$Hwt)), color="red") + 
  geom_vline(xintercept=mean(cats$Bwt), color="gray", linetype="dashed")

Simple Linear Regression (II)

How to cheat with R-square?

The Cats Example