library(faraway)
data(ozone)
?ozone
summary(ozone)
ozone[1:5,]

Show the pairwise correlation among variables.

library(corrplot)
M=cor(ozone)
corrplot(M, method="ellipse")

corrplot.mixed(M, lower="number", upper="ellipse")

The goal is to predict O3 with the other variables. The following pairwise plots show reveal several nonlinear relationships indicating that a linear regression might not be appropriate without the addition of some transformations. pairs(ozone,pch=“.”)

pairs(ozone, pch=".")

Fit a regression tree

library(rpart)
## 
## Attaching package: 'rpart'
## 
## The following object is masked from 'package:faraway':
## 
##     solder
roz = rpart(O3 ~ .,ozone)

plot(roz,compress=T,uniform=T,branch=0.4,margin=.10)
text(roz)

Prediction with a tree model

x0 = apply(ozone[,-1],2,median)
predict(roz, data.frame(t(x0)))
##        1 
## 14.35484

Which sub-tree?

This is essentially a model selection problem. Can we use criteria like AIC/BIC? Indeed we could define a model selection criterion similar to AIC/BIC: RSS + alpha*tree-size. Here the tree-size is defined to be the number of parameters a tree mode has, which is equal to the number of terminal (or leaf) nodes. The penalty per extra dimension, i.e., alpha, is also referred to as the complexity parameter (CP).

Instead of setting the CP to be 2 or log(n), try all possible values for CP.

It turns out when we increase CP from 0 to infinity, the resulting optimal sub-trees form a nested sequence, i.e., the optimal trees get smaller when it costs more to have an additional node.

The optimal subtree (based on the cost-complexity criterion) stays the same for a range of CP values. And it is not difficult to find all the break-points (of CP), i.e., values of CP with which the returned optimal sub-tree changes, which is the cptable returned by rpart.

printcp(roz)
## 
## Regression tree:
## rpart(formula = O3 ~ ., data = ozone)
## 
## Variables actually used in tree construction:
## [1] doy      dpg      humidity ibh      ibt      temp     vis     
## 
## Root node error: 21115/330 = 63.986
## 
## n= 330 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.545699      0   1.00000 1.00264 0.076171
## 2 0.073659      1   0.45430 0.46883 0.041058
## 3 0.053542      2   0.38064 0.41305 0.038397
## 4 0.026756      3   0.32710 0.36964 0.035783
## 5 0.023276      4   0.30034 0.36599 0.035977
## 6 0.023102      5   0.27707 0.35832 0.035577
## 7 0.015325      6   0.25397 0.34008 0.034326
## 8 0.010914      7   0.23864 0.33047 0.032413
## 9 0.010000      8   0.22773 0.32247 0.032283
plotcp(roz)

The current tree is too small. Let’s fit a bigger tree (i.e., fit a tree with a smaller CP value), and then do the pruning.

roze = rpart(O3 ~ .,ozone, cp=0.001)
printcp(roze)
## 
## Regression tree:
## rpart(formula = O3 ~ ., data = ozone, cp = 0.001)
## 
## Variables actually used in tree construction:
## [1] doy      dpg      humidity ibh      ibt      temp     vh       vis     
## 
## Root node error: 21115/330 = 63.986
## 
## n= 330 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.5456993      0   1.00000 1.00652 0.077189
## 2  0.0736591      1   0.45430 0.48314 0.042643
## 3  0.0535415      2   0.38064 0.41860 0.037416
## 4  0.0267557      3   0.32710 0.37441 0.035033
## 5  0.0232760      4   0.30034 0.37421 0.035282
## 6  0.0231021      5   0.27707 0.38221 0.036033
## 7  0.0153249      6   0.25397 0.37653 0.034986
## 8  0.0109137      7   0.23864 0.37888 0.036591
## 9  0.0070746      8   0.22773 0.37656 0.035441
## 10 0.0059918      9   0.22065 0.38556 0.036511
## 11 0.0059317     10   0.21466 0.38597 0.037257
## 12 0.0049709     12   0.20280 0.38678 0.036954
## 13 0.0047996     15   0.18789 0.38980 0.037602
## 14 0.0044712     16   0.18309 0.38603 0.037590
## 15 0.0031921     17   0.17861 0.38934 0.039488
## 16 0.0022152     19   0.17223 0.39578 0.039327
## 17 0.0020733     20   0.17002 0.38786 0.039109
## 18 0.0020297     22   0.16587 0.38993 0.039110
## 19 0.0014432     23   0.16384 0.39080 0.039092
## 20 0.0011322     24   0.16240 0.38918 0.039116
## 21 0.0011035     25   0.16126 0.38761 0.039141
## 22 0.0010000     26   0.16016 0.38696 0.039153
plotcp(roze)

tmp1=rpart(O3~., ozone, cp=0.01)
tmp2=rpart(O3~., ozone, cp=0.008)
tmp3=rpart(O3~., ozone, cp=0.007)
par(mfrow=c(1,3))
plot(tmp1,compress=T,uniform=T,branch=0.4,margin=.10)
text(tmp1)
plot(tmp2,compress=T,uniform=T,branch=0.4,margin=.10)
text(tmp2)
plot(tmp3,compress=T,uniform=T,branch=0.4,margin=.10)
text(tmp3)

tmp1=rpart(O3~., ozone, cp=0.0055)
tmp2=rpart(O3~., ozone, cp=0.0049)
par(mfrow=c(1,2))
plot(tmp1,compress=T,uniform=T,branch=0.4,margin=.10)
text(tmp1, cex=0.5)
plot(tmp2,compress=T,uniform=T,branch=0.4,margin=.10)
text(tmp2, cex=0.5)

However, we still don’t know what the optimal choice of CP. The cptable gives us all candidate tree sizes; for each tree size, we compute the corresponding CV error, and then pick the subtree with the CV errors. There are usually two approaches to determining the optimal tree size.

Sometimes, these two approaches would give the same result, but sometimes they may not. For this particular example, from the CV error plot (produced by plotcp), we can see that the CV error first dropped dramatically when the tree size increases, and then became stablized at roughly the same value for a wide range of tree sizes. The size.1se approach may work better here: any tree, whose CV error is within one-standard error of the smallest CV error, would be a good choice, so we pick the smallest tree among them. The dashed line in the CV error plot marks the smallest CV error plus its one-standard-error, so we should just pick the smallest tree among all trees below the dashed line.

myCPtable = roze$cptable
id.min = which.min(myCPtable[,'xerror'])
my1se.err = myCPtable[id.min,'xerror'] + myCPtable[id.min, 'xstd']
plotcp(roze)
abline(h=my1se.err, col="red") 

# the red line should cover the original dashed line

# The size.min approach: pick the tree size with the smallest CV error, this
# can be done by pruning your tree using a CP value between myCPtable[id.min, 'CP'] and  myCPtable[id.min-1, 'CP']
# Of course, you need to modify the code if id.min =1. 

CP.min = (myCPtable[id.min, 'CP'] + myCPtable[(id.min-1), 'CP'])/2
tree.min = prune.rpart(roze, CP.min)
plot(tree.min)
text(tree.min)

# The size.1se approach: pick the smallest tree below the dashed line
id.1se = min(which(myCPtable[,'xerror'] < my1se.err)) 
CP.1se = (myCPtable[id.1se, 'CP'] + myCPtable[(id.1se-1), 'CP'])/2
tree.1se = prune.rpart(roze, CP.1se)
plot(tree.1se)
text(tree.1se)

Random Forest

library(randomForest);
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
?randomForest
rfModel = randomForest(O3~., data = ozone)

Items returned by randomForest

names(rfModel) 
##  [1] "call"            "type"            "predicted"      
##  [4] "mse"             "rsq"             "oob.times"      
##  [7] "importance"      "importanceSD"    "localImportance"
## [10] "proximity"       "ntree"           "mtry"           
## [13] "forest"          "coefs"           "y"              
## [16] "test"            "inbag"           "terms"
rfModel$ntree  # number of tress in the forest
## [1] 500
rfModel$mtry  
## [1] 3

How to extract yhat of the training data? For regression, suppose the output is stored in “lmout”, then we can use 1) lmout$fitted; or 2) predict(lmout, newdata=trainingdata), and the two vectors should be the same.

For randomForest, however, the two approaches return different results. Why?

yhat = predict(rfModel, newdata=ozone) 
sum((ozone$O3 - yhat)^2)
## [1] 999.8417
# which is different from rfModel$predicted. 
sum((ozone$O3- rfModel$predicted)^2)
## [1] 5264.606

How is rfModel$predicted computed?

length(rfModel$oob.times)
## [1] 330
rfModel$oob.times[1:5]
## [1] 187 196 162 204 183
length(rfModel$predicted)
## [1] 330
rfModel$predicted[1:5]
##        1        2        3        4        5 
## 5.255971 5.995068 9.149588 7.687663 5.328415
# Then rfModel$predicted[1] is the average prediction of the 1st sample from the rfModel$oob.times[1] trees. So sum((ozone$O3- rfModel$predicted)^2) is more like a test error,  and sum((ozone$O3 - predict(rfModel, newdata=ozone) )^2) is like a training error, which is smaller. 

Classification Tree

We have some training data consisting of 148 cases with the following variables: there are three possible species, Giganteus, Melanops and Fuliginosus, the sex of the animal and 18 skull measurements (Andrews and Herzberg, 1985). The goal is to identify the species of a historical specimen from the Rijksmuseum van Natuurlijkee in Leiden whose skull measurements is stored in x0.

```r data(kanga) x0=c(1115, NA, 748, 182, NA, NA, 178, 311, 756, 226, NA, NA, NA, 48, 1009, NA, 204, 593)

# How to deal with the missing values in x0 and the in the training data kanga = kanga[,c (T, F, !is.na(x0))] kanga[1:2,] ```

## species basilar.length palate.length palate.width squamosal.depth ## 1 giganteus 1312 882 NA 180 ## 2 giganteus 1439 985 230 150 ## lacrymal.width zygomatic.width orbital.width foramina.length ## 1 394 782 249 88 ## 2 416 824 233 100 ## mandible.length mandible.depth ramus.height ## 1 1086 179 591 ## 2 1158 181 643

r apply(kanga,2,function(x) sum(is.na(x)))

## species basilar.length palate.length palate.width ## 0 1 1 24 ## squamosal.depth lacrymal.width zygomatic.width orbital.width ## 1 0 1 0 ## foramina.length mandible.length mandible.depth ramus.height ## 0 12 0 0

r round(cor(kanga[,-1],use="pairwise.complete.obs")[,c(3, 9)],2)

## palate.width mandible.length ## basilar.length 0.77 0.98 ## palate.length 0.81 0.98 ## palate.width 1.00 0.81 ## squamosal.depth 0.69 0.80 ## lacrymal.width 0.77 0.92 ## zygomatic.width 0.78 0.92 ## orbital.width 0.12 0.25 ## foramina.length 0.19 0.23 ## mandible.length 0.81 1.00 ## mandible.depth 0.62 0.85 ## ramus.height 0.73 0.94

r newko = na.omit (kanga [, -c(4, 10)]) dim(newko)

## [1] 144 10

Fit a classification tree.

kt = rpart(species ~ ., data=newko,cp=0.0001)
printcp(kt)
## 
## Classification tree:
## rpart(formula = species ~ ., data = newko, cp = 1e-04)
## 
## Variables actually used in tree construction:
## [1] basilar.length  foramina.length lacrymal.width  ramus.height   
## [5] squamosal.depth zygomatic.width
## 
## Root node error: 95/144 = 0.65972
## 
## n= 144 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.178947      0   1.00000 1.12632 0.055193
## 2 0.105263      1   0.82105 1.03158 0.058896
## 3 0.050000      2   0.71579 0.84211 0.062767
## 4 0.021053      6   0.51579 0.90526 0.061952
## 5 0.010526      7   0.49474 0.87368 0.062416
## 6 0.000100      8   0.48421 0.87368 0.062416
ktp = prune(kt,cp=0.0211)
plot(ktp,compress=T,uniform=T,branch=0.4,margin=.10)
text(ktp)

table (newko$species, predict(ktp,type="class"))
##              
##               fuliginosus giganteus melanops
##   fuliginosus          43         4        2
##   giganteus            12        29        7
##   melanops             15         9       23

Fit a RandomForest for classification

rfModel = randomForest(species~., data = newko)

table(newko$species,  predict(rfModel,newko))             
##              
##               fuliginosus giganteus melanops
##   fuliginosus          49         0        0
##   giganteus             0        48        0
##   melanops              0         0       47
table(newko$species, rfModel$predicted)
##              
##               fuliginosus giganteus melanops
##   fuliginosus          40         5        4
##   giganteus             5        22       21
##   melanops             13        22       12
rfModel$confusion
##             fuliginosus giganteus melanops class.error
## fuliginosus          40         5        4   0.1836735
## giganteus             5        22       21   0.5416667
## melanops             13        22       12   0.7446809