library(faraway)
data(ozone)
?ozone
summary(ozone)
ozone[1:5,]
Show the pairwise correlation among variables.
library(corrplot)
M=cor(ozone)
corrplot(M, method="ellipse")
corrplot.mixed(M, lower="number", upper="ellipse")
The goal is to predict O3
with the other variables. The following pairwise plots show reveal several nonlinear relationships indicating that a linear regression might not be appropriate without the addition of some transformations. pairs(ozone,pch=“.”)
pairs(ozone, pch=".")
library(rpart)
##
## Attaching package: 'rpart'
##
## The following object is masked from 'package:faraway':
##
## solder
roz = rpart(O3 ~ .,ozone)
plot(roz,compress=T,uniform=T,branch=0.4,margin=.10)
text(roz)
Prediction with a tree model
x0 = apply(ozone[,-1],2,median)
predict(roz, data.frame(t(x0)))
## 1
## 14.35484
This is essentially a model selection problem. Can we use criteria like AIC/BIC? Indeed we could define a model selection criterion similar to AIC/BIC: RSS + alpha*tree-size. Here the tree-size is defined to be the number of parameters a tree mode has, which is equal to the number of terminal (or leaf) nodes. The penalty per extra dimension, i.e., alpha, is also referred to as the complexity parameter (CP).
Instead of setting the CP to be 2 or log(n), try all possible values for CP.
It turns out when we increase CP from 0 to infinity, the resulting optimal sub-trees form a nested sequence, i.e., the optimal trees get smaller when it costs more to have an additional node.
The optimal subtree (based on the cost-complexity criterion) stays the same for a range of CP values. And it is not difficult to find all the break-points (of CP), i.e., values of CP with which the returned optimal sub-tree changes, which is the cptable
returned by rpart
.
printcp(roz)
##
## Regression tree:
## rpart(formula = O3 ~ ., data = ozone)
##
## Variables actually used in tree construction:
## [1] doy dpg humidity ibh ibt temp vis
##
## Root node error: 21115/330 = 63.986
##
## n= 330
##
## CP nsplit rel error xerror xstd
## 1 0.545699 0 1.00000 1.00264 0.076171
## 2 0.073659 1 0.45430 0.46883 0.041058
## 3 0.053542 2 0.38064 0.41305 0.038397
## 4 0.026756 3 0.32710 0.36964 0.035783
## 5 0.023276 4 0.30034 0.36599 0.035977
## 6 0.023102 5 0.27707 0.35832 0.035577
## 7 0.015325 6 0.25397 0.34008 0.034326
## 8 0.010914 7 0.23864 0.33047 0.032413
## 9 0.010000 8 0.22773 0.32247 0.032283
plotcp(roz)
The current tree is too small. Let’s fit a bigger tree (i.e., fit a tree with a smaller CP value), and then do the pruning.
roze = rpart(O3 ~ .,ozone, cp=0.001)
printcp(roze)
##
## Regression tree:
## rpart(formula = O3 ~ ., data = ozone, cp = 0.001)
##
## Variables actually used in tree construction:
## [1] doy dpg humidity ibh ibt temp vh vis
##
## Root node error: 21115/330 = 63.986
##
## n= 330
##
## CP nsplit rel error xerror xstd
## 1 0.5456993 0 1.00000 1.00652 0.077189
## 2 0.0736591 1 0.45430 0.48314 0.042643
## 3 0.0535415 2 0.38064 0.41860 0.037416
## 4 0.0267557 3 0.32710 0.37441 0.035033
## 5 0.0232760 4 0.30034 0.37421 0.035282
## 6 0.0231021 5 0.27707 0.38221 0.036033
## 7 0.0153249 6 0.25397 0.37653 0.034986
## 8 0.0109137 7 0.23864 0.37888 0.036591
## 9 0.0070746 8 0.22773 0.37656 0.035441
## 10 0.0059918 9 0.22065 0.38556 0.036511
## 11 0.0059317 10 0.21466 0.38597 0.037257
## 12 0.0049709 12 0.20280 0.38678 0.036954
## 13 0.0047996 15 0.18789 0.38980 0.037602
## 14 0.0044712 16 0.18309 0.38603 0.037590
## 15 0.0031921 17 0.17861 0.38934 0.039488
## 16 0.0022152 19 0.17223 0.39578 0.039327
## 17 0.0020733 20 0.17002 0.38786 0.039109
## 18 0.0020297 22 0.16587 0.38993 0.039110
## 19 0.0014432 23 0.16384 0.39080 0.039092
## 20 0.0011322 24 0.16240 0.38918 0.039116
## 21 0.0011035 25 0.16126 0.38761 0.039141
## 22 0.0010000 26 0.16016 0.38696 0.039153
plotcp(roze)
tmp1=rpart(O3~., ozone, cp=0.01)
tmp2=rpart(O3~., ozone, cp=0.008)
tmp3=rpart(O3~., ozone, cp=0.007)
par(mfrow=c(1,3))
plot(tmp1,compress=T,uniform=T,branch=0.4,margin=.10)
text(tmp1)
plot(tmp2,compress=T,uniform=T,branch=0.4,margin=.10)
text(tmp2)
plot(tmp3,compress=T,uniform=T,branch=0.4,margin=.10)
text(tmp3)
tmp1=rpart(O3~., ozone, cp=0.0055)
tmp2=rpart(O3~., ozone, cp=0.0049)
par(mfrow=c(1,2))
plot(tmp1,compress=T,uniform=T,branch=0.4,margin=.10)
text(tmp1, cex=0.5)
plot(tmp2,compress=T,uniform=T,branch=0.4,margin=.10)
text(tmp2, cex=0.5)
However, we still don’t know what the optimal choice of CP. The cptable
gives us all candidate tree sizes; for each tree size, we compute the corresponding CV error, and then pick the subtree with the CV errors. There are usually two approaches to determining the optimal tree size.
size.min: Pick the optimal tree size which has the smallest CV error.
size.1se: Pick the smallest tree size whose CV error is within 1se (one-standard-error) of the smallest CV error.
Sometimes, these two approaches would give the same result, but sometimes they may not. For this particular example, from the CV error plot (produced by plotcp
), we can see that the CV error first dropped dramatically when the tree size increases, and then became stablized at roughly the same value for a wide range of tree sizes. The size.1se approach may work better here: any tree, whose CV error is within one-standard error of the smallest CV error, would be a good choice, so we pick the smallest tree among them. The dashed line in the CV error plot marks the smallest CV error plus its one-standard-error, so we should just pick the smallest tree among all trees below the dashed line.
myCPtable = roze$cptable
id.min = which.min(myCPtable[,'xerror'])
my1se.err = myCPtable[id.min,'xerror'] + myCPtable[id.min, 'xstd']
plotcp(roze)
abline(h=my1se.err, col="red")
# the red line should cover the original dashed line
# The size.min approach: pick the tree size with the smallest CV error, this
# can be done by pruning your tree using a CP value between myCPtable[id.min, 'CP'] and myCPtable[id.min-1, 'CP']
# Of course, you need to modify the code if id.min =1.
CP.min = (myCPtable[id.min, 'CP'] + myCPtable[(id.min-1), 'CP'])/2
tree.min = prune.rpart(roze, CP.min)
plot(tree.min)
text(tree.min)
# The size.1se approach: pick the smallest tree below the dashed line
id.1se = min(which(myCPtable[,'xerror'] < my1se.err))
CP.1se = (myCPtable[id.1se, 'CP'] + myCPtable[(id.1se-1), 'CP'])/2
tree.1se = prune.rpart(roze, CP.1se)
plot(tree.1se)
text(tree.1se)
library(randomForest);
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
?randomForest
rfModel = randomForest(O3~., data = ozone)
Items returned by randomForest
ntree: Number of trees in the forest, default = 500.
mtry: at each split (of any tree), only “mtry” random chosen predictors are used as candidate predictors, default = 3.
oob.times: oob = out of bag; for each tree, we draw a random sample (with replacement) of the data from a bag and oob.times
stands for the number of times, a particular sample is not selected into the training set.
names(rfModel)
## [1] "call" "type" "predicted"
## [4] "mse" "rsq" "oob.times"
## [7] "importance" "importanceSD" "localImportance"
## [10] "proximity" "ntree" "mtry"
## [13] "forest" "coefs" "y"
## [16] "test" "inbag" "terms"
rfModel$ntree # number of tress in the forest
## [1] 500
rfModel$mtry
## [1] 3
How to extract yhat of the training data? For regression, suppose the output is stored in “lmout”, then we can use 1) lmout$fitted; or 2) predict(lmout, newdata=trainingdata), and the two vectors should be the same.
For randomForest
, however, the two approaches return different results. Why?
yhat = predict(rfModel, newdata=ozone)
sum((ozone$O3 - yhat)^2)
## [1] 999.8417
# which is different from rfModel$predicted.
sum((ozone$O3- rfModel$predicted)^2)
## [1] 5264.606
How is rfModel$predicted
computed?
length(rfModel$oob.times)
## [1] 330
rfModel$oob.times[1:5]
## [1] 187 196 162 204 183
length(rfModel$predicted)
## [1] 330
rfModel$predicted[1:5]
## 1 2 3 4 5
## 5.255971 5.995068 9.149588 7.687663 5.328415
# Then rfModel$predicted[1] is the average prediction of the 1st sample from the rfModel$oob.times[1] trees. So sum((ozone$O3- rfModel$predicted)^2) is more like a test error, and sum((ozone$O3 - predict(rfModel, newdata=ozone) )^2) is like a training error, which is smaller.
We have some training data consisting of 148 cases with the following variables: there are three possible species, Giganteus, Melanops and Fuliginosus, the sex of the animal and 18 skull measurements (Andrews and Herzberg, 1985). The goal is to identify the species of a historical specimen from the Rijksmuseum van Natuurlijkee in Leiden whose skull measurements is stored in x0.
```r data(kanga) x0=c(1115, NA, 748, 182, NA, NA, 178, 311, 756, 226, NA, NA, NA, 48, 1009, NA, 204, 593)
# How to deal with the missing values in x0 and the in the training data kanga = kanga[,c (T, F, !is.na(x0))] kanga[1:2,] ```
## species basilar.length palate.length palate.width squamosal.depth ## 1 giganteus 1312 882 NA 180 ## 2 giganteus 1439 985 230 150 ## lacrymal.width zygomatic.width orbital.width foramina.length ## 1 394 782 249 88 ## 2 416 824 233 100 ## mandible.length mandible.depth ramus.height ## 1 1086 179 591 ## 2 1158 181 643
r apply(kanga,2,function(x) sum(is.na(x)))
## species basilar.length palate.length palate.width ## 0 1 1 24 ## squamosal.depth lacrymal.width zygomatic.width orbital.width ## 1 0 1 0 ## foramina.length mandible.length mandible.depth ramus.height ## 0 12 0 0
r round(cor(kanga[,-1],use="pairwise.complete.obs")[,c(3, 9)],2)
## palate.width mandible.length ## basilar.length 0.77 0.98 ## palate.length 0.81 0.98 ## palate.width 1.00 0.81 ## squamosal.depth 0.69 0.80 ## lacrymal.width 0.77 0.92 ## zygomatic.width 0.78 0.92 ## orbital.width 0.12 0.25 ## foramina.length 0.19 0.23 ## mandible.length 0.81 1.00 ## mandible.depth 0.62 0.85 ## ramus.height 0.73 0.94
r newko = na.omit (kanga [, -c(4, 10)]) dim(newko)
## [1] 144 10
Fit a classification tree.
kt = rpart(species ~ ., data=newko,cp=0.0001)
printcp(kt)
##
## Classification tree:
## rpart(formula = species ~ ., data = newko, cp = 1e-04)
##
## Variables actually used in tree construction:
## [1] basilar.length foramina.length lacrymal.width ramus.height
## [5] squamosal.depth zygomatic.width
##
## Root node error: 95/144 = 0.65972
##
## n= 144
##
## CP nsplit rel error xerror xstd
## 1 0.178947 0 1.00000 1.12632 0.055193
## 2 0.105263 1 0.82105 1.03158 0.058896
## 3 0.050000 2 0.71579 0.84211 0.062767
## 4 0.021053 6 0.51579 0.90526 0.061952
## 5 0.010526 7 0.49474 0.87368 0.062416
## 6 0.000100 8 0.48421 0.87368 0.062416
ktp = prune(kt,cp=0.0211)
plot(ktp,compress=T,uniform=T,branch=0.4,margin=.10)
text(ktp)
table (newko$species, predict(ktp,type="class"))
##
## fuliginosus giganteus melanops
## fuliginosus 43 4 2
## giganteus 12 29 7
## melanops 15 9 23
Fit a RandomForest for classification
rfModel = randomForest(species~., data = newko)
table(newko$species, predict(rfModel,newko))
##
## fuliginosus giganteus melanops
## fuliginosus 49 0 0
## giganteus 0 48 0
## melanops 0 0 47
table(newko$species, rfModel$predicted)
##
## fuliginosus giganteus melanops
## fuliginosus 40 5 4
## giganteus 5 22 21
## melanops 13 22 12
rfModel$confusion
## fuliginosus giganteus melanops class.error
## fuliginosus 40 5 4 0.1836735
## giganteus 5 22 21 0.5416667
## melanops 13 22 12 0.7446809