library(splines);
library(ggplot2)
help(bs)
help(ns)
x=(1:199)/100;
n = length(x)
m=5;
myknots= 2*(1:m)/(m+1)
myknots
## [1] 0.3333 0.6667 1.0000 1.3333 1.6667
F=bs(x,knots=myknots, intercept=TRUE)
dim(F)
## [1] 199 9
mydf = m+4;
tmpdata = data.frame(t = rep(1:n, mydf),
basisfunc=as.vector(F),
type=as.factor(rep(1:mydf, each=n)))
ggplot(tmpdata, aes(x=t, y=basisfunc, color=type)) +
geom_path()
F=bs(x,knots=myknots)
dim(F)
## [1] 199 8
mydf = m+3;
tmpdata = data.frame(t = rep(1:n, mydf),
basisfunc=as.vector(F),
type=as.factor(rep(1:mydf, each=n)))
ggplot(tmpdata, aes(x=t, y=basisfunc, color=type)) +
geom_path()
F=ns(x, knots=myknots, Boundary.knots=c(0,2), intercept=TRUE)
dim(F)
## [1] 199 7
mydf = 7
tmpdata = data.frame(t = rep(1:n, mydf),
basisfunc=as.vector(F),
type=as.factor(rep(1:mydf, each=n)))
ggplot(tmpdata, aes(x=t, y=basisfunc, color=type)) +
geom_path()
This dataset lists the number of live births per 10,000 23-year-old women in the United States between 1917 and 2003.
source("birthrates.txt");
birthrates = as.data.frame(birthrates)
names(birthrates) = c("year", "rate")
ggplot(birthrates, aes(x=year, y=rate)) +
geom_point() + geom_smooth(method="lm", se=FALSE)
fit1=lm(rate~bs(year, knots=quantile(year, c(1/3, 2/3))), data=birthrates);
fit2=lm(rate~bs(year, df=5), data=birthrates);
fit3=lm(rate~bs(year, df=6, intercept=TRUE), data=birthrates)
fit4=lm(rate~bs(year, df=5, intercept=TRUE), data=birthrates)
plot(birthrates$year, birthrates$rate, ylim=c(90,280))
lines(spline(birthrates$year, predict(fit1)), col="red", lty=1)
lines(spline(birthrates$year, predict(fit2)), col="blue", lty=2)
lines(spline(birthrates$year, predict(fit3)), col="green", lty=3)
lines(spline(birthrates$year, predict(fit4)), lty=2, lwd=2)
# Alternatively, you can predict the spline fit on a fine grid, and then connect them
# plot(birthrates$year, birthrates$rate, ylim=c(90,280))
# year.grid = seq(from=min(birthrates$year), to=max(birthrates$year), length=200)
# ypred = predict(fit1, data.frame(year=year.grid))
# lines(year.grid, ypred, col="blue", lwd=2)
fit1=lm(rate~ns(year, knots=quantile(year, (1:4)/5)), data=birthrates);
fit2=lm(rate~ns(year, df=5), data=birthrates);
fit3=lm(rate~ns(year, df=6, intercept=TRUE), data=birthrates)
plot(birthrates$year, birthrates$rate, ylim=c(90,280))
lines(spline(birthrates$year, predict(fit1)), col="red", lty=1)
lines(spline(birthrates$year, predict(fit2)), col="blue", lty=2)
lines(spline(birthrates$year, predict(fit3)), col="green", lty=3)
plot(birthrates$year, birthrates$rate, ylim=c(90,280));
lines(spline(birthrates$year, predict(lm(rate~bs(year, df=4), data=birthrates))), col="blue");
lines(spline(birthrates$year, predict(lm(rate~bs(year, df=7), data=birthrates))), col="red");
lines(spline(birthrates$year, predict(lm(rate~bs(year, df=19), data=birthrates))), col="black");
legend("topright", lty=rep(1,3), col=c("blue", "red", "black"), legend=c("df=5", "df=8", "df=20"))
The location of knots will affect the performance of a spline model. But selecting the location of knots is computationally too expensive. Instead, we always place knots equally at quantiles of x, and then select just the number of knots, or equivalently, the df. Can we use F-test to select the number of knots?
For each df, we use 10-fold CV to calculate the CV error. When doing 10-fold CV, each time, based on 90% of the data, we place the (df-4) knots at the corresponding quantiles and then fit a regression spline,
First, we need to divide the data into K
folds. I’ve seen people use the following command, but when the sample size is relatively small, the foldsize could be very unbalanced.
K=10
fold.id = sample(1:K, nrow(birthrates), replace=TRUE)
fold.id
## [1] 6 7 4 8 8 9 8 3 1 2 9 3 7 9 6 8 3 4 7 5 10 10 8
## [24] 5 10 10 1 2 3 7 5 6 8 5 2 4 6 5 8 3 6 4 7 10 8 9
## [47] 8 6 7 7 9 10 9 6 9 2 9 10 9 3 2 3 7 9 3 7 1 2 6
## [70] 10 3 3 6 1 9 9 10 7 6 6 4 6 2 6 4 6 5
table(fold.id)
## fold.id
## 1 2 3 4 5 6 7 8 9 10
## 4 7 10 6 6 14 10 9 12 9
Instead, we use the following command to divide the data into roughly equal size of folds.
K=10
n = nrow(birthrates)
fold.size = c(rep(9, 7), rep(8, 3))
fold.size
## [1] 9 9 9 9 9 9 9 8 8 8
sum(fold.size)
## [1] 87
fold.id = rep(0, n);
for(i in 1:(K-1)){
tmp.id = sample((1:n)[fold.id == 0], fold.size[i], replace=FALSE)
print(tmp.id)
fold.id[tmp.id] = i
}
## [1] 52 79 43 64 5 41 12 86 39
## [1] 38 47 82 27 8 87 48 50 30
## [1] 65 84 34 2 76 21 6 67 28
## [1] 24 31 15 49 83 16 1 9 42
## [1] 66 69 36 61 32 40 78 44 46
## [1] 51 71 25 73 72 10 54 4 59
## [1] 13 77 57 74 23 53 68 11 81
## [1] 45 14 63 17 70 80 33 26
## [1] 7 60 85 19 75 37 18 62
fold.id[fold.id==0] = K
fold.id
## [1] 4 3 10 6 1 3 9 2 4 6 7 1 7 8 4 4 8 9 9 10 3 10 7
## [24] 4 6 8 2 3 10 2 4 5 8 3 10 5 9 2 1 5 1 4 1 5 8 5
## [47] 2 2 4 2 6 1 7 6 10 10 7 10 6 9 5 9 8 1 3 5 3 7 5
## [70] 8 6 6 6 7 9 3 7 5 1 8 7 2 4 3 9 1 2
table(fold.id)
## fold.id
## 1 2 3 4 5 6 7 8 9 10
## 9 9 9 9 9 9 9 8 8 8
mydf=10:30
mycv = rep(0, length(mydf))
for(i in 1:length(mydf)){
m = mydf[i]-4;
for(k in 1:K){
id = which(fold.id == k);
myknots = quantile(birthrates$year[-id], (1:m)/(m+1))
myfit = lm(rate ~ bs(year, knots=myknots), data=birthrates[-id,])
ypred = predict(myfit, newdata=birthrates[id,])
mycv[i]=mycv[i] + sum((birthrates$rate[id] - ypred)^2)
}
}
plot(mydf, mycv)
Re-run the 10-fold CV. The plot of mydf
versus mycv
may vary, but shouldn’t be too different.
fold.id = rep(0, n);
for(i in 1:(K-1)){
tmp.id = sample((1:n)[fold.id == 0], fold.size[i], replace=FALSE)
print(tmp.id)
fold.id[tmp.id] = i
}
## [1] 61 76 32 20 58 54 8 72 82
## [1] 86 47 55 3 12 49 79 37 35
## [1] 53 52 18 62 26 60 77 70 31
## [1] 64 66 34 33 13 39 81 57 28
## [1] 71 1 36 22 74 16 7 63 14
## [1] 46 21 6 42 56 40 51 2 84
## [1] 17 4 5 80 25 65 23 27 38
## [1] 30 48 11 85 87 78 45 41
## [1] 15 43 19 9 44 83 69 29
fold.id[fold.id==0] = K
mycv = rep(0, length(mydf))
for(i in 1:length(mydf)){
m = mydf[i]-4;
myknots = quantile(birthrates$year, (1:m)/(m+1))
for(k in 1:K){
id = which(fold.id == k);
myfit = lm(rate ~ bs(year, knots=myknots), data=birthrates[-id,])
#myspline = interpSpline(year[-id], predict(myfit))
ypred = predict(myfit, newdata=birthrates[id,])
mycv[i]=mycv[i] + sum((birthrates$rate[id] - ypred)^2)
}
}
plot(mydf, mycv)