Regression Splines

library(splines);
library(ggplot2)
help(bs)
help(ns)

Spline Basis Functions

Basis functions for cubic splined with knots with 5 knots and df = 9.

x=(1:199)/100;
n = length(x)
m=5;
myknots= 2*(1:m)/(m+1)
myknots

## [1] 0.3333 0.6667 1.0000 1.3333 1.6667

F=bs(x,knots=myknots, intercept=TRUE)
dim(F)

## [1] 199   9

mydf = m+4; 
tmpdata = data.frame(t = rep(1:n, mydf),
                     basisfunc=as.vector(F), 
                     type=as.factor(rep(1:mydf, each=n)))
ggplot(tmpdata, aes(x=t, y=basisfunc, color=type)) + 
  geom_path()

F=bs(x,knots=myknots)
dim(F)

## [1] 199   8

mydf = m+3; 
tmpdata = data.frame(t = rep(1:n, mydf),
                     basisfunc=as.vector(F), 
                     type=as.factor(rep(1:mydf, each=n)))
ggplot(tmpdata, aes(x=t, y=basisfunc, color=type)) +
  geom_path()

Basis functions for NCS with 7 knots (5 interior knots and 2 boundary knots) and df =7

F=ns(x, knots=myknots, Boundary.knots=c(0,2), intercept=TRUE)
dim(F)

## [1] 199   7

mydf = 7

tmpdata = data.frame(t = rep(1:n, mydf),
                     basisfunc=as.vector(F), 
                     type=as.factor(rep(1:mydf, each=n)))

ggplot(tmpdata, aes(x=t, y=basisfunc, color=type)) +
  geom_path()

The Birthrates Data

This dataset lists the number of live births per 10,000 23-year-old women in the United States between 1917 and 2003.

source("birthrates.txt");
birthrates = as.data.frame(birthrates)
names(birthrates) = c("year", "rate")
ggplot(birthrates, aes(x=year, y=rate)) + 
  geom_point() + geom_smooth(method="lm", se=FALSE)

Understand how R counts the df.

fit1=lm(rate~bs(year, knots=quantile(year, c(1/3, 2/3))), data=birthrates);
fit2=lm(rate~bs(year, df=5), data=birthrates);
fit3=lm(rate~bs(year, df=6, intercept=TRUE), data=birthrates) 
fit4=lm(rate~bs(year, df=5, intercept=TRUE), data=birthrates)

plot(birthrates$year, birthrates$rate, ylim=c(90,280))
lines(spline(birthrates$year, predict(fit1)), col="red", lty=1)
lines(spline(birthrates$year, predict(fit2)), col="blue", lty=2)
lines(spline(birthrates$year, predict(fit3)), col="green", lty=3)
lines(spline(birthrates$year, predict(fit4)), lty=2, lwd=2)

# Alternatively, you can predict the spline fit on a fine grid, and then connect them

# plot(birthrates$year, birthrates$rate, ylim=c(90,280))
# year.grid = seq(from=min(birthrates$year), to=max(birthrates$year), length=200)
# ypred = predict(fit1, data.frame(year=year.grid))
# lines(year.grid, ypred, col="blue", lwd=2)

fit1=lm(rate~ns(year, knots=quantile(year, (1:4)/5)), data=birthrates);
fit2=lm(rate~ns(year, df=5), data=birthrates);
fit3=lm(rate~ns(year, df=6, intercept=TRUE), data=birthrates) 

plot(birthrates$year, birthrates$rate, ylim=c(90,280))
lines(spline(birthrates$year, predict(fit1)), col="red", lty=1)
lines(spline(birthrates$year, predict(fit2)), col="blue", lty=2)
lines(spline(birthrates$year, predict(fit3)), col="green", lty=3)

Try cubic splines with different degree-of-freedoms

plot(birthrates$year, birthrates$rate, ylim=c(90,280));
lines(spline(birthrates$year, predict(lm(rate~bs(year, df=4), data=birthrates))), col="blue");
lines(spline(birthrates$year, predict(lm(rate~bs(year, df=7), data=birthrates))), col="red");
lines(spline(birthrates$year, predict(lm(rate~bs(year, df=19), data=birthrates))), col="black");
legend("topright", lty=rep(1,3), col=c("blue", "red", "black"), legend=c("df=5", "df=8", "df=20"))

Use 10-fold CV to select df (or equivalently the number of knots)

The location of knots will affect the performance of a spline model. But selecting the location of knots is computationally too expensive. Instead, we always place knots equally at quantiles of x, and then select just the number of knots, or equivalently, the df. Can we use F-test to select the number of knots?

For each df, we use 10-fold CV to calculate the CV error. When doing 10-fold CV, each time, based on 90% of the data, we place the (df-4) knots at the corresponding quantiles and then fit a regression spline,

First, we need to divide the data into K folds. I’ve seen people use the following command, but when the sample size is relatively small, the foldsize could be very unbalanced.

K=10
fold.id = sample(1:K, nrow(birthrates), replace=TRUE)
fold.id

##  [1]  6  7  4  8  8  9  8  3  1  2  9  3  7  9  6  8  3  4  7  5 10 10  8
## [24]  5 10 10  1  2  3  7  5  6  8  5  2  4  6  5  8  3  6  4  7 10  8  9
## [47]  8  6  7  7  9 10  9  6  9  2  9 10  9  3  2  3  7  9  3  7  1  2  6
## [70] 10  3  3  6  1  9  9 10  7  6  6  4  6  2  6  4  6  5

table(fold.id)

## fold.id
##  1  2  3  4  5  6  7  8  9 10 
##  4  7 10  6  6 14 10  9 12  9

Instead, we use the following command to divide the data into roughly equal size of folds.

K=10
n = nrow(birthrates)
fold.size = c(rep(9, 7), rep(8, 3))
fold.size

##  [1] 9 9 9 9 9 9 9 8 8 8

sum(fold.size)

## [1] 87

fold.id = rep(0, n); 
for(i in 1:(K-1)){
  tmp.id = sample((1:n)[fold.id == 0], fold.size[i], replace=FALSE)
  print(tmp.id)
  fold.id[tmp.id] = i
  }

## [1] 52 79 43 64  5 41 12 86 39
## [1] 38 47 82 27  8 87 48 50 30
## [1] 65 84 34  2 76 21  6 67 28
## [1] 24 31 15 49 83 16  1  9 42
## [1] 66 69 36 61 32 40 78 44 46
## [1] 51 71 25 73 72 10 54  4 59
## [1] 13 77 57 74 23 53 68 11 81
## [1] 45 14 63 17 70 80 33 26
## [1]  7 60 85 19 75 37 18 62

fold.id[fold.id==0] = K
fold.id

##  [1]  4  3 10  6  1  3  9  2  4  6  7  1  7  8  4  4  8  9  9 10  3 10  7
## [24]  4  6  8  2  3 10  2  4  5  8  3 10  5  9  2  1  5  1  4  1  5  8  5
## [47]  2  2  4  2  6  1  7  6 10 10  7 10  6  9  5  9  8  1  3  5  3  7  5
## [70]  8  6  6  6  7  9  3  7  5  1  8  7  2  4  3  9  1  2

table(fold.id)

## fold.id
##  1  2  3  4  5  6  7  8  9 10 
##  9  9  9  9  9  9  9  8  8  8

mydf=10:30
mycv = rep(0, length(mydf))

for(i in 1:length(mydf)){
  m = mydf[i]-4;  
  for(k in 1:K){
    id = which(fold.id == k);
    myknots = quantile(birthrates$year[-id], (1:m)/(m+1))
    myfit = lm(rate ~ bs(year, knots=myknots), data=birthrates[-id,])
    ypred = predict(myfit, newdata=birthrates[id,])
    mycv[i]=mycv[i] + sum((birthrates$rate[id] - ypred)^2)
  }
}

plot(mydf, mycv)

Re-run the 10-fold CV. The plot of mydf versus mycv may vary, but shouldn’t be too different.

fold.id = rep(0, n); 
for(i in 1:(K-1)){
  tmp.id = sample((1:n)[fold.id == 0], fold.size[i], replace=FALSE)
  print(tmp.id)
  fold.id[tmp.id] = i
  }

## [1] 61 76 32 20 58 54  8 72 82
## [1] 86 47 55  3 12 49 79 37 35
## [1] 53 52 18 62 26 60 77 70 31
## [1] 64 66 34 33 13 39 81 57 28
## [1] 71  1 36 22 74 16  7 63 14
## [1] 46 21  6 42 56 40 51  2 84
## [1] 17  4  5 80 25 65 23 27 38
## [1] 30 48 11 85 87 78 45 41
## [1] 15 43 19  9 44 83 69 29

fold.id[fold.id==0] = K
mycv = rep(0, length(mydf))

for(i in 1:length(mydf)){
  m = mydf[i]-4; 
  myknots = quantile(birthrates$year, (1:m)/(m+1))
  for(k in 1:K){
    id = which(fold.id == k);
    myfit = lm(rate ~ bs(year, knots=myknots), data=birthrates[-id,])
    #myspline = interpSpline(year[-id], predict(myfit))
    ypred = predict(myfit, newdata=birthrates[id,])
    mycv[i]=mycv[i] + sum((birthrates$rate[id] - ypred)^2)
  }
}
plot(mydf, mycv)