본문 바로가기

Data Science/R

[R] Non-Linear Models : Splines

1. What is Splines?

Regression splines are more flexible than polynomials and step functions, and are actually an extension of the two. The divide the range of X into K distinct regions. For each region, a polynomial function is fit to the data, however, the polynomials are constrained so that they join smoothly at the region boundaries or knots.

 

For regression splines, instead of fitting a high degree polynomial over the entire of X, we can fit several low-degree polynomials over different regions of X. Each of these functions can be fit using lease squares.

 

2. Linear Splines

  • A Linear splines with knots at \(\zeta_k\), \(k = 1, ..., K\) is a piecewise linear polynomial continuous at each knot.
  • \(y_i = \beta_0 + \beta_1b_1(x_i) + \beta_{K+1}b_{K+1}(x_i) + \epsilon\)
    • \(b_1(x_i) = x_i\)
    • \(b_{k+1}(x_i) = (x_i - \zeta_k)_{+}\)
    • \((x_i - \zeta_k)_{+} = x_i - \zeta_k\), if \(x_i > \zeta_k\)

 

# Linear spline
g6 <- lm(wage ~ bs(age, knots=50, degree=1), data=nWage)
pred6 <- predict(g6, newdata=list(age=age.grid))

plot(nWage[, 2], nWage[, 11], col="darkgrey", xlab="Age",
     ylab="Wage")
title(main="Linear Spline")
lines(age.grid, pred6, lwd=2, col="darkred")
abline(v=50, lty=2)

 

 

3. Cubic Splines

  • A Cubic spline with knots at \(\zeta_k\), \(k = 1, ..., K\) is a piecewise cubic polynomial with contunuous derivatives up to order 2 at each knot.
  • \(y_i = \beta_0 + \beta_1b_1(x_i) + \beta_2b_2(x_i) + ... + \beta_{K+3}b_{K+3}(x_i) + \epsilon\)
    • \(b_1(x_i) = x_i\)
    • \(b_2(x_i) = x_i^2\)
    • \(b_3(x_i) = x_i^3\)
    • \(b_{k+3}(x_i) = (x_i - \zeta_k)_{+}^3\)
    • \((x_i - \zeta_{k+3})_{+}^3 = (x_i - \zeta_k)^3\), if \(x_i > \zeta_k\)

 

# Truncated power basis functions 
d <- 3
knots <- 50 
x1 <- outer(nWage$age, 1:3, "^") # make form of x, x^2, x^3 
x2 <- outer(nWage$age, knots, ">") * outer(nwage$age, knots, "-")^d # make form of (x-z)^3 
x <- cbind(x1, x2) # make form of x, x^2, x^3, (x-50)^3

# Train models 
g4 <- lm(wage ~ x, data=nWage)

# Make testing set and predictions 
nx1 <- outer(age.grid, 1:d, "^")
nx2 <- outer(age.grid, knots, ">") * outer(age.grid, knots, "-")^d
nx <- cbind(nx1, nx2)
pred4 <- predict(g4, newdata=list(x=nx))

# Visualize 
par(mfrow=c(1,2))
plot(nWage[, 2], nWage[, 11], col="darkgrey", xlab="Age", ylab="Wage")
title(main = "Cubic Spline")
lines(age.grid, pred4, lwd=2, col="red")
abline(v=50, lty = 2)

# Make automatic splines using bs function 
library(splines)
g5 <- lm(wage ~ bs(age, knots=50), data=nWage)
pred5 <- predict(g5, newdata=list(age=age.grid))
plot(nWage[, 2], nWage[, 11], col="darkgrey", xlab="Age", ylab="Wage")
title(main="Cubic Spline")
lines(age.grid, pred5, lwd=2, col="red")
abline(v=50, lty=2)

 

 

4. Natural Cubic Splines

  • Splines have high variance at the outer range of the predictors.
  • A natural splines is a regression spline with additional boundary constraints.
  • The natural function is required to be linear at the boundary.

 

# Cubic Spline
fit <- lm(wage ~ bs(age, knots=c(25 ,40 ,60)), data=nWage)
pred <- predict(fit, newdata=list(age=age.grid), se=T)

# Natural Spline : using(ns) function 
fit2 <- lm(wage ~ ns(age, knots=c(25 ,40 ,60)), data=nWage)
pred2 <- predict(fit2, newdata=list(age=age.grid), se=T)
plot(nWage[, 2], nWage[, 11], col="darkgrey", xlab="Age", ylab="Wage")
lines(age.grid, pred$fit, lwd=2, col=4)
lines(age.grid, pred$fit + 2*pred$se, lty="dashed", col=4)
lines(age.grid, pred$fit - 2*pred$se, lty="dashed", col=4)
lines(age.grid, pred2$fit, lwd=2, col=2)
lines(age.grid, pred2$fit + 2*pred2$se, lty="dashed", col=2)
lines(age.grid, pred2$fit - 2*pred2$se, lty="dashed", col=2)
abline(v=c(25, 40, 60), lty=2)
legend("topright", c("Natural Cubic Spline", "Cubic Spline"), lty=1, lwd=2, col=c(2, 4))

 

 

4.1 [Ex] Natural Cubic Splines with degree of freedom

  • We can train data with natural cubic splines based on percentile using ns(df=k) keywords.
  • We fit a natrual cubic spline with three knots, where the knots locations were chosen automatically as the 25th, 50th, and 75th percentiles.

 

# Use a complete Wage data 
age <- Wage$age 
wage <- Wage$wage 

# Make test dataset 
age.grid <- seq(min(age), max(age)) 

# Training model with natural cubic splines 
g1 <- lm(wage ~ ns(age, df=4), data=Wage) 
pred1 <- predict(g1, newdata=list(age=age.grid), se=T) 
se.bands1 <- cbind(pred1$fit + 2*pred1$se.fit, pred1$fit - 2*pred1$se.fit) 

# Training model with natural cubic splines of binary problems 
g2 <- glm(I(wage > 25) ~ ns(age, df=4), data=Wage, family="binomial") 
pred2 <- predict(g2, newdata=list(age=age.grid), se=T) 
pfit <- exp(pred2$fit)/(1 + exp(pred2$fit)) 
se.bands.logit <- cbind(pred2$fit + 2*pred2$se.fit, pred2$fit - 2*pred2$se.fit)
se.bands2 <- exp(se.bands.logit)/(1 + exp(se.bands.logit)) 

# Visualize predicted confidence bands : lm fit 
par(mfrow=c(1,2), mar=c(4.5 ,4.5 ,1 ,1), oma=c(0, 0, 4, 0))
plot(age, wage, cex=.5, col="darkgrey", xlab="Age", ylab="Wage")
title("Natural Cubic Spline", outer=T)
lines(age.grid, pred1$fit, lwd=3, col=2)
matlines(age.grid, se.bands1, lwd=2, col=2, lty=2)
ncs <- ns(age, df=4)
attr(ncs, "knots")
abline(v=attr(ncs, "knots"), lty=3)

# Visualize predicted confidence bands : glm fit 
plot(age, I(wage > 250), type ="n", ylim=c(0, .2), xlab="Age",
     ylab="Pr(Wage>250 | Age)")
points(jitter(age), I((wage >250)/5), cex=.5, pch ="|",
       col="darkgrey")
lines(age.grid, pfit, lwd=3, col=2)
matlines(age.grid, se.bands2, lwd=2, col=2, lty=2)
abline(v=attr(ncs, "knots"), lty=3)

 

  • Cubic splines with 𝐾 knots has 𝐾+4 parameters or df.

 

 

4.2 [Ex] Optimized Natural Cubic Splines

  • Place more knots where the function might vary most rapidly.
  • Place fewer knots where it seems more stable.
  • How many knots should we use : Using a cross-validation

 

set.seed(1111)
CVE <- matrix(0, 20, 10) 

# Iterate 20 times of Cross Validation 
for (k in 1:20) { 
    gr <- sample(rep(seq(10), length=nrow(Wage))) 
    pred <- matrix(NA, nrow(Wage), 10) 
    # Applying 10 K-folds Cross Validation 
    for (i in 1:10) {
        tran <- (gr != i) 
        test <- (gr == i) 
        # Natural Cubic Splines with j degree of freedom 
        for (j in 1:10) {
            nsx <- ns(age, df=j) 
            g <- lm(wage ~ nsx, data=Wage, subset=tran) 
            mse <- (Wage$wage - predict(g, nsx))^2
            pred[test, j] <- mse[test]
        } 
    }
    CVE[k, ] <- apply(pred, 2, mean)
}
RES <- apply(CVE, 2, mean)
plot(seq(10), RES, type="b", col=2, pch=20, xlab="Degrees of Freedom of Natural Spline", 
     ylab="Mean Squared Error")

 

 

4.3 [Ex] Comparison to Polynomial Regression

  • Regression splines often give superior results to polynomial regression.
  • The extra flexibility in the polynomial produces undesirable results at the boundaries, while the natural cubic spline still provides a reasonable fit to the data.

 

# Training model with natural cubic splines with 15 df and polynomail with 15 terms 
g1 <- lm(wage ~ ns(age, df=15), data=Wage)
g2 <- lm(wage ~ poly(age, 15), data=Wage)

# Make prediction of testing set
pred1 <- predict(g1, newdata=list(age=age.grid), se=T)
pred2 <- predict(g2, newdata=list(age=age.grid), se=T)

# Visualize comparison results 
plot(age, wage, cex=.5, col="darkgrey", xlab="Age", ylab="Wage")
lines(age.grid, pred1$fit, lwd=2, col=2)
lines(age.grid, pred1$fit + 2*pred1$se, lty="dashed", col=2)
lines(age.grid, pred1$fit - 2*pred1$se, lty="dashed", col=2)
lines(age.grid, pred2$fit, lwd=2, col=4)
lines(age.grid, pred2$fit + 2*pred2$se, lty="dashed", col=4)
lines(age.grid, pred2$fit - 2*pred2$se, lty="dashed", col=4)
legend("topleft", c("Natural Cubic Spline", "Polynomial"), lty=1, lwd=2, col=c(2, 4))

 

 

5. Smoothing Splines

  • Finding a function \(g(x)\) that make \(RSS\) small, but that is also smooth.
  • A function \(g(x)\) that minimizes below is a smoothing spline.
    • \(\sum_{i=1}^n (y_i - g(x_i))^2 + \lambda \int g(t)^2dt\)
    • \(\lambda\) is a non-negative tuning paramete.
    • \(\lambda\) controls how wiggly \(g(x)\) is.   
    • The smaller \(\lambda\), the more wiggly the function.
    • As \(\lambda \to \infty\), the function \(g(x)\) becomes linear.
  •  The solution is a natural cubic spline, with a knot at every unique value of \(x_i\).
    • Smoothing splines avoid the knot-selection issue, leaving a single \(\lambda\) to be choosen.
    • The function smooth.spline() will fit a smoothing spline.
    • The leave-one-out cross-validation error(LOOCV) can be computed very efficiently for smoothing splines.
    • \(LOOCV_{\lambda} = \sum_{i=1}^n(y_i - \hat{g}_{\lambda}^{[-i]}(x_i))^2 = \sum_{i=1}^n[\frac{y_i - \hat{g}_{\lambda}(x_i)}{1 - S_{\lambda ii}}]^2\)

 

# Preparing dataset
library(ILSR)
library(splines)
data(Wage)
age <- Wage$age
wage <- Wage$wage
age.grid <- seq(min(age), max(age)) 

# Training models fitted in smoothing splines 
# Polynomial fits with df = 16
fit <- smooth.spline(age, wage, df=16) 

# Optimized smoothing splines with cross validation
fit2 <- smooth.spline(age, wage, cv=TRUE) 
fit2$df 

# Natural cubic splines with df=7
fit3 <- lm(wage ~ ns(age, df=7), data=Wage)
pred3 <- predict(fit3, newdata=list(age=age.grid)) 

# Visualize fits 
plot(age, wage, cex=.5, col="darkgrey") 
title("Smoothing Spline vs Natural Spline") 
lines(fit, col="red", lwd=2)
lines(fit2, col="blue", lwd=2) 
lines(age.grid, pred3, col="green", lwd=2) 
legend("topright", legend=c("SS (DF=16)", "SS (DF=6.8)", "NS (DF=7)"), col=c("red","blue","green"), lty=1, lwd=2)

# Cross Validation Simulation
set.seed(1234)
N <- 10 # Simulation replications
K <- 10 # 10-fold CV
df <- seq(2, 20) ## Degrees of freedom
CVE <- matrix(0, N, length(df))
for (k in 1:N) {
    gr <- sample(rep(seq(K), length=nrow(Wage)))
    pred <- matrix(NA, nrow(Wage), length(df))
    for (i in 1:K) {
        tran <- (gr != i)
        test <- (gr == i)
        for (j in 1:length(df)) {
            fit <- smooth.spline(age[tran], wage[tran], df=df[j])
            mse <- (wage-predict(fit, age)$y)^2
            pred[test, j] <- mse[test]
        }
    }
CVE[k, ] <- apply(pred, 2, mean)
}
RES <- apply(CVE, 2, mean)

# Visualize Cross Validation Error 
par(mfrow=c(1,2))
matplot(t(CVE), type="b", col=2, lty=2, pch=20, ylab="CV errors", xlab="Degrees of freedom")
plot(df, RES, type="b", col=2, pch=20, ylab="averaged CV errors", xlab="Degrees of freedom")
abline(v=df[which.min(RES)], col="grey", lty=4)

 

 

# Smoothng splines vs Natural Cubic Splines 
set.seed(1357)
MSE1 <- matrix(0, 100, 2)
for (i in 1:100) {
    tran <- sample(nrow(Wage), size=floor(nrow(Wage)*2/3))
    test <- setdiff(1:nrow(Wage), tran)
    g1 <- smooth.spline(age[tran], wage[tran], df=7)
    g2 <- lm(wage ~ ns(age, df=7), data=Wage, subset=tran)
    mse1 <- (wage-predict(g1, age)$y)[test]^2
    mse2 <- (wage-predict(g2, Wage))[test]^2
    MSE1[i,] <- c(mean(mse1), mean(mse2))
}
apply(MSE1, 2, mean)