Data Science/R

[R] Variable Selection Methods : Ridge

See_the_forest 2022. 10. 5. 16:48

1. Variable Selection Methods

  • We cannot use subset selection model in \(n << p\) (high dimensional data).
  • So a shrinkage method is recommended.

 

2. Shrinkage

  • Technique that constrains or regularizes the coeeficient estimates \(\hat{\beta}\), that shrinks the \(\hat{\beta}\) towards zero : \(E(\hat{\beta}^{sh}) \neq \beta\)
  • Shrinking the \(\hat{\beta}\) can reduce variance : \(Var(\hat{\beta}^{OLS}) >> Var(\hat{\beta}^{sh})\)
  • Examples
    • Ridge
    • Lasso
    • Elastic Net : Ridge + Lasso

 

3. Ridge Regression

  • \(RSS + \lambda\sum_{j=1}^{p}\beta_j^2\)
  • where \(\lambda >= 0\) is a tuning parameter.
  • For a grid of \(\lambda\) : \(\lambda_{max} = \lambda_1 > ... > \lambda_m = \lambda_{min}\).
  • The \(l_2\)-norm of \(\hat{\beta}\) : \(||\hat{\beta_{\lambda_1}}||^2 < ... < ||\hat{\beta_{\lambda_m}}||^2\)

 

# Import Dataset 
library(ISLR)
library(leaps) 
library(glmnet)
data(Hitters) 
Hitters <- na.omit(Hitters)

# model.matrix convert categorical data type into numerical 
x <- model.matrix(Salary~., Hitters)[, -1] 
y <- Hitters$Salary

# Grid Search for hyper parameter tuning lambda 
grid <- 10^seq(10, -2, length = 100) 
# Fit the ridge model : Default grid of lambda 
ridge.mod <- glmnet(x, y, alpha = 0, lambda=grid) 

# row for the number of parameters 
# col for the number of hyperparameter lambda 
dim(coef(ridge.mod)) 
ridge.mod$lambda 

# Calculate l_2 norm of each lambda 
# If lambda increase, l_2 norm decreaes 
# If lambda decreaes, l_2 norm increase 
ridge.mod$lambda[60]
coef(ridge.mod)[,60]
sqrt(sum(coef(ridge.mod)[-1, 60]^2))

# Calculate l2-norm based on lambda 
l2.norm <- apply(ridge.mod$beta, 2, function(t) sum(t^2))
x.axis <- cbind(log(ridge.mod$lambda), l2.norm)
colnames(x.axis) <- c("log.lambda", "L2.norm")
x.axis

# Visualize plot
par(mfrow=c(1,2))
plot(ridge.mod, "lambda", label=TRUE)
plot(ridge.mod, "norm", label=TRUE)

 

4. Selecint the Tuning Parameter

4.1 Validation Set

# Make dataset 
library(glmnet)
library(ISLR) 
names(Hitters) 
Hitters <- na.omit(Hitters) 

set.seed(123)
x <- model.matrix(Salary~., Hitters)[, -1] 
y <- Hitters$Salary

# Train-Test Split
train <- sample(1:nrow(x), nrow(x)/3) 
test <- (-train) 
y.test <- y[test]

# Hyperparameter tuning 
grid <- 10^seq(10, -2, length=100) 
r1 <- glmnet(x[train, ], y[train], alpha=0, lambda=grid)
ss <- 0:(length(r1$lambda)-1) 
Err <- NULL

# Cross validation Error for test sample 
for (i in 1:length(r1$lambda)) { 
    r1.pred <- predict(r1, s=ss[i], newx=x[test, ])
    Err[i] <- mean((r1.pred - y.test)^2) 
} 
wh <- which.min(Err) 
lam.opt <- r1$lambda[wh] 

# Get full model with optimized hyperparmeter 
r.full <- glmnet(x, y, alpha=0, lambda=grid) 
r.full$beta[, wh] 
predict(r.full, type="coefficients", s=lam.opt)

 

4.2 One-Standard Rule Error

set.seed(1234)
cv.r <- cv.glmnet(x, y, alpha=0, nfolds=10)
names(cv.r) 
# cvm : The mean value of cross validation -> CVE 
# cvsd : The standard deviation of cross validation -> One-standard error 
# cvup : The upperbound of CVE -> cvm + cvsd 
# cvlo : The lowerbound of CVE -> cvm - cvsd 
# lambda.min : The lambda which optimize input model 
# lambda.1se : The lambda which optimize imput model based on one-standard error 

cbind(cv.r$cvlo, cv.r$cvm, cv.r$cvup)
# Scatter plot based on One-Standard error 
# left vertix line : log(lambda.min) 
# right vertix line(more shrinked model) : log(lambda.1se) 
plot(cv.r) 

which(cv.r$lambda==cv.r$lambda.min)
which(cv.r$lambda==cv.r$lambda.1se)
# 100, 54 -> lambda.min < lambda.1se 

b.min <- predict(cv.r, type="coefficients", s=cv.r$lambda.min)
b.1se <- predict(cv.r, type="coefficients", s=cv.r$lambda.1se)

# calculate l1-norm
# calculate sum(b.min!=0) - 1 to get l2-norm 
cbind(b.min, b.1se)
c(sum(b.min[-1]^2), sum(b.1se[-1]^2))
# sum(b.min[-1]^2) > sum(b.1se[-1]^2)