Data Science/R
[R] Variable Selection Methods : Ridge
See_the_forest
2022. 10. 5. 16:48
1. Variable Selection Methods
- We cannot use subset selection model in \(n << p\) (high dimensional data).
- So a shrinkage method is recommended.
2. Shrinkage
- Technique that constrains or regularizes the coeeficient estimates \(\hat{\beta}\), that shrinks the \(\hat{\beta}\) towards zero : \(E(\hat{\beta}^{sh}) \neq \beta\)
- Shrinking the \(\hat{\beta}\) can reduce variance : \(Var(\hat{\beta}^{OLS}) >> Var(\hat{\beta}^{sh})\)
- Examples
- Ridge
- Lasso
- Elastic Net : Ridge + Lasso
3. Ridge Regression
- \(RSS + \lambda\sum_{j=1}^{p}\beta_j^2\)
- where \(\lambda >= 0\) is a tuning parameter.
- For a grid of \(\lambda\) : \(\lambda_{max} = \lambda_1 > ... > \lambda_m = \lambda_{min}\).
- The \(l_2\)-norm of \(\hat{\beta}\) : \(||\hat{\beta_{\lambda_1}}||^2 < ... < ||\hat{\beta_{\lambda_m}}||^2\)
# Import Dataset
library(ISLR)
library(leaps)
library(glmnet)
data(Hitters)
Hitters <- na.omit(Hitters)
# model.matrix convert categorical data type into numerical
x <- model.matrix(Salary~., Hitters)[, -1]
y <- Hitters$Salary
# Grid Search for hyper parameter tuning lambda
grid <- 10^seq(10, -2, length = 100)
# Fit the ridge model : Default grid of lambda
ridge.mod <- glmnet(x, y, alpha = 0, lambda=grid)
# row for the number of parameters
# col for the number of hyperparameter lambda
dim(coef(ridge.mod))
ridge.mod$lambda
# Calculate l_2 norm of each lambda
# If lambda increase, l_2 norm decreaes
# If lambda decreaes, l_2 norm increase
ridge.mod$lambda[60]
coef(ridge.mod)[,60]
sqrt(sum(coef(ridge.mod)[-1, 60]^2))
# Calculate l2-norm based on lambda
l2.norm <- apply(ridge.mod$beta, 2, function(t) sum(t^2))
x.axis <- cbind(log(ridge.mod$lambda), l2.norm)
colnames(x.axis) <- c("log.lambda", "L2.norm")
x.axis
# Visualize plot
par(mfrow=c(1,2))
plot(ridge.mod, "lambda", label=TRUE)
plot(ridge.mod, "norm", label=TRUE)
4. Selecint the Tuning Parameter
4.1 Validation Set
# Make dataset
library(glmnet)
library(ISLR)
names(Hitters)
Hitters <- na.omit(Hitters)
set.seed(123)
x <- model.matrix(Salary~., Hitters)[, -1]
y <- Hitters$Salary
# Train-Test Split
train <- sample(1:nrow(x), nrow(x)/3)
test <- (-train)
y.test <- y[test]
# Hyperparameter tuning
grid <- 10^seq(10, -2, length=100)
r1 <- glmnet(x[train, ], y[train], alpha=0, lambda=grid)
ss <- 0:(length(r1$lambda)-1)
Err <- NULL
# Cross validation Error for test sample
for (i in 1:length(r1$lambda)) {
r1.pred <- predict(r1, s=ss[i], newx=x[test, ])
Err[i] <- mean((r1.pred - y.test)^2)
}
wh <- which.min(Err)
lam.opt <- r1$lambda[wh]
# Get full model with optimized hyperparmeter
r.full <- glmnet(x, y, alpha=0, lambda=grid)
r.full$beta[, wh]
predict(r.full, type="coefficients", s=lam.opt)
4.2 One-Standard Rule Error
set.seed(1234)
cv.r <- cv.glmnet(x, y, alpha=0, nfolds=10)
names(cv.r)
# cvm : The mean value of cross validation -> CVE
# cvsd : The standard deviation of cross validation -> One-standard error
# cvup : The upperbound of CVE -> cvm + cvsd
# cvlo : The lowerbound of CVE -> cvm - cvsd
# lambda.min : The lambda which optimize input model
# lambda.1se : The lambda which optimize imput model based on one-standard error
cbind(cv.r$cvlo, cv.r$cvm, cv.r$cvup)
# Scatter plot based on One-Standard error
# left vertix line : log(lambda.min)
# right vertix line(more shrinked model) : log(lambda.1se)
plot(cv.r)
which(cv.r$lambda==cv.r$lambda.min)
which(cv.r$lambda==cv.r$lambda.1se)
# 100, 54 -> lambda.min < lambda.1se
b.min <- predict(cv.r, type="coefficients", s=cv.r$lambda.min)
b.1se <- predict(cv.r, type="coefficients", s=cv.r$lambda.1se)
# calculate l1-norm
# calculate sum(b.min!=0) - 1 to get l2-norm
cbind(b.min, b.1se)
c(sum(b.min[-1]^2), sum(b.1se[-1]^2))
# sum(b.min[-1]^2) > sum(b.1se[-1]^2)