1. How to do simulation about prediction performance?
- If \(X\) has \(n \times p (200 \times 2000)\) size, we need to find which variables are selected and which variable affects target most(coefficients).
- So, we need to consider variable models predicting well.
- M1 : \(\hat{\beta}^{lasso} + \lambda_{min}\)
- M2 : \(\hat{\beta}^{lasso} + \lambda_{1se}\)
- M3 : \(\hat{\beta}^{lasso} + \lambda_{min} + \hat{\beta}^{ols}\)
- M4 : \(\hat{\beta}^{lasso} + \lambda_{1se} + \hat{\beta}^{ols}\)
- M5 : \(\hat{\beta}^{ols}[1:20]\) - After training model and with their regression coefficients, choose the best model with the lowest \(MSE_{test}\).
2. Quantitative Outcome
2.1 Generate X and Y
# install.packages('glmnet')
library(glmnet)
# Generate X matrix and Y vector
sim.fun <- function(n, p, beta, family=c("gaussian", "binomial")) {
family <- match.arg(family)
if (family=="gaussian") {
x <- matrix(rnorm(n*p), n, p)
y <- x %*% beta + rnorm(n)
}
else {
x <- matrix(rnorm(n*p), n, p)
xb <- x %*% beta
z <- exp(xb) / (1+exp(xb))
u <- runif(n)
y <- rep(0, n)
y[z > u] <- 1
}
list(x=x, y=y)
}
# Configure dimension of n, p and beta(모수)
set.seed(1234)
n <- 200
p <- 2000
beta <- rep(0, p)
beta[1:20] <- runif(20, -1, 1)
# Generate x and y
sim <- sim.fun(n, p, beta)
x <- sim$x; y <- sim$y
2.2 Regression Coefficients based on Model1 and Model2
# Fit the lasso with two different lambda values
# b hat is inferred regression coefficients : M1, M2
g <- cv.glmnet(x, y, alpha=1, nfolds = 10)
bhat1 <- coef(g, s="lambda.min")
bhat2 <- coef(g, s="lambda.1se")
# Check coefficients with value is not 0.
wh1 <- which(as.matrix(bhat1)!=0)
w1 <- wh1[-1]-1
wh2 <- which(as.matrix(bhat2)!=0)
w2 <- wh2[-1]-1
2.3 Regression Coefficients based on Model3, Model4, and Model5
# Compute ordinary least square estimates (unbiased estimates)
bhat3 <- bhat4 <- bhat5 <- rep(0, p+1)
bhat3[wh1] <- lm(y ~ x[, w1])$coef
bhat4[wh2] <- lm(y ~ x[, w2])$coef
bhat5[1:21] <- lm(y ~ x[, 1:20])$coef
2.4 Calculate Test Error based on MSE among Models
set.seed(56789)
# Generate test sets
test <- sim.fun(n, p, beta)
xt <- cbind(1, test$x)
yt <- test$y
# Test set prediction errors of 6 coefficient estimates
mean((yt - xt %*% bhat1)^2) # lasso_lambda.min (M1)
mean((yt - xt %*% bhat2)^2) # lasso_lambda.1se (M2)
mean((yt - xt %*% bhat3)^2) # least square_lambda.min (M3)
mean((yt - xt %*% bhat4)^2) # least square_lambda.1se (M4)
mean((yt - xt %*% bhat5)^2) # least square_nonzero beta (M5)
mean((yt - xt %*% c(0, beta))^2) # true beta (M6)
# Calculate TE repeatedly
set.seed(1)
# Generate new test sets 100 times
K <- 100
pred <- matrix(NA, K, 6)
for (i in 1:K) {
test <- sim.fun(n, p, beta)
xt <- cbind(1, test$x)
yt <- test$y
pred[i, 1] <- mean((yt - xt %*% bhat1)^2)
pred[i, 2] <- mean((yt - xt %*% bhat2)^2)
pred[i, 3] <- mean((yt - xt %*% bhat3)^2)
pred[i, 4] <- mean((yt - xt %*% bhat4)^2)
pred[i, 5] <- mean((yt - xt %*% bhat5)^2)
pred[i, 6] <- mean((yt - xt %*% c(0, beta))^2)
}
apply(pred, 2, mean)
boxplot(pred, col=c(2,2,4,4,3,"orange"), boxwex=0.6,
names=c("M1", "M2", "M3", "M4", "M5", "M6"),
ylab="Prediction Error")
- We can see that lasso model didn't select accurate variables rather than ols model.
3. Binary Outcome
3.1 Generate X and Y
set.seed(111)
n <- 200
p <- 2000
beta <- rep(0, p)
beta[1:20] <- runif(20, -1, 1)
sim <- sim.fun(n, p, beta, family="binomial")
x <- sim$x; y <- sim$y
3.2 Classification Error function
## Classification Error
class.fun <- function(test.x, test.y, beta, k=0.5) {
# xb : calculate x % coefficients
xb <- test.x %*% beta
# exb : inferred probability of xb
exb <- exp(xb) / (1 + exp(xb))
y <- rep(0, length(test.y))
y[as.logical(exb > k)] <- 1
min(mean(test.y!=y), mean(test.y!=(1-y)))
}
3.3 Training Models 1 ~ 5
g <- cv.glmnet(x, y, alpha=1, nfolds = 10, family="binomial")
bhat1 <- coef(g, s="lambda.min")
bhat2 <- coef(g, s="lambda.1se")
wh1 <- which(as.matrix(bhat1)!=0)
wh2 <- which(as.matrix(bhat2)!=0)
bhat3 <- bhat4 <- bhat5 <- rep(0, p+1)
w1 <- wh1[-1]-1; w2 <- wh2[-1]-1
bhat3[wh1] <- glm(y ~ x[, w1], family="binomial")$coef
bhat4[wh2] <- glm(y ~ x[, w2], family="binomial")$coef
bhat5[1:21] <- glm(y ~ x[, 1:20], family="binomial")$coef
3.4 Calculate Test Erro based on Missclassification Rate
# Generate test sets
set.seed(56789)
test <- sim.fun(n, p, beta, family="binomial")
xt <- cbind(1, test$x); yt <- test$y
# Prediction error comparison
class.fun(xt, yt, bhat1) # lasso_lambda.min (M1)
class.fun(xt, yt, bhat2) # lasso_lambda.1se (M2)
class.fun(xt, yt, bhat3) # least square_lambda.min (M3)
class.fun(xt, yt, bhat4) # least square_lambda.1se (M4)
class.fun(xt, yt, bhat5) # least square_nonzero beta (M5)
class.fun(xt, yt, c(0, beta)) # true beta (M6)
# Calculate TE repeatedly
set.seed(35791)
# Generate new test sets 100 times
K <- 100
pred <- matrix(NA, K, 6)
for (i in 1:K) {
test <- sim.fun(n, p, beta, family="binomial")
xt <- cbind(1, test$x)
yt <- test$y
pred[i, 1] <- class.fun(xt, yt, bhat1)
pred[i, 2] <- class.fun(xt, yt, bhat2)
pred[i, 3] <- class.fun(xt, yt, bhat3)
pred[i, 4] <- class.fun(xt, yt, bhat4)
pred[i, 5] <- class.fun(xt, yt, bhat5)
pred[i, 6] <- class.fun(xt, yt, c(0, beta))
}
apply(pred, 2, mean)
boxplot(pred, col=c(2,2,4,4,3,"orange"), boxwex=0.6,
names=c("M1", "M2", "M3", "M4", "M5", "M6"),
ylab="Prediction Error")
- With accurate variable selection(using ols model), we can get better test error.
'
'Data Science > R' 카테고리의 다른 글
[R] Elastic-Net Regression (0) | 2022.10.13 |
---|---|
[R] Regularization Methods (0) | 2022.10.11 |
[R] Useful Functions for Regression Problems (0) | 2022.10.07 |
[R] Regularization Methods : Binary (0) | 2022.10.05 |
[R] Variable Selection Methods : Lasso (0) | 2022.10.05 |