본문 바로가기

Data Science/R

[R] Simulation Study : Prediction Performance

1. How to do simulation about prediction performance?

  • If \(X\) has \(n \times p (200 \times 2000)\) size, we need to find which variables are selected and which variable affects target most(coefficients).
  • So, we need to consider variable models predicting well. 
        - M1 : \(\hat{\beta}^{lasso} + \lambda_{min}\)
        - M2 : \(\hat{\beta}^{lasso} + \lambda_{1se}\)
        - M3 : \(\hat{\beta}^{lasso} + \lambda_{min} + \hat{\beta}^{ols}\)
        - M4 : \(\hat{\beta}^{lasso} + \lambda_{1se} + \hat{\beta}^{ols}\)
        - M5 : \(\hat{\beta}^{ols}[1:20]\)
  • After training model and with their regression coefficients, choose the best model with the lowest \(MSE_{test}\).

 

2. Quantitative Outcome

2.1 Generate X and Y

# install.packages('glmnet')
library(glmnet)

# Generate X matrix and Y vector
sim.fun <- function(n, p, beta, family=c("gaussian", "binomial")) {
  family <- match.arg(family)
  if (family=="gaussian") {
    x <- matrix(rnorm(n*p), n, p)
    y <- x %*% beta + rnorm(n)
  }
  else {
    x <- matrix(rnorm(n*p), n, p)
    xb <- x %*% beta
    z <- exp(xb) / (1+exp(xb))
    u <- runif(n)
    y <- rep(0, n)
    y[z > u] <- 1
  }
  list(x=x, y=y)
}

# Configure dimension of n, p and beta(모수)
set.seed(1234)
n <- 200
p <- 2000
beta <- rep(0, p)
beta[1:20] <- runif(20, -1, 1)

# Generate x and y 
sim <- sim.fun(n, p, beta)
x <- sim$x; y <- sim$y

 

2.2 Regression Coefficients based on Model1 and Model2

# Fit the lasso with two different lambda values
# b hat is inferred regression coefficients : M1, M2 
g <- cv.glmnet(x, y, alpha=1, nfolds = 10)
bhat1 <- coef(g, s="lambda.min")
bhat2 <- coef(g, s="lambda.1se")
# Check coefficients with value is not 0. 
wh1 <- which(as.matrix(bhat1)!=0)
w1 <- wh1[-1]-1
wh2 <- which(as.matrix(bhat2)!=0)
w2 <- wh2[-1]-1

 

2.3 Regression Coefficients based on Model3, Model4, and Model5

# Compute ordinary least square estimates (unbiased estimates)
bhat3 <- bhat4 <- bhat5 <- rep(0, p+1)
bhat3[wh1] <- lm(y ~ x[, w1])$coef
bhat4[wh2] <- lm(y ~ x[, w2])$coef
bhat5[1:21] <- lm(y ~ x[, 1:20])$coef

 

2.4 Calculate Test Error based on MSE among Models

set.seed(56789)
# Generate test sets
test <- sim.fun(n, p, beta)
xt <- cbind(1, test$x)
yt <- test$y

# Test set prediction errors of 6 coefficient estimates
mean((yt - xt %*% bhat1)^2) # lasso_lambda.min (M1)
mean((yt - xt %*% bhat2)^2) # lasso_lambda.1se (M2)
mean((yt - xt %*% bhat3)^2) # least square_lambda.min (M3)
mean((yt - xt %*% bhat4)^2) # least square_lambda.1se (M4)
mean((yt - xt %*% bhat5)^2) # least square_nonzero beta (M5)
mean((yt - xt %*% c(0, beta))^2) # true beta (M6)

# Calculate TE repeatedly
set.seed(1)
# Generate new test sets 100 times
K <- 100
pred <- matrix(NA, K, 6)
for (i in 1:K) {
    test <- sim.fun(n, p, beta)
    xt <- cbind(1, test$x)
    yt <- test$y

    pred[i, 1] <- mean((yt - xt %*% bhat1)^2)
    pred[i, 2] <- mean((yt - xt %*% bhat2)^2)
    pred[i, 3] <- mean((yt - xt %*% bhat3)^2)
    pred[i, 4] <- mean((yt - xt %*% bhat4)^2)
    pred[i, 5] <- mean((yt - xt %*% bhat5)^2)
    pred[i, 6] <- mean((yt - xt %*% c(0, beta))^2)
}

apply(pred, 2, mean)
boxplot(pred, col=c(2,2,4,4,3,"orange"), boxwex=0.6,
names=c("M1", "M2", "M3", "M4", "M5", "M6"),
ylab="Prediction Error")

 

 

  • We can see that lasso model didn't select accurate variables rather than ols model.

 

3. Binary Outcome

3.1 Generate X and Y

set.seed(111)
n <- 200
p <- 2000
beta <- rep(0, p)
beta[1:20] <- runif(20, -1, 1)
sim <- sim.fun(n, p, beta, family="binomial")
x <- sim$x; y <- sim$y

 

3.2 Classification Error function

## Classification Error
class.fun <- function(test.x, test.y, beta, k=0.5) {
    # xb : calculate x % coefficients
    xb <- test.x %*% beta
    # exb : inferred probability of xb 
    exb <- exp(xb) / (1 + exp(xb))
    y <- rep(0, length(test.y))
    y[as.logical(exb > k)] <- 1
    min(mean(test.y!=y), mean(test.y!=(1-y)))
}

 

3.3 Training Models 1 ~ 5

g <- cv.glmnet(x, y, alpha=1, nfolds = 10, family="binomial")
bhat1 <- coef(g, s="lambda.min")
bhat2 <- coef(g, s="lambda.1se")
wh1 <- which(as.matrix(bhat1)!=0)
wh2 <- which(as.matrix(bhat2)!=0)
bhat3 <- bhat4 <- bhat5 <- rep(0, p+1)
w1 <- wh1[-1]-1; w2 <- wh2[-1]-1
bhat3[wh1] <- glm(y ~ x[, w1], family="binomial")$coef
bhat4[wh2] <- glm(y ~ x[, w2], family="binomial")$coef
bhat5[1:21] <- glm(y ~ x[, 1:20], family="binomial")$coef

 

3.4 Calculate Test Erro based on Missclassification Rate

# Generate test sets
set.seed(56789)
test <- sim.fun(n, p, beta, family="binomial")
xt <- cbind(1, test$x); yt <- test$y

# Prediction error comparison
class.fun(xt, yt, bhat1) # lasso_lambda.min (M1)
class.fun(xt, yt, bhat2) # lasso_lambda.1se (M2)
class.fun(xt, yt, bhat3) # least square_lambda.min (M3)
class.fun(xt, yt, bhat4) # least square_lambda.1se (M4)
class.fun(xt, yt, bhat5) # least square_nonzero beta (M5)
class.fun(xt, yt, c(0, beta)) # true beta (M6)

# Calculate TE repeatedly
set.seed(35791)

# Generate new test sets 100 times
K <- 100
pred <- matrix(NA, K, 6)
for (i in 1:K) {
    test <- sim.fun(n, p, beta, family="binomial")
    xt <- cbind(1, test$x)
    yt <- test$y

    pred[i, 1] <- class.fun(xt, yt, bhat1)
    pred[i, 2] <- class.fun(xt, yt, bhat2)
    pred[i, 3] <- class.fun(xt, yt, bhat3)
    pred[i, 4] <- class.fun(xt, yt, bhat4)
    pred[i, 5] <- class.fun(xt, yt, bhat5)
    pred[i, 6] <- class.fun(xt, yt, c(0, beta))
}

apply(pred, 2, mean)
boxplot(pred, col=c(2,2,4,4,3,"orange"), boxwex=0.6,
names=c("M1", "M2", "M3", "M4", "M5", "M6"),
ylab="Prediction Error")

 

 

  • With accurate variable selection(using ols model), we can get better test error.

'

 

'Data Science > R' 카테고리의 다른 글

[R] Elastic-Net Regression  (0) 2022.10.13
[R] Regularization Methods  (0) 2022.10.11
[R] Useful Functions for Regression Problems  (0) 2022.10.07
[R] Regularization Methods : Binary  (0) 2022.10.05
[R] Variable Selection Methods : Lasso  (0) 2022.10.05