본문 바로가기

Data Science/R

[R] Tree-Based Methods : Random Forest

1. What is Random Forest?

  • Random forests provide an improvement over bagged trees by way of a small tweak that decorrelates the trees. This reduces the variance when we average the trees.
  • When building these decision trees, each time a split in a trees is considered, a random selection of m predictors is chosen as split candidates from the full set of p predictors.
  • A fresh selection of m predictors is taken at each split, and typically we choose \(m = \sqrt{p}\).

 

2. Building Random Forest with m predictors

2.1 Building Random Forest using bootstrapped sample

# Train-test split
set.seed(123)
train <- sample(1:nrow(Heart), nrow(Heart)/2) 
test <- setdiff(1:nrow(Heart), train) 

# Bagging: m=13, Random forest: m=1, 4, 6
B <- 500 
n <- nrow(Heart)
m <- c(1, 4, 6, 13) 
Err <- matrix(0, B, length(m)) 
Vote <- matrix(0, length(test), length(m)) 

for (i in 1:B) { 
    index <- sample(train, replace=TRUE) 
    for (k in 1:length(m)) { 
        s <- c(sort(sample(1:13, m[k])), 14) 
        tr <- tree(AHD ~., data=Heart[index, s]) 
        pr <- predict(tr, Heart[test, ], type="class") 
        Vote[pr=="Yes", k] <- Vote[pr=="Yes", k] + 1
        PR <- rep("Yes", length(test)) 
        PR[Vote[,k] < i/2] <- "No" 
        Err[i, k] <- mean(Heart$AHD[test]!=PR) 
    } 
} 

# Visualize Result 
labels <- c("m = 1", "m = 4", "m = 6", "m = 13")
matplot(Err, type="l", xlab="Number of Trees", lty=1,
col=c(1,2:4), ylab="Classification Error Rate")
legend("topright", legend=labels, col=c(1,2:4), lty=1)

# Statistical reports 
colnames(Err) <- labels
apply(Err, 2, summary)

 

  • m = 1 : 0.2408725
  • m = 4 : 0.2242282
  • m = 6 : 0.2242550
  • m = 13 : 0.2466040
  • The missclassification rate was the lowest when m was 6.

 

2.2 [Ex] Random Forest using randomForest packages

library(randomForest)

# Bagging(m=13) 
bag.heart <- randomForest(x=Heart[train, -14], y=Heart[train,14], 
                          xtest=Heart[test, -14], ytest=Heart[test,14], 
                          mtry=13, importance=TRUE) 
bag.heart
bag.conf <- bag.heart$test$confusion[1:2,1:2] 

# Missclassification Error of m = 13
1 - sum(diag(bag.conf))/sum(bag.conf)

# Bagging(m=1) 
rf1 <- randomForest(x=Heart[train,-14], y=Heart[train,14], 
                    xtest=Heart[test,-14], ytest=Heart[test,14], 
                    mtry=1, importance=TRUE) 
rf1.conf <- rf1$test$confusion[1:2, 1:2]
1- sum(diag(rf1.conf))/sum(rf1.conf)

## Random forest with m=4
rf2 <- randomForest(x=Heart[train,-14], y=Heart[train,14],
                    xtest=Heart[test,-14], ytest=Heart[test,14],
                    mtry=4, importance=TRUE)
rf2.conf <- rf2$test$confusion[1:2,1:2]
1- sum(diag(rf2.conf))/sum(rf2.conf)

## Random forest with m=6
rf3 <- randomForest(x=Heart[train,-14], y=Heart[train,14],
                    xtest=Heart[test,-14], ytest=Heart[test,14],
                    mtry=6, importance=TRUE)
rf3.conf <- rf3$test$confusion[1:2,1:2]
1- sum(diag(rf3.conf))/sum(rf3.conf)

 

  • Missclassification Error of m = 13 : 0.2684564
  • Missclassification Error of m = 1 : 0.2147651
  • Missclassification Error of m = 4 : 0.2416107
  • Missclassification Error of m = 6 : 0.2348993

 

randomForest(x, y=NULL, xtest, ytest=NULL, ntree=500, mtry=n, replace=TRUE)

  • x, y : x, y can be applied separately, usually using formula a lot
  • xtest, ytest : If we apply the test dataset together, perform the test at the same time
  • ntree : The number of the tree
  • mtry : The number of descriptive variable candidates

 

2.3 [Ex] RandomForest using mtry grid search 

# Set Grids 
set.seed(1111)
N <- 50
CER <- matrix(0, N, 13)

# Training models : 50 replications 
for (i in 1:N) {
    train <- sample(1:nrow(Heart), floor(nrow(Heart)*2/3))
    test <- setdiff(1:nrow(Heart), train)
    for (k in 1:13) {
        # Apply random forest 
        rf <- randomForest(x=Heart[train, -14], y=Heart[train, 14], 
                           xtest=Heart[test, -14], ytest=Heart[test, 14], mtry=k)
        rfc <- rf$test$confusion[1:2,1:2]
        # Calculate missclassification rate 
        CER[i, k] <- 1-sum(diag(rfc))/sum(rfc)
    }
}
apply(CER, 2, mean)

# Visualize results 
boxplot(CER, boxwex=0.5, main="Random Forest with m = 1 to 13",
        ylab="Classification Error Rates", col="orange", ylim=c(0, 0.4))

 

 

2.4 [Ex] Variable Importance Reports

# Scatter plot of feature importance 
varImpPlot(rf1)
varImpPlot(rf2)
varImpPlot(rf3)

# Horizontal bar plot of feature importance 
(imp1 <- importance(rf1))
(imp2 <- importance(rf2))
(imp3 <- importance(rf3))

par(mfrow=c(1,3))
# Based on MeanDecreaseAccuracy
barplot(sort(imp1[,3]), main="RF (m=1)", horiz=TRUE, col=2)
barplot(sort(imp2[,3]), main="RF (m=4)", horiz=TRUE, col=2)
barplot(sort(imp3[,3]), main="RF (m=6)", horiz=TRUE, col=2)

# Based on MeanDecreaseGini
barplot(sort(imp1[,4]), main="RF (m=1)", horiz=TRUE, col=2)
barplot(sort(imp2[,4]), main="RF (m=4)", horiz=TRUE, col=2)
barplot(sort(imp3[,4]), main="RF (m=6)", horiz=TRUE, col=2)
  • MeanDecreaseAccuracy : Shows feature importance when splitting nodes based on Accuracy
  • MeanDecreaseGini : Shows feature importance when splitting nodes based on Gini index

 

 

3. [Ex] Assignment

The Caravan insurance dataset consists of 85 quantitative variables and 1 factor with 2 levels. The factor is a response variable Purchase. Randomly separate the training. validation and test sets such that

set.seed(1111)
data(Caravan)
train <- sample(1:nrow(Caravan), nrow(Caravan)/2)
others <- setdiff(1:nrow(Caravan), train)
validation <- sample(others, length(others)/2)
test <- setdiff(others, validation) 

Fit the random forest model using the training set, where the number of predictors begins from 1 to 9. First, find the optimal number of predictors using the validation set which minimizes the classification error rate(CER). Then, compute the CER of the test set, using the random forest with the optimal number of predictors.

 

library(randomForest)
library(ISLR)

set.seed(1111)
data(Caravan)
train <- sample(1:nrow(Caravan), nrow(Caravan)/2)
others <- setdiff(1:nrow(Caravan), train)
validation <- sample(others, length(others)/2)
test <- setdiff(others, validation) 

CER <- NULL
for (i in 1:9) { 
  rf <- randomForest(x=Caravan[train, -86], y=Caravan[train, 86],
                     xtest=Caravan[validation, -86], ytest=Caravan[validation, 86], mtry=i)  
  rfc <- rf$test$confusion[1:2,1:2]
  CER[i] <- 1-sum(diag(rfc))/sum(rfc)
}
wm <- which.min(CER)

# Calculate missclassification of test sets 
rf.test <- randomForest(x=Caravan[train, -86], y=Caravan[train, 86], 
                   xtest=Caravan[test, -86], ytest=Caravan[test, 86], mtry=wm)
rfc.test <- rf.test$test$confusion[1:2,1:2]
CER.test <- 1-sum(diag(rfc.test))/sum(rfc.test)
CER.test

 

  • wm : 3
  • CER.test : 0.06593407