1. What is Random Forest?
- Random forests provide an improvement over bagged trees by way of a small tweak that decorrelates the trees. This reduces the variance when we average the trees.
- When building these decision trees, each time a split in a trees is considered, a random selection of m predictors is chosen as split candidates from the full set of p predictors.
- A fresh selection of m predictors is taken at each split, and typically we choose \(m = \sqrt{p}\).
2. Building Random Forest with m predictors
2.1 Building Random Forest using bootstrapped sample
# Train-test split
set.seed(123)
train <- sample(1:nrow(Heart), nrow(Heart)/2)
test <- setdiff(1:nrow(Heart), train)
# Bagging: m=13, Random forest: m=1, 4, 6
B <- 500
n <- nrow(Heart)
m <- c(1, 4, 6, 13)
Err <- matrix(0, B, length(m))
Vote <- matrix(0, length(test), length(m))
for (i in 1:B) {
index <- sample(train, replace=TRUE)
for (k in 1:length(m)) {
s <- c(sort(sample(1:13, m[k])), 14)
tr <- tree(AHD ~., data=Heart[index, s])
pr <- predict(tr, Heart[test, ], type="class")
Vote[pr=="Yes", k] <- Vote[pr=="Yes", k] + 1
PR <- rep("Yes", length(test))
PR[Vote[,k] < i/2] <- "No"
Err[i, k] <- mean(Heart$AHD[test]!=PR)
}
}
# Visualize Result
labels <- c("m = 1", "m = 4", "m = 6", "m = 13")
matplot(Err, type="l", xlab="Number of Trees", lty=1,
col=c(1,2:4), ylab="Classification Error Rate")
legend("topright", legend=labels, col=c(1,2:4), lty=1)
# Statistical reports
colnames(Err) <- labels
apply(Err, 2, summary)
- m = 1 : 0.2408725
- m = 4 : 0.2242282
- m = 6 : 0.2242550
- m = 13 : 0.2466040
- The missclassification rate was the lowest when m was 6.
2.2 [Ex] Random Forest using randomForest packages
library(randomForest)
# Bagging(m=13)
bag.heart <- randomForest(x=Heart[train, -14], y=Heart[train,14],
xtest=Heart[test, -14], ytest=Heart[test,14],
mtry=13, importance=TRUE)
bag.heart
bag.conf <- bag.heart$test$confusion[1:2,1:2]
# Missclassification Error of m = 13
1 - sum(diag(bag.conf))/sum(bag.conf)
# Bagging(m=1)
rf1 <- randomForest(x=Heart[train,-14], y=Heart[train,14],
xtest=Heart[test,-14], ytest=Heart[test,14],
mtry=1, importance=TRUE)
rf1.conf <- rf1$test$confusion[1:2, 1:2]
1- sum(diag(rf1.conf))/sum(rf1.conf)
## Random forest with m=4
rf2 <- randomForest(x=Heart[train,-14], y=Heart[train,14],
xtest=Heart[test,-14], ytest=Heart[test,14],
mtry=4, importance=TRUE)
rf2.conf <- rf2$test$confusion[1:2,1:2]
1- sum(diag(rf2.conf))/sum(rf2.conf)
## Random forest with m=6
rf3 <- randomForest(x=Heart[train,-14], y=Heart[train,14],
xtest=Heart[test,-14], ytest=Heart[test,14],
mtry=6, importance=TRUE)
rf3.conf <- rf3$test$confusion[1:2,1:2]
1- sum(diag(rf3.conf))/sum(rf3.conf)
- Missclassification Error of m = 13 : 0.2684564
- Missclassification Error of m = 1 : 0.2147651
- Missclassification Error of m = 4 : 0.2416107
- Missclassification Error of m = 6 : 0.2348993
randomForest(x, y=NULL, xtest, ytest=NULL, ntree=500, mtry=n, replace=TRUE)
- x, y : x, y can be applied separately, usually using formula a lot
- xtest, ytest : If we apply the test dataset together, perform the test at the same time
- ntree : The number of the tree
- mtry : The number of descriptive variable candidates
2.3 [Ex] RandomForest using mtry grid search
# Set Grids
set.seed(1111)
N <- 50
CER <- matrix(0, N, 13)
# Training models : 50 replications
for (i in 1:N) {
train <- sample(1:nrow(Heart), floor(nrow(Heart)*2/3))
test <- setdiff(1:nrow(Heart), train)
for (k in 1:13) {
# Apply random forest
rf <- randomForest(x=Heart[train, -14], y=Heart[train, 14],
xtest=Heart[test, -14], ytest=Heart[test, 14], mtry=k)
rfc <- rf$test$confusion[1:2,1:2]
# Calculate missclassification rate
CER[i, k] <- 1-sum(diag(rfc))/sum(rfc)
}
}
apply(CER, 2, mean)
# Visualize results
boxplot(CER, boxwex=0.5, main="Random Forest with m = 1 to 13",
ylab="Classification Error Rates", col="orange", ylim=c(0, 0.4))
2.4 [Ex] Variable Importance Reports
# Scatter plot of feature importance
varImpPlot(rf1)
varImpPlot(rf2)
varImpPlot(rf3)
# Horizontal bar plot of feature importance
(imp1 <- importance(rf1))
(imp2 <- importance(rf2))
(imp3 <- importance(rf3))
par(mfrow=c(1,3))
# Based on MeanDecreaseAccuracy
barplot(sort(imp1[,3]), main="RF (m=1)", horiz=TRUE, col=2)
barplot(sort(imp2[,3]), main="RF (m=4)", horiz=TRUE, col=2)
barplot(sort(imp3[,3]), main="RF (m=6)", horiz=TRUE, col=2)
# Based on MeanDecreaseGini
barplot(sort(imp1[,4]), main="RF (m=1)", horiz=TRUE, col=2)
barplot(sort(imp2[,4]), main="RF (m=4)", horiz=TRUE, col=2)
barplot(sort(imp3[,4]), main="RF (m=6)", horiz=TRUE, col=2)
- MeanDecreaseAccuracy : Shows feature importance when splitting nodes based on Accuracy
- MeanDecreaseGini : Shows feature importance when splitting nodes based on Gini index
3. [Ex] Assignment
The Caravan insurance dataset consists of 85 quantitative variables and 1 factor with 2 levels. The factor is a response variable Purchase. Randomly separate the training. validation and test sets such that
set.seed(1111)
data(Caravan)
train <- sample(1:nrow(Caravan), nrow(Caravan)/2)
others <- setdiff(1:nrow(Caravan), train)
validation <- sample(others, length(others)/2)
test <- setdiff(others, validation)
Fit the random forest model using the training set, where the number of predictors begins from 1 to 9. First, find the optimal number of predictors using the validation set which minimizes the classification error rate(CER). Then, compute the CER of the test set, using the random forest with the optimal number of predictors.
library(randomForest)
library(ISLR)
set.seed(1111)
data(Caravan)
train <- sample(1:nrow(Caravan), nrow(Caravan)/2)
others <- setdiff(1:nrow(Caravan), train)
validation <- sample(others, length(others)/2)
test <- setdiff(others, validation)
CER <- NULL
for (i in 1:9) {
rf <- randomForest(x=Caravan[train, -86], y=Caravan[train, 86],
xtest=Caravan[validation, -86], ytest=Caravan[validation, 86], mtry=i)
rfc <- rf$test$confusion[1:2,1:2]
CER[i] <- 1-sum(diag(rfc))/sum(rfc)
}
wm <- which.min(CER)
# Calculate missclassification of test sets
rf.test <- randomForest(x=Caravan[train, -86], y=Caravan[train, 86],
xtest=Caravan[test, -86], ytest=Caravan[test, 86], mtry=wm)
rfc.test <- rf.test$test$confusion[1:2,1:2]
CER.test <- 1-sum(diag(rfc.test))/sum(rfc.test)
CER.test
- wm : 3
- CER.test : 0.06593407
'Data Science > R' 카테고리의 다른 글
[R] Tree-Based Methods : Bayesian Additive Tree (0) | 2022.11.27 |
---|---|
[R] Tree-Based Methods : Boosting (0) | 2022.11.27 |
[R] Tree-Based Methods : Bagging (0) | 2022.11.27 |
[R] Tree-Based Methods : Advantages and Disadvantages of Tree (0) | 2022.11.27 |
[R] Tree-Based Methods : Classification Decision Tree (1) | 2022.11.27 |