PARISLAB Concrete Overdesign --- Optimal # of Outlier

Angela Yu
Sep 7, 2019
2 min read

The goal of this section is to select the optimal number of outliers to remove. We consider the two algorithms that yield the largest R-squared from previous sections: Random Forest and XGboost.

(1) Remove 0 - 5000 data points based on the propensity of being an outlier (using Angel-Based Outlier Detection, Function-Based Outlier Detection, and Subspace Outlier Detection)

(2) Split the all data into five iterations of testing and training, using training data to train Random Forest and XGBoost tree

(3) Calculate the average R-squared

(4) Plot the R-squared and find the algorithm and number removed that produce the highest R-squared

Due to its high computational complexity, the operation below is conducted in UCLA's supercomputer cluster Hoffman2. Visualization of changes in R-squared will be demonstrated below the code chunk.

###############################################################################

library(randomForest)

library(abodOutlier)

library(xgboost)

setwd("/u/home/h/huiziy/project-hduquant/Huizi_R_result")

concrete <- read.csv("Clean_data.csv")

abod <- read.csv("ABOD.csv")

result <- matrix(nrow = 101, ncol = 2)

for (i in seq(from = 0, to = 5000, by = 50)) {

## (1) preparing the data by spliting 10% to be out-of-box

input <- concrete[,1:7]

input2 <- scale(input)

complete <- cbind(input2, concrete$overdesign)

concrete2 <- as.data.frame(complete)

colnames(concrete2) <- c("coarse_agg_weight", "fine_agg_weight", "current_weight", "fly_ash_weight", "AEA_dose", "type_awra_dose", "weight_ratio", "target", "overdesign")

if (i == 0) {

concrete2 = concrete2

concrete2$Ind <- "Inlier"

} else {

concrete2[order(abod, decreasing = FALSE)[1:i],"Ind"] <- "Outlier"

concrete2[is.na(concrete2$Ind),"Ind"] <- "Inlier"

}

removed_abod <- subset(concrete2, concrete2$Ind == "Inlier")

## (2) deciding whether we need to remove any variables based on variable importance

## tree_1 <- randomForest(y = concrete2$overdesign , x = concrete2[,1:7])

## importance(tree_1, type = 2)

## random forest picked coarse_agg_weight, fine_agg_weight, current_weight, fly_ash_weight,

## AEA_dose, type_awra_dose, W..C.P. all proved to be above 45, shows that we don't need to remove any variables

## (3) split the all data into five iterations of testing and training, calculate the average Rsquare

Rsquared_rf_avg <- c(NA, NA, NA, NA, NA)

Rsquared_xg_avg <- c(NA, NA, NA, NA, NA)

for (j in 1:5) {

set.seed(1234567+j*1000)

samp<-sample(1:nrow(removed_abod),nrow(removed_abod)*0.8,replace = F)

train <-removed_abod[samp,]

test <- removed_abod[-samp,]

mu <- mean(test$overdesign)

tree_abod <- randomForest(y = train$overdesign , x = train[,1:7], ntree = 500, importance = TRUE)

rf.pred_abod <- predict(tree_abod, newdata =as.matrix(test[,1:7]))

Rsquared_abod_1 <- 1 - (sum((test$overdesign - rf.pred_abod)^2)/sum((test$overdesign - mu)^2))

Rsquared_rf_avg[j] = Rsquared_abod_1

## XGboost

train_values <- train[,1:7]

train_result <- train$overdesign

test_values <- test[,1:7]

test_result <- test$overdesign

dtrain = xgb.DMatrix(data = as.matrix(train_values), label = train_result)

dtest = xgb.DMatrix(data = as.matrix(test_values), label = test_result)

watchlist = list(train=dtrain, test=dtest)

xgb_train <- xgb.train(data = dtrain,

max.depth = 8,

eta = 0.3,

nthread = 2,

nround = 10000,

watchlist = watchlist,

objective = "reg:linear",

early_stopping_rounds = 50,

print_every_n = 500)

pred_val_xgboost <- predict(xgb_train, as.matrix(test[,1:7]))

Rsquared_boosted <- 1 - (sum((test$overdesign - pred_val_xgboost)^2)/sum((test$overdesign - mu)^2))

Rsquared_xg_avg[j] <- Rsquared_boosted

}

result[i/50+1, 1] <- mean(Rsquared_rf_avg)

result[i/50+1, 2] <- mean(Rsquared_xg_avg)

}

## Store the result:

path_out = '/u/home/h/huiziy/project-hduquant/Huizi_R_result/'

fileName = paste0(path_out, "result_abod",".csv")

write.table(result, file= fileName, sep = " ", row.names = FALSE, col.names= TRUE )

###############################################################################

The Maximum R-squared is achieved using

(1) SOD outlier detection method

(2) Removing 2450 data points

(3) Using Random Forest to Predict

PARISLAB Concrete Overdesign --- Optimal # of Outlier

Recent Posts

Comments