knitr::opts_chunk$set(warning = FALSE, message = FALSE)

Libraries and Data Import

library(PRROC)
library(caret)
library(knitr)
library(tidyverse)
library(data.table)
library(caret)
library(ranger)
library(matrixStats)
library(Metrics)
library(ROCR)
library(pROC)

application_train_clean <- as.data.frame(fread("data/application_train_clean.csv"))
application_test_clean <- as.data.frame(fread("data/application_test_clean.csv"))

application_train_smote <- as.data.frame(fread("data/application_train_smote.csv"))

Class Balance

application_train_clean %>% pull(DEFAULT) %>% table()

## .
##  FALSE   TRUE 
## 282412  24804

application_train_clean %>% pull(DEFAULT) %>% table() %>% prop.table() %>% round(2)

## .
## FALSE  TRUE 
##  0.92  0.08

Data Partitioning

# split data set
creditinTrain <- createDataPartition(application_train_clean$DEFAULT, p=.7, list=FALSE)

# train set
credit_train_set <- application_train_clean[creditinTrain,]

# test set
credit_test_set <- application_train_clean[-creditinTrain,]

Training the Random Forest

# convert target column to factor
credit_train_set$DEFAULT <- as.factor(credit_train_set$DEFAULT)
credit_test_set$DEFAULT <- as.factor(credit_test_set$DEFAULT)

# calculate class weights for unbalanced dataset
class_proportions <- table(credit_train_set$DEFAULT) / nrow(credit_train_set)
class_weights <- 1 / class_proportions

# train RF model with 500 trees, 10 depth
rf_model <- ranger(formula = DEFAULT ~ ., data = credit_train_set, 
                   num.trees = 500, max.depth = 10, oob.error = TRUE, class.weights = class_weights, , importance = 'impurity', seed = 1234)

## Growing trees.. Progress: 61%. Estimated remaining time: 19 seconds.

# find and print important variables
importance <- sort(rf_model$variable.importance, decreasing = TRUE)
print(importance)

##                EXT_SOURCE_3                EXT_SOURCE_2 
##                 3915.331821                 3914.256113 
##                EXT_SOURCE_1                  DAYS_BIRTH 
##                 1371.953260                  932.734504 
##               DAYS_EMPLOYED             DAYS_ID_PUBLISH 
##                  784.079700                  628.624043 
##      DAYS_LAST_PHONE_CHANGE             AMT_GOODS_PRICE 
##                  606.893673                  562.112875 
##                  AMT_CREDIT                 AMT_ANNUITY 
##                  530.617750                  530.053620 
##           DAYS_REGISTRATION         NAME_EDUCATION_TYPE 
##                  510.023525                  473.922627 
##                  SK_ID_CURR            AMT_INCOME_TOTAL 
##                  429.132643                  380.145040 
##  REGION_POPULATION_RELATIVE           ORGANIZATION_TYPE 
##                  364.088854                  313.538331 
##     HOUR_APPR_PROCESS_START                 GENDER_MALE 
##                  295.323537                  264.774400 
##            NAME_INCOME_TYPE                 OWN_CAR_AGE 
##                  249.771147                  232.336864 
## REGION_RATING_CLIENT_W_CITY             OCCUPATION_TYPE 
##                  206.702786                  205.551901 
##  AMT_REQ_CREDIT_BUREAU_YEAR        REGION_RATING_CLIENT 
##                  190.947189                  185.535168 
##    OBS_60_CNT_SOCIAL_CIRCLE    OBS_30_CNT_SOCIAL_CIRCLE 
##                  168.415112                  167.207537 
##  WEEKDAY_APPR_PROCESS_START          NAME_FAMILY_STATUS 
##                  142.646987                  128.983164 
##             CNT_FAM_MEMBERS    DEF_30_CNT_SOCIAL_CIRCLE 
##                  124.026587                  115.013984 
##      REG_CITY_NOT_WORK_CITY           NAME_HOUSING_TYPE 
##                  112.303368                  112.185588 
##                IMPUTED_EXT1    DEF_60_CNT_SOCIAL_CIRCLE 
##                  111.123783                  101.656638 
##                CNT_CHILDREN   AMT_REQ_CREDIT_BUREAU_QRT 
##                   96.543462                   93.817822 
##                IMPUTED_EXT3      REG_CITY_NOT_LIVE_CITY 
##                   92.989174                   92.067500 
##                FLAG_OWN_CAR                   CASH_LOAN 
##                   78.697777                   77.571704 
##   AMT_REQ_CREDIT_BUREAU_MON              FLAG_EMP_PHONE 
##                   74.760871                   67.619116 
##             NAME_TYPE_SUITE             FLAG_WORK_PHONE 
##                   65.391212                   48.951156 
##     LIVE_CITY_NOT_WORK_CITY                  FLAG_PHONE 
##                   43.432376                   43.258732 
##             FLAG_OWN_REALTY  AMT_REQ_CREDIT_BUREAU_WEEK 
##                   37.792319                   33.045423 
##                  FLAG_EMAIL   AMT_REQ_CREDIT_BUREAU_DAY 
##                   27.542693                   24.588493 
##  REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION 
##                   24.001468                   21.919742 
##  REG_REGION_NOT_LIVE_REGION  AMT_REQ_CREDIT_BUREAU_HOUR 
##                   21.778973                   18.535071 
##            FLAG_CONT_MOBILE                IMPUTED_EXT2 
##                    8.740591                    8.267135 
##                  FLAG_MOBIL 
##                    0.000000

Predictions on Train Data

# predict classes on train data
predictions_train <- predict(rf_model, credit_train_set)$predictions

Train Metrics

# get confusion matrix
conf_matrix_train <- confusionMatrix(predictions_train, credit_train_set$DEFAULT)
print(conf_matrix_train)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  FALSE   TRUE
##      FALSE 147401   5838
##      TRUE   50288  11525
##                                           
##                Accuracy : 0.739           
##                  95% CI : (0.7372, 0.7409)
##     No Information Rate : 0.9193          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1889          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7456          
##             Specificity : 0.6638          
##          Pos Pred Value : 0.9619          
##          Neg Pred Value : 0.1864          
##              Prevalence : 0.9193          
##          Detection Rate : 0.6854          
##    Detection Prevalence : 0.7126          
##       Balanced Accuracy : 0.7047          
##                                           
##        'Positive' Class : FALSE           
##

Predictions on Test Data

# predict on test data
predictions_test <- predict(rf_model, credit_test_set)$predictions

Test Metrics

# get confusion matrix
conf_matrix_test <- confusionMatrix(predictions_test, credit_test_set$DEFAULT)
print(conf_matrix_test)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 62746  2948
##      TRUE  21977  4493
##                                           
##                Accuracy : 0.7296          
##                  95% CI : (0.7267, 0.7324)
##     No Information Rate : 0.9193          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.159           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7406          
##             Specificity : 0.6038          
##          Pos Pred Value : 0.9551          
##          Neg Pred Value : 0.1697          
##              Prevalence : 0.9193          
##          Detection Rate : 0.6808          
##    Detection Prevalence : 0.7128          
##       Balanced Accuracy : 0.6722          
##                                           
##        'Positive' Class : FALSE           
##

AUC Train Set

# train RF for AUC calculation
rf_AUC <- ranger(formula = DEFAULT ~ ., data = credit_train_set, 
                   num.trees = 500, max.depth = 10, oob.error = TRUE, probability = TRUE, class.weights = class_weights, importance = 'impurity', seed = 1234)

## Growing trees.. Progress: 66%. Estimated remaining time: 15 seconds.

# train probabilities
probs_train <- predict(rf_AUC, data = credit_train_set)$predictions[, 2]

# train predictions
pred_train <- prediction(probs_train, credit_train_set$DEFAULT)

# train trp and fpr performance metrics
perf_train <- performance(pred_train, measure = "tpr", x.measure = "fpr")

# train AUC performance
auc_perf_train <- performance(pred_train, measure = "auc")

# train AUC calculation
auc_value_train <- auc_perf_train@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_train, 4)))

## [1] "AUC = 0.8098"

AUC Test Set

# test test probabilities
probs_test <- predict(rf_AUC, data = credit_test_set)$predictions[, 2]

# test set predictions
pred_test <- prediction(probs_test, credit_test_set$DEFAULT)

# test set tpr and fpr performance
perf_test <- performance(pred_test, measure = "tpr", x.measure = "fpr")

# test set AUC performance
auc_perf_test <- performance(pred_test, measure = "auc")

# test set AUC calcualtion
auc_value_test <- auc_perf_test@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_test, 4)))

## [1] "AUC = 0.7393"

Model 2

# train RF with 800 trees and 20 depth
rf_model1 <- ranger(formula = DEFAULT ~ ., data = credit_train_set, , importance = 'impurity',
                   num.trees = 800, max.depth = 20, class.weights = class_weights, seed = 1234)

## Growing trees.. Progress: 19%. Estimated remaining time: 2 minutes, 13 seconds.
## Growing trees.. Progress: 40%. Estimated remaining time: 1 minute, 34 seconds.
## Growing trees.. Progress: 62%. Estimated remaining time: 58 seconds.
## Growing trees.. Progress: 85%. Estimated remaining time: 21 seconds.

# train predictions
predictions_train1 <- predict(rf_model1, credit_train_set)$predictions

# test predictions
predictions_test1 <- predict(rf_model1, credit_test_set)$predictions

# conf matrix train
conf_matrix_train1 <- confusionMatrix(predictions_train1, credit_train_set$DEFAULT)
print(conf_matrix_train1)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  FALSE   TRUE
##      FALSE 194760   3638
##      TRUE    2929  13725
##                                           
##                Accuracy : 0.9695          
##                  95% CI : (0.9687, 0.9702)
##     No Information Rate : 0.9193          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7904          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9852          
##             Specificity : 0.7905          
##          Pos Pred Value : 0.9817          
##          Neg Pred Value : 0.8241          
##              Prevalence : 0.9193          
##          Detection Rate : 0.9056          
##    Detection Prevalence : 0.9226          
##       Balanced Accuracy : 0.8878          
##                                           
##        'Positive' Class : FALSE           
##

# conf matrix train
conf_matrix_test1 <- confusionMatrix(predictions_test1, credit_test_set$DEFAULT)
print(conf_matrix_test1)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 82097  6460
##      TRUE   2626   981
##                                           
##                Accuracy : 0.9014          
##                  95% CI : (0.8995, 0.9033)
##     No Information Rate : 0.9193          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1318          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9690          
##             Specificity : 0.1318          
##          Pos Pred Value : 0.9271          
##          Neg Pred Value : 0.2720          
##              Prevalence : 0.9193          
##          Detection Rate : 0.8908          
##    Detection Prevalence : 0.9609          
##       Balanced Accuracy : 0.5504          
##                                           
##        'Positive' Class : FALSE           
##

AUC Train Set 2

# train AUC for 800 trees, 20 depth
rf_AUC1 <- ranger(formula = DEFAULT ~ ., data = credit_train_set, num.trees = 800, max.depth = 20, probability = TRUE, importance = 'impurity', class.weights = class_weights, seed = 1234)

## Growing trees.. Progress: 22%. Estimated remaining time: 1 minute, 47 seconds.
## Growing trees.. Progress: 46%. Estimated remaining time: 1 minute, 13 seconds.
## Growing trees.. Progress: 70%. Estimated remaining time: 40 seconds.
## Growing trees.. Progress: 94%. Estimated remaining time: 8 seconds.

# probabilities
probs_train1 <- predict(rf_AUC1, data = credit_train_set)$predictions[, 2]

# predictions
pred_train1 <- prediction(probs_train1, credit_train_set$DEFAULT)

# tpr and fpr
perf_train1 <- performance(pred_train1, measure = "tpr", x.measure = "fpr")

# auc performance
auc_perf_train1 <- performance(pred_train1, measure = "auc")

# AUC calculation
auc_value_train1 <- auc_perf_train1@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_train1, 4)))

## [1] "AUC = 0.9952"

AUC Test Set 2

# probabilities
probs_test1 <- predict(rf_AUC1, data = credit_test_set)$predictions[, 2]

# predictions
pred_test1 <- prediction(probs_test1, credit_test_set$DEFAULT)

# tpr and fpr
perf_test1 <- performance(pred_test1, measure = "tpr", x.measure = "fpr")

# auc performance
auc_perf_test1 <- performance(pred_test1, measure = "auc")

# AUC calculation
auc_value_test1 <- auc_perf_test1@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_test1, 4)))

## [1] "AUC = 0.7369"

SMOTE Class Balance

application_train_smote %>% pull(DEFAULT) %>% table()

## .
##  FALSE   TRUE 
## 282412 272844

application_train_smote %>% pull(DEFAULT) %>% table() %>% prop.table() %>% round(2)

## .
## FALSE  TRUE 
##  0.51  0.49

SMOTE Data Partitioning

# split data set
creditinTrainsmote <- createDataPartition(application_train_smote$DEFAULT, p=.7, list=FALSE)

# train set
credit_train_smote <- application_train_smote[creditinTrainsmote,]

# test set
credit_test_smote <- application_train_smote[-creditinTrainsmote,]

SMOTE Training the Random Forest

# convert target column to factor
credit_train_smote$DEFAULT <- as.factor(credit_train_smote$DEFAULT)
credit_test_smote$DEFAULT <- as.factor(credit_test_smote$DEFAULT)

# train RF model with 500 trees, 10 depth
rf_modelSMOTE <- ranger(formula = DEFAULT ~ ., data = credit_train_smote, 
                   num.trees = 500, max.depth = 10, oob.error = TRUE, , importance = 'impurity', seed = 1234)

## Growing trees.. Progress: 13%. Estimated remaining time: 3 minutes, 26 seconds.
## Growing trees.. Progress: 28%. Estimated remaining time: 2 minutes, 44 seconds.
## Growing trees.. Progress: 43%. Estimated remaining time: 2 minutes, 7 seconds.
## Growing trees.. Progress: 58%. Estimated remaining time: 1 minute, 34 seconds.
## Growing trees.. Progress: 72%. Estimated remaining time: 1 minute, 1 seconds.
## Growing trees.. Progress: 86%. Estimated remaining time: 30 seconds.

# find and print important variables
importance_SMOTE <- sort(rf_modelSMOTE$variable.importance, decreasing = TRUE)
print(importance_SMOTE)

##                                     GENDER_MALE.Y 
##                                      7.444491e+03 
##                                     GENDER_MALE.N 
##                                      6.685673e+03 
##                                      EXT_SOURCE_3 
##                                      5.408013e+03 
## NAME_EDUCATION_TYPE.Secondary...secondary.special 
##                                      5.064758e+03 
##                                    IMPUTED_EXT1.N 
##                                      4.988043e+03 
##                                    FLAG_OWN_CAR.Y 
##                                      4.599549e+03 
##                                    IMPUTED_EXT1.Y 
##                                      4.534672e+03 
##              NAME_EDUCATION_TYPE.Higher.education 
##                                      4.476700e+03 
##                                    FLAG_OWN_CAR.N 
##                                      4.448189e+03 
##                          REG_CITY_NOT_WORK_CITY.Y 
##                                      4.195708e+03 
##                                      EXT_SOURCE_2 
##                                      3.862359e+03 
##                          REG_CITY_NOT_WORK_CITY.N 
##                                      3.693999e+03 
##                        NAME_FAMILY_STATUS.Married 
##                                      3.689392e+03 
##                          NAME_INCOME_TYPE.Working 
##                                      3.524069e+03 
##                                      FLAG_PHONE.N 
##                                      3.340081e+03 
##                                      FLAG_PHONE.Y 
##                                      3.254170e+03 
##                                      EXT_SOURCE_1 
##                                      2.304645e+03 
##                         LIVE_CITY_NOT_WORK_CITY.Y 
##                                      2.160710e+03 
##                                   CNT_FAM_MEMBERS 
##                                      2.126587e+03 
##                         LIVE_CITY_NOT_WORK_CITY.N 
##                                      2.093209e+03 
##                              REGION_RATING_CLIENT 
##                                      2.080974e+03 
##                                    IMPUTED_EXT3.N 
##                                      1.949196e+03 
##                                 FLAG_OWN_REALTY.N 
##                                      1.941928e+03 
##                          OBS_30_CNT_SOCIAL_CIRCLE 
##                                      1.933572e+03 
##                          OBS_60_CNT_SOCIAL_CIRCLE 
##                                      1.924480e+03 
##                                 FLAG_OWN_REALTY.Y 
##                                      1.898020e+03 
##                       REGION_RATING_CLIENT_W_CITY 
##                                      1.877450e+03 
##             NAME_INCOME_TYPE.Commercial.associate 
##                                      1.671184e+03 
##                                    IMPUTED_EXT3.Y 
##                                      1.659980e+03 
##                          OCCUPATION_TYPE.Laborers 
##                                      1.622203e+03 
##                                       OWN_CAR_AGE 
##                                      1.565750e+03 
##                                      CNT_CHILDREN 
##                                      1.548119e+03 
##                     NAME_TYPE_SUITE.Unaccompanied 
##                                      1.146839e+03 
##                                 FLAG_WORK_PHONE.N 
##                                      1.011400e+03 
##                                 FLAG_WORK_PHONE.Y 
##                                      1.009917e+03 
##                        AMT_REQ_CREDIT_BUREAU_YEAR 
##                                      9.868246e+02 
##          ORGANIZATION_TYPE.Business.Entity.Type.3 
##                                      9.637569e+02 
##           NAME_FAMILY_STATUS.Single...not.married 
##                                      9.252466e+02 
##                          REG_CITY_NOT_LIVE_CITY.Y 
##                                      8.178555e+02 
##                 WEEKDAY_APPR_PROCESS_START.MONDAY 
##                                      7.752334e+02 
##                          REG_CITY_NOT_LIVE_CITY.N 
##                                      7.689803e+02 
##              WEEKDAY_APPR_PROCESS_START.WEDNESDAY 
##                                      7.664687e+02 
##                WEEKDAY_APPR_PROCESS_START.TUESDAY 
##                                      7.431503e+02 
##                            NAME_TYPE_SUITE.Family 
##                                      5.959356e+02 
##               WEEKDAY_APPR_PROCESS_START.THURSDAY 
##                                      5.393774e+02 
##                 NAME_FAMILY_STATUS.Civil.marriage 
##                                      4.870037e+02 
##                 WEEKDAY_APPR_PROCESS_START.FRIDAY 
##                                      4.582626e+02 
##                          DEF_30_CNT_SOCIAL_CIRCLE 
##                                      4.403277e+02 
##                           OCCUPATION_TYPE.Drivers 
##                                      4.354296e+02 
##                   ORGANIZATION_TYPE.Self.employed 
##                                      4.195951e+02 
##                               OCCUPATION_TYPE.XNA 
##                                      4.125629e+02 
##                       OCCUPATION_TYPE.Sales.staff 
##                                      3.530636e+02 
##                          DEF_60_CNT_SOCIAL_CIRCLE 
##                                      2.644528e+02 
##               NAME_HOUSING_TYPE.House...apartment 
##                                      2.511268e+02 
##                      NAME_FAMILY_STATUS.Separated 
##                                      2.124115e+02 
##                        REGION_POPULATION_RELATIVE 
##                                      1.999970e+02 
##               WEEKDAY_APPR_PROCESS_START.SATURDAY 
##                                      1.906710e+02 
##                          NAME_FAMILY_STATUS.Widow 
##                                      1.855194e+02 
##                    NAME_INCOME_TYPE.State.servant 
##                                      1.732414e+02 
##                        OCCUPATION_TYPE.Core.staff 
##                                      1.687093e+02 
##             NAME_EDUCATION_TYPE.Incomplete.higher 
##                                      1.265142e+02 
##                            DAYS_LAST_PHONE_CHANGE 
##                                      1.219585e+02 
##                      REG_REGION_NOT_WORK_REGION.N 
##                                      9.229820e+01 
##                                        DAYS_BIRTH 
##                                      8.202299e+01 
##                          OCCUPATION_TYPE.Managers 
##                                      7.451275e+01 
##                      REG_REGION_NOT_WORK_REGION.Y 
##                                      6.004381e+01 
##                    NAME_HOUSING_TYPE.With.parents 
##                                      5.620894e+01 
##                         AMT_REQ_CREDIT_BUREAU_MON 
##                                      5.205232e+01 
##                                     DAYS_EMPLOYED 
##                                      5.136552e+01 
##                                   DAYS_ID_PUBLISH 
##                                      4.754036e+01 
##                           HOUR_APPR_PROCESS_START 
##                                      3.967130e+01 
##                      REG_REGION_NOT_LIVE_REGION.N 
##                                      3.939734e+01 
##               NAME_EDUCATION_TYPE.Lower.secondary 
##                                      3.714776e+01 
##                     LIVE_REGION_NOT_WORK_REGION.N 
##                                      3.489076e+01 
##                     LIVE_REGION_NOT_WORK_REGION.Y 
##                                      3.260822e+01 
##                                  AMT_INCOME_TOTAL 
##                                      2.977919e+01 
##                      REG_REGION_NOT_LIVE_REGION.Y 
##                                      2.973562e+01 
##                                        AMT_CREDIT 
##                                      2.937600e+01 
##                                   AMT_GOODS_PRICE 
##                                      2.905561e+01 
##                                 DAYS_REGISTRATION 
##                                      2.630463e+01 
##                                      FLAG_EMAIL.Y 
##                                      2.622571e+01 
##                   NAME_TYPE_SUITE.Spouse..partner 
##                                      2.534514e+01 
##                                       AMT_ANNUITY 
##                                      2.511945e+01 
##                                      FLAG_EMAIL.N 
##                                      2.425832e+01 
##             NAME_HOUSING_TYPE.Municipal.apartment 
##                                      2.240918e+01 
##                         AMT_REQ_CREDIT_BUREAU_QRT 
##                                      2.141124e+01 
##                        NAME_INCOME_TYPE.Pensioner 
##                                      1.911535e+01 
##                NAME_HOUSING_TYPE.Rented.apartment 
##                                      1.661527e+01 
##                                  FLAG_EMP_PHONE.Y 
##                                      1.607607e+01 
##                           ORGANIZATION_TYPE.Other 
##                                      1.580644e+01 
##                             ORGANIZATION_TYPE.XNA 
##                                      1.557004e+01 
##                    OCCUPATION_TYPE.Security.staff 
##                                      1.523310e+01 
##                 WEEKDAY_APPR_PROCESS_START.SUNDAY 
##                                      1.522241e+01 
##          ORGANIZATION_TYPE.Business.Entity.Type.2 
##                                      1.334574e+01 
##                                  FLAG_EMP_PHONE.N 
##                                      1.280340e+01 
##                    ORGANIZATION_TYPE.Construction 
##                                      1.209058e+01 
##                       OCCUPATION_TYPE.Accountants 
##                                      1.160384e+01 
##             OCCUPATION_TYPE.High.skill.tech.staff 
##                                      1.027786e+01 
##                        ORGANIZATION_TYPE.Medicine 
##                                      9.253402e+00 
##                   ORGANIZATION_TYPE.Trade..type.7 
##                                      8.839607e+00 
##                          NAME_TYPE_SUITE.Children 
##                                      7.177885e+00 
##                        AMT_REQ_CREDIT_BUREAU_WEEK 
##                                      6.396039e+00 
##                      ORGANIZATION_TYPE.Government 
##                                      5.756175e+00 
##                     OCCUPATION_TYPE.Cooking.staff 
##                                      5.673210e+00 
##               ORGANIZATION_TYPE.Transport..type.4 
##                                      5.656496e+00 
##                    OCCUPATION_TYPE.Medicine.staff 
##                                      5.498736e+00 
##                    ORGANIZATION_TYPE.Kindergarten 
##                                      5.023972e+00 
##                   ORGANIZATION_TYPE.Trade..type.3 
##                                      4.795816e+00 
##                OCCUPATION_TYPE.Low.skill.Laborers 
##                                      4.714950e+00 
##                        ORGANIZATION_TYPE.Security 
##                                      4.481458e+00 
##               ORGANIZATION_TYPE.Transport..type.3 
##                                      4.383485e+00 
##                                       CASH_LOAN.N 
##                                      4.378394e+00 
##                          ORGANIZATION_TYPE.School 
##                                      4.332550e+00 
##                                       CASH_LOAN.Y 
##                                      3.978249e+00 
##          ORGANIZATION_TYPE.Business.Entity.Type.1 
##                                      3.946746e+00 
##                ORGANIZATION_TYPE.Industry..type.3 
##                                      3.305940e+00 
##                    OCCUPATION_TYPE.Cleaning.staff 
##                                      3.299557e+00 
##                      ORGANIZATION_TYPE.Restaurant 
##                                      3.163058e+00 
##                         AMT_REQ_CREDIT_BUREAU_DAY 
##                                      3.046038e+00 
##                           NAME_TYPE_SUITE.Other_B 
##                                      3.030730e+00 
##                          ORGANIZATION_TYPE.Postal 
##                                      2.726748e+00 
##                        AMT_REQ_CREDIT_BUREAU_HOUR 
##                                      2.578747e+00 
##              OCCUPATION_TYPE.Waiters.barmen.staff 
##                                      2.320473e+00 
##                     ORGANIZATION_TYPE.Agriculture 
##                                      2.254574e+00 
##                         ORGANIZATION_TYPE.Telecom 
##                                      2.185969e+00 
##                   ORGANIZATION_TYPE.Trade..type.2 
##                                      2.168890e+00 
##                            ORGANIZATION_TYPE.Bank 
##                                      2.098756e+00 
##                        ORGANIZATION_TYPE.Military 
##                                      2.044563e+00 
##                NAME_HOUSING_TYPE.Office.apartment 
##                                      2.008419e+00 
##                 NAME_HOUSING_TYPE.Co.op.apartment 
##                                      1.789164e+00 
##                          ORGANIZATION_TYPE.Police 
##                                      1.764229e+00 
##                         ORGANIZATION_TYPE.Housing 
##                                      1.762673e+00 
##                         ORGANIZATION_TYPE.Realtor 
##                                      1.731531e+00 
##                       OCCUPATION_TYPE.Secretaries 
##                                      1.687042e+00 
##                ORGANIZATION_TYPE.Industry..type.4 
##                                      1.633106e+00 
##                ORGANIZATION_TYPE.Industry..type.1 
##                                      1.524414e+00 
##             OCCUPATION_TYPE.Private.service.staff 
##                                      1.486261e+00 
##               ORGANIZATION_TYPE.Industry..type.11 
##                                      1.428360e+00 
##                ORGANIZATION_TYPE.Industry..type.7 
##                                      1.423483e+00 
##                           NAME_TYPE_SUITE.Other_A 
##                                      1.415821e+00 
##                                    IMPUTED_EXT2.Y 
##                                      1.266612e+00 
##                                    IMPUTED_EXT2.N 
##                                      1.262543e+00 
##                        ORGANIZATION_TYPE.Services 
##                                      1.261261e+00 
##                     OCCUPATION_TYPE.Realty.agents 
##                                      1.259923e+00 
##               ORGANIZATION_TYPE.Transport..type.2 
##                                      1.252802e+00 
##                ORGANIZATION_TYPE.Industry..type.9 
##                                      1.238143e+00 
##                          ORGANIZATION_TYPE.Mobile 
##                                      1.184110e+00 
##                   NAME_TYPE_SUITE.Group.of.people 
##                                      1.177230e+00 
##                       ORGANIZATION_TYPE.Insurance 
##                                      1.150800e+00 
##                   ORGANIZATION_TYPE.Trade..type.1 
##                                      1.060392e+00 
##                  ORGANIZATION_TYPE.Legal.Services 
##                                      1.025855e+00 
##                      ORGANIZATION_TYPE.University 
##                                      9.743315e-01 
##                ORGANIZATION_TYPE.Industry..type.5 
##                                      9.693173e-01 
##                   ORGANIZATION_TYPE.Trade..type.6 
##                                      8.035085e-01 
##                          OCCUPATION_TYPE.HR.staff 
##                                      7.788360e-01 
##                     ORGANIZATION_TYPE.Advertising 
##                                      7.752790e-01 
##                                FLAG_CONT_MOBILE.Y 
##                                      7.474219e-01 
##             ORGANIZATION_TYPE.Security.Ministries 
##                                      7.448791e-01 
##                         ORGANIZATION_TYPE.Culture 
##                                      7.303800e-01 
##                                FLAG_CONT_MOBILE.N 
##                                      7.061707e-01 
##                     ORGANIZATION_TYPE.Electricity 
##                                      6.767039e-01 
##                       ORGANIZATION_TYPE.Emergency 
##                                      6.762616e-01 
##                           ORGANIZATION_TYPE.Hotel 
##                                      6.564189e-01 
##                        ORGANIZATION_TYPE.Cleaning 
##                                      6.155889e-01 
##                        ORGANIZATION_TYPE.Religion 
##                                      4.909636e-01 
##               ORGANIZATION_TYPE.Industry..type.10 
##                                      4.837548e-01 
##                ORGANIZATION_TYPE.Industry..type.2 
##                                      4.400813e-01 
##                ORGANIZATION_TYPE.Industry..type.6 
##                                      4.375614e-01 
##                          OCCUPATION_TYPE.IT.staff 
##                                      4.202654e-01 
##               ORGANIZATION_TYPE.Transport..type.1 
##                                      4.154291e-01 
##               ORGANIZATION_TYPE.Industry..type.12 
##                                      3.058339e-01 
##                       NAME_INCOME_TYPE.Unemployed 
##                                      2.847856e-01 
##                   ORGANIZATION_TYPE.Trade..type.5 
##                                      2.199512e-01 
##               NAME_EDUCATION_TYPE.Academic.degree 
##                                      1.509186e-01 
##                  NAME_INCOME_TYPE.Maternity.leave 
##                                      1.422529e-01 
##               ORGANIZATION_TYPE.Industry..type.13 
##                                      1.420683e-01 
##                   ORGANIZATION_TYPE.Trade..type.4 
##                                      8.897049e-02 
##                ORGANIZATION_TYPE.Industry..type.8 
##                                      7.190292e-02 
##                          NAME_INCOME_TYPE.Student 
##                                      2.666667e-03 
##                      NAME_INCOME_TYPE.Businessman 
##                                      0.000000e+00 
##                        NAME_FAMILY_STATUS.Unknown 
##                                      0.000000e+00

SMOTE Predictions on Train Data

# predict classes on train data
predictions_trainSMOTE <- predict(rf_modelSMOTE, credit_train_smote)$predictions

SMOTE Train Metrics

# get confusion matrix
conf_matrix_trainSMOTE <- confusionMatrix(predictions_trainSMOTE, credit_train_smote$DEFAULT)
print(conf_matrix_trainSMOTE)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  FALSE   TRUE
##      FALSE 197681  22499
##      TRUE       8 168492
##                                           
##                Accuracy : 0.9421          
##                  95% CI : (0.9414, 0.9428)
##     No Information Rate : 0.5086          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8839          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8822          
##          Pos Pred Value : 0.8978          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5086          
##          Detection Rate : 0.5086          
##    Detection Prevalence : 0.5665          
##       Balanced Accuracy : 0.9411          
##                                           
##        'Positive' Class : FALSE           
##

SMOTE Predictions on Test Data

# predict on test data
predictions_testSMOTE <- predict(rf_modelSMOTE, credit_test_smote)$predictions

SMOTE Test Metrics

# get confusion matrix
conf_matrix_testSMOTE <- confusionMatrix(predictions_testSMOTE, credit_test_smote$DEFAULT)
print(conf_matrix_testSMOTE)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 84719  9659
##      TRUE      4 72194
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9409, 0.9431)
##     No Information Rate : 0.5086          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8837          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8820          
##          Pos Pred Value : 0.8977          
##          Neg Pred Value : 0.9999          
##              Prevalence : 0.5086          
##          Detection Rate : 0.5086          
##    Detection Prevalence : 0.5666          
##       Balanced Accuracy : 0.9410          
##                                           
##        'Positive' Class : FALSE           
##

SMOTE AUC Train Set

# train RF for AUC calculation
rf_AUCSMOTE <- ranger(formula = DEFAULT ~ ., data = credit_train_smote, 
                   num.trees = 500, max.depth = 10, oob.error = TRUE, probability = TRUE, importance = 'impurity', seed = 1234)

## Growing trees.. Progress: 13%. Estimated remaining time: 3 minutes, 20 seconds.
## Growing trees.. Progress: 28%. Estimated remaining time: 2 minutes, 41 seconds.
## Growing trees.. Progress: 42%. Estimated remaining time: 2 minutes, 7 seconds.
## Growing trees.. Progress: 57%. Estimated remaining time: 1 minute, 34 seconds.
## Growing trees.. Progress: 72%. Estimated remaining time: 1 minute, 1 seconds.
## Growing trees.. Progress: 86%. Estimated remaining time: 31 seconds.

# train probabilities
probs_trainSMOTE <- predict(rf_AUCSMOTE, data = credit_train_smote)$predictions[, 2]

# train predictions
pred_trainSMOTE <- prediction(probs_trainSMOTE, credit_train_smote$DEFAULT)

# train trp and fpr performance metrics
perf_trainSMOTE <- performance(pred_trainSMOTE, measure = "tpr", x.measure = "fpr")

# train AUC performance
auc_perf_trainSMOTE <- performance(pred_trainSMOTE, measure = "auc")

# train AUC calculation
auc_value_trainSMOTE <- auc_perf_trainSMOTE@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_trainSMOTE, 4)))

## [1] "AUC = 0.9738"

SMOTE AUC Test Set

# test test probabilities
probs_testSMOTE <- predict(rf_AUCSMOTE, data = credit_test_smote)$predictions[, 2]

# test set predictions
pred_testSMOTE <- prediction(probs_testSMOTE, credit_test_smote$DEFAULT)

# test set tpr and fpr performance
perf_testSMOTE <- performance(pred_testSMOTE, measure = "tpr", x.measure = "fpr")

# test set AUC performance
auc_perf_testSMOTE <- performance(pred_testSMOTE, measure = "auc")

# test set AUC calculation
auc_value_testSMOTE <- auc_perf_testSMOTE@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_testSMOTE, 4)))

## [1] "AUC = 0.9714"

SMOTE Model 2

# train RF with 800 trees and 20 depth
rf_model1smote <- ranger(formula = DEFAULT ~ ., data = credit_train_smote, , importance = 'impurity',
                   num.trees = 800, max.depth = 20, seed = 1234)

## Growing trees.. Progress: 5%. Estimated remaining time: 11 minutes, 20 seconds.
## Growing trees.. Progress: 10%. Estimated remaining time: 9 minutes, 53 seconds.
## Growing trees.. Progress: 16%. Estimated remaining time: 8 minutes, 38 seconds.
## Growing trees.. Progress: 21%. Estimated remaining time: 8 minutes, 1 seconds.
## Growing trees.. Progress: 27%. Estimated remaining time: 7 minutes, 18 seconds.
## Growing trees.. Progress: 32%. Estimated remaining time: 6 minutes, 48 seconds.
## Growing trees.. Progress: 38%. Estimated remaining time: 6 minutes, 11 seconds.
## Growing trees.. Progress: 43%. Estimated remaining time: 5 minutes, 37 seconds.
## Growing trees.. Progress: 49%. Estimated remaining time: 5 minutes, 1 seconds.
## Growing trees.. Progress: 55%. Estimated remaining time: 4 minutes, 26 seconds.
## Growing trees.. Progress: 60%. Estimated remaining time: 3 minutes, 55 seconds.
## Growing trees.. Progress: 66%. Estimated remaining time: 3 minutes, 19 seconds.
## Growing trees.. Progress: 71%. Estimated remaining time: 2 minutes, 48 seconds.
## Growing trees.. Progress: 77%. Estimated remaining time: 2 minutes, 15 seconds.
## Growing trees.. Progress: 82%. Estimated remaining time: 1 minute, 42 seconds.
## Growing trees.. Progress: 88%. Estimated remaining time: 1 minute, 10 seconds.
## Growing trees.. Progress: 93%. Estimated remaining time: 38 seconds.
## Growing trees.. Progress: 100%. Estimated remaining time: 2 seconds.

# train predictions
predictions_train1smote <- predict(rf_model1smote, credit_train_smote)$predictions

# test predictions
predictions_test1smote <- predict(rf_model1smote, credit_test_smote)$predictions

# conf matrix train
conf_matrix_train1smote <- confusionMatrix(predictions_train1smote, credit_train_smote$DEFAULT)
print(conf_matrix_train1smote)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  FALSE   TRUE
##      FALSE 197689  16801
##      TRUE       0 174190
##                                           
##                Accuracy : 0.9568          
##                  95% CI : (0.9561, 0.9574)
##     No Information Rate : 0.5086          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9134          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9120          
##          Pos Pred Value : 0.9217          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5086          
##          Detection Rate : 0.5086          
##    Detection Prevalence : 0.5518          
##       Balanced Accuracy : 0.9560          
##                                           
##        'Positive' Class : FALSE           
##

# conf matrix train
conf_matrix_test1smote <- confusionMatrix(predictions_test1smote, credit_test_smote$DEFAULT)
print(conf_matrix_test1smote)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 84723  7568
##      TRUE      0 74285
##                                           
##                Accuracy : 0.9546          
##                  95% CI : (0.9536, 0.9556)
##     No Information Rate : 0.5086          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.909           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9075          
##          Pos Pred Value : 0.9180          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5086          
##          Detection Rate : 0.5086          
##    Detection Prevalence : 0.5540          
##       Balanced Accuracy : 0.9538          
##                                           
##        'Positive' Class : FALSE           
##

SMOTE AUC Train Set 2

# train AUC for 800 trees, 20 depth
rf_AUC1smote <- ranger(formula = DEFAULT ~ ., data = credit_train_smote, num.trees = 800, max.depth = 20, probability = TRUE, importance = 'impurity', seed = 1234)

## Growing trees.. Progress: 5%. Estimated remaining time: 11 minutes, 20 seconds.
## Growing trees.. Progress: 11%. Estimated remaining time: 9 minutes, 6 seconds.
## Growing trees.. Progress: 16%. Estimated remaining time: 8 minutes, 24 seconds.
## Growing trees.. Progress: 22%. Estimated remaining time: 7 minutes, 40 seconds.
## Growing trees.. Progress: 28%. Estimated remaining time: 6 minutes, 59 seconds.
## Growing trees.. Progress: 33%. Estimated remaining time: 6 minutes, 25 seconds.
## Growing trees.. Progress: 39%. Estimated remaining time: 5 minutes, 57 seconds.
## Growing trees.. Progress: 44%. Estimated remaining time: 5 minutes, 22 seconds.
## Growing trees.. Progress: 50%. Estimated remaining time: 4 minutes, 48 seconds.
## Growing trees.. Progress: 56%. Estimated remaining time: 4 minutes, 14 seconds.
## Growing trees.. Progress: 61%. Estimated remaining time: 3 minutes, 46 seconds.
## Growing trees.. Progress: 66%. Estimated remaining time: 3 minutes, 13 seconds.
## Growing trees.. Progress: 72%. Estimated remaining time: 2 minutes, 43 seconds.
## Growing trees.. Progress: 77%. Estimated remaining time: 2 minutes, 11 seconds.
## Growing trees.. Progress: 83%. Estimated remaining time: 1 minute, 40 seconds.
## Growing trees.. Progress: 88%. Estimated remaining time: 1 minute, 7 seconds.
## Growing trees.. Progress: 94%. Estimated remaining time: 37 seconds.
## Growing trees.. Progress: 100%. Estimated remaining time: 2 seconds.

# probabilities
probs_train1smote <- predict(rf_AUC1smote, data = credit_train_smote)$predictions[, 2]

# predictions
pred_train1smote <- prediction(probs_train1smote, credit_train_smote$DEFAULT)

# tpr and fpr
perf_train1smote <- performance(pred_train1smote, measure = "tpr", x.measure = "fpr")

# auc performance
auc_perf_train1smote <- performance(pred_train1smote, measure = "auc")

# AUC calculation
auc_value_train1smote <- auc_perf_train1smote@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_train1smote, 4)))

## [1] "AUC = 0.9987"

SMOTE AUC Test Set 2

# probabilities
probs_test1smote <- predict(rf_AUC1smote, data = credit_test_smote)$predictions[, 2]

# predictions
pred_test1smote <- prediction(probs_test1smote, credit_test_smote$DEFAULT)

# tpr and fpr
perf_test1smote <- performance(pred_test1smote, measure = "tpr", x.measure = "fpr")

# auc performance
auc_perf_test1smote <- performance(pred_test1smote, measure = "auc")

# AUC calculation
auc_value_test1smote <- auc_perf_test1smote@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_test1smote, 4)))

## [1] "AUC = 0.9781"

Random Forest

Georgia Christodoulou

2024-10-20

Libraries and Data Import

Class Balance

Data Partitioning

Training the Random Forest

Predictions on Train Data

Train Metrics

Predictions on Test Data

Test Metrics

AUC Train Set

AUC Test Set

Model 2

AUC Train Set 2

AUC Test Set 2

SMOTE Class Balance

SMOTE Data Partitioning

SMOTE Training the Random Forest

SMOTE Predictions on Train Data

SMOTE Train Metrics

SMOTE Predictions on Test Data

SMOTE Test Metrics

SMOTE AUC Train Set

SMOTE AUC Test Set

SMOTE Model 2

SMOTE AUC Train Set 2

SMOTE AUC Test Set 2