knitr::opts_chunk$set(warning = FALSE, message = FALSE)

Libraries and Data Import

library(PRROC)
library(caret)
library(knitr)
library(tidyverse)
library(data.table)
library(caret)
library(ranger)
library(matrixStats)
library(Metrics)
library(ROCR)
library(pROC)

application_train_clean <- as.data.frame(fread("data/application_train_clean.csv"))
application_test_clean <- as.data.frame(fread("data/application_test_clean.csv"))

application_train_smote <- as.data.frame(fread("data/application_train_smote.csv"))

Class Balance

application_train_clean %>% pull(DEFAULT) %>% table()
## .
##  FALSE   TRUE 
## 282412  24804
application_train_clean %>% pull(DEFAULT) %>% table() %>% prop.table() %>% round(2)
## .
## FALSE  TRUE 
##  0.92  0.08

Data Partitioning

# split data set
creditinTrain <- createDataPartition(application_train_clean$DEFAULT, p=.7, list=FALSE)

# train set
credit_train_set <- application_train_clean[creditinTrain,]

# test set
credit_test_set <- application_train_clean[-creditinTrain,]

Training the Random Forest

# convert target column to factor
credit_train_set$DEFAULT <- as.factor(credit_train_set$DEFAULT)
credit_test_set$DEFAULT <- as.factor(credit_test_set$DEFAULT)

# calculate class weights for unbalanced dataset
class_proportions <- table(credit_train_set$DEFAULT) / nrow(credit_train_set)
class_weights <- 1 / class_proportions

# train RF model with 500 trees, 10 depth
rf_model <- ranger(formula = DEFAULT ~ ., data = credit_train_set, 
                   num.trees = 500, max.depth = 10, oob.error = TRUE, class.weights = class_weights, , importance = 'impurity', seed = 1234)
## Growing trees.. Progress: 61%. Estimated remaining time: 19 seconds.
# find and print important variables
importance <- sort(rf_model$variable.importance, decreasing = TRUE)
print(importance)
##                EXT_SOURCE_3                EXT_SOURCE_2 
##                 3915.331821                 3914.256113 
##                EXT_SOURCE_1                  DAYS_BIRTH 
##                 1371.953260                  932.734504 
##               DAYS_EMPLOYED             DAYS_ID_PUBLISH 
##                  784.079700                  628.624043 
##      DAYS_LAST_PHONE_CHANGE             AMT_GOODS_PRICE 
##                  606.893673                  562.112875 
##                  AMT_CREDIT                 AMT_ANNUITY 
##                  530.617750                  530.053620 
##           DAYS_REGISTRATION         NAME_EDUCATION_TYPE 
##                  510.023525                  473.922627 
##                  SK_ID_CURR            AMT_INCOME_TOTAL 
##                  429.132643                  380.145040 
##  REGION_POPULATION_RELATIVE           ORGANIZATION_TYPE 
##                  364.088854                  313.538331 
##     HOUR_APPR_PROCESS_START                 GENDER_MALE 
##                  295.323537                  264.774400 
##            NAME_INCOME_TYPE                 OWN_CAR_AGE 
##                  249.771147                  232.336864 
## REGION_RATING_CLIENT_W_CITY             OCCUPATION_TYPE 
##                  206.702786                  205.551901 
##  AMT_REQ_CREDIT_BUREAU_YEAR        REGION_RATING_CLIENT 
##                  190.947189                  185.535168 
##    OBS_60_CNT_SOCIAL_CIRCLE    OBS_30_CNT_SOCIAL_CIRCLE 
##                  168.415112                  167.207537 
##  WEEKDAY_APPR_PROCESS_START          NAME_FAMILY_STATUS 
##                  142.646987                  128.983164 
##             CNT_FAM_MEMBERS    DEF_30_CNT_SOCIAL_CIRCLE 
##                  124.026587                  115.013984 
##      REG_CITY_NOT_WORK_CITY           NAME_HOUSING_TYPE 
##                  112.303368                  112.185588 
##                IMPUTED_EXT1    DEF_60_CNT_SOCIAL_CIRCLE 
##                  111.123783                  101.656638 
##                CNT_CHILDREN   AMT_REQ_CREDIT_BUREAU_QRT 
##                   96.543462                   93.817822 
##                IMPUTED_EXT3      REG_CITY_NOT_LIVE_CITY 
##                   92.989174                   92.067500 
##                FLAG_OWN_CAR                   CASH_LOAN 
##                   78.697777                   77.571704 
##   AMT_REQ_CREDIT_BUREAU_MON              FLAG_EMP_PHONE 
##                   74.760871                   67.619116 
##             NAME_TYPE_SUITE             FLAG_WORK_PHONE 
##                   65.391212                   48.951156 
##     LIVE_CITY_NOT_WORK_CITY                  FLAG_PHONE 
##                   43.432376                   43.258732 
##             FLAG_OWN_REALTY  AMT_REQ_CREDIT_BUREAU_WEEK 
##                   37.792319                   33.045423 
##                  FLAG_EMAIL   AMT_REQ_CREDIT_BUREAU_DAY 
##                   27.542693                   24.588493 
##  REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION 
##                   24.001468                   21.919742 
##  REG_REGION_NOT_LIVE_REGION  AMT_REQ_CREDIT_BUREAU_HOUR 
##                   21.778973                   18.535071 
##            FLAG_CONT_MOBILE                IMPUTED_EXT2 
##                    8.740591                    8.267135 
##                  FLAG_MOBIL 
##                    0.000000

Predictions on Train Data

# predict classes on train data
predictions_train <- predict(rf_model, credit_train_set)$predictions

Train Metrics

# get confusion matrix
conf_matrix_train <- confusionMatrix(predictions_train, credit_train_set$DEFAULT)
print(conf_matrix_train)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  FALSE   TRUE
##      FALSE 147401   5838
##      TRUE   50288  11525
##                                           
##                Accuracy : 0.739           
##                  95% CI : (0.7372, 0.7409)
##     No Information Rate : 0.9193          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1889          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7456          
##             Specificity : 0.6638          
##          Pos Pred Value : 0.9619          
##          Neg Pred Value : 0.1864          
##              Prevalence : 0.9193          
##          Detection Rate : 0.6854          
##    Detection Prevalence : 0.7126          
##       Balanced Accuracy : 0.7047          
##                                           
##        'Positive' Class : FALSE           
## 

Predictions on Test Data

# predict on test data
predictions_test <- predict(rf_model, credit_test_set)$predictions

Test Metrics

# get confusion matrix
conf_matrix_test <- confusionMatrix(predictions_test, credit_test_set$DEFAULT)
print(conf_matrix_test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 62746  2948
##      TRUE  21977  4493
##                                           
##                Accuracy : 0.7296          
##                  95% CI : (0.7267, 0.7324)
##     No Information Rate : 0.9193          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.159           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7406          
##             Specificity : 0.6038          
##          Pos Pred Value : 0.9551          
##          Neg Pred Value : 0.1697          
##              Prevalence : 0.9193          
##          Detection Rate : 0.6808          
##    Detection Prevalence : 0.7128          
##       Balanced Accuracy : 0.6722          
##                                           
##        'Positive' Class : FALSE           
## 

AUC Train Set

# train RF for AUC calculation
rf_AUC <- ranger(formula = DEFAULT ~ ., data = credit_train_set, 
                   num.trees = 500, max.depth = 10, oob.error = TRUE, probability = TRUE, class.weights = class_weights, importance = 'impurity', seed = 1234)
## Growing trees.. Progress: 66%. Estimated remaining time: 15 seconds.
# train probabilities
probs_train <- predict(rf_AUC, data = credit_train_set)$predictions[, 2]

# train predictions
pred_train <- prediction(probs_train, credit_train_set$DEFAULT)

# train trp and fpr performance metrics
perf_train <- performance(pred_train, measure = "tpr", x.measure = "fpr")

# train AUC performance
auc_perf_train <- performance(pred_train, measure = "auc")

# train AUC calculation
auc_value_train <- auc_perf_train@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_train, 4)))
## [1] "AUC = 0.8098"

AUC Test Set

# test test probabilities
probs_test <- predict(rf_AUC, data = credit_test_set)$predictions[, 2]

# test set predictions
pred_test <- prediction(probs_test, credit_test_set$DEFAULT)

# test set tpr and fpr performance
perf_test <- performance(pred_test, measure = "tpr", x.measure = "fpr")

# test set AUC performance
auc_perf_test <- performance(pred_test, measure = "auc")

# test set AUC calcualtion
auc_value_test <- auc_perf_test@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_test, 4)))
## [1] "AUC = 0.7393"

Model 2

# train RF with 800 trees and 20 depth
rf_model1 <- ranger(formula = DEFAULT ~ ., data = credit_train_set, , importance = 'impurity',
                   num.trees = 800, max.depth = 20, class.weights = class_weights, seed = 1234)
## Growing trees.. Progress: 19%. Estimated remaining time: 2 minutes, 13 seconds.
## Growing trees.. Progress: 40%. Estimated remaining time: 1 minute, 34 seconds.
## Growing trees.. Progress: 62%. Estimated remaining time: 58 seconds.
## Growing trees.. Progress: 85%. Estimated remaining time: 21 seconds.
# train predictions
predictions_train1 <- predict(rf_model1, credit_train_set)$predictions

# test predictions
predictions_test1 <- predict(rf_model1, credit_test_set)$predictions

# conf matrix train
conf_matrix_train1 <- confusionMatrix(predictions_train1, credit_train_set$DEFAULT)
print(conf_matrix_train1)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  FALSE   TRUE
##      FALSE 194760   3638
##      TRUE    2929  13725
##                                           
##                Accuracy : 0.9695          
##                  95% CI : (0.9687, 0.9702)
##     No Information Rate : 0.9193          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7904          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9852          
##             Specificity : 0.7905          
##          Pos Pred Value : 0.9817          
##          Neg Pred Value : 0.8241          
##              Prevalence : 0.9193          
##          Detection Rate : 0.9056          
##    Detection Prevalence : 0.9226          
##       Balanced Accuracy : 0.8878          
##                                           
##        'Positive' Class : FALSE           
## 
# conf matrix train
conf_matrix_test1 <- confusionMatrix(predictions_test1, credit_test_set$DEFAULT)
print(conf_matrix_test1)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 82097  6460
##      TRUE   2626   981
##                                           
##                Accuracy : 0.9014          
##                  95% CI : (0.8995, 0.9033)
##     No Information Rate : 0.9193          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1318          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9690          
##             Specificity : 0.1318          
##          Pos Pred Value : 0.9271          
##          Neg Pred Value : 0.2720          
##              Prevalence : 0.9193          
##          Detection Rate : 0.8908          
##    Detection Prevalence : 0.9609          
##       Balanced Accuracy : 0.5504          
##                                           
##        'Positive' Class : FALSE           
## 

AUC Train Set 2

# train AUC for 800 trees, 20 depth
rf_AUC1 <- ranger(formula = DEFAULT ~ ., data = credit_train_set, num.trees = 800, max.depth = 20, probability = TRUE, importance = 'impurity', class.weights = class_weights, seed = 1234)
## Growing trees.. Progress: 22%. Estimated remaining time: 1 minute, 47 seconds.
## Growing trees.. Progress: 46%. Estimated remaining time: 1 minute, 13 seconds.
## Growing trees.. Progress: 70%. Estimated remaining time: 40 seconds.
## Growing trees.. Progress: 94%. Estimated remaining time: 8 seconds.
# probabilities
probs_train1 <- predict(rf_AUC1, data = credit_train_set)$predictions[, 2]

# predictions
pred_train1 <- prediction(probs_train1, credit_train_set$DEFAULT)

# tpr and fpr
perf_train1 <- performance(pred_train1, measure = "tpr", x.measure = "fpr")

# auc performance
auc_perf_train1 <- performance(pred_train1, measure = "auc")

# AUC calculation
auc_value_train1 <- auc_perf_train1@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_train1, 4)))
## [1] "AUC = 0.9952"

AUC Test Set 2

# probabilities
probs_test1 <- predict(rf_AUC1, data = credit_test_set)$predictions[, 2]

# predictions
pred_test1 <- prediction(probs_test1, credit_test_set$DEFAULT)

# tpr and fpr
perf_test1 <- performance(pred_test1, measure = "tpr", x.measure = "fpr")

# auc performance
auc_perf_test1 <- performance(pred_test1, measure = "auc")

# AUC calculation
auc_value_test1 <- auc_perf_test1@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_test1, 4)))
## [1] "AUC = 0.7369"

SMOTE Class Balance

application_train_smote %>% pull(DEFAULT) %>% table()
## .
##  FALSE   TRUE 
## 282412 272844
application_train_smote %>% pull(DEFAULT) %>% table() %>% prop.table() %>% round(2)
## .
## FALSE  TRUE 
##  0.51  0.49

SMOTE Data Partitioning

# split data set
creditinTrainsmote <- createDataPartition(application_train_smote$DEFAULT, p=.7, list=FALSE)

# train set
credit_train_smote <- application_train_smote[creditinTrainsmote,]

# test set
credit_test_smote <- application_train_smote[-creditinTrainsmote,]

SMOTE Training the Random Forest

# convert target column to factor
credit_train_smote$DEFAULT <- as.factor(credit_train_smote$DEFAULT)
credit_test_smote$DEFAULT <- as.factor(credit_test_smote$DEFAULT)

# train RF model with 500 trees, 10 depth
rf_modelSMOTE <- ranger(formula = DEFAULT ~ ., data = credit_train_smote, 
                   num.trees = 500, max.depth = 10, oob.error = TRUE, , importance = 'impurity', seed = 1234)
## Growing trees.. Progress: 13%. Estimated remaining time: 3 minutes, 26 seconds.
## Growing trees.. Progress: 28%. Estimated remaining time: 2 minutes, 44 seconds.
## Growing trees.. Progress: 43%. Estimated remaining time: 2 minutes, 7 seconds.
## Growing trees.. Progress: 58%. Estimated remaining time: 1 minute, 34 seconds.
## Growing trees.. Progress: 72%. Estimated remaining time: 1 minute, 1 seconds.
## Growing trees.. Progress: 86%. Estimated remaining time: 30 seconds.
# find and print important variables
importance_SMOTE <- sort(rf_modelSMOTE$variable.importance, decreasing = TRUE)
print(importance_SMOTE)
##                                     GENDER_MALE.Y 
##                                      7.444491e+03 
##                                     GENDER_MALE.N 
##                                      6.685673e+03 
##                                      EXT_SOURCE_3 
##                                      5.408013e+03 
## NAME_EDUCATION_TYPE.Secondary...secondary.special 
##                                      5.064758e+03 
##                                    IMPUTED_EXT1.N 
##                                      4.988043e+03 
##                                    FLAG_OWN_CAR.Y 
##                                      4.599549e+03 
##                                    IMPUTED_EXT1.Y 
##                                      4.534672e+03 
##              NAME_EDUCATION_TYPE.Higher.education 
##                                      4.476700e+03 
##                                    FLAG_OWN_CAR.N 
##                                      4.448189e+03 
##                          REG_CITY_NOT_WORK_CITY.Y 
##                                      4.195708e+03 
##                                      EXT_SOURCE_2 
##                                      3.862359e+03 
##                          REG_CITY_NOT_WORK_CITY.N 
##                                      3.693999e+03 
##                        NAME_FAMILY_STATUS.Married 
##                                      3.689392e+03 
##                          NAME_INCOME_TYPE.Working 
##                                      3.524069e+03 
##                                      FLAG_PHONE.N 
##                                      3.340081e+03 
##                                      FLAG_PHONE.Y 
##                                      3.254170e+03 
##                                      EXT_SOURCE_1 
##                                      2.304645e+03 
##                         LIVE_CITY_NOT_WORK_CITY.Y 
##                                      2.160710e+03 
##                                   CNT_FAM_MEMBERS 
##                                      2.126587e+03 
##                         LIVE_CITY_NOT_WORK_CITY.N 
##                                      2.093209e+03 
##                              REGION_RATING_CLIENT 
##                                      2.080974e+03 
##                                    IMPUTED_EXT3.N 
##                                      1.949196e+03 
##                                 FLAG_OWN_REALTY.N 
##                                      1.941928e+03 
##                          OBS_30_CNT_SOCIAL_CIRCLE 
##                                      1.933572e+03 
##                          OBS_60_CNT_SOCIAL_CIRCLE 
##                                      1.924480e+03 
##                                 FLAG_OWN_REALTY.Y 
##                                      1.898020e+03 
##                       REGION_RATING_CLIENT_W_CITY 
##                                      1.877450e+03 
##             NAME_INCOME_TYPE.Commercial.associate 
##                                      1.671184e+03 
##                                    IMPUTED_EXT3.Y 
##                                      1.659980e+03 
##                          OCCUPATION_TYPE.Laborers 
##                                      1.622203e+03 
##                                       OWN_CAR_AGE 
##                                      1.565750e+03 
##                                      CNT_CHILDREN 
##                                      1.548119e+03 
##                     NAME_TYPE_SUITE.Unaccompanied 
##                                      1.146839e+03 
##                                 FLAG_WORK_PHONE.N 
##                                      1.011400e+03 
##                                 FLAG_WORK_PHONE.Y 
##                                      1.009917e+03 
##                        AMT_REQ_CREDIT_BUREAU_YEAR 
##                                      9.868246e+02 
##          ORGANIZATION_TYPE.Business.Entity.Type.3 
##                                      9.637569e+02 
##           NAME_FAMILY_STATUS.Single...not.married 
##                                      9.252466e+02 
##                          REG_CITY_NOT_LIVE_CITY.Y 
##                                      8.178555e+02 
##                 WEEKDAY_APPR_PROCESS_START.MONDAY 
##                                      7.752334e+02 
##                          REG_CITY_NOT_LIVE_CITY.N 
##                                      7.689803e+02 
##              WEEKDAY_APPR_PROCESS_START.WEDNESDAY 
##                                      7.664687e+02 
##                WEEKDAY_APPR_PROCESS_START.TUESDAY 
##                                      7.431503e+02 
##                            NAME_TYPE_SUITE.Family 
##                                      5.959356e+02 
##               WEEKDAY_APPR_PROCESS_START.THURSDAY 
##                                      5.393774e+02 
##                 NAME_FAMILY_STATUS.Civil.marriage 
##                                      4.870037e+02 
##                 WEEKDAY_APPR_PROCESS_START.FRIDAY 
##                                      4.582626e+02 
##                          DEF_30_CNT_SOCIAL_CIRCLE 
##                                      4.403277e+02 
##                           OCCUPATION_TYPE.Drivers 
##                                      4.354296e+02 
##                   ORGANIZATION_TYPE.Self.employed 
##                                      4.195951e+02 
##                               OCCUPATION_TYPE.XNA 
##                                      4.125629e+02 
##                       OCCUPATION_TYPE.Sales.staff 
##                                      3.530636e+02 
##                          DEF_60_CNT_SOCIAL_CIRCLE 
##                                      2.644528e+02 
##               NAME_HOUSING_TYPE.House...apartment 
##                                      2.511268e+02 
##                      NAME_FAMILY_STATUS.Separated 
##                                      2.124115e+02 
##                        REGION_POPULATION_RELATIVE 
##                                      1.999970e+02 
##               WEEKDAY_APPR_PROCESS_START.SATURDAY 
##                                      1.906710e+02 
##                          NAME_FAMILY_STATUS.Widow 
##                                      1.855194e+02 
##                    NAME_INCOME_TYPE.State.servant 
##                                      1.732414e+02 
##                        OCCUPATION_TYPE.Core.staff 
##                                      1.687093e+02 
##             NAME_EDUCATION_TYPE.Incomplete.higher 
##                                      1.265142e+02 
##                            DAYS_LAST_PHONE_CHANGE 
##                                      1.219585e+02 
##                      REG_REGION_NOT_WORK_REGION.N 
##                                      9.229820e+01 
##                                        DAYS_BIRTH 
##                                      8.202299e+01 
##                          OCCUPATION_TYPE.Managers 
##                                      7.451275e+01 
##                      REG_REGION_NOT_WORK_REGION.Y 
##                                      6.004381e+01 
##                    NAME_HOUSING_TYPE.With.parents 
##                                      5.620894e+01 
##                         AMT_REQ_CREDIT_BUREAU_MON 
##                                      5.205232e+01 
##                                     DAYS_EMPLOYED 
##                                      5.136552e+01 
##                                   DAYS_ID_PUBLISH 
##                                      4.754036e+01 
##                           HOUR_APPR_PROCESS_START 
##                                      3.967130e+01 
##                      REG_REGION_NOT_LIVE_REGION.N 
##                                      3.939734e+01 
##               NAME_EDUCATION_TYPE.Lower.secondary 
##                                      3.714776e+01 
##                     LIVE_REGION_NOT_WORK_REGION.N 
##                                      3.489076e+01 
##                     LIVE_REGION_NOT_WORK_REGION.Y 
##                                      3.260822e+01 
##                                  AMT_INCOME_TOTAL 
##                                      2.977919e+01 
##                      REG_REGION_NOT_LIVE_REGION.Y 
##                                      2.973562e+01 
##                                        AMT_CREDIT 
##                                      2.937600e+01 
##                                   AMT_GOODS_PRICE 
##                                      2.905561e+01 
##                                 DAYS_REGISTRATION 
##                                      2.630463e+01 
##                                      FLAG_EMAIL.Y 
##                                      2.622571e+01 
##                   NAME_TYPE_SUITE.Spouse..partner 
##                                      2.534514e+01 
##                                       AMT_ANNUITY 
##                                      2.511945e+01 
##                                      FLAG_EMAIL.N 
##                                      2.425832e+01 
##             NAME_HOUSING_TYPE.Municipal.apartment 
##                                      2.240918e+01 
##                         AMT_REQ_CREDIT_BUREAU_QRT 
##                                      2.141124e+01 
##                        NAME_INCOME_TYPE.Pensioner 
##                                      1.911535e+01 
##                NAME_HOUSING_TYPE.Rented.apartment 
##                                      1.661527e+01 
##                                  FLAG_EMP_PHONE.Y 
##                                      1.607607e+01 
##                           ORGANIZATION_TYPE.Other 
##                                      1.580644e+01 
##                             ORGANIZATION_TYPE.XNA 
##                                      1.557004e+01 
##                    OCCUPATION_TYPE.Security.staff 
##                                      1.523310e+01 
##                 WEEKDAY_APPR_PROCESS_START.SUNDAY 
##                                      1.522241e+01 
##          ORGANIZATION_TYPE.Business.Entity.Type.2 
##                                      1.334574e+01 
##                                  FLAG_EMP_PHONE.N 
##                                      1.280340e+01 
##                    ORGANIZATION_TYPE.Construction 
##                                      1.209058e+01 
##                       OCCUPATION_TYPE.Accountants 
##                                      1.160384e+01 
##             OCCUPATION_TYPE.High.skill.tech.staff 
##                                      1.027786e+01 
##                        ORGANIZATION_TYPE.Medicine 
##                                      9.253402e+00 
##                   ORGANIZATION_TYPE.Trade..type.7 
##                                      8.839607e+00 
##                          NAME_TYPE_SUITE.Children 
##                                      7.177885e+00 
##                        AMT_REQ_CREDIT_BUREAU_WEEK 
##                                      6.396039e+00 
##                      ORGANIZATION_TYPE.Government 
##                                      5.756175e+00 
##                     OCCUPATION_TYPE.Cooking.staff 
##                                      5.673210e+00 
##               ORGANIZATION_TYPE.Transport..type.4 
##                                      5.656496e+00 
##                    OCCUPATION_TYPE.Medicine.staff 
##                                      5.498736e+00 
##                    ORGANIZATION_TYPE.Kindergarten 
##                                      5.023972e+00 
##                   ORGANIZATION_TYPE.Trade..type.3 
##                                      4.795816e+00 
##                OCCUPATION_TYPE.Low.skill.Laborers 
##                                      4.714950e+00 
##                        ORGANIZATION_TYPE.Security 
##                                      4.481458e+00 
##               ORGANIZATION_TYPE.Transport..type.3 
##                                      4.383485e+00 
##                                       CASH_LOAN.N 
##                                      4.378394e+00 
##                          ORGANIZATION_TYPE.School 
##                                      4.332550e+00 
##                                       CASH_LOAN.Y 
##                                      3.978249e+00 
##          ORGANIZATION_TYPE.Business.Entity.Type.1 
##                                      3.946746e+00 
##                ORGANIZATION_TYPE.Industry..type.3 
##                                      3.305940e+00 
##                    OCCUPATION_TYPE.Cleaning.staff 
##                                      3.299557e+00 
##                      ORGANIZATION_TYPE.Restaurant 
##                                      3.163058e+00 
##                         AMT_REQ_CREDIT_BUREAU_DAY 
##                                      3.046038e+00 
##                           NAME_TYPE_SUITE.Other_B 
##                                      3.030730e+00 
##                          ORGANIZATION_TYPE.Postal 
##                                      2.726748e+00 
##                        AMT_REQ_CREDIT_BUREAU_HOUR 
##                                      2.578747e+00 
##              OCCUPATION_TYPE.Waiters.barmen.staff 
##                                      2.320473e+00 
##                     ORGANIZATION_TYPE.Agriculture 
##                                      2.254574e+00 
##                         ORGANIZATION_TYPE.Telecom 
##                                      2.185969e+00 
##                   ORGANIZATION_TYPE.Trade..type.2 
##                                      2.168890e+00 
##                            ORGANIZATION_TYPE.Bank 
##                                      2.098756e+00 
##                        ORGANIZATION_TYPE.Military 
##                                      2.044563e+00 
##                NAME_HOUSING_TYPE.Office.apartment 
##                                      2.008419e+00 
##                 NAME_HOUSING_TYPE.Co.op.apartment 
##                                      1.789164e+00 
##                          ORGANIZATION_TYPE.Police 
##                                      1.764229e+00 
##                         ORGANIZATION_TYPE.Housing 
##                                      1.762673e+00 
##                         ORGANIZATION_TYPE.Realtor 
##                                      1.731531e+00 
##                       OCCUPATION_TYPE.Secretaries 
##                                      1.687042e+00 
##                ORGANIZATION_TYPE.Industry..type.4 
##                                      1.633106e+00 
##                ORGANIZATION_TYPE.Industry..type.1 
##                                      1.524414e+00 
##             OCCUPATION_TYPE.Private.service.staff 
##                                      1.486261e+00 
##               ORGANIZATION_TYPE.Industry..type.11 
##                                      1.428360e+00 
##                ORGANIZATION_TYPE.Industry..type.7 
##                                      1.423483e+00 
##                           NAME_TYPE_SUITE.Other_A 
##                                      1.415821e+00 
##                                    IMPUTED_EXT2.Y 
##                                      1.266612e+00 
##                                    IMPUTED_EXT2.N 
##                                      1.262543e+00 
##                        ORGANIZATION_TYPE.Services 
##                                      1.261261e+00 
##                     OCCUPATION_TYPE.Realty.agents 
##                                      1.259923e+00 
##               ORGANIZATION_TYPE.Transport..type.2 
##                                      1.252802e+00 
##                ORGANIZATION_TYPE.Industry..type.9 
##                                      1.238143e+00 
##                          ORGANIZATION_TYPE.Mobile 
##                                      1.184110e+00 
##                   NAME_TYPE_SUITE.Group.of.people 
##                                      1.177230e+00 
##                       ORGANIZATION_TYPE.Insurance 
##                                      1.150800e+00 
##                   ORGANIZATION_TYPE.Trade..type.1 
##                                      1.060392e+00 
##                  ORGANIZATION_TYPE.Legal.Services 
##                                      1.025855e+00 
##                      ORGANIZATION_TYPE.University 
##                                      9.743315e-01 
##                ORGANIZATION_TYPE.Industry..type.5 
##                                      9.693173e-01 
##                   ORGANIZATION_TYPE.Trade..type.6 
##                                      8.035085e-01 
##                          OCCUPATION_TYPE.HR.staff 
##                                      7.788360e-01 
##                     ORGANIZATION_TYPE.Advertising 
##                                      7.752790e-01 
##                                FLAG_CONT_MOBILE.Y 
##                                      7.474219e-01 
##             ORGANIZATION_TYPE.Security.Ministries 
##                                      7.448791e-01 
##                         ORGANIZATION_TYPE.Culture 
##                                      7.303800e-01 
##                                FLAG_CONT_MOBILE.N 
##                                      7.061707e-01 
##                     ORGANIZATION_TYPE.Electricity 
##                                      6.767039e-01 
##                       ORGANIZATION_TYPE.Emergency 
##                                      6.762616e-01 
##                           ORGANIZATION_TYPE.Hotel 
##                                      6.564189e-01 
##                        ORGANIZATION_TYPE.Cleaning 
##                                      6.155889e-01 
##                        ORGANIZATION_TYPE.Religion 
##                                      4.909636e-01 
##               ORGANIZATION_TYPE.Industry..type.10 
##                                      4.837548e-01 
##                ORGANIZATION_TYPE.Industry..type.2 
##                                      4.400813e-01 
##                ORGANIZATION_TYPE.Industry..type.6 
##                                      4.375614e-01 
##                          OCCUPATION_TYPE.IT.staff 
##                                      4.202654e-01 
##               ORGANIZATION_TYPE.Transport..type.1 
##                                      4.154291e-01 
##               ORGANIZATION_TYPE.Industry..type.12 
##                                      3.058339e-01 
##                       NAME_INCOME_TYPE.Unemployed 
##                                      2.847856e-01 
##                   ORGANIZATION_TYPE.Trade..type.5 
##                                      2.199512e-01 
##               NAME_EDUCATION_TYPE.Academic.degree 
##                                      1.509186e-01 
##                  NAME_INCOME_TYPE.Maternity.leave 
##                                      1.422529e-01 
##               ORGANIZATION_TYPE.Industry..type.13 
##                                      1.420683e-01 
##                   ORGANIZATION_TYPE.Trade..type.4 
##                                      8.897049e-02 
##                ORGANIZATION_TYPE.Industry..type.8 
##                                      7.190292e-02 
##                          NAME_INCOME_TYPE.Student 
##                                      2.666667e-03 
##                      NAME_INCOME_TYPE.Businessman 
##                                      0.000000e+00 
##                        NAME_FAMILY_STATUS.Unknown 
##                                      0.000000e+00

SMOTE Predictions on Train Data

# predict classes on train data
predictions_trainSMOTE <- predict(rf_modelSMOTE, credit_train_smote)$predictions

SMOTE Train Metrics

# get confusion matrix
conf_matrix_trainSMOTE <- confusionMatrix(predictions_trainSMOTE, credit_train_smote$DEFAULT)
print(conf_matrix_trainSMOTE)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  FALSE   TRUE
##      FALSE 197681  22499
##      TRUE       8 168492
##                                           
##                Accuracy : 0.9421          
##                  95% CI : (0.9414, 0.9428)
##     No Information Rate : 0.5086          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8839          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8822          
##          Pos Pred Value : 0.8978          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5086          
##          Detection Rate : 0.5086          
##    Detection Prevalence : 0.5665          
##       Balanced Accuracy : 0.9411          
##                                           
##        'Positive' Class : FALSE           
## 

SMOTE Predictions on Test Data

# predict on test data
predictions_testSMOTE <- predict(rf_modelSMOTE, credit_test_smote)$predictions

SMOTE Test Metrics

# get confusion matrix
conf_matrix_testSMOTE <- confusionMatrix(predictions_testSMOTE, credit_test_smote$DEFAULT)
print(conf_matrix_testSMOTE)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 84719  9659
##      TRUE      4 72194
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9409, 0.9431)
##     No Information Rate : 0.5086          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8837          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8820          
##          Pos Pred Value : 0.8977          
##          Neg Pred Value : 0.9999          
##              Prevalence : 0.5086          
##          Detection Rate : 0.5086          
##    Detection Prevalence : 0.5666          
##       Balanced Accuracy : 0.9410          
##                                           
##        'Positive' Class : FALSE           
## 

SMOTE AUC Train Set

# train RF for AUC calculation
rf_AUCSMOTE <- ranger(formula = DEFAULT ~ ., data = credit_train_smote, 
                   num.trees = 500, max.depth = 10, oob.error = TRUE, probability = TRUE, importance = 'impurity', seed = 1234)
## Growing trees.. Progress: 13%. Estimated remaining time: 3 minutes, 20 seconds.
## Growing trees.. Progress: 28%. Estimated remaining time: 2 minutes, 41 seconds.
## Growing trees.. Progress: 42%. Estimated remaining time: 2 minutes, 7 seconds.
## Growing trees.. Progress: 57%. Estimated remaining time: 1 minute, 34 seconds.
## Growing trees.. Progress: 72%. Estimated remaining time: 1 minute, 1 seconds.
## Growing trees.. Progress: 86%. Estimated remaining time: 31 seconds.
# train probabilities
probs_trainSMOTE <- predict(rf_AUCSMOTE, data = credit_train_smote)$predictions[, 2]

# train predictions
pred_trainSMOTE <- prediction(probs_trainSMOTE, credit_train_smote$DEFAULT)

# train trp and fpr performance metrics
perf_trainSMOTE <- performance(pred_trainSMOTE, measure = "tpr", x.measure = "fpr")

# train AUC performance
auc_perf_trainSMOTE <- performance(pred_trainSMOTE, measure = "auc")

# train AUC calculation
auc_value_trainSMOTE <- auc_perf_trainSMOTE@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_trainSMOTE, 4)))
## [1] "AUC = 0.9738"

SMOTE AUC Test Set

# test test probabilities
probs_testSMOTE <- predict(rf_AUCSMOTE, data = credit_test_smote)$predictions[, 2]

# test set predictions
pred_testSMOTE <- prediction(probs_testSMOTE, credit_test_smote$DEFAULT)

# test set tpr and fpr performance
perf_testSMOTE <- performance(pred_testSMOTE, measure = "tpr", x.measure = "fpr")

# test set AUC performance
auc_perf_testSMOTE <- performance(pred_testSMOTE, measure = "auc")

# test set AUC calculation
auc_value_testSMOTE <- auc_perf_testSMOTE@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_testSMOTE, 4)))
## [1] "AUC = 0.9714"

SMOTE Model 2

# train RF with 800 trees and 20 depth
rf_model1smote <- ranger(formula = DEFAULT ~ ., data = credit_train_smote, , importance = 'impurity',
                   num.trees = 800, max.depth = 20, seed = 1234)
## Growing trees.. Progress: 5%. Estimated remaining time: 11 minutes, 20 seconds.
## Growing trees.. Progress: 10%. Estimated remaining time: 9 minutes, 53 seconds.
## Growing trees.. Progress: 16%. Estimated remaining time: 8 minutes, 38 seconds.
## Growing trees.. Progress: 21%. Estimated remaining time: 8 minutes, 1 seconds.
## Growing trees.. Progress: 27%. Estimated remaining time: 7 minutes, 18 seconds.
## Growing trees.. Progress: 32%. Estimated remaining time: 6 minutes, 48 seconds.
## Growing trees.. Progress: 38%. Estimated remaining time: 6 minutes, 11 seconds.
## Growing trees.. Progress: 43%. Estimated remaining time: 5 minutes, 37 seconds.
## Growing trees.. Progress: 49%. Estimated remaining time: 5 minutes, 1 seconds.
## Growing trees.. Progress: 55%. Estimated remaining time: 4 minutes, 26 seconds.
## Growing trees.. Progress: 60%. Estimated remaining time: 3 minutes, 55 seconds.
## Growing trees.. Progress: 66%. Estimated remaining time: 3 minutes, 19 seconds.
## Growing trees.. Progress: 71%. Estimated remaining time: 2 minutes, 48 seconds.
## Growing trees.. Progress: 77%. Estimated remaining time: 2 minutes, 15 seconds.
## Growing trees.. Progress: 82%. Estimated remaining time: 1 minute, 42 seconds.
## Growing trees.. Progress: 88%. Estimated remaining time: 1 minute, 10 seconds.
## Growing trees.. Progress: 93%. Estimated remaining time: 38 seconds.
## Growing trees.. Progress: 100%. Estimated remaining time: 2 seconds.
# train predictions
predictions_train1smote <- predict(rf_model1smote, credit_train_smote)$predictions

# test predictions
predictions_test1smote <- predict(rf_model1smote, credit_test_smote)$predictions

# conf matrix train
conf_matrix_train1smote <- confusionMatrix(predictions_train1smote, credit_train_smote$DEFAULT)
print(conf_matrix_train1smote)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  FALSE   TRUE
##      FALSE 197689  16801
##      TRUE       0 174190
##                                           
##                Accuracy : 0.9568          
##                  95% CI : (0.9561, 0.9574)
##     No Information Rate : 0.5086          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9134          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9120          
##          Pos Pred Value : 0.9217          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5086          
##          Detection Rate : 0.5086          
##    Detection Prevalence : 0.5518          
##       Balanced Accuracy : 0.9560          
##                                           
##        'Positive' Class : FALSE           
## 
# conf matrix train
conf_matrix_test1smote <- confusionMatrix(predictions_test1smote, credit_test_smote$DEFAULT)
print(conf_matrix_test1smote)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 84723  7568
##      TRUE      0 74285
##                                           
##                Accuracy : 0.9546          
##                  95% CI : (0.9536, 0.9556)
##     No Information Rate : 0.5086          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.909           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9075          
##          Pos Pred Value : 0.9180          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5086          
##          Detection Rate : 0.5086          
##    Detection Prevalence : 0.5540          
##       Balanced Accuracy : 0.9538          
##                                           
##        'Positive' Class : FALSE           
## 

SMOTE AUC Train Set 2

# train AUC for 800 trees, 20 depth
rf_AUC1smote <- ranger(formula = DEFAULT ~ ., data = credit_train_smote, num.trees = 800, max.depth = 20, probability = TRUE, importance = 'impurity', seed = 1234)
## Growing trees.. Progress: 5%. Estimated remaining time: 11 minutes, 20 seconds.
## Growing trees.. Progress: 11%. Estimated remaining time: 9 minutes, 6 seconds.
## Growing trees.. Progress: 16%. Estimated remaining time: 8 minutes, 24 seconds.
## Growing trees.. Progress: 22%. Estimated remaining time: 7 minutes, 40 seconds.
## Growing trees.. Progress: 28%. Estimated remaining time: 6 minutes, 59 seconds.
## Growing trees.. Progress: 33%. Estimated remaining time: 6 minutes, 25 seconds.
## Growing trees.. Progress: 39%. Estimated remaining time: 5 minutes, 57 seconds.
## Growing trees.. Progress: 44%. Estimated remaining time: 5 minutes, 22 seconds.
## Growing trees.. Progress: 50%. Estimated remaining time: 4 minutes, 48 seconds.
## Growing trees.. Progress: 56%. Estimated remaining time: 4 minutes, 14 seconds.
## Growing trees.. Progress: 61%. Estimated remaining time: 3 minutes, 46 seconds.
## Growing trees.. Progress: 66%. Estimated remaining time: 3 minutes, 13 seconds.
## Growing trees.. Progress: 72%. Estimated remaining time: 2 minutes, 43 seconds.
## Growing trees.. Progress: 77%. Estimated remaining time: 2 minutes, 11 seconds.
## Growing trees.. Progress: 83%. Estimated remaining time: 1 minute, 40 seconds.
## Growing trees.. Progress: 88%. Estimated remaining time: 1 minute, 7 seconds.
## Growing trees.. Progress: 94%. Estimated remaining time: 37 seconds.
## Growing trees.. Progress: 100%. Estimated remaining time: 2 seconds.
# probabilities
probs_train1smote <- predict(rf_AUC1smote, data = credit_train_smote)$predictions[, 2]

# predictions
pred_train1smote <- prediction(probs_train1smote, credit_train_smote$DEFAULT)

# tpr and fpr
perf_train1smote <- performance(pred_train1smote, measure = "tpr", x.measure = "fpr")

# auc performance
auc_perf_train1smote <- performance(pred_train1smote, measure = "auc")

# AUC calculation
auc_value_train1smote <- auc_perf_train1smote@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_train1smote, 4)))
## [1] "AUC = 0.9987"

SMOTE AUC Test Set 2

# probabilities
probs_test1smote <- predict(rf_AUC1smote, data = credit_test_smote)$predictions[, 2]

# predictions
pred_test1smote <- prediction(probs_test1smote, credit_test_smote$DEFAULT)

# tpr and fpr
perf_test1smote <- performance(pred_test1smote, measure = "tpr", x.measure = "fpr")

# auc performance
auc_perf_test1smote <- performance(pred_test1smote, measure = "auc")

# AUC calculation
auc_value_test1smote <- auc_perf_test1smote@y.values[[1]]

# print AUC value
print(paste("AUC =", round(auc_value_test1smote, 4)))
## [1] "AUC = 0.9781"