Arvores de Classificação - XGboost

Author

Ricardo Accioly

Published

November 24, 2024

Bibliotecas

library(tidyverse)
library(ISLR)

Dados

data(Default)
summary(Default)
 default    student       balance           income     
 No :9667   No :7056   Min.   :   0.0   Min.   :  772  
 Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
                       Median : 823.6   Median :34553  
                       Mean   : 835.4   Mean   :33517  
                       3rd Qu.:1166.3   3rd Qu.:43808  
                       Max.   :2654.3   Max.   :73554  
str(Default)
'data.frame':   10000 obs. of  4 variables:
 $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
 $ balance: num  730 817 1074 529 786 ...
 $ income : num  44362 12106 31767 35704 38463 ...
head(Default)
  default student   balance    income
1      No      No  729.5265 44361.625
2      No     Yes  817.1804 12106.135
3      No      No 1073.5492 31767.139
4      No      No  529.2506 35704.494
5      No      No  785.6559 38463.496
6      No     Yes  919.5885  7491.559

Manipulando os dados

credito <- tibble(Default)
summary(credito)
 default    student       balance           income     
 No :9667   No :7056   Min.   :   0.0   Min.   :  772  
 Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
                       Median : 823.6   Median :34553  
                       Mean   : 835.4   Mean   :33517  
                       3rd Qu.:1166.3   3rd Qu.:43808  
                       Max.   :2654.3   Max.   :73554  
# renomeando colunas
credito <- credito %>% 
                rename( inadimplente = default, estudante = student, balanco = balance,
                receita = income)
credito <- credito %>% mutate( inadimplente =  case_when(
                           inadimplente == "No"  ~ "Nao",
                           inadimplente == "Yes" ~ "Sim"
                          )) %>% mutate(inadimplente = factor(inadimplente))
credito <- credito %>% mutate( estudante =  case_when(
                           estudante == "No"  ~ 0,
                           estudante == "Yes" ~ 1
                          )) 

str(credito)
tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
 $ inadimplente: Factor w/ 2 levels "Nao","Sim": 1 1 1 1 1 1 1 1 1 1 ...
 $ estudante   : num [1:10000] 0 1 0 0 0 1 0 1 0 0 ...
 $ balanco     : num [1:10000] 730 817 1074 529 786 ...
 $ receita     : num [1:10000] 44362 12106 31767 35704 38463 ...
summary(credito)
 inadimplente   estudante         balanco          receita     
 Nao:9667     Min.   :0.0000   Min.   :   0.0   Min.   :  772  
 Sim: 333     1st Qu.:0.0000   1st Qu.: 481.7   1st Qu.:21340  
              Median :0.0000   Median : 823.6   Median :34553  
              Mean   :0.2944   Mean   : 835.4   Mean   :33517  
              3rd Qu.:1.0000   3rd Qu.:1166.3   3rd Qu.:43808  
              Max.   :1.0000   Max.   :2654.3   Max.   :73554  

Treino e Teste

library(caret)
set.seed(21)
y <- credito$inadimplente
indice_teste <- createDataPartition(y, times = 1, p = 0.2, list = FALSE)

x_conj_treino <- credito %>% slice(-indice_teste) %>% select(-inadimplente)
x_conj_teste <- credito %>% slice(indice_teste) %>% select(-inadimplente)
y_treino <- credito %>% slice(-indice_teste) %>% select(inadimplente) 
y_treino <- as.integer(unlist(y_treino))-1
y_teste <- credito %>% slice(indice_teste) %>% select(inadimplente)
y_teste <- as.integer(unlist(y_teste))-1

Treinando

## 1a tentativa Xgboost
library(xgboost)
num_class = 2
params = list(
  booster="gbtree",
  eta=0.001,
  max_depth=5,
  gamma=3,
  subsample=0.75,
  colsample_bytree=1,
  objective="multi:softprob",
  eval_metric="mlogloss",
  num_class=num_class
)

set.seed(21)
cv <- xgb.cv(data = as.matrix(x_conj_treino), label = as.matrix(y_treino), params=params, 
             nrounds = 10000, nfold = 5, early_stopping_rounds=10, 
             nthreads=1, verbose=FALSE)
# cv
elog <- as.data.frame(cv$evaluation_log)
elog %>% 
   summarize(ntrees.train = which.min(train_mlogloss_mean),  
             ntrees.test  = which.min(test_mlogloss_mean))   
  ntrees.train ntrees.test
1         6678        6668
(nrounds <- which.min(elog$test_mlogloss_mean))
[1] 6668

Modelo Final

modelo_xgb <- xgboost(data = as.matrix(x_conj_treino), label = as.matrix(y_treino),
             params=params, nrounds = nrounds, verbose = FALSE)

Importancia das variáveis

importancia <- xgb.importance(model = modelo_xgb)
importancia
     Feature        Gain       Cover  Frequency
      <char>       <num>       <num>      <num>
1:   balanco 0.936743463 0.938791966 0.74828024
2:   receita 0.057924670 0.052455552 0.22517788
3: estudante 0.005331867 0.008752483 0.02654188
xgb.plot.importance(importancia)

Previsões

x_conj_teste$prev <- predict(modelo_xgb, as.matrix(x_conj_teste), reshape=T)

Avaliando

xgb.ychapeu <- as.factor(ifelse(x_conj_teste$prev[,1] > 0.5,0,1))
confusionMatrix(xgb.ychapeu,as.factor(y_teste), positive="1")
Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 1924   38
         1   10   29
                                          
               Accuracy : 0.976           
                 95% CI : (0.9683, 0.9823)
    No Information Rate : 0.9665          
    P-Value [Acc > NIR] : 0.008318        
                                          
                  Kappa : 0.5357          
                                          
 Mcnemar's Test P-Value : 9.735e-05       
                                          
            Sensitivity : 0.43284         
            Specificity : 0.99483         
         Pos Pred Value : 0.74359         
         Neg Pred Value : 0.98063         
             Prevalence : 0.03348         
         Detection Rate : 0.01449         
   Detection Prevalence : 0.01949         
      Balanced Accuracy : 0.71383         
                                          
       'Positive' Class : 1