Regressão Logística - SMOTE

Author

Ricardo Accioly

Published

August 20, 2024

Carregando Bibliotecas

library(tidyverse)
library(ISLR)
data(Default)
summary(Default)

 default    student       balance           income     
 No :9667   No :7056   Min.   :   0.0   Min.   :  772  
 Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
                       Median : 823.6   Median :34553  
                       Mean   : 835.4   Mean   :33517  
                       3rd Qu.:1166.3   3rd Qu.:43808  
                       Max.   :2654.3   Max.   :73554

str(Default)

'data.frame':   10000 obs. of  4 variables:
 $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
 $ balance: num  730 817 1074 529 786 ...
 $ income : num  44362 12106 31767 35704 38463 ...

head(Default)

  default student   balance    income
1      No      No  729.5265 44361.625
2      No     Yes  817.1804 12106.135
3      No      No 1073.5492 31767.139
4      No      No  529.2506 35704.494
5      No      No  785.6559 38463.496
6      No     Yes  919.5885  7491.559

Manipulando os dados

credito <- tibble(Default)
summary(credito)

 default    student       balance           income     
 No :9667   No :7056   Min.   :   0.0   Min.   :  772  
 Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
                       Median : 823.6   Median :34553  
                       Mean   : 835.4   Mean   :33517  
                       3rd Qu.:1166.3   3rd Qu.:43808  
                       Max.   :2654.3   Max.   :73554

# renomeando colunas
credito <- credito %>% 
                rename( inadimplente = default, estudante = student, balanco = balance,
                receita = income)
credito <- credito %>% mutate( inadimplente =  case_when(
                           inadimplente == "No"  ~ "Nao",
                           inadimplente == "Yes" ~ "Sim"
                          )) %>% mutate(inadimplente = factor(inadimplente))
credito <- credito %>% mutate( estudante =  case_when(
                           estudante == "No"  ~ 0,
                           estudante == "Yes" ~ 1
                          ))

str(credito)

tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
 $ inadimplente: Factor w/ 2 levels "Nao","Sim": 1 1 1 1 1 1 1 1 1 1 ...
 $ estudante   : num [1:10000] 0 1 0 0 0 1 0 1 0 0 ...
 $ balanco     : num [1:10000] 730 817 1074 529 786 ...
 $ receita     : num [1:10000] 44362 12106 31767 35704 38463 ...

summary(credito)

 inadimplente   estudante         balanco          receita     
 Nao:9667     Min.   :0.0000   Min.   :   0.0   Min.   :  772  
 Sim: 333     1st Qu.:0.0000   1st Qu.: 481.7   1st Qu.:21340  
              Median :0.0000   Median : 823.6   Median :34553  
              Mean   :0.2944   Mean   : 835.4   Mean   :33517  
              3rd Qu.:1.0000   3rd Qu.:1166.3   3rd Qu.:43808  
              Max.   :1.0000   Max.   :2654.3   Max.   :73554

Treino e Teste

library(caret)
set.seed(21)
y <- credito$inadimplente
indice_teste <- createDataPartition(y, times = 1, p = 0.2, list = FALSE)

conj_treino <- credito %>% slice(-indice_teste)
conj_teste <- credito %>% slice(indice_teste)

summary(conj_treino)

 inadimplente   estudante         balanco          receita     
 Nao:7733     Min.   :0.0000   Min.   :   0.0   Min.   :  772  
 Sim: 266     1st Qu.:0.0000   1st Qu.: 481.3   1st Qu.:21339  
              Median :0.0000   Median : 819.1   Median :34541  
              Mean   :0.2953   Mean   : 832.7   Mean   :33541  
              3rd Qu.:1.0000   3rd Qu.:1167.1   3rd Qu.:43840  
              Max.   :1.0000   Max.   :2654.3   Max.   :73554

summary(conj_teste)

 inadimplente   estudante         balanco          receita     
 Nao:1934     Min.   :0.0000   Min.   :   0.0   Min.   : 4755  
 Sim:  67     1st Qu.:0.0000   1st Qu.: 483.5   1st Qu.:21371  
              Median :0.0000   Median : 836.6   Median :34591  
              Mean   :0.2909   Mean   : 846.2   Mean   :33423  
              3rd Qu.:1.0000   3rd Qu.:1163.1   3rd Qu.:43646  
              Max.   :1.0000   Max.   :2461.5   Max.   :71239

SMOTE

library(smotefamily)
set.seed(123)
teste <- SMOTE(conj_treino[,-1], target = conj_treino$inadimplente, K=5)
conj_treinoS <- teste$data
conj_treinoS$class <- as.factor(conj_treinoS$class)
conj_treinoS <- conj_treinoS %>% rename( inadimplente = class)
prop.table(table(conj_treinoS$inadimplente))


     Nao      Sim 
0.500615 0.499385

summary(conj_treinoS)

   estudante         balanco          receita      inadimplente
 Min.   :0.0000   Min.   :   0.0   Min.   :  772   Nao:7733    
 1st Qu.:0.0000   1st Qu.: 795.5   1st Qu.:20166   Sim:7714    
 Median :0.0000   Median :1392.3   Median :33703               
 Mean   :0.3394   Mean   :1272.9   Mean   :32841               
 3rd Qu.:1.0000   3rd Qu.:1769.2   3rd Qu.:43745               
 Max.   :1.0000   Max.   :2654.3   Max.   :73554

1a Regressão logística

mod1 <- glm(inadimplente ~ balanco + receita + estudante,data=conj_treinoS,family=binomial)
summary(mod1)


Call:
glm(formula = inadimplente ~ balanco + receita + estudante, family = binomial, 
    data = conj_treinoS)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -9.778e+00  2.279e-01 -42.898  < 2e-16 ***
balanco      7.115e-03  1.199e-04  59.356  < 2e-16 ***
receita      9.409e-06  3.645e-06   2.582  0.00984 ** 
estudante   -6.092e-01  1.071e-01  -5.688 1.28e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 21414.1  on 15446  degrees of freedom
Residual deviance:  7490.3  on 15443  degrees of freedom
AIC: 7498.3

Number of Fisher Scoring iterations: 7

coef(mod1)

  (Intercept)       balanco       receita     estudante 
-9.778189e+00  7.114940e-03  9.409475e-06 -6.092070e-01

summary(mod1)$coef

                 Estimate   Std. Error    z value     Pr(>|z|)
(Intercept) -9.778189e+00 2.279388e-01 -42.898311 0.000000e+00
balanco      7.114940e-03 1.198694e-04  59.355775 0.000000e+00
receita      9.409475e-06 3.644856e-06   2.581576 9.835023e-03
estudante   -6.092070e-01 1.070945e-01  -5.688499 1.281605e-08

Avaliando o modelo novamente

prop.table(table(conj_teste$inadimplente))


       Nao        Sim 
0.96651674 0.03348326

p_chapeu <- predict(mod1, newdata = conj_teste, type = "response")
y_chapeu <- ifelse(p_chapeu > 0.5, "Sim", "Nao") %>% factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(y_chapeu, conj_teste$inadimplente, positive="Sim")

Confusion Matrix and Statistics

          Reference
Prediction  Nao  Sim
       Nao 1696   11
       Sim  238   56
                                          
               Accuracy : 0.8756          
                 95% CI : (0.8603, 0.8897)
    No Information Rate : 0.9665          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.2705          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.83582         
            Specificity : 0.87694         
         Pos Pred Value : 0.19048         
         Neg Pred Value : 0.99356         
             Prevalence : 0.03348         
         Detection Rate : 0.02799         
   Detection Prevalence : 0.14693         
      Balanced Accuracy : 0.85638         
                                          
       'Positive' Class : Sim

Curva ROC

library(pROC)
p_chapeu_log <- predict(mod1, newdata = conj_teste, type = "response")
head(p_chapeu_log)

           1            2            3            4            5            6 
1.710555e-01 3.991424e-04 9.094011e-05 9.398313e-02 3.185776e-03 8.710220e-04

roc_log <- roc(conj_teste$inadimplente ~ p_chapeu_log, plot = TRUE, print.auc=TRUE,
                 legacy.axes=TRUE)

as.numeric(roc_log$auc)

[1] 0.941186