Regressão Logística - SMOTE

Author

Ricardo Accioly

Published

October 28, 2025

Carregando Bibliotecas

#>  default    student       balance           income     
#>  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
#>  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
#>                        Median : 823.6   Median :34553  
#>                        Mean   : 835.4   Mean   :33517  
#>                        3rd Qu.:1166.3   3rd Qu.:43808  
#>                        Max.   :2654.3   Max.   :73554
str(Default)
#> 'data.frame':    10000 obs. of  4 variables:
#>  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
#>  $ balance: num  730 817 1074 529 786 ...
#>  $ income : num  44362 12106 31767 35704 38463 ...
head(Default)
#>   default student   balance    income
#> 1      No      No  729.5265 44361.625
#> 2      No     Yes  817.1804 12106.135
#> 3      No      No 1073.5492 31767.139
#> 4      No      No  529.2506 35704.494
#> 5      No      No  785.6559 38463.496
#> 6      No     Yes  919.5885  7491.559

Manipulando os dados

credito <- tibble(Default)
summary(credito)
#>  default    student       balance           income     
#>  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
#>  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
#>                        Median : 823.6   Median :34553  
#>                        Mean   : 835.4   Mean   :33517  
#>                        3rd Qu.:1166.3   3rd Qu.:43808  
#>                        Max.   :2654.3   Max.   :73554
# renomeando colunas
credito <- credito %>% 
                rename( inadimplente = default, estudante = student, balanco = balance,
                receita = income)
credito <- credito %>% mutate( inadimplente =  case_when(
                           inadimplente == "No"  ~ "Nao",
                           inadimplente == "Yes" ~ "Sim"
                          )) %>% mutate(inadimplente = factor(inadimplente))
credito <- credito %>% mutate( estudante =  case_when(
                           estudante == "No"  ~ 0,
                           estudante == "Yes" ~ 1
                          ))

str(credito)
#> tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
#>  $ inadimplente: Factor w/ 2 levels "Nao","Sim": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ estudante   : num [1:10000] 0 1 0 0 0 1 0 1 0 0 ...
#>  $ balanco     : num [1:10000] 730 817 1074 529 786 ...
#>  $ receita     : num [1:10000] 44362 12106 31767 35704 38463 ...
summary(credito)
#>  inadimplente   estudante         balanco          receita     
#>  Nao:9667     Min.   :0.0000   Min.   :   0.0   Min.   :  772  
#>  Sim: 333     1st Qu.:0.0000   1st Qu.: 481.7   1st Qu.:21340  
#>               Median :0.0000   Median : 823.6   Median :34553  
#>               Mean   :0.2944   Mean   : 835.4   Mean   :33517  
#>               3rd Qu.:1.0000   3rd Qu.:1166.3   3rd Qu.:43808  
#>               Max.   :1.0000   Max.   :2654.3   Max.   :73554

Treino e Teste

set.seed(2025)
y <- credito$inadimplente
credito_split <- createDataPartition(y, times = 1, p = 0.1, list = FALSE)

conj_treino <- credito[-credito_split,]
conj_teste <- credito[credito_split,]

summary(conj_treino)
#>  inadimplente   estudante         balanco          receita     
#>  Nao:8700     Min.   :0.0000   Min.   :   0.0   Min.   :  772  
#>  Sim: 299     1st Qu.:0.0000   1st Qu.: 483.6   1st Qu.:21343  
#>               Median :0.0000   Median : 821.3   Median :34576  
#>               Mean   :0.2946   Mean   : 836.3   Mean   :33556  
#>               3rd Qu.:1.0000   3rd Qu.:1167.1   3rd Qu.:43854  
#>               Max.   :1.0000   Max.   :2654.3   Max.   :73554
summary(conj_teste)
#>  inadimplente   estudante         balanco          receita     
#>  Nao:967      Min.   :0.0000   Min.   :   0.0   Min.   : 5386  
#>  Sim: 34      1st Qu.:0.0000   1st Qu.: 453.6   1st Qu.:21319  
#>               Median :0.0000   Median : 839.9   Median :34086  
#>               Mean   :0.2927   Mean   : 827.4   Mean   :33170  
#>               3rd Qu.:1.0000   3rd Qu.:1156.3   3rd Qu.:42856  
#>               Max.   :1.0000   Max.   :2221.0   Max.   :70701

SMOTE

library(smotefamily)
set.seed(123)
teste <- SMOTE(conj_treino[,-1], target = conj_treino$inadimplente, K=5)
conj_treinoS <- teste$data
conj_treinoS$class <- as.factor(conj_treinoS$class)
conj_treinoS <- conj_treinoS %>% rename( inadimplente = class)
prop.table(table(conj_treinoS$inadimplente))
#> 
#>       Nao       Sim 
#> 0.5008347 0.4991653
summary(conj_treinoS)
#>    estudante         balanco          receita      inadimplente
#>  Min.   :0.0000   Min.   :   0.0   Min.   :  772   Nao:8700    
#>  1st Qu.:0.0000   1st Qu.: 798.1   1st Qu.:20071   Sim:8671    
#>  Median :0.0000   Median :1392.3   Median :33552               
#>  Mean   :0.3398   Mean   :1281.7   Mean   :32721               
#>  3rd Qu.:1.0000   3rd Qu.:1786.8   3rd Qu.:43584               
#>  Max.   :1.0000   Max.   :2654.3   Max.   :73554

1a Regressão logística

mod1 <- glm(inadimplente ~ balanco + receita + estudante,data=conj_treinoS,family=binomial)
summary(mod1)
#> 
#> Call:
#> glm(formula = inadimplente ~ balanco + receita + estudante, family = binomial, 
#>     data = conj_treinoS)
#> 
#> Coefficients:
#>               Estimate Std. Error z value Pr(>|z|)    
#> (Intercept) -9.289e+00  2.112e-01 -43.990   <2e-16 ***
#> balanco      7.111e-03  1.136e-04  62.586   <2e-16 ***
#> receita     -3.103e-06  3.532e-06  -0.879     0.38    
#> estudante   -9.580e-01  1.017e-01  -9.418   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 24081.3  on 17370  degrees of freedom
#> Residual deviance:  8348.3  on 17367  degrees of freedom
#> AIC: 8356.3
#> 
#> Number of Fisher Scoring iterations: 7
coef(mod1)
#>   (Intercept)       balanco       receita     estudante 
#> -9.288540e+00  7.110668e-03 -3.103058e-06 -9.580370e-01
summary(mod1)$coef
#>                  Estimate   Std. Error     z value     Pr(>|z|)
#> (Intercept) -9.288540e+00 2.111528e-01 -43.9896601 0.000000e+00
#> balanco      7.110668e-03 1.136140e-04  62.5862063 0.000000e+00
#> receita     -3.103058e-06 3.532109e-06  -0.8785285 3.796570e-01
#> estudante   -9.580370e-01 1.017207e-01  -9.4183074 4.584179e-21

Avaliando o modelo novamente

prop.table(table(conj_teste$inadimplente))
#> 
#>        Nao        Sim 
#> 0.96603397 0.03396603
p_chapeu <- predict(mod1, newdata = conj_teste, type = "response")
y_chapeu <- ifelse(p_chapeu > 0.5, "Sim", "Nao") %>% factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(y_chapeu, conj_teste$inadimplente, positive="Sim") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction Nao Sim
#>        Nao 861   6
#>        Sim 106  28
#>                                          
#>                Accuracy : 0.8881         
#>                  95% CI : (0.8669, 0.907)
#>     No Information Rate : 0.966          
#>     P-Value [Acc > NIR] : 1              
#>                                          
#>                   Kappa : 0.2951         
#>                                          
#>  Mcnemar's Test P-Value : <2e-16         
#>                                          
#>             Sensitivity : 0.82353        
#>             Specificity : 0.89038        
#>          Pos Pred Value : 0.20896        
#>          Neg Pred Value : 0.99308        
#>              Prevalence : 0.03397        
#>          Detection Rate : 0.02797        
#>    Detection Prevalence : 0.13387        
#>       Balanced Accuracy : 0.85696        
#>                                          
#>        'Positive' Class : Sim            
#> 

Curva ROC

library(pROC)
p_chapeu_log <- predict(mod1, newdata = conj_teste, type = "response")
head(p_chapeu_log)
#>           1           2           3           4           5           6 
#> 0.088099402 0.002130191 0.019239847 0.070623827 0.133527974 0.005357011
roc_log <- roc(conj_teste$inadimplente ~ p_chapeu_log, plot = TRUE, print.auc=TRUE,
                 legacy.axes=TRUE) 

as.numeric(roc_log$auc)
#> [1] 0.9325081