Regressão Logística

Author

Ricardo Accioly

Published

August 20, 2024

Carregando Bibliotecas

library(tidyverse)
library(ISLR)
data(Default)
summary(Default)
 default    student       balance           income     
 No :9667   No :7056   Min.   :   0.0   Min.   :  772  
 Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
                       Median : 823.6   Median :34553  
                       Mean   : 835.4   Mean   :33517  
                       3rd Qu.:1166.3   3rd Qu.:43808  
                       Max.   :2654.3   Max.   :73554  
str(Default)
'data.frame':   10000 obs. of  4 variables:
 $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
 $ balance: num  730 817 1074 529 786 ...
 $ income : num  44362 12106 31767 35704 38463 ...
head(Default)
  default student   balance    income
1      No      No  729.5265 44361.625
2      No     Yes  817.1804 12106.135
3      No      No 1073.5492 31767.139
4      No      No  529.2506 35704.494
5      No      No  785.6559 38463.496
6      No     Yes  919.5885  7491.559

Manipulando os dados

credito <- tibble(Default)
summary(credito)
 default    student       balance           income     
 No :9667   No :7056   Min.   :   0.0   Min.   :  772  
 Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
                       Median : 823.6   Median :34553  
                       Mean   : 835.4   Mean   :33517  
                       3rd Qu.:1166.3   3rd Qu.:43808  
                       Max.   :2654.3   Max.   :73554  
# renomeando colunas
credito <- credito %>% 
                rename( inadimplente = default, estudante = student, balanco = balance,
                receita = income)
credito <- credito %>% mutate( inadimplente =  case_when(
                           inadimplente == "No"  ~ "Nao",
                           inadimplente == "Yes" ~ "Sim"
                          )) %>% mutate(inadimplente = factor(inadimplente))
credito <- credito %>% mutate( estudante =  case_when(
                           estudante == "No"  ~ 0,
                           estudante == "Yes" ~ 1
                          )) 

str(credito)
tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
 $ inadimplente: Factor w/ 2 levels "Nao","Sim": 1 1 1 1 1 1 1 1 1 1 ...
 $ estudante   : num [1:10000] 0 1 0 0 0 1 0 1 0 0 ...
 $ balanco     : num [1:10000] 730 817 1074 529 786 ...
 $ receita     : num [1:10000] 44362 12106 31767 35704 38463 ...
summary(credito)
 inadimplente   estudante         balanco          receita     
 Nao:9667     Min.   :0.0000   Min.   :   0.0   Min.   :  772  
 Sim: 333     1st Qu.:0.0000   1st Qu.: 481.7   1st Qu.:21340  
              Median :0.0000   Median : 823.6   Median :34553  
              Mean   :0.2944   Mean   : 835.4   Mean   :33517  
              3rd Qu.:1.0000   3rd Qu.:1166.3   3rd Qu.:43808  
              Max.   :1.0000   Max.   :2654.3   Max.   :73554  

Treino e Teste

library(caret)
set.seed(2024)
y <- credito$inadimplente
credito_split <- createDataPartition(y, times = 1, p = 0.80, list = FALSE)

conj_treino <- credito[credito_split,]
conj_teste <- credito[-credito_split,]
                           
summary(conj_treino)
 inadimplente   estudante         balanco          receita     
 Nao:7734     Min.   :0.0000   Min.   :   0.0   Min.   :  772  
 Sim: 267     1st Qu.:0.0000   1st Qu.: 479.5   1st Qu.:21309  
              Median :0.0000   Median : 825.9   Median :34468  
              Mean   :0.2945   Mean   : 835.9   Mean   :33437  
              3rd Qu.:1.0000   3rd Qu.:1170.0   3rd Qu.:43706  
              Max.   :1.0000   Max.   :2654.3   Max.   :72461  
summary(conj_teste)
 inadimplente   estudante         balanco          receita     
 Nao:1933     Min.   :0.0000   Min.   :   0.0   Min.   : 5083  
 Sim:  66     1st Qu.:0.0000   1st Qu.: 489.5   1st Qu.:21422  
              Median :0.0000   Median : 813.9   Median :35177  
              Mean   :0.2941   Mean   : 833.3   Mean   :33837  
              3rd Qu.:1.0000   3rd Qu.:1143.4   3rd Qu.:44208  
              Max.   :1.0000   Max.   :2502.7   Max.   :73554  

Matriz de dispersão

library(psych)
pairs.panels(conj_treino, 
             method = "pearson", # metodo de correlação
             hist.col = "#00AFBB",
             density = TRUE,  # mostra graficos de densidade
             ellipses = FALSE # mostra elipses de correlação
             )

Avaliando o comportamento das variáveis em função do status (inadimplente / estudante)

library(patchwork)
p1 <- ggplot(credito, aes(x=inadimplente, y=balanco, color=inadimplente)) +
  geom_boxplot()
p2 <- ggplot(credito, aes(x=inadimplente, y=receita, color=inadimplente)) +
  geom_boxplot()
p3 <- ggplot(credito, aes(x=as.factor(estudante), y=balanco, color=as.factor(estudante))) +
  geom_boxplot()
p4 <- ggplot(credito, aes(x=as.factor(estudante), y=receita, color=as.factor(estudante))) +
  geom_boxplot()
(p1 + p2) / (p3 + p4)

Balanço vs Receita

ggplot(data = credito, aes(x=balanco,  y = receita, col = inadimplente)) +
  geom_point() 

Avaliando comportamento

# proporção de inadimplentes
credito %>% select(inadimplente, balanco) %>% summarize(prop = mean(inadimplente == "Sim")) 
# A tibble: 1 × 1
    prop
   <dbl>
1 0.0333
# media do balanço dos inadimplentes 
credito %>% filter(inadimplente == "Sim") %>% summarize(valor= mean(balanco))   
# A tibble: 1 × 1
  valor
  <dbl>
1 1748.
quantis <- quantile(credito$balanco, probs = c(.1,.25, .50, .75, .9, .95, 0.97, 0.99))
quantis
      10%       25%       50%       75%       90%       95%       97%       99% 
 180.5753  481.7311  823.6370 1166.3084 1471.6253 1665.9626 1793.2910 2008.4709 
credito %>% 
            mutate(grupo_balanco = case_when(
               balanco<=quantis[1] ~ quantis[1],
               balanco>quantis[1] & balanco<=quantis[2] ~ quantis[2],
               balanco>quantis[2] & balanco<=quantis[3]  ~ quantis[3],
               balanco>quantis[3] & balanco<=quantis[4]  ~ quantis[4],
               balanco>quantis[4] & balanco<=quantis[5]  ~ quantis[5],
               balanco>quantis[5] & balanco<=quantis[6]  ~ quantis[6],
               balanco>quantis[6] & balanco<=quantis[7]  ~ quantis[7],
               balanco>quantis[7] ~ quantis[8])) %>%
           group_by(grupo_balanco) %>%
           summarize(prop = mean(inadimplente == "Sim")) %>%
           ggplot(aes(grupo_balanco, prop)) +
           geom_point() +
           geom_line()

1a Regressão logística: só balanço

mod1 <- glm(inadimplente ~ balanco,data=conj_treino,family=binomial)
summary(mod1)

Call:
glm(formula = inadimplente ~ balanco, family = binomial, data = conj_treino)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.047e+01  3.931e-01  -26.64   <2e-16 ***
balanco      5.386e-03  2.405e-04   22.40   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2340.6  on 8000  degrees of freedom
Residual deviance: 1302.1  on 7999  degrees of freedom
AIC: 1306.1

Number of Fisher Scoring iterations: 8
coef(mod1)
  (Intercept)       balanco 
-10.471543192   0.005385567 
summary(mod1)$coef
                 Estimate   Std. Error   z value      Pr(>|z|)
(Intercept) -10.471543192 0.3931122212 -26.63754 2.495385e-156
balanco       0.005385567 0.0002404507  22.39780 4.134795e-111

Avaliando o modelo

p_chapeu <- predict(mod1, newdata = conj_teste, type = "response")
y_chapeu <- ifelse(p_chapeu > 0.5, "Sim", "Nao") %>% factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(y_chapeu, conj_teste$inadimplente, positive="Sim") 
Confusion Matrix and Statistics

          Reference
Prediction  Nao  Sim
       Nao 1929   45
       Sim    4   21
                                          
               Accuracy : 0.9755          
                 95% CI : (0.9677, 0.9818)
    No Information Rate : 0.967           
    P-Value [Acc > NIR] : 0.01625         
                                          
                  Kappa : 0.4516          
                                          
 Mcnemar's Test P-Value : 1.102e-08       
                                          
            Sensitivity : 0.31818         
            Specificity : 0.99793         
         Pos Pred Value : 0.84000         
         Neg Pred Value : 0.97720         
             Prevalence : 0.03302         
         Detection Rate : 0.01051         
   Detection Prevalence : 0.01251         
      Balanced Accuracy : 0.65806         
                                          
       'Positive' Class : Sim             
                                          

Veja as probabilidade de inadimplencia para balanços de 1000, 2000 e 3000

predict(mod1, newdata = data.frame(balanco = c(1000,2000,3000)), type="response")
          1           2           3 
0.006144857 0.574342575 0.996615498 

Curva S

inadimpl <- as.numeric(conj_treino$inadimplente) - 1
plot(inadimpl ~ balanco, data = conj_treino, 
     col = "darkorange", pch = "|", ylim = c(0, 1),
     main = "Regressão Logistica - Classificacão")
abline(h = 0, lty = 3)
abline(h = 1, lty = 3)
abline(h = 0.5, lty = 2)
curve(predict(mod1, data.frame(balanco = x),
        type = "response"), add = TRUE, lwd = 3, col = "dodgerblue")
abline(v = -coef(mod1)[1] / coef(mod1)[2], lwd = 2)

Valor de balanço com probabilidade de 50%

-\(\beta_0\)/\(\beta_1\)

-coef(mod1)[1] / coef(mod1)[2]
(Intercept) 
   1944.371 

2a Regressão logística: todas as variáveis

mod2 <- glm(inadimplente ~ balanco + receita + estudante,data=conj_treino,family=binomial)
summary(mod2)

Call:
glm(formula = inadimplente ~ balanco + receita + estudante, family = binomial, 
    data = conj_treino)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.075e+01  5.417e-01 -19.847   <2e-16 ***
balanco      5.628e-03  2.538e-04  22.180   <2e-16 ***
receita      4.416e-06  9.088e-06   0.486   0.6270    
estudante   -6.264e-01  2.612e-01  -2.398   0.0165 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2340.6  on 8000  degrees of freedom
Residual deviance: 1281.3  on 7997  degrees of freedom
AIC: 1289.3

Number of Fisher Scoring iterations: 8
coef(mod2)
  (Intercept)       balanco       receita     estudante 
-1.075144e+01  5.628328e-03  4.415834e-06 -6.263630e-01 
summary(mod2)$coef
                 Estimate   Std. Error     z value      Pr(>|z|)
(Intercept) -1.075144e+01 5.417102e-01 -19.8472260  1.164551e-87
balanco      5.628328e-03 2.537535e-04  22.1802958 5.322786e-109
receita      4.415834e-06 9.088132e-06   0.4858902  6.270450e-01
estudante   -6.263630e-01 2.611624e-01  -2.3983656  1.646842e-02

É possível se ver que receita não é significativa

3a Regressão Logística (sem receita)

mod3 <- glm(inadimplente ~ balanco + estudante,data=conj_treino,family=binomial)
summary(mod3)

Call:
glm(formula = inadimplente ~ balanco + estudante, family = binomial, 
    data = conj_treino)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.058e+01  4.027e-01 -26.264  < 2e-16 ***
balanco      5.629e-03  2.537e-04  22.190  < 2e-16 ***
estudante   -7.246e-01  1.646e-01  -4.401 1.08e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2340.6  on 8000  degrees of freedom
Residual deviance: 1281.6  on 7998  degrees of freedom
AIC: 1287.6

Number of Fisher Scoring iterations: 8
coef(mod3)
  (Intercept)       balanco     estudante 
-10.577195871   0.005629461  -0.724574801 
summary(mod3)$coef
                 Estimate   Std. Error   z value      Pr(>|z|)
(Intercept) -10.577195871 0.4027279496 -26.26387 4.962858e-152
balanco       0.005629461 0.0002536895  22.19035 4.256243e-109
estudante    -0.724574801 0.1646214717  -4.40146  1.075250e-05

Comparando os modelos

anova(mod2,mod3,test='LR')
Analysis of Deviance Table

Model 1: inadimplente ~ balanco + receita + estudante
Model 2: inadimplente ~ balanco + estudante
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1      7997     1281.3                     
2      7998     1281.6 -1 -0.23621    0.627

StepAIC

Ao invé de usarmos a estatística de Wald para selecionar as variáveis significativas, podemos usar o AIC (equivalente ao Cp) como usamos na regressão múltipla para selecionar as variáveis explicativas.

A função stepAIC tem um parametro k que define se vamos usar o AIC ou o BIC para fazer a seleção. Quando k=2 temos o AIC e quando k=log(n) temos o BIC.

library(MASS)
mod3a <- stepAIC(mod2, k=2, trace=FALSE)
summary(mod3a)

Call:
glm(formula = inadimplente ~ balanco + estudante, family = binomial, 
    data = conj_treino)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.058e+01  4.027e-01 -26.264  < 2e-16 ***
balanco      5.629e-03  2.537e-04  22.190  < 2e-16 ***
estudante   -7.246e-01  1.646e-01  -4.401 1.08e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2340.6  on 8000  degrees of freedom
Residual deviance: 1281.6  on 7998  degrees of freedom
AIC: 1287.6

Number of Fisher Scoring iterations: 8
(k <- log(nrow(conj_treino)))
[1] 8.987322
mod3b <- stepAIC(mod2, k=k, trace=FALSE)
summary(mod3b)

Call:
glm(formula = inadimplente ~ balanco + estudante, family = binomial, 
    data = conj_treino)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.058e+01  4.027e-01 -26.264  < 2e-16 ***
balanco      5.629e-03  2.537e-04  22.190  < 2e-16 ***
estudante   -7.246e-01  1.646e-01  -4.401 1.08e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2340.6  on 8000  degrees of freedom
Residual deviance: 1281.6  on 7998  degrees of freedom
AIC: 1287.6

Number of Fisher Scoring iterations: 8

Avaliando o modelo novamente

p_chapeu <- predict(mod3, newdata = conj_teste, type = "response")
y_chapeu <- ifelse(p_chapeu > 0.5, "Sim", "Nao") %>% factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(y_chapeu, conj_teste$inadimplente, positive="Sim") 
Confusion Matrix and Statistics

          Reference
Prediction  Nao  Sim
       Nao 1927   43
       Sim    6   23
                                          
               Accuracy : 0.9755          
                 95% CI : (0.9677, 0.9818)
    No Information Rate : 0.967           
    P-Value [Acc > NIR] : 0.01625         
                                          
                  Kappa : 0.4736          
                                          
 Mcnemar's Test P-Value : 2.706e-07       
                                          
            Sensitivity : 0.34848         
            Specificity : 0.99690         
         Pos Pred Value : 0.79310         
         Neg Pred Value : 0.97817         
             Prevalence : 0.03302         
         Detection Rate : 0.01151         
   Detection Prevalence : 0.01451         
      Balanced Accuracy : 0.67269         
                                          
       'Positive' Class : Sim             
                                          

Mudando a probabilidade (limite) para aumentar a sensibilidade

p_chapeu <- predict(mod3, newdata = conj_teste, type = "response")
y_chapeu <- ifelse(p_chapeu > 0.1, "Sim", "Nao") %>% 
             factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(y_chapeu, conj_teste$inadimplente, positive="Sim")
Confusion Matrix and Statistics

          Reference
Prediction  Nao  Sim
       Nao 1823   15
       Sim  110   51
                                          
               Accuracy : 0.9375          
                 95% CI : (0.9259, 0.9477)
    No Information Rate : 0.967           
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.4223          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.77273         
            Specificity : 0.94309         
         Pos Pred Value : 0.31677         
         Neg Pred Value : 0.99184         
             Prevalence : 0.03302         
         Detection Rate : 0.02551         
   Detection Prevalence : 0.08054         
      Balanced Accuracy : 0.85791         
                                          
       'Positive' Class : Sim             
                                          

Curva ROC modelo só com balanço

library(pROC)
p_chapeu_log <- predict(mod1, newdata = conj_teste, type = "response")
head(p_chapeu_log)
           1            2            3            4            5            6 
0.0023045487 0.0145058938 0.0000283305 0.0048096860 0.0028238187 0.0116350810 
roc_log <- roc(conj_teste$inadimplente ~ p_chapeu_log, plot = TRUE, print.auc=FALSE, legacy.axes=TRUE)

# Area debaixo da curva
as.numeric(roc_log$auc)
[1] 0.9579708

Curva ROC 2: Modelo com balanço + estudante

p_chapeu_log <- predict(mod3, newdata = conj_teste, type = "response")
head(p_chapeu_log)
           1            2            3            4            5            6 
1.227576e-03 1.727504e-02 2.549008e-05 5.457908e-03 3.128840e-03 6.698177e-03 
roc_log2 <- roc(conj_teste$inadimplente ~ p_chapeu_log, plot = TRUE, print.auc=FALSE, legacy.axes=TRUE)

AUC

# Area debaixo da curva
as.numeric(roc_log2$auc)
[1] 0.9586919

Melhor limite

m_limite <- coords(roc_log2, "best", ret = "threshold")$threshold
m_limite
[1] 0.03752365
p_chapeu <- predict(mod3, newdata = conj_teste, type = "response")
y_chapeu <- ifelse(p_chapeu > m_limite, "Sim", "Nao") %>% 
             factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(y_chapeu, conj_teste$inadimplente, positive="Sim")
Confusion Matrix and Statistics

          Reference
Prediction  Nao  Sim
       Nao 1701    6
       Sim  232   60
                                          
               Accuracy : 0.8809          
                 95% CI : (0.8659, 0.8948)
    No Information Rate : 0.967           
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.2974          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.90909         
            Specificity : 0.87998         
         Pos Pred Value : 0.20548         
         Neg Pred Value : 0.99649         
             Prevalence : 0.03302         
         Detection Rate : 0.03002         
   Detection Prevalence : 0.14607         
      Balanced Accuracy : 0.89454         
                                          
       'Positive' Class : Sim             
                                          

Duas ROCs juntas

plot(roc_log)
plot(roc_log2, add=TRUE, col="blue")
legend("bottomright", legend=c("Mod 1", "Mod2"),
       col=c(par("fg"), "blue"), lwd=2)

Curva ROC 3 com o KNN

# Ajustando KNN 
set.seed(23)
conj_treino[,3:4] <- scale(conj_treino[,3:4]) # scale normaliza
conj_teste[,3:4] <- scale(conj_teste[, 3:4])
ctrl <- trainControl(method = "cv")
treina_knn <- train(inadimplente ~ balanco + estudante, method = "knn", trControl= ctrl, tuneGrid = data.frame(k = seq(5,140, by=5)), data = conj_treino)
# treina_knn
plot(treina_knn)

prev_knn <- predict(treina_knn, conj_teste,type = "prob")

## ROC
roc_log2 <- roc(conj_teste$inadimplente ~ p_chapeu_log, plot = TRUE, print.auc=FALSE, col= "black", legacy.axes=TRUE) 
roc_knn1 <- roc(conj_teste$inadimplente ~ prev_knn[,2], plot = TRUE, print.auc=FALSE, col="green", legacy.axes=TRUE, add=TRUE)

legend("bottomright",legend=c("Reg. Log", "KNN"), 
       col=c("black","green"),lwd=4)

# Area abaixo da curva
# Regressão Logística
as.numeric(roc_log2$auc)
[1] 0.9586919
## KNN
as.numeric(roc_knn1$auc)
[1] 0.943462