Carregando Bibliotecas
library (tidyverse)
library (ISLR)
data (Default)
summary (Default)
default student balance income
No :9667 No :7056 Min. : 0.0 Min. : 772
Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
Median : 823.6 Median :34553
Mean : 835.4 Mean :33517
3rd Qu.:1166.3 3rd Qu.:43808
Max. :2654.3 Max. :73554
'data.frame': 10000 obs. of 4 variables:
$ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
$ balance: num 730 817 1074 529 786 ...
$ income : num 44362 12106 31767 35704 38463 ...
default student balance income
1 No No 729.5265 44361.625
2 No Yes 817.1804 12106.135
3 No No 1073.5492 31767.139
4 No No 529.2506 35704.494
5 No No 785.6559 38463.496
6 No Yes 919.5885 7491.559
Manipulando os dados
credito <- tibble (Default)
summary (credito)
default student balance income
No :9667 No :7056 Min. : 0.0 Min. : 772
Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
Median : 823.6 Median :34553
Mean : 835.4 Mean :33517
3rd Qu.:1166.3 3rd Qu.:43808
Max. :2654.3 Max. :73554
# renomeando colunas
credito <- credito %>%
rename ( inadimplente = default, estudante = student, balanco = balance,
receita = income)
credito <- credito %>% mutate ( inadimplente = case_when (
inadimplente == "No" ~ "Nao" ,
inadimplente == "Yes" ~ "Sim"
)) %>% mutate (inadimplente = factor (inadimplente))
credito <- credito %>% mutate ( estudante = case_when (
estudante == "No" ~ 0 ,
estudante == "Yes" ~ 1
))
str (credito)
tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
$ inadimplente: Factor w/ 2 levels "Nao","Sim": 1 1 1 1 1 1 1 1 1 1 ...
$ estudante : num [1:10000] 0 1 0 0 0 1 0 1 0 0 ...
$ balanco : num [1:10000] 730 817 1074 529 786 ...
$ receita : num [1:10000] 44362 12106 31767 35704 38463 ...
inadimplente estudante balanco receita
Nao:9667 Min. :0.0000 Min. : 0.0 Min. : 772
Sim: 333 1st Qu.:0.0000 1st Qu.: 481.7 1st Qu.:21340
Median :0.0000 Median : 823.6 Median :34553
Mean :0.2944 Mean : 835.4 Mean :33517
3rd Qu.:1.0000 3rd Qu.:1166.3 3rd Qu.:43808
Max. :1.0000 Max. :2654.3 Max. :73554
Treino e Teste
library (caret)
set.seed (2024 )
y <- credito$ inadimplente
credito_split <- createDataPartition (y, times = 1 , p = 0.80 , list = FALSE )
conj_treino <- credito[credito_split,]
conj_teste <- credito[- credito_split,]
summary (conj_treino)
inadimplente estudante balanco receita
Nao:7734 Min. :0.0000 Min. : 0.0 Min. : 772
Sim: 267 1st Qu.:0.0000 1st Qu.: 479.5 1st Qu.:21309
Median :0.0000 Median : 825.9 Median :34468
Mean :0.2945 Mean : 835.9 Mean :33437
3rd Qu.:1.0000 3rd Qu.:1170.0 3rd Qu.:43706
Max. :1.0000 Max. :2654.3 Max. :72461
inadimplente estudante balanco receita
Nao:1933 Min. :0.0000 Min. : 0.0 Min. : 5083
Sim: 66 1st Qu.:0.0000 1st Qu.: 489.5 1st Qu.:21422
Median :0.0000 Median : 813.9 Median :35177
Mean :0.2941 Mean : 833.3 Mean :33837
3rd Qu.:1.0000 3rd Qu.:1143.4 3rd Qu.:44208
Max. :1.0000 Max. :2502.7 Max. :73554
Matriz de dispersão
library (psych)
pairs.panels (conj_treino,
method = "pearson" , # metodo de correlação
hist.col = "#00AFBB" ,
density = TRUE , # mostra graficos de densidade
ellipses = FALSE # mostra elipses de correlação
)
Avaliando o comportamento das variáveis em função do status (inadimplente / estudante)
library (patchwork)
p1 <- ggplot (credito, aes (x= inadimplente, y= balanco, color= inadimplente)) +
geom_boxplot ()
p2 <- ggplot (credito, aes (x= inadimplente, y= receita, color= inadimplente)) +
geom_boxplot ()
p3 <- ggplot (credito, aes (x= as.factor (estudante), y= balanco, color= as.factor (estudante))) +
geom_boxplot ()
p4 <- ggplot (credito, aes (x= as.factor (estudante), y= receita, color= as.factor (estudante))) +
geom_boxplot ()
(p1 + p2) / (p3 + p4)
Balanço vs Receita
ggplot (data = credito, aes (x= balanco, y = receita, col = inadimplente)) +
geom_point ()
Avaliando comportamento
# proporção de inadimplentes
credito %>% select (inadimplente, balanco) %>% summarize (prop = mean (inadimplente == "Sim" ))
# A tibble: 1 × 1
prop
<dbl>
1 0.0333
# media do balanço dos inadimplentes
credito %>% filter (inadimplente == "Sim" ) %>% summarize (valor= mean (balanco))
# A tibble: 1 × 1
valor
<dbl>
1 1748.
quantis <- quantile (credito$ balanco, probs = c (.1 ,.25 , .50 , .75 , .9 , .95 , 0.97 , 0.99 ))
quantis
10% 25% 50% 75% 90% 95% 97% 99%
180.5753 481.7311 823.6370 1166.3084 1471.6253 1665.9626 1793.2910 2008.4709
credito %>%
mutate (grupo_balanco = case_when (
balanco<= quantis[1 ] ~ quantis[1 ],
balanco> quantis[1 ] & balanco<= quantis[2 ] ~ quantis[2 ],
balanco> quantis[2 ] & balanco<= quantis[3 ] ~ quantis[3 ],
balanco> quantis[3 ] & balanco<= quantis[4 ] ~ quantis[4 ],
balanco> quantis[4 ] & balanco<= quantis[5 ] ~ quantis[5 ],
balanco> quantis[5 ] & balanco<= quantis[6 ] ~ quantis[6 ],
balanco> quantis[6 ] & balanco<= quantis[7 ] ~ quantis[7 ],
balanco> quantis[7 ] ~ quantis[8 ])) %>%
group_by (grupo_balanco) %>%
summarize (prop = mean (inadimplente == "Sim" )) %>%
ggplot (aes (grupo_balanco, prop)) +
geom_point () +
geom_line ()
1a Regressão logística: só balanço
mod1 <- glm (inadimplente ~ balanco,data= conj_treino,family= binomial)
summary (mod1)
Call:
glm(formula = inadimplente ~ balanco, family = binomial, data = conj_treino)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.047e+01 3.931e-01 -26.64 <2e-16 ***
balanco 5.386e-03 2.405e-04 22.40 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2340.6 on 8000 degrees of freedom
Residual deviance: 1302.1 on 7999 degrees of freedom
AIC: 1306.1
Number of Fisher Scoring iterations: 8
(Intercept) balanco
-10.471543192 0.005385567
Estimate Std. Error z value Pr(>|z|)
(Intercept) -10.471543192 0.3931122212 -26.63754 2.495385e-156
balanco 0.005385567 0.0002404507 22.39780 4.134795e-111
Avaliando o modelo
p_chapeu <- predict (mod1, newdata = conj_teste, type = "response" )
y_chapeu <- ifelse (p_chapeu > 0.5 , "Sim" , "Nao" ) %>% factor (levels = levels (conj_teste$ inadimplente))
confusionMatrix (y_chapeu, conj_teste$ inadimplente, positive= "Sim" )
Confusion Matrix and Statistics
Reference
Prediction Nao Sim
Nao 1929 45
Sim 4 21
Accuracy : 0.9755
95% CI : (0.9677, 0.9818)
No Information Rate : 0.967
P-Value [Acc > NIR] : 0.01625
Kappa : 0.4516
Mcnemar's Test P-Value : 1.102e-08
Sensitivity : 0.31818
Specificity : 0.99793
Pos Pred Value : 0.84000
Neg Pred Value : 0.97720
Prevalence : 0.03302
Detection Rate : 0.01051
Detection Prevalence : 0.01251
Balanced Accuracy : 0.65806
'Positive' Class : Sim
Veja as probabilidade de inadimplencia para balanços de 1000, 2000 e 3000
predict (mod1, newdata = data.frame (balanco = c (1000 ,2000 ,3000 )), type= "response" )
1 2 3
0.006144857 0.574342575 0.996615498
Curva S
inadimpl <- as.numeric (conj_treino$ inadimplente) - 1
plot (inadimpl ~ balanco, data = conj_treino,
col = "darkorange" , pch = "|" , ylim = c (0 , 1 ),
main = "Regressão Logistica - Classificacão" )
abline (h = 0 , lty = 3 )
abline (h = 1 , lty = 3 )
abline (h = 0.5 , lty = 2 )
curve (predict (mod1, data.frame (balanco = x),
type = "response" ), add = TRUE , lwd = 3 , col = "dodgerblue" )
abline (v = - coef (mod1)[1 ] / coef (mod1)[2 ], lwd = 2 )
2a Regressão logística: todas as variáveis
mod2 <- glm (inadimplente ~ balanco + receita + estudante,data= conj_treino,family= binomial)
summary (mod2)
Call:
glm(formula = inadimplente ~ balanco + receita + estudante, family = binomial,
data = conj_treino)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.075e+01 5.417e-01 -19.847 <2e-16 ***
balanco 5.628e-03 2.538e-04 22.180 <2e-16 ***
receita 4.416e-06 9.088e-06 0.486 0.6270
estudante -6.264e-01 2.612e-01 -2.398 0.0165 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2340.6 on 8000 degrees of freedom
Residual deviance: 1281.3 on 7997 degrees of freedom
AIC: 1289.3
Number of Fisher Scoring iterations: 8
(Intercept) balanco receita estudante
-1.075144e+01 5.628328e-03 4.415834e-06 -6.263630e-01
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.075144e+01 5.417102e-01 -19.8472260 1.164551e-87
balanco 5.628328e-03 2.537535e-04 22.1802958 5.322786e-109
receita 4.415834e-06 9.088132e-06 0.4858902 6.270450e-01
estudante -6.263630e-01 2.611624e-01 -2.3983656 1.646842e-02
É possível se ver que receita não é significativa
3a Regressão Logística (sem receita)
mod3 <- glm (inadimplente ~ balanco + estudante,data= conj_treino,family= binomial)
summary (mod3)
Call:
glm(formula = inadimplente ~ balanco + estudante, family = binomial,
data = conj_treino)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.058e+01 4.027e-01 -26.264 < 2e-16 ***
balanco 5.629e-03 2.537e-04 22.190 < 2e-16 ***
estudante -7.246e-01 1.646e-01 -4.401 1.08e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2340.6 on 8000 degrees of freedom
Residual deviance: 1281.6 on 7998 degrees of freedom
AIC: 1287.6
Number of Fisher Scoring iterations: 8
(Intercept) balanco estudante
-10.577195871 0.005629461 -0.724574801
Estimate Std. Error z value Pr(>|z|)
(Intercept) -10.577195871 0.4027279496 -26.26387 4.962858e-152
balanco 0.005629461 0.0002536895 22.19035 4.256243e-109
estudante -0.724574801 0.1646214717 -4.40146 1.075250e-05
Comparando os modelos
anova (mod2,mod3,test= 'LR' )
Analysis of Deviance Table
Model 1: inadimplente ~ balanco + receita + estudante
Model 2: inadimplente ~ balanco + estudante
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 7997 1281.3
2 7998 1281.6 -1 -0.23621 0.627
StepAIC
Ao invé de usarmos a estatística de Wald para selecionar as variáveis significativas, podemos usar o AIC (equivalente ao Cp ) como usamos na regressão múltipla para selecionar as variáveis explicativas.
A função stepAIC tem um parametro k que define se vamos usar o AIC ou o BIC para fazer a seleção. Quando k=2 temos o AIC e quando k=log(n) temos o BIC.
library (MASS)
mod3a <- stepAIC (mod2, k= 2 , trace= FALSE )
summary (mod3a)
Call:
glm(formula = inadimplente ~ balanco + estudante, family = binomial,
data = conj_treino)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.058e+01 4.027e-01 -26.264 < 2e-16 ***
balanco 5.629e-03 2.537e-04 22.190 < 2e-16 ***
estudante -7.246e-01 1.646e-01 -4.401 1.08e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2340.6 on 8000 degrees of freedom
Residual deviance: 1281.6 on 7998 degrees of freedom
AIC: 1287.6
Number of Fisher Scoring iterations: 8
(k <- log (nrow (conj_treino)))
mod3b <- stepAIC (mod2, k= k, trace= FALSE )
summary (mod3b)
Call:
glm(formula = inadimplente ~ balanco + estudante, family = binomial,
data = conj_treino)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.058e+01 4.027e-01 -26.264 < 2e-16 ***
balanco 5.629e-03 2.537e-04 22.190 < 2e-16 ***
estudante -7.246e-01 1.646e-01 -4.401 1.08e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2340.6 on 8000 degrees of freedom
Residual deviance: 1281.6 on 7998 degrees of freedom
AIC: 1287.6
Number of Fisher Scoring iterations: 8
Avaliando o modelo novamente
p_chapeu <- predict (mod3, newdata = conj_teste, type = "response" )
y_chapeu <- ifelse (p_chapeu > 0.5 , "Sim" , "Nao" ) %>% factor (levels = levels (conj_teste$ inadimplente))
confusionMatrix (y_chapeu, conj_teste$ inadimplente, positive= "Sim" )
Confusion Matrix and Statistics
Reference
Prediction Nao Sim
Nao 1927 43
Sim 6 23
Accuracy : 0.9755
95% CI : (0.9677, 0.9818)
No Information Rate : 0.967
P-Value [Acc > NIR] : 0.01625
Kappa : 0.4736
Mcnemar's Test P-Value : 2.706e-07
Sensitivity : 0.34848
Specificity : 0.99690
Pos Pred Value : 0.79310
Neg Pred Value : 0.97817
Prevalence : 0.03302
Detection Rate : 0.01151
Detection Prevalence : 0.01451
Balanced Accuracy : 0.67269
'Positive' Class : Sim
Mudando a probabilidade (limite) para aumentar a sensibilidade
p_chapeu <- predict (mod3, newdata = conj_teste, type = "response" )
y_chapeu <- ifelse (p_chapeu > 0.1 , "Sim" , "Nao" ) %>%
factor (levels = levels (conj_teste$ inadimplente))
confusionMatrix (y_chapeu, conj_teste$ inadimplente, positive= "Sim" )
Confusion Matrix and Statistics
Reference
Prediction Nao Sim
Nao 1823 15
Sim 110 51
Accuracy : 0.9375
95% CI : (0.9259, 0.9477)
No Information Rate : 0.967
P-Value [Acc > NIR] : 1
Kappa : 0.4223
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.77273
Specificity : 0.94309
Pos Pred Value : 0.31677
Neg Pred Value : 0.99184
Prevalence : 0.03302
Detection Rate : 0.02551
Detection Prevalence : 0.08054
Balanced Accuracy : 0.85791
'Positive' Class : Sim
AUC
# Area debaixo da curva
as.numeric (roc_log2$ auc)
Melhor limite
m_limite <- coords (roc_log2, "best" , ret = "threshold" )$ threshold
m_limite
p_chapeu <- predict (mod3, newdata = conj_teste, type = "response" )
y_chapeu <- ifelse (p_chapeu > m_limite, "Sim" , "Nao" ) %>%
factor (levels = levels (conj_teste$ inadimplente))
confusionMatrix (y_chapeu, conj_teste$ inadimplente, positive= "Sim" )
Confusion Matrix and Statistics
Reference
Prediction Nao Sim
Nao 1701 6
Sim 232 60
Accuracy : 0.8809
95% CI : (0.8659, 0.8948)
No Information Rate : 0.967
P-Value [Acc > NIR] : 1
Kappa : 0.2974
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.90909
Specificity : 0.87998
Pos Pred Value : 0.20548
Neg Pred Value : 0.99649
Prevalence : 0.03302
Detection Rate : 0.03002
Detection Prevalence : 0.14607
Balanced Accuracy : 0.89454
'Positive' Class : Sim
Duas ROCs juntas
plot (roc_log)
plot (roc_log2, add= TRUE , col= "blue" )
legend ("bottomright" , legend= c ("Mod 1" , "Mod2" ),
col= c (par ("fg" ), "blue" ), lwd= 2 )