Carregando Bibliotecas
default student balance income
No :9667 No :7056 Min. : 0.0 Min. : 772
Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
Median : 823.6 Median :34553
Mean : 835.4 Mean :33517
3rd Qu.:1166.3 3rd Qu.:43808
Max. :2654.3 Max. :73554
'data.frame': 10000 obs. of 4 variables:
$ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
$ balance: num 730 817 1074 529 786 ...
$ income : num 44362 12106 31767 35704 38463 ...
default student balance income
1 No No 729.5265 44361.625
2 No Yes 817.1804 12106.135
3 No No 1073.5492 31767.139
4 No No 529.2506 35704.494
5 No No 785.6559 38463.496
6 No Yes 919.5885 7491.559
Manipulando os dados
default student balance income
No :9667 No :7056 Min. : 0.0 Min. : 772
Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
Median : 823.6 Median :34553
Mean : 835.4 Mean :33517
3rd Qu.:1166.3 3rd Qu.:43808
Max. :2654.3 Max. :73554
# renomeando colunas
credito <- credito %>%
rename ( inadimplente = default , estudante = student , balanco = balance ,
receita = income )
credito <- credito %>% mutate ( inadimplente = case_when (
inadimplente == "No" ~ "Nao" ,
inadimplente == "Yes" ~ "Sim"
) ) %>% mutate ( inadimplente = factor ( inadimplente ) )
credito <- credito %>% mutate ( estudante = case_when (
estudante == "No" ~ 0 ,
estudante == "Yes" ~ 1
) )
str ( credito )
tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
$ inadimplente: Factor w/ 2 levels "Nao","Sim": 1 1 1 1 1 1 1 1 1 1 ...
$ estudante : num [1:10000] 0 1 0 0 0 1 0 1 0 0 ...
$ balanco : num [1:10000] 730 817 1074 529 786 ...
$ receita : num [1:10000] 44362 12106 31767 35704 38463 ...
inadimplente estudante balanco receita
Nao:9667 Min. :0.0000 Min. : 0.0 Min. : 772
Sim: 333 1st Qu.:0.0000 1st Qu.: 481.7 1st Qu.:21340
Median :0.0000 Median : 823.6 Median :34553
Mean :0.2944 Mean : 835.4 Mean :33517
3rd Qu.:1.0000 3rd Qu.:1166.3 3rd Qu.:43808
Max. :1.0000 Max. :2654.3 Max. :73554
Treino e Teste
inadimplente estudante balanco receita
Nao:7734 Min. :0.0000 Min. : 0.0 Min. : 772
Sim: 267 1st Qu.:0.0000 1st Qu.: 479.5 1st Qu.:21309
Median :0.0000 Median : 825.9 Median :34468
Mean :0.2945 Mean : 835.9 Mean :33437
3rd Qu.:1.0000 3rd Qu.:1170.0 3rd Qu.:43706
Max. :1.0000 Max. :2654.3 Max. :72461
inadimplente estudante balanco receita
Nao:1933 Min. :0.0000 Min. : 0.0 Min. : 5083
Sim: 66 1st Qu.:0.0000 1st Qu.: 489.5 1st Qu.:21422
Median :0.0000 Median : 813.9 Median :35177
Mean :0.2941 Mean : 833.3 Mean :33837
3rd Qu.:1.0000 3rd Qu.:1143.4 3rd Qu.:44208
Max. :1.0000 Max. :2502.7 Max. :73554
Matriz de dispersão
library ( psych )
pairs.panels ( conj_treino ,
method = "pearson" , # metodo de correlação
hist.col = "#00AFBB" ,
density = TRUE , # mostra graficos de densidade
ellipses = FALSE # mostra elipses de correlação
)
Avaliando o comportamento das variáveis em função do status (inadimplente / estudante)
library ( patchwork )
p1 <- ggplot ( credito , aes ( x= inadimplente , y= balanco , color= inadimplente ) ) +
geom_boxplot ( )
p2 <- ggplot ( credito , aes ( x= inadimplente , y= receita , color= inadimplente ) ) +
geom_boxplot ( )
p3 <- ggplot ( credito , aes ( x= as.factor ( estudante ) , y= balanco , color= as.factor ( estudante ) ) ) +
geom_boxplot ( )
p4 <- ggplot ( credito , aes ( x= as.factor ( estudante ) , y= receita , color= as.factor ( estudante ) ) ) +
geom_boxplot ( )
( p1 + p2 ) / ( p3 + p4 )
Avaliando comportamento
# A tibble: 1 × 1
prop
<dbl>
1 0.0333
# A tibble: 1 × 1
valor
<dbl>
1 1748.
quantis <- quantile ( credito $ balanco , probs = c ( .1 ,.25 , .50 , .75 , .9 , .95 , 0.97 , 0.99 ) )
quantis
10% 25% 50% 75% 90% 95% 97% 99%
180.5753 481.7311 823.6370 1166.3084 1471.6253 1665.9626 1793.2910 2008.4709
credito %>%
mutate ( grupo_balanco = case_when (
balanco <= quantis [ 1 ] ~ quantis [ 1 ] ,
balanco > quantis [ 1 ] & balanco <= quantis [ 2 ] ~ quantis [ 2 ] ,
balanco > quantis [ 2 ] & balanco <= quantis [ 3 ] ~ quantis [ 3 ] ,
balanco > quantis [ 3 ] & balanco <= quantis [ 4 ] ~ quantis [ 4 ] ,
balanco > quantis [ 4 ] & balanco <= quantis [ 5 ] ~ quantis [ 5 ] ,
balanco > quantis [ 5 ] & balanco <= quantis [ 6 ] ~ quantis [ 6 ] ,
balanco > quantis [ 6 ] & balanco <= quantis [ 7 ] ~ quantis [ 7 ] ,
balanco > quantis [ 7 ] ~ quantis [ 8 ] ) ) %>%
group_by ( grupo_balanco ) %>%
summarize ( prop = mean ( inadimplente == "Sim" ) ) %>%
ggplot ( aes ( grupo_balanco , prop ) ) +
geom_point ( ) +
geom_line ( )
1a Regressão logística: só balanço
mod1 <- glm ( inadimplente ~ balanco ,data= conj_treino ,family= binomial )
summary ( mod1 )
Call:
glm(formula = inadimplente ~ balanco, family = binomial, data = conj_treino)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.047e+01 3.931e-01 -26.64 <2e-16 ***
balanco 5.386e-03 2.405e-04 22.40 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2340.6 on 8000 degrees of freedom
Residual deviance: 1302.1 on 7999 degrees of freedom
AIC: 1306.1
Number of Fisher Scoring iterations: 8
(Intercept) balanco
-10.471543192 0.005385567
Estimate Std. Error z value Pr(>|z|)
(Intercept) -10.471543192 0.3931122212 -26.63754 2.495385e-156
balanco 0.005385567 0.0002404507 22.39780 4.134795e-111
Avaliando o modelo
p_chapeu <- predict ( mod1 , newdata = conj_teste , type = "response" )
y_chapeu <- ifelse ( p_chapeu > 0.5 , "Sim" , "Nao" ) %>% factor ( levels = levels ( conj_teste $ inadimplente ) )
confusionMatrix ( y_chapeu , conj_teste $ inadimplente , positive= "Sim" )
Confusion Matrix and Statistics
Reference
Prediction Nao Sim
Nao 1929 45
Sim 4 21
Accuracy : 0.9755
95% CI : (0.9677, 0.9818)
No Information Rate : 0.967
P-Value [Acc > NIR] : 0.01625
Kappa : 0.4516
Mcnemar's Test P-Value : 1.102e-08
Sensitivity : 0.31818
Specificity : 0.99793
Pos Pred Value : 0.84000
Neg Pred Value : 0.97720
Prevalence : 0.03302
Detection Rate : 0.01051
Detection Prevalence : 0.01251
Balanced Accuracy : 0.65806
'Positive' Class : Sim
Veja as probabilidade de inadimplencia para balanços de 1000, 2000 e 3000
1 2 3
0.006144857 0.574342575 0.996615498
Curva S
inadimpl <- as.numeric ( conj_treino $ inadimplente ) - 1
plot ( inadimpl ~ balanco , data = conj_treino ,
col = "darkorange" , pch = "|" , ylim = c ( 0 , 1 ) ,
main = "Regressão Logistica - Classificacão" )
abline ( h = 0 , lty = 3 )
abline ( h = 1 , lty = 3 )
abline ( h = 0.5 , lty = 2 )
curve ( predict ( mod1 , data.frame ( balanco = x ) ,
type = "response" ) , add = TRUE , lwd = 3 , col = "dodgerblue" )
abline ( v = - coef ( mod1 ) [ 1 ] / coef ( mod1 ) [ 2 ] , lwd = 2 )
2a Regressão logística: todas as variáveis
mod2 <- glm ( inadimplente ~ balanco + receita + estudante ,data= conj_treino ,family= binomial )
summary ( mod2 )
Call:
glm(formula = inadimplente ~ balanco + receita + estudante, family = binomial,
data = conj_treino)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.075e+01 5.417e-01 -19.847 <2e-16 ***
balanco 5.628e-03 2.538e-04 22.180 <2e-16 ***
receita 4.416e-06 9.088e-06 0.486 0.6270
estudante -6.264e-01 2.612e-01 -2.398 0.0165 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2340.6 on 8000 degrees of freedom
Residual deviance: 1281.3 on 7997 degrees of freedom
AIC: 1289.3
Number of Fisher Scoring iterations: 8
(Intercept) balanco receita estudante
-1.075144e+01 5.628328e-03 4.415834e-06 -6.263630e-01
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.075144e+01 5.417102e-01 -19.8472260 1.164551e-87
balanco 5.628328e-03 2.537535e-04 22.1802958 5.322786e-109
receita 4.415834e-06 9.088132e-06 0.4858902 6.270450e-01
estudante -6.263630e-01 2.611624e-01 -2.3983656 1.646842e-02
É possível se ver que receita não é significativa
3a Regressão Logística (sem receita)
mod3 <- glm ( inadimplente ~ balanco + estudante ,data= conj_treino ,family= binomial )
summary ( mod3 )
Call:
glm(formula = inadimplente ~ balanco + estudante, family = binomial,
data = conj_treino)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.058e+01 4.027e-01 -26.264 < 2e-16 ***
balanco 5.629e-03 2.537e-04 22.190 < 2e-16 ***
estudante -7.246e-01 1.646e-01 -4.401 1.08e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2340.6 on 8000 degrees of freedom
Residual deviance: 1281.6 on 7998 degrees of freedom
AIC: 1287.6
Number of Fisher Scoring iterations: 8
(Intercept) balanco estudante
-10.577195871 0.005629461 -0.724574801
Estimate Std. Error z value Pr(>|z|)
(Intercept) -10.577195871 0.4027279496 -26.26387 4.962858e-152
balanco 0.005629461 0.0002536895 22.19035 4.256243e-109
estudante -0.724574801 0.1646214717 -4.40146 1.075250e-05
Comparando os modelos
anova ( mod2 ,mod3 ,test= 'LR' )
Analysis of Deviance Table
Model 1: inadimplente ~ balanco + receita + estudante
Model 2: inadimplente ~ balanco + estudante
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 7997 1281.3
2 7998 1281.6 -1 -0.23621 0.627
StepAIC
Ao invé de usarmos a estatística de Wald para selecionar as variáveis significativas, podemos usar o AIC (equivalente ao Cp ) como usamos na regressão múltipla para selecionar as variáveis explicativas.
A função stepAIC tem um parametro k que define se vamos usar o AIC ou o BIC para fazer a seleção. Quando k=2 temos o AIC e quando k=log(n) temos o BIC.
Call:
glm(formula = inadimplente ~ balanco + estudante, family = binomial,
data = conj_treino)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.058e+01 4.027e-01 -26.264 < 2e-16 ***
balanco 5.629e-03 2.537e-04 22.190 < 2e-16 ***
estudante -7.246e-01 1.646e-01 -4.401 1.08e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2340.6 on 8000 degrees of freedom
Residual deviance: 1281.6 on 7998 degrees of freedom
AIC: 1287.6
Number of Fisher Scoring iterations: 8
Call:
glm(formula = inadimplente ~ balanco + estudante, family = binomial,
data = conj_treino)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.058e+01 4.027e-01 -26.264 < 2e-16 ***
balanco 5.629e-03 2.537e-04 22.190 < 2e-16 ***
estudante -7.246e-01 1.646e-01 -4.401 1.08e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2340.6 on 8000 degrees of freedom
Residual deviance: 1281.6 on 7998 degrees of freedom
AIC: 1287.6
Number of Fisher Scoring iterations: 8
Avaliando o modelo novamente
p_chapeu <- predict ( mod3 , newdata = conj_teste , type = "response" )
y_chapeu <- ifelse ( p_chapeu > 0.5 , "Sim" , "Nao" ) %>% factor ( levels = levels ( conj_teste $ inadimplente ) )
confusionMatrix ( y_chapeu , conj_teste $ inadimplente , positive= "Sim" )
Confusion Matrix and Statistics
Reference
Prediction Nao Sim
Nao 1927 43
Sim 6 23
Accuracy : 0.9755
95% CI : (0.9677, 0.9818)
No Information Rate : 0.967
P-Value [Acc > NIR] : 0.01625
Kappa : 0.4736
Mcnemar's Test P-Value : 2.706e-07
Sensitivity : 0.34848
Specificity : 0.99690
Pos Pred Value : 0.79310
Neg Pred Value : 0.97817
Prevalence : 0.03302
Detection Rate : 0.01151
Detection Prevalence : 0.01451
Balanced Accuracy : 0.67269
'Positive' Class : Sim
Mudando a probabilidade (limite) para aumentar a sensibilidade
p_chapeu <- predict ( mod3 , newdata = conj_teste , type = "response" )
y_chapeu <- ifelse ( p_chapeu > 0.1 , "Sim" , "Nao" ) %>%
factor ( levels = levels ( conj_teste $ inadimplente ) )
confusionMatrix ( y_chapeu , conj_teste $ inadimplente , positive= "Sim" )
Confusion Matrix and Statistics
Reference
Prediction Nao Sim
Nao 1823 15
Sim 110 51
Accuracy : 0.9375
95% CI : (0.9259, 0.9477)
No Information Rate : 0.967
P-Value [Acc > NIR] : 1
Kappa : 0.4223
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.77273
Specificity : 0.94309
Pos Pred Value : 0.31677
Neg Pred Value : 0.99184
Prevalence : 0.03302
Detection Rate : 0.02551
Detection Prevalence : 0.08054
Balanced Accuracy : 0.85791
'Positive' Class : Sim
Melhor limite
m_limite <- coords ( roc_log2 , "best" , ret = "threshold" ) $ threshold
m_limite
p_chapeu <- predict ( mod3 , newdata = conj_teste , type = "response" )
y_chapeu <- ifelse ( p_chapeu > m_limite , "Sim" , "Nao" ) %>%
factor ( levels = levels ( conj_teste $ inadimplente ) )
confusionMatrix ( y_chapeu , conj_teste $ inadimplente , positive= "Sim" )
Confusion Matrix and Statistics
Reference
Prediction Nao Sim
Nao 1701 6
Sim 232 60
Accuracy : 0.8809
95% CI : (0.8659, 0.8948)
No Information Rate : 0.967
P-Value [Acc > NIR] : 1
Kappa : 0.2974
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.90909
Specificity : 0.87998
Pos Pred Value : 0.20548
Neg Pred Value : 0.99649
Prevalence : 0.03302
Detection Rate : 0.03002
Detection Prevalence : 0.14607
Balanced Accuracy : 0.89454
'Positive' Class : Sim
Duas ROCs juntas
plot ( roc_log )
plot ( roc_log2 , add= TRUE , col= "blue" )
legend ( "bottomright" , legend= c ( "Mod 1" , "Mod2" ) ,
col= c ( par ( "fg" ) , "blue" ) , lwd= 2 )