Carregando Bibliotecas
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr 1.1.4 ✔ readr 2.1.5
#> ✔ forcats 1.0.0 ✔ stringr 1.5.1
#> ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
#> ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
#> ✔ purrr 1.0.4
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#> default student balance income
#> No :9667 No :7056 Min. : 0.0 Min. : 772
#> Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
#> Median : 823.6 Median :34553
#> Mean : 835.4 Mean :33517
#> 3rd Qu.:1166.3 3rd Qu.:43808
#> Max. :2654.3 Max. :73554
#> 'data.frame': 10000 obs. of 4 variables:
#> $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
#> $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
#> $ balance: num 730 817 1074 529 786 ...
#> $ income : num 44362 12106 31767 35704 38463 ...
#> default student balance income
#> 1 No No 729.5265 44361.625
#> 2 No Yes 817.1804 12106.135
#> 3 No No 1073.5492 31767.139
#> 4 No No 529.2506 35704.494
#> 5 No No 785.6559 38463.496
#> 6 No Yes 919.5885 7491.559
Manipulando os dados
#> default student balance income
#> No :9667 No :7056 Min. : 0.0 Min. : 772
#> Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
#> Median : 823.6 Median :34553
#> Mean : 835.4 Mean :33517
#> 3rd Qu.:1166.3 3rd Qu.:43808
#> Max. :2654.3 Max. :73554
# renomeando colunas
credito <- credito %>%
rename ( inadimplente = default , estudante = student , balanco = balance ,
receita = income )
credito <- credito %>% mutate ( inadimplente = case_when (
inadimplente == "No" ~ "Nao" ,
inadimplente == "Yes" ~ "Sim"
) ) %>% mutate ( inadimplente = factor ( inadimplente ) )
credito <- credito %>% mutate ( estudante = case_when (
estudante == "No" ~ 0 ,
estudante == "Yes" ~ 1
) )
str ( credito )
#> tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
#> $ inadimplente: Factor w/ 2 levels "Nao","Sim": 1 1 1 1 1 1 1 1 1 1 ...
#> $ estudante : num [1:10000] 0 1 0 0 0 1 0 1 0 0 ...
#> $ balanco : num [1:10000] 730 817 1074 529 786 ...
#> $ receita : num [1:10000] 44362 12106 31767 35704 38463 ...
#> inadimplente estudante balanco receita
#> Nao:9667 Min. :0.0000 Min. : 0.0 Min. : 772
#> Sim: 333 1st Qu.:0.0000 1st Qu.: 481.7 1st Qu.:21340
#> Median :0.0000 Median : 823.6 Median :34553
#> Mean :0.2944 Mean : 835.4 Mean :33517
#> 3rd Qu.:1.0000 3rd Qu.:1166.3 3rd Qu.:43808
#> Max. :1.0000 Max. :2654.3 Max. :73554
Treino e Teste
#> Carregando pacotes exigidos: lattice
#>
#> Anexando pacote: 'caret'
#> O seguinte objeto é mascarado por 'package:purrr':
#>
#> lift
set.seed ( 2024 )
y <- credito $ inadimplente
indice_teste <- createDataPartition ( y , times = 1 , p = 0.2 , list = FALSE )
conj_treino <- credito [ - indice_teste ,]
conj_teste <- credito [ indice_teste ,]
summary ( conj_treino )
#> inadimplente estudante balanco receita
#> Nao:7733 Min. :0.0000 Min. : 0.0 Min. : 772
#> Sim: 266 1st Qu.:0.0000 1st Qu.: 481.8 1st Qu.:21536
#> Median :0.0000 Median : 818.0 Median :34690
#> Mean :0.2929 Mean : 833.4 Mean :33610
#> 3rd Qu.:1.0000 3rd Qu.:1164.6 3rd Qu.:43850
#> Max. :1.0000 Max. :2654.3 Max. :73554
#> inadimplente estudante balanco receita
#> Nao:1934 Min. :0.0000 Min. : 0.0 Min. : 1498
#> Sim: 67 1st Qu.:0.0000 1st Qu.: 481.6 1st Qu.:20655
#> Median :0.0000 Median : 840.1 Median :33812
#> Mean :0.3003 Mean : 843.1 Mean :33144
#> 3rd Qu.:1.0000 3rd Qu.:1171.4 3rd Qu.:43593
#> Max. :1.0000 Max. :2578.5 Max. :68722
Balanço e receita
featurePlot ( x = conj_treino [ , c ( "balanco" , "receita" , "estudante" ) ] ,
y = conj_treino $ inadimplente ,
plot = "density" ,
scales = list ( x = list ( relation = "free" ) ,
y = list ( relation = "free" ) ) ,
adjust = 1.5 ,
pch = "|" ,
layout = c ( 2 , 1 ) ,
auto.key = list ( columns = 2 ) )
Avaliando o comportamento das variáveis em função do status (inadimplente / estudante)
library ( patchwork )
p1 <- ggplot ( credito , aes ( x= inadimplente , y= balanco , color= inadimplente ) ) +
geom_boxplot ( )
p2 <- ggplot ( credito , aes ( x= inadimplente , y= receita , color= inadimplente ) ) +
geom_boxplot ( )
p3 <- ggplot ( credito , aes ( x= as.factor ( estudante ) , y= balanco , color= as.factor ( estudante ) ) ) +
geom_boxplot ( )
p4 <- ggplot ( credito , aes ( x= as.factor ( estudante ) , y= receita , color= as.factor ( estudante ) ) ) +
geom_boxplot ( )
( p1 + p2 ) / ( p3 + p4 )
Calcula Erro
# Este valor é igual a 1 - Accuracy da matriz de confusão
calc_erro_class <- function ( real , previsto ) {
mean ( real != previsto )
}
Treino e Teste Normalizado
library ( caret )
set.seed ( 2024 )
# Normalizando os dados
credito <- credito %>% mutate ( balanco = scale ( balanco ) , receita = scale ( receita ) )
y <- credito $ inadimplente
indice_teste <- createDataPartition ( y , times = 1 , p = 0.2 , list = FALSE )
conj_treino <- credito [ - indice_teste ,]
conj_teste <- credito [ indice_teste ,]
summary ( conj_treino )
#> inadimplente estudante balanco.V1
#> Nao:7733 Min. :0.0000 Min. :-1.72699815214000
#> Sim: 266 1st Qu.:0.0000 1st Qu.:-0.73093769169700
#> Median :0.0000 Median :-0.03595597796600
#> Mean :0.2929 Mean :-0.00401639039487
#> 3rd Qu.:1.0000 3rd Qu.: 0.68056023172100
#> Max. :1.0000 Max. : 3.76037076853000
#> receita.V1
#> Min. :-2.45526723525000
#> 1st Qu.:-0.89836473917200
#> Median : 0.08796491781240
#> Mean : 0.00699812328583
#> 3rd Qu.: 0.77481662870600
#> Max. : 3.00204946164000
#> inadimplente estudante balanco.V1
#> Nao:1934 Min. :0.0000 Min. :-1.72699815214000
#> Sim: 67 1st Qu.:0.0000 1st Qu.:-0.73139456297700
#> Median :0.0000 Median : 0.00984385369764
#> Mean :0.3003 Mean : 0.01605552562150
#> 3rd Qu.:1.0000 3rd Qu.: 0.69476143173600
#> Max. :1.0000 Max. : 3.60355620415000
#> receita.V1
#> Min. :-2.4008112726600
#> 1st Qu.:-0.9643945022460
#> Median : 0.0220960712686
#> Mean :-0.0279750065784
#> 3rd Qu.: 0.7555020572170
#> Max. : 2.6397198805100
LDA
#>
#> Anexando pacote: 'MASS'
#> O seguinte objeto é mascarado por 'package:patchwork':
#>
#> area
#> O seguinte objeto é mascarado por 'package:dplyr':
#>
#> select
treina_lda <- lda ( inadimplente ~ balanco + estudante + receita , data = conj_treino )
treina_lda
#> Call:
#> lda(inadimplente ~ balanco + estudante + receita, data = conj_treino)
#>
#> Prior probabilities of groups:
#> Nao Sim
#> 0.96674584 0.03325416
#>
#> Group means:
#> balanco estudante receita
#> Nao -0.06884968 0.2903142 0.008870876
#> Sim 1.88077996 0.3684211 -0.047445476
#>
#> Coefficients of linear discriminants:
#> LD1
#> balanco 1.09199528
#> estudante -0.14668654
#> receita 0.07531296
#> [1] "class" "posterior" "x"
y_chapeu <- predict ( treina_lda , conj_teste ) $ class %>%
factor ( levels = levels ( conj_teste $ inadimplente ) )
confusionMatrix ( data = y_chapeu , reference = conj_teste $ inadimplente , positive= "Sim" )
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Nao Sim
#> Nao 1926 48
#> Sim 8 19
#>
#> Accuracy : 0.972
#> 95% CI : (0.9638, 0.9788)
#> No Information Rate : 0.9665
#> P-Value [Acc > NIR] : 0.09339
#>
#> Kappa : 0.3926
#>
#> Mcnemar's Test P-Value : 1.872e-07
#>
#> Sensitivity : 0.283582
#> Specificity : 0.995863
#> Pos Pred Value : 0.703704
#> Neg Pred Value : 0.975684
#> Prevalence : 0.033483
#> Detection Rate : 0.009495
#> Detection Prevalence : 0.013493
#> Balanced Accuracy : 0.639723
#>
#> 'Positive' Class : Sim
#>
# Este valor é igual a 1 - Accuracy da matriz de confusão
calc_erro_class ( conj_teste $ inadimplente , y_chapeu )
LDA - Ajustando probabilidade limite
p_chapeu <- predict ( treina_lda , conj_teste ) $ posterior
head ( p_chapeu )
#> Nao Sim
#> 1 0.9857954 0.0142045675
#> 2 0.9981836 0.0018164269
#> 3 0.9180153 0.0819846555
#> 4 0.9988081 0.0011919412
#> 5 0.9519939 0.0480061272
#> 6 0.9998932 0.0001067956
y_chapeu <- ifelse ( p_chapeu [ , 2 ] > 0.11 , "Sim" , "Nao" ) %>%
factor ( levels = levels ( conj_teste $ inadimplente ) )
confusionMatrix ( data = y_chapeu , reference = conj_teste $ inadimplente , positive= "Sim" )
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Nao Sim
#> Nao 1815 19
#> Sim 119 48
#>
#> Accuracy : 0.931
#> 95% CI : (0.919, 0.9417)
#> No Information Rate : 0.9665
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.3807
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.71642
#> Specificity : 0.93847
#> Pos Pred Value : 0.28743
#> Neg Pred Value : 0.98964
#> Prevalence : 0.03348
#> Detection Rate : 0.02399
#> Detection Prevalence : 0.08346
#> Balanced Accuracy : 0.82744
#>
#> 'Positive' Class : Sim
#>
# Este valor é igual a 1 - Accuracy da matriz de confusão
calc_erro_class ( conj_teste $ inadimplente , y_chapeu )
Seleção de variáveis
No LDA, a seleção de variáveis pode ser feita com o RFE (Recursive Feature Elimination). O RFE é um método de seleção de variáveis que utiliza a validação cruzada para avaliar o desempenho do modelo com diferentes subconjuntos de variáveis. O RFE é implementado na função rfe()
do pacote caret
.
# Usar o RFE para selecionar as variáveis
# Definir controle para RFE
control <- rfeControl ( functions = ldaFuncs , method = "cv" , number = 10 )
# Aplicar o RFE
set.seed ( 2024 )
result <- rfe ( conj_treino [ , 2 : 4 ] , conj_treino $ inadimplente , sizes = c ( 1 : 3 ) , rfeControl = control )
# Resultados
print ( result )
#>
#> Recursive feature selection
#>
#> Outer resampling method: Cross-Validated (10 fold)
#>
#> Resampling performance over subset size:
#>
#> Variables Accuracy Kappa AccuracySD KappaSD Selected
#> 1 0.9724 0.3451 0.003761 0.12676
#> 2 0.9729 0.3562 0.003398 0.10548
#> 3 0.9735 0.3804 0.002942 0.09046 *
#>
#> The top 3 variables (out of 3):
#> balanco, estudante, receita
Outra Opção
Podemos usar os gráfico exploratórios iniciais e também o resultado da regressão logística como ponto de partida para a seleção de variáveis.
treina_lda2 <- lda ( inadimplente ~ balanco + estudante , data = conj_treino )
treina_lda2
#> Call:
#> lda(inadimplente ~ balanco + estudante, data = conj_treino)
#>
#> Prior probabilities of groups:
#> Nao Sim
#> 0.96674584 0.03325416
#>
#> Group means:
#> balanco estudante
#> Nao -0.06884968 0.2903142
#> Sim 1.88077996 0.3684211
#>
#> Coefficients of linear discriminants:
#> LD1
#> balanco 1.0935003
#> estudante -0.2716957
y_chapeu <- predict ( treina_lda2 , conj_teste ) $ class %>%
factor ( levels = levels ( conj_teste $ inadimplente ) )
confusionMatrix ( data = y_chapeu , reference = conj_teste $ inadimplente , positive= "Sim" )
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Nao Sim
#> Nao 1926 48
#> Sim 8 19
#>
#> Accuracy : 0.972
#> 95% CI : (0.9638, 0.9788)
#> No Information Rate : 0.9665
#> P-Value [Acc > NIR] : 0.09339
#>
#> Kappa : 0.3926
#>
#> Mcnemar's Test P-Value : 1.872e-07
#>
#> Sensitivity : 0.283582
#> Specificity : 0.995863
#> Pos Pred Value : 0.703704
#> Neg Pred Value : 0.975684
#> Prevalence : 0.033483
#> Detection Rate : 0.009495
#> Detection Prevalence : 0.013493
#> Balanced Accuracy : 0.639723
#>
#> 'Positive' Class : Sim
#>
# Este valor é igual a 1 - Accuracy da matriz de confusão
calc_erro_class ( conj_teste $ inadimplente , y_chapeu )
Podemos observar que não houve mudança nos resultados ao retirar a variável receita.
QDA
treina_qda <- qda ( inadimplente ~ balanco + estudante + receita , data = conj_treino )
treina_qda
#> Call:
#> qda(inadimplente ~ balanco + estudante + receita, data = conj_treino)
#>
#> Prior probabilities of groups:
#> Nao Sim
#> 0.96674584 0.03325416
#>
#> Group means:
#> balanco estudante receita
#> Nao -0.06884968 0.2903142 0.008870876
#> Sim 1.88077996 0.3684211 -0.047445476
y_chapeu <- predict ( treina_qda , conj_teste ) $ class %>%
factor ( levels = levels ( conj_teste $ inadimplente ) )
confusionMatrix ( data = y_chapeu , reference = conj_teste $ inadimplente , positive= "Sim" )
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Nao Sim
#> Nao 1922 47
#> Sim 12 20
#>
#> Accuracy : 0.9705
#> 95% CI : (0.9621, 0.9775)
#> No Information Rate : 0.9665
#> P-Value [Acc > NIR] : 0.1762
#>
#> Kappa : 0.3909
#>
#> Mcnemar's Test P-Value : 9.581e-06
#>
#> Sensitivity : 0.298507
#> Specificity : 0.993795
#> Pos Pred Value : 0.625000
#> Neg Pred Value : 0.976130
#> Prevalence : 0.033483
#> Detection Rate : 0.009995
#> Detection Prevalence : 0.015992
#> Balanced Accuracy : 0.646151
#>
#> 'Positive' Class : Sim
#>
QDA - Ajustando probabilidade limite
p_chapeu <- predict ( treina_qda , conj_teste ) $ posterior
head ( p_chapeu )
#> Nao Sim
#> 1 0.9904625 9.537473e-03
#> 2 0.9997798 2.202256e-04
#> 3 0.9062854 9.371459e-02
#> 4 0.9999365 6.346083e-05
#> 5 0.9526017 4.739833e-02
#> 6 0.9999996 3.588934e-07
y_chapeu <- ifelse ( p_chapeu [ , 2 ] > 0.11 , "Sim" , "Nao" ) %>%
factor ( levels = levels ( conj_teste $ inadimplente ) )
confusionMatrix ( data = y_chapeu , reference = conj_teste $ inadimplente , positive= "Sim" )
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Nao Sim
#> Nao 1803 17
#> Sim 131 50
#>
#> Accuracy : 0.926
#> 95% CI : (0.9137, 0.9371)
#> No Information Rate : 0.9665
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.3726
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.74627
#> Specificity : 0.93226
#> Pos Pred Value : 0.27624
#> Neg Pred Value : 0.99066
#> Prevalence : 0.03348
#> Detection Rate : 0.02499
#> Detection Prevalence : 0.09045
#> Balanced Accuracy : 0.83927
#>
#> 'Positive' Class : Sim
#>
Curva ROC
library ( pROC )
# KNN
set.seed ( 21 )
ctrl <- trainControl ( method = "cv" )
treina_knn <- train ( inadimplente ~ balanco + estudante , method = "knn" , trControl= ctrl , preProcess= c ( "center" , "scale" ) , tuneGrid = data.frame ( k = seq ( 21 ,140 , by= 4 ) ) , data = conj_treino )
prev_knn <- predict ( treina_knn , conj_teste ,type = "prob" )
# Reg Log
mod2 <- glm ( inadimplente ~ balanco + estudante ,data= conj_treino ,family= binomial )
p_chapeu_log <- predict ( mod2 , newdata = conj_teste , type = "response" )
# LDA e QDA
p_chapeu_lda <- predict ( treina_lda , conj_teste ) $ posterior
p_chapeu_qda <- predict ( treina_qda , conj_teste ) $ posterior
roc_log <- roc ( conj_teste $ inadimplente ~ p_chapeu_log , plot = TRUE , print.auc= FALSE ,
col= "black" , legacy.axes= TRUE )
roc_lda <- roc ( conj_teste $ inadimplente ~ p_chapeu_lda [ ,2 ] , plot = TRUE , print.auc= FALSE , col= "green" , legacy.axes= TRUE , add= TRUE )
roc_qda <- roc ( conj_teste $ inadimplente ~ p_chapeu_qda [ ,2 ] , plot = TRUE , print.auc= FALSE , col= "blue" , legacy.axes= TRUE , add= TRUE )
roc_knn1 <- roc ( conj_teste $ inadimplente ~ prev_knn [ ,2 ] , plot = TRUE , print.auc= FALSE , col= "red" , legacy.axes= TRUE , add= TRUE )
legend ( "bottomright" ,legend= c ( "Reg Log" ,"LDA" ,"QDA" , "KNN" ) ,
col= c ( "black" , "green" ,"blue" , "red" ) ,lwd= 4 )