LDA e QDA

Author

Ricardo Accioly

Published

August 20, 2024

LDA e QDA

Carregando Bibliotecas

#>  default    student       balance           income     
#>  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
#>  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
#>                        Median : 823.6   Median :34553  
#>                        Mean   : 835.4   Mean   :33517  
#>                        3rd Qu.:1166.3   3rd Qu.:43808  
#>                        Max.   :2654.3   Max.   :73554
str(Default)
#> 'data.frame':    10000 obs. of  4 variables:
#>  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
#>  $ balance: num  730 817 1074 529 786 ...
#>  $ income : num  44362 12106 31767 35704 38463 ...
head(Default)
#>   default student   balance    income
#> 1      No      No  729.5265 44361.625
#> 2      No     Yes  817.1804 12106.135
#> 3      No      No 1073.5492 31767.139
#> 4      No      No  529.2506 35704.494
#> 5      No      No  785.6559 38463.496
#> 6      No     Yes  919.5885  7491.559

Manipulando os dados

credito <- tibble(Default)
summary(credito)
#>  default    student       balance           income     
#>  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
#>  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
#>                        Median : 823.6   Median :34553  
#>                        Mean   : 835.4   Mean   :33517  
#>                        3rd Qu.:1166.3   3rd Qu.:43808  
#>                        Max.   :2654.3   Max.   :73554
# renomeando colunas
credito <- credito %>% 
                rename( inadimplente = default, estudante = student, balanco = balance,
                receita = income)
credito <- credito %>% mutate( inadimplente =  case_when(
                           inadimplente == "No"  ~ "Nao",
                           inadimplente == "Yes" ~ "Sim"
                          )) %>% mutate(inadimplente = factor(inadimplente))
credito <- credito %>% mutate( estudante =  case_when(
                           estudante == "No"  ~ 0,
                           estudante == "Yes" ~ 1
                          )) 

str(credito)
#> tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
#>  $ inadimplente: Factor w/ 2 levels "Nao","Sim": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ estudante   : num [1:10000] 0 1 0 0 0 1 0 1 0 0 ...
#>  $ balanco     : num [1:10000] 730 817 1074 529 786 ...
#>  $ receita     : num [1:10000] 44362 12106 31767 35704 38463 ...
summary(credito)
#>  inadimplente   estudante         balanco          receita     
#>  Nao:9667     Min.   :0.0000   Min.   :   0.0   Min.   :  772  
#>  Sim: 333     1st Qu.:0.0000   1st Qu.: 481.7   1st Qu.:21340  
#>               Median :0.0000   Median : 823.6   Median :34553  
#>               Mean   :0.2944   Mean   : 835.4   Mean   :33517  
#>               3rd Qu.:1.0000   3rd Qu.:1166.3   3rd Qu.:43808  
#>               Max.   :1.0000   Max.   :2654.3   Max.   :73554

Treino e Teste

library(caret)
set.seed(2024)
y <- credito$inadimplente
indice_teste <- createDataPartition(y, times = 1, p = 0.2, list = FALSE)

conj_treino <- credito[-indice_teste,]
conj_teste <- credito[indice_teste,]

summary(conj_treino)
#>  inadimplente   estudante         balanco          receita     
#>  Nao:7733     Min.   :0.0000   Min.   :   0.0   Min.   :  772  
#>  Sim: 266     1st Qu.:0.0000   1st Qu.: 481.8   1st Qu.:21536  
#>               Median :0.0000   Median : 818.0   Median :34690  
#>               Mean   :0.2929   Mean   : 833.4   Mean   :33610  
#>               3rd Qu.:1.0000   3rd Qu.:1164.6   3rd Qu.:43850  
#>               Max.   :1.0000   Max.   :2654.3   Max.   :73554
summary(conj_teste)
#>  inadimplente   estudante         balanco          receita     
#>  Nao:1934     Min.   :0.0000   Min.   :   0.0   Min.   : 1498  
#>  Sim:  67     1st Qu.:0.0000   1st Qu.: 481.6   1st Qu.:20655  
#>               Median :0.0000   Median : 840.1   Median :33812  
#>               Mean   :0.3003   Mean   : 843.1   Mean   :33144  
#>               3rd Qu.:1.0000   3rd Qu.:1171.4   3rd Qu.:43593  
#>               Max.   :1.0000   Max.   :2578.5   Max.   :68722

Balanço e receita

featurePlot(x = conj_treino[, c("balanco", "receita", "estudante")], 
            y = conj_treino$inadimplente,
            plot = "density", 
            scales = list(x = list(relation = "free"), 
                          y = list(relation = "free")), 
            adjust = 1.5, 
            pch = "|", 
            layout = c(2, 1), 
            auto.key = list(columns = 2))

Avaliando o comportamento das variáveis em função do status (inadimplente / estudante)

library(patchwork)
p1 <- ggplot(credito, aes(x=inadimplente, y=balanco, color=inadimplente)) +
  geom_boxplot()
p2 <- ggplot(credito, aes(x=inadimplente, y=receita, color=inadimplente)) +
  geom_boxplot()
p3 <- ggplot(credito, aes(x=as.factor(estudante), y=balanco, color=as.factor(estudante))) +
  geom_boxplot()
p4 <- ggplot(credito, aes(x=as.factor(estudante), y=receita, color=as.factor(estudante))) +
  geom_boxplot()
(p1 + p2) / (p3 + p4)

Calcula Erro

# Este valor é igual a 1 - Accuracy da matriz de confusão
calc_erro_class <- function(real, previsto) {
  mean(real != previsto)
}

LDA

#> 
#> Anexando pacote: 'MASS'
#> O seguinte objeto é mascarado por 'package:patchwork':
#> 
#>     area
#> O seguinte objeto é mascarado por 'package:dplyr':
#> 
#>     select
treina_lda <- lda(inadimplente ~ balanco + estudante + receita, data = conj_treino)
treina_lda
#> Call:
#> lda(inadimplente ~ balanco + estudante + receita, data = conj_treino)
#> 
#> Prior probabilities of groups:
#>        Nao        Sim 
#> 0.96674584 0.03325416 
#> 
#> Group means:
#>       balanco estudante  receita
#> Nao  802.0713 0.2903142 33635.29
#> Sim 1745.1363 0.3684211 32884.22
#> 
#> Coefficients of linear discriminants:
#>                     LD1
#> balanco    2.257518e-03
#> estudante -1.466865e-01
#> receita    5.647071e-06
plot(treina_lda)

names(predict(treina_lda, conj_treino))
#> [1] "class"     "posterior" "x"
y_chapeu <- predict(treina_lda, conj_teste)$class %>% 
             factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(data = y_chapeu, reference = conj_teste$inadimplente,  positive="Sim")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Nao  Sim
#>        Nao 1926   48
#>        Sim    8   19
#>                                           
#>                Accuracy : 0.972           
#>                  95% CI : (0.9638, 0.9788)
#>     No Information Rate : 0.9665          
#>     P-Value [Acc > NIR] : 0.09339         
#>                                           
#>                   Kappa : 0.3926          
#>                                           
#>  Mcnemar's Test P-Value : 1.872e-07       
#>                                           
#>             Sensitivity : 0.283582        
#>             Specificity : 0.995863        
#>          Pos Pred Value : 0.703704        
#>          Neg Pred Value : 0.975684        
#>              Prevalence : 0.033483        
#>          Detection Rate : 0.009495        
#>    Detection Prevalence : 0.013493        
#>       Balanced Accuracy : 0.639723        
#>                                           
#>        'Positive' Class : Sim             
#> 
# Este valor é igual a 1 - Accuracy da matriz de confusão
calc_erro_class(conj_teste$inadimplente, y_chapeu)
#> [1] 0.02798601

LDA - Ajustando probabilidade limite

p_chapeu <- predict(treina_lda, conj_teste)$posterior
head(p_chapeu)
#>         Nao          Sim
#> 1 0.9857954 0.0142045675
#> 2 0.9981836 0.0018164269
#> 3 0.9180153 0.0819846555
#> 4 0.9988081 0.0011919412
#> 5 0.9519939 0.0480061272
#> 6 0.9998932 0.0001067956
y_chapeu <- ifelse(p_chapeu[, 2] > 0.11, "Sim", "Nao") %>% 
             factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(data = y_chapeu, reference = conj_teste$inadimplente,  positive="Sim") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Nao  Sim
#>        Nao 1815   19
#>        Sim  119   48
#>                                          
#>                Accuracy : 0.931          
#>                  95% CI : (0.919, 0.9417)
#>     No Information Rate : 0.9665         
#>     P-Value [Acc > NIR] : 1              
#>                                          
#>                   Kappa : 0.3807         
#>                                          
#>  Mcnemar's Test P-Value : <2e-16         
#>                                          
#>             Sensitivity : 0.71642        
#>             Specificity : 0.93847        
#>          Pos Pred Value : 0.28743        
#>          Neg Pred Value : 0.98964        
#>              Prevalence : 0.03348        
#>          Detection Rate : 0.02399        
#>    Detection Prevalence : 0.08346        
#>       Balanced Accuracy : 0.82744        
#>                                          
#>        'Positive' Class : Sim            
#> 
# Este valor é igual a 1 - Accuracy da matriz de confusão
calc_erro_class(conj_teste$inadimplente, y_chapeu)
#> [1] 0.06896552

QDA

treina_qda <- qda(inadimplente ~ balanco + estudante + receita, data = conj_treino)
treina_qda
#> Call:
#> qda(inadimplente ~ balanco + estudante + receita, data = conj_treino)
#> 
#> Prior probabilities of groups:
#>        Nao        Sim 
#> 0.96674584 0.03325416 
#> 
#> Group means:
#>       balanco estudante  receita
#> Nao  802.0713 0.2903142 33635.29
#> Sim 1745.1363 0.3684211 32884.22
y_chapeu <- predict(treina_qda, conj_teste)$class %>% 
             factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(data = y_chapeu, reference = conj_teste$inadimplente,  positive="Sim") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Nao  Sim
#>        Nao 1922   47
#>        Sim   12   20
#>                                           
#>                Accuracy : 0.9705          
#>                  95% CI : (0.9621, 0.9775)
#>     No Information Rate : 0.9665          
#>     P-Value [Acc > NIR] : 0.1762          
#>                                           
#>                   Kappa : 0.3909          
#>                                           
#>  Mcnemar's Test P-Value : 9.581e-06       
#>                                           
#>             Sensitivity : 0.298507        
#>             Specificity : 0.993795        
#>          Pos Pred Value : 0.625000        
#>          Neg Pred Value : 0.976130        
#>              Prevalence : 0.033483        
#>          Detection Rate : 0.009995        
#>    Detection Prevalence : 0.015992        
#>       Balanced Accuracy : 0.646151        
#>                                           
#>        'Positive' Class : Sim             
#> 

QDA - Ajustando probabilidade limite

p_chapeu <- predict(treina_qda, conj_teste)$posterior
head(p_chapeu)
#>         Nao          Sim
#> 1 0.9904625 9.537473e-03
#> 2 0.9997798 2.202256e-04
#> 3 0.9062854 9.371459e-02
#> 4 0.9999365 6.346083e-05
#> 5 0.9526017 4.739833e-02
#> 6 0.9999996 3.588934e-07
y_chapeu <- ifelse(p_chapeu[, 2] > 0.11, "Sim", "Nao") %>% 
             factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(data = y_chapeu, reference = conj_teste$inadimplente,  positive="Sim") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Nao  Sim
#>        Nao 1803   17
#>        Sim  131   50
#>                                           
#>                Accuracy : 0.926           
#>                  95% CI : (0.9137, 0.9371)
#>     No Information Rate : 0.9665          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.3726          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.74627         
#>             Specificity : 0.93226         
#>          Pos Pred Value : 0.27624         
#>          Neg Pred Value : 0.99066         
#>              Prevalence : 0.03348         
#>          Detection Rate : 0.02499         
#>    Detection Prevalence : 0.09045         
#>       Balanced Accuracy : 0.83927         
#>                                           
#>        'Positive' Class : Sim             
#> 

Curva ROC

library(pROC)

# KNN
set.seed(21)
ctrl <- trainControl(method = "cv")
treina_knn <- train(inadimplente ~ balanco + estudante, method = "knn", trControl= ctrl, preProcess=c("center", "scale"), tuneGrid = data.frame(k = seq(21,140, by=4)), data = conj_treino)
prev_knn <- predict(treina_knn, conj_teste,type = "prob")

# Reg Log
mod2 <- glm(inadimplente ~ balanco + estudante,data=conj_treino,family=binomial)
p_chapeu_log <- predict(mod2, newdata = conj_teste, type = "response")

# LDA e QDA
p_chapeu_lda <- predict(treina_lda, conj_teste)$posterior
p_chapeu_qda <- predict(treina_qda, conj_teste)$posterior

roc_log <- roc(conj_teste$inadimplente ~ p_chapeu_log, plot = TRUE, print.auc=FALSE,
                 col="black", legacy.axes=TRUE)
roc_lda <- roc(conj_teste$inadimplente ~ p_chapeu_lda[,2], plot = TRUE, print.auc=FALSE, col="green", legacy.axes=TRUE, add=TRUE)
roc_qda <- roc(conj_teste$inadimplente ~ p_chapeu_qda[,2], plot = TRUE, print.auc=FALSE, col="blue", legacy.axes=TRUE, add=TRUE)
roc_knn1 <- roc(conj_teste$inadimplente ~ prev_knn[,2], plot = TRUE, print.auc=FALSE, col="red", legacy.axes=TRUE, add=TRUE)

legend("bottomright",legend=c("Reg Log","LDA","QDA", "KNN"), 
       col=c("black", "green","blue", "red"),lwd=4)

as.numeric(roc_log$auc)
#> [1] 0.9505009
as.numeric(roc_lda$auc)
#> [1] 0.9498063
as.numeric(roc_qda$auc)
#> [1] 0.9497909
as.numeric(roc_knn1$auc)
#> [1] 0.9423397