LDA e QDA

Author

Ricardo Accioly

Published

May 26, 2025

LDA e QDA

Carregando Bibliotecas

library(tidyverse)

#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.4     ✔ readr     2.1.5
#> ✔ forcats   1.0.0     ✔ stringr   1.5.1
#> ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
#> ✔ purrr     1.0.4     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ISLR)
data(Default)
summary(Default)

#>  default    student       balance           income     
#>  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
#>  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
#>                        Median : 823.6   Median :34553  
#>                        Mean   : 835.4   Mean   :33517  
#>                        3rd Qu.:1166.3   3rd Qu.:43808  
#>                        Max.   :2654.3   Max.   :73554

str(Default)

#> 'data.frame':    10000 obs. of  4 variables:
#>  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
#>  $ balance: num  730 817 1074 529 786 ...
#>  $ income : num  44362 12106 31767 35704 38463 ...

head(Default)

#>   default student   balance    income
#> 1      No      No  729.5265 44361.625
#> 2      No     Yes  817.1804 12106.135
#> 3      No      No 1073.5492 31767.139
#> 4      No      No  529.2506 35704.494
#> 5      No      No  785.6559 38463.496
#> 6      No     Yes  919.5885  7491.559

Manipulando os dados

credito <- tibble(Default)
summary(credito)

#>  default    student       balance           income     
#>  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
#>  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
#>                        Median : 823.6   Median :34553  
#>                        Mean   : 835.4   Mean   :33517  
#>                        3rd Qu.:1166.3   3rd Qu.:43808  
#>                        Max.   :2654.3   Max.   :73554

# renomeando colunas
credito <- credito %>% 
                rename( inadimplente = default, estudante = student, balanco = balance,
                receita = income)
credito <- credito %>% mutate( inadimplente =  case_when(
                           inadimplente == "No"  ~ "Nao",
                           inadimplente == "Yes" ~ "Sim"
                          )) %>% mutate(inadimplente = factor(inadimplente))
credito <- credito %>% mutate( estudante =  case_when(
                           estudante == "No"  ~ 0,
                           estudante == "Yes" ~ 1
                          )) 

str(credito)

#> tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
#>  $ inadimplente: Factor w/ 2 levels "Nao","Sim": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ estudante   : num [1:10000] 0 1 0 0 0 1 0 1 0 0 ...
#>  $ balanco     : num [1:10000] 730 817 1074 529 786 ...
#>  $ receita     : num [1:10000] 44362 12106 31767 35704 38463 ...

summary(credito)

#>  inadimplente   estudante         balanco          receita     
#>  Nao:9667     Min.   :0.0000   Min.   :   0.0   Min.   :  772  
#>  Sim: 333     1st Qu.:0.0000   1st Qu.: 481.7   1st Qu.:21340  
#>               Median :0.0000   Median : 823.6   Median :34553  
#>               Mean   :0.2944   Mean   : 835.4   Mean   :33517  
#>               3rd Qu.:1.0000   3rd Qu.:1166.3   3rd Qu.:43808  
#>               Max.   :1.0000   Max.   :2654.3   Max.   :73554

Treino e Teste

library(caret)

#> Carregando pacotes exigidos: lattice

#> 
#> Anexando pacote: 'caret'

#> O seguinte objeto é mascarado por 'package:purrr':
#> 
#>     lift

set.seed(2024)
y <- credito$inadimplente
indice_teste <- createDataPartition(y, times = 1, p = 0.2, list = FALSE)

conj_treino <- credito[-indice_teste,]
conj_teste <- credito[indice_teste,]

summary(conj_treino)

#>  inadimplente   estudante         balanco          receita     
#>  Nao:7733     Min.   :0.0000   Min.   :   0.0   Min.   :  772  
#>  Sim: 266     1st Qu.:0.0000   1st Qu.: 481.8   1st Qu.:21536  
#>               Median :0.0000   Median : 818.0   Median :34690  
#>               Mean   :0.2929   Mean   : 833.4   Mean   :33610  
#>               3rd Qu.:1.0000   3rd Qu.:1164.6   3rd Qu.:43850  
#>               Max.   :1.0000   Max.   :2654.3   Max.   :73554

summary(conj_teste)

#>  inadimplente   estudante         balanco          receita     
#>  Nao:1934     Min.   :0.0000   Min.   :   0.0   Min.   : 1498  
#>  Sim:  67     1st Qu.:0.0000   1st Qu.: 481.6   1st Qu.:20655  
#>               Median :0.0000   Median : 840.1   Median :33812  
#>               Mean   :0.3003   Mean   : 843.1   Mean   :33144  
#>               3rd Qu.:1.0000   3rd Qu.:1171.4   3rd Qu.:43593  
#>               Max.   :1.0000   Max.   :2578.5   Max.   :68722

Balanço e receita

featurePlot(x = conj_treino[, c("balanco", "receita", "estudante")], 
            y = conj_treino$inadimplente,
            plot = "density", 
            scales = list(x = list(relation = "free"), 
                          y = list(relation = "free")), 
            adjust = 1.5, 
            pch = "|", 
            layout = c(2, 1), 
            auto.key = list(columns = 2))

Avaliando o comportamento das variáveis em função do status (inadimplente / estudante)

library(patchwork)
p1 <- ggplot(credito, aes(x=inadimplente, y=balanco, color=inadimplente)) +
  geom_boxplot()
p2 <- ggplot(credito, aes(x=inadimplente, y=receita, color=inadimplente)) +
  geom_boxplot()
p3 <- ggplot(credito, aes(x=as.factor(estudante), y=balanco, color=as.factor(estudante))) +
  geom_boxplot()
p4 <- ggplot(credito, aes(x=as.factor(estudante), y=receita, color=as.factor(estudante))) +
  geom_boxplot()
(p1 + p2) / (p3 + p4)

Calcula Erro

# Este valor é igual a 1 - Accuracy da matriz de confusão
calc_erro_class <- function(real, previsto) {
  mean(real != previsto)
}

Treino e Teste Normalizado

library(caret)
set.seed(2024)
# Normalizando os dados
credito <- credito %>% mutate(balanco = scale(balanco), receita = scale(receita))

y <- credito$inadimplente
indice_teste <- createDataPartition(y, times = 1, p = 0.2, list = FALSE)

conj_treino <- credito[-indice_teste,]
conj_teste <- credito[indice_teste,]

summary(conj_treino)

#>  inadimplente   estudante              balanco.V1         
#>  Nao:7733     Min.   :0.0000   Min.   :-1.72699815214000  
#>  Sim: 266     1st Qu.:0.0000   1st Qu.:-0.73093769169700  
#>               Median :0.0000   Median :-0.03595597796600  
#>               Mean   :0.2929   Mean   :-0.00401639039487  
#>               3rd Qu.:1.0000   3rd Qu.: 0.68056023172100  
#>               Max.   :1.0000   Max.   : 3.76037076853000  
#>          receita.V1         
#>  Min.   :-2.45526723525000  
#>  1st Qu.:-0.89836473917200  
#>  Median : 0.08796491781240  
#>  Mean   : 0.00699812328583  
#>  3rd Qu.: 0.77481662870600  
#>  Max.   : 3.00204946164000

summary(conj_teste)

#>  inadimplente   estudante              balanco.V1         
#>  Nao:1934     Min.   :0.0000   Min.   :-1.72699815214000  
#>  Sim:  67     1st Qu.:0.0000   1st Qu.:-0.73139456297700  
#>               Median :0.0000   Median : 0.00984385369764  
#>               Mean   :0.3003   Mean   : 0.01605552562150  
#>               3rd Qu.:1.0000   3rd Qu.: 0.69476143173600  
#>               Max.   :1.0000   Max.   : 3.60355620415000  
#>          receita.V1        
#>  Min.   :-2.4008112726600  
#>  1st Qu.:-0.9643945022460  
#>  Median : 0.0220960712686  
#>  Mean   :-0.0279750065784  
#>  3rd Qu.: 0.7555020572170  
#>  Max.   : 2.6397198805100

LDA

library(MASS)

#> 
#> Anexando pacote: 'MASS'

#> O seguinte objeto é mascarado por 'package:patchwork':
#> 
#>     area

#> O seguinte objeto é mascarado por 'package:dplyr':
#> 
#>     select

treina_lda <- lda(inadimplente ~ balanco + estudante + receita, data = conj_treino)
treina_lda

#> Call:
#> lda(inadimplente ~ balanco + estudante + receita, data = conj_treino)
#> 
#> Prior probabilities of groups:
#>        Nao        Sim 
#> 0.96674584 0.03325416 
#> 
#> Group means:
#>         balanco estudante      receita
#> Nao -0.06884968 0.2903142  0.008870876
#> Sim  1.88077996 0.3684211 -0.047445476
#> 
#> Coefficients of linear discriminants:
#>                   LD1
#> balanco    1.09199528
#> estudante -0.14668654
#> receita    0.07531296

plot(treina_lda)

names(predict(treina_lda, conj_treino))

#> [1] "class"     "posterior" "x"

y_chapeu <- predict(treina_lda, conj_teste)$class %>% 
             factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(data = y_chapeu, reference = conj_teste$inadimplente,  positive="Sim")

#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Nao  Sim
#>        Nao 1926   48
#>        Sim    8   19
#>                                           
#>                Accuracy : 0.972           
#>                  95% CI : (0.9638, 0.9788)
#>     No Information Rate : 0.9665          
#>     P-Value [Acc > NIR] : 0.09339         
#>                                           
#>                   Kappa : 0.3926          
#>                                           
#>  Mcnemar's Test P-Value : 1.872e-07       
#>                                           
#>             Sensitivity : 0.283582        
#>             Specificity : 0.995863        
#>          Pos Pred Value : 0.703704        
#>          Neg Pred Value : 0.975684        
#>              Prevalence : 0.033483        
#>          Detection Rate : 0.009495        
#>    Detection Prevalence : 0.013493        
#>       Balanced Accuracy : 0.639723        
#>                                           
#>        'Positive' Class : Sim             
#>

# Este valor é igual a 1 - Accuracy da matriz de confusão
calc_erro_class(conj_teste$inadimplente, y_chapeu)

#> [1] 0.02798601

LDA - Ajustando probabilidade limite

p_chapeu <- predict(treina_lda, conj_teste)$posterior
head(p_chapeu)

#>         Nao          Sim
#> 1 0.9857954 0.0142045675
#> 2 0.9981836 0.0018164269
#> 3 0.9180153 0.0819846555
#> 4 0.9988081 0.0011919412
#> 5 0.9519939 0.0480061272
#> 6 0.9998932 0.0001067956

y_chapeu <- ifelse(p_chapeu[, 2] > 0.11, "Sim", "Nao") %>% 
             factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(data = y_chapeu, reference = conj_teste$inadimplente,  positive="Sim")

#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Nao  Sim
#>        Nao 1815   19
#>        Sim  119   48
#>                                          
#>                Accuracy : 0.931          
#>                  95% CI : (0.919, 0.9417)
#>     No Information Rate : 0.9665         
#>     P-Value [Acc > NIR] : 1              
#>                                          
#>                   Kappa : 0.3807         
#>                                          
#>  Mcnemar's Test P-Value : <2e-16         
#>                                          
#>             Sensitivity : 0.71642        
#>             Specificity : 0.93847        
#>          Pos Pred Value : 0.28743        
#>          Neg Pred Value : 0.98964        
#>              Prevalence : 0.03348        
#>          Detection Rate : 0.02399        
#>    Detection Prevalence : 0.08346        
#>       Balanced Accuracy : 0.82744        
#>                                          
#>        'Positive' Class : Sim            
#>

# Este valor é igual a 1 - Accuracy da matriz de confusão
calc_erro_class(conj_teste$inadimplente, y_chapeu)

#> [1] 0.06896552

Seleção de variáveis

No LDA, a seleção de variáveis pode ser feita com o RFE (Recursive Feature Elimination). O RFE é um método de seleção de variáveis que utiliza a validação cruzada para avaliar o desempenho do modelo com diferentes subconjuntos de variáveis. O RFE é implementado na função rfe() do pacote caret.

# Usar o RFE para selecionar as variáveis
# Definir controle para RFE
control <- rfeControl(functions = ldaFuncs, method = "cv", number = 10)

# Aplicar o RFE
set.seed(2024)
result <- rfe(conj_treino[, 2:4], conj_treino$inadimplente, sizes = c(1:3), rfeControl = control)

# Resultados
print(result)

#> 
#> Recursive feature selection
#> 
#> Outer resampling method: Cross-Validated (10 fold) 
#> 
#> Resampling performance over subset size:
#> 
#>  Variables Accuracy  Kappa AccuracySD KappaSD Selected
#>          1   0.9724 0.3451   0.003761 0.12676         
#>          2   0.9729 0.3562   0.003398 0.10548         
#>          3   0.9735 0.3804   0.002942 0.09046        *
#> 
#> The top 3 variables (out of 3):
#>    balanco, estudante, receita

Outra Opção

Podemos usar os gráfico exploratórios iniciais e também o resultado da regressão logística como ponto de partida para a seleção de variáveis.

treina_lda2 <- lda(inadimplente ~ balanco + estudante, data = conj_treino)
treina_lda2

#> Call:
#> lda(inadimplente ~ balanco + estudante, data = conj_treino)
#> 
#> Prior probabilities of groups:
#>        Nao        Sim 
#> 0.96674584 0.03325416 
#> 
#> Group means:
#>         balanco estudante
#> Nao -0.06884968 0.2903142
#> Sim  1.88077996 0.3684211
#> 
#> Coefficients of linear discriminants:
#>                  LD1
#> balanco    1.0935003
#> estudante -0.2716957

plot(treina_lda2)

y_chapeu <- predict(treina_lda2, conj_teste)$class %>% 
             factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(data = y_chapeu, reference = conj_teste$inadimplente,  positive="Sim")

#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Nao  Sim
#>        Nao 1926   48
#>        Sim    8   19
#>                                           
#>                Accuracy : 0.972           
#>                  95% CI : (0.9638, 0.9788)
#>     No Information Rate : 0.9665          
#>     P-Value [Acc > NIR] : 0.09339         
#>                                           
#>                   Kappa : 0.3926          
#>                                           
#>  Mcnemar's Test P-Value : 1.872e-07       
#>                                           
#>             Sensitivity : 0.283582        
#>             Specificity : 0.995863        
#>          Pos Pred Value : 0.703704        
#>          Neg Pred Value : 0.975684        
#>              Prevalence : 0.033483        
#>          Detection Rate : 0.009495        
#>    Detection Prevalence : 0.013493        
#>       Balanced Accuracy : 0.639723        
#>                                           
#>        'Positive' Class : Sim             
#>

# Este valor é igual a 1 - Accuracy da matriz de confusão
calc_erro_class(conj_teste$inadimplente, y_chapeu)

#> [1] 0.02798601

Podemos observar que não houve mudança nos resultados ao retirar a variável receita.

QDA

treina_qda <- qda(inadimplente ~ balanco + estudante + receita, data = conj_treino)
treina_qda

#> Call:
#> qda(inadimplente ~ balanco + estudante + receita, data = conj_treino)
#> 
#> Prior probabilities of groups:
#>        Nao        Sim 
#> 0.96674584 0.03325416 
#> 
#> Group means:
#>         balanco estudante      receita
#> Nao -0.06884968 0.2903142  0.008870876
#> Sim  1.88077996 0.3684211 -0.047445476

y_chapeu <- predict(treina_qda, conj_teste)$class %>% 
             factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(data = y_chapeu, reference = conj_teste$inadimplente,  positive="Sim")

#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Nao  Sim
#>        Nao 1922   47
#>        Sim   12   20
#>                                           
#>                Accuracy : 0.9705          
#>                  95% CI : (0.9621, 0.9775)
#>     No Information Rate : 0.9665          
#>     P-Value [Acc > NIR] : 0.1762          
#>                                           
#>                   Kappa : 0.3909          
#>                                           
#>  Mcnemar's Test P-Value : 9.581e-06       
#>                                           
#>             Sensitivity : 0.298507        
#>             Specificity : 0.993795        
#>          Pos Pred Value : 0.625000        
#>          Neg Pred Value : 0.976130        
#>              Prevalence : 0.033483        
#>          Detection Rate : 0.009995        
#>    Detection Prevalence : 0.015992        
#>       Balanced Accuracy : 0.646151        
#>                                           
#>        'Positive' Class : Sim             
#>

QDA - Ajustando probabilidade limite

p_chapeu <- predict(treina_qda, conj_teste)$posterior
head(p_chapeu)

#>         Nao          Sim
#> 1 0.9904625 9.537473e-03
#> 2 0.9997798 2.202256e-04
#> 3 0.9062854 9.371459e-02
#> 4 0.9999365 6.346083e-05
#> 5 0.9526017 4.739833e-02
#> 6 0.9999996 3.588934e-07

y_chapeu <- ifelse(p_chapeu[, 2] > 0.11, "Sim", "Nao") %>% 
             factor(levels = levels(conj_teste$inadimplente))
confusionMatrix(data = y_chapeu, reference = conj_teste$inadimplente,  positive="Sim")

#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Nao  Sim
#>        Nao 1803   17
#>        Sim  131   50
#>                                           
#>                Accuracy : 0.926           
#>                  95% CI : (0.9137, 0.9371)
#>     No Information Rate : 0.9665          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.3726          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.74627         
#>             Specificity : 0.93226         
#>          Pos Pred Value : 0.27624         
#>          Neg Pred Value : 0.99066         
#>              Prevalence : 0.03348         
#>          Detection Rate : 0.02499         
#>    Detection Prevalence : 0.09045         
#>       Balanced Accuracy : 0.83927         
#>                                           
#>        'Positive' Class : Sim             
#>

Curva ROC

library(pROC)

# KNN
set.seed(21)
ctrl <- trainControl(method = "cv")
treina_knn <- train(inadimplente ~ balanco + estudante, method = "knn", trControl= ctrl, preProcess=c("center", "scale"), tuneGrid = data.frame(k = seq(21,140, by=4)), data = conj_treino)
prev_knn <- predict(treina_knn, conj_teste,type = "prob")

# Reg Log
mod2 <- glm(inadimplente ~ balanco + estudante,data=conj_treino,family=binomial)
p_chapeu_log <- predict(mod2, newdata = conj_teste, type = "response")

# LDA e QDA
p_chapeu_lda <- predict(treina_lda, conj_teste)$posterior
p_chapeu_qda <- predict(treina_qda, conj_teste)$posterior

roc_log <- roc(conj_teste$inadimplente ~ p_chapeu_log, plot = TRUE, print.auc=FALSE,
                 col="black", legacy.axes=TRUE)
roc_lda <- roc(conj_teste$inadimplente ~ p_chapeu_lda[,2], plot = TRUE, print.auc=FALSE, col="green", legacy.axes=TRUE, add=TRUE)
roc_qda <- roc(conj_teste$inadimplente ~ p_chapeu_qda[,2], plot = TRUE, print.auc=FALSE, col="blue", legacy.axes=TRUE, add=TRUE)
roc_knn1 <- roc(conj_teste$inadimplente ~ prev_knn[,2], plot = TRUE, print.auc=FALSE, col="red", legacy.axes=TRUE, add=TRUE)

legend("bottomright",legend=c("Reg Log","LDA","QDA", "KNN"), 
       col=c("black", "green","blue", "red"),lwd=4)

as.numeric(roc_log$auc)

#> [1] 0.9505009

as.numeric(roc_lda$auc)

#> [1] 0.9498063

as.numeric(roc_qda$auc)

#> [1] 0.9497909

as.numeric(roc_knn1$auc)

#> [1] 0.9423397