Problemas na Regressão Multipla

Author

Ricardo Accioly

Published

September 3, 2025

Carregando bibliotecas

#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.4     ✔ readr     2.1.5
#> ✔ forcats   1.0.0     ✔ stringr   1.5.1
#> ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
#> ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
#> ✔ purrr     1.1.0     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#> Carregando pacotes exigidos: lattice
#> 
#> Anexando pacote: 'caret'
#> 
#> O seguinte objeto é mascarado por 'package:purrr':
#> 
#>     lift

Dados de pressão sanguinea

  • BP = Pressão sanguínea (em mm Hg)

  • Age = idade (em anos)

  • Weight = peso (em kg)

  • BSA = area superficial do corpo (em m2)

  • Dur = duração da hipertensão (em anos)

  • Pulse = batimentos (batidas por minuto)

  • Stress = índice de stress

pressao_sangue <- read_delim("bloodpress.txt", col_names = TRUE)
#> Rows: 20 Columns: 8
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: "\t"
#> dbl (8): Pt, BP, Age, Weight, BSA, Dur, Pulse, Stress
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Renomeando

glimpse(pressao_sangue)
#> Rows: 20
#> Columns: 8
#> $ Pt     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …
#> $ BP     <dbl> 105, 115, 116, 117, 112, 121, 121, 110, 110, 114, 114, 115, 114…
#> $ Age    <dbl> 47, 49, 49, 50, 51, 48, 49, 47, 49, 48, 47, 49, 50, 45, 52, 46,…
#> $ Weight <dbl> 85.4, 94.2, 95.3, 94.7, 89.4, 99.5, 99.8, 90.9, 89.2, 92.7, 94.…
#> $ BSA    <dbl> 1.75, 2.10, 1.98, 2.01, 1.89, 2.25, 2.25, 1.90, 1.83, 2.07, 2.0…
#> $ Dur    <dbl> 5.1, 3.8, 8.2, 5.8, 7.0, 9.3, 2.5, 6.2, 7.1, 5.6, 5.3, 5.6, 10.…
#> $ Pulse  <dbl> 63, 70, 72, 73, 72, 71, 69, 66, 69, 64, 74, 71, 68, 67, 76, 69,…
#> $ Stress <dbl> 33, 14, 10, 99, 95, 10, 42, 8, 62, 35, 90, 21, 47, 80, 98, 95, …
pressao_sangue <- pressao_sangue %>% rename(PS = BP, Idade = Age,
                                            Peso = Weight, Acorp = BSA,
                                            Pulso = Pulse) %>% select(-Pt)

Sumario

summary(pressao_sangue)
#>        PS            Idade            Peso            Acorp      
#>  Min.   :105.0   Min.   :45.00   Min.   : 85.40   Min.   :1.750  
#>  1st Qu.:110.0   1st Qu.:47.00   1st Qu.: 90.22   1st Qu.:1.897  
#>  Median :114.0   Median :48.50   Median : 94.15   Median :1.980  
#>  Mean   :114.0   Mean   :48.60   Mean   : 93.09   Mean   :1.998  
#>  3rd Qu.:116.2   3rd Qu.:49.25   3rd Qu.: 94.85   3rd Qu.:2.075  
#>  Max.   :125.0   Max.   :56.00   Max.   :101.30   Max.   :2.250  
#>       Dur            Pulso           Stress     
#>  Min.   : 2.50   Min.   :62.00   Min.   : 8.00  
#>  1st Qu.: 5.25   1st Qu.:67.75   1st Qu.:17.00  
#>  Median : 6.00   Median :70.00   Median :44.50  
#>  Mean   : 6.43   Mean   :69.60   Mean   :53.35  
#>  3rd Qu.: 7.60   3rd Qu.:72.00   3rd Qu.:95.00  
#>  Max.   :10.20   Max.   :76.00   Max.   :99.00
#> corrplot 0.95 loaded
mat_corr <- cor(cor(pressao_sangue))
corrplot(mat_corr)

cor(pressao_sangue)
#>               PS     Idade       Peso      Acorp       Dur     Pulso     Stress
#> PS     1.0000000 0.6590930 0.95006765 0.86587887 0.2928336 0.7214132 0.16390139
#> Idade  0.6590930 1.0000000 0.40734926 0.37845460 0.3437921 0.6187643 0.36822369
#> Peso   0.9500677 0.4073493 1.00000000 0.87530481 0.2006496 0.6593399 0.03435475
#> Acorp  0.8658789 0.3784546 0.87530481 1.00000000 0.1305400 0.4648188 0.01844634
#> Dur    0.2928336 0.3437921 0.20064959 0.13054001 1.0000000 0.4015144 0.31163982
#> Pulso  0.7214132 0.6187643 0.65933987 0.46481881 0.4015144 1.0000000 0.50631008
#> Stress 0.1639014 0.3682237 0.03435475 0.01844634 0.3116398 0.5063101 1.00000000

Aqui vemos que a presssão sanguinea tem uma correlação forte com o peso e também com a área corporal. O peso e a area corporal tem uma correlação forte. Esta correlação alta pode indicar a existencia de multicolinearidade.

library(psych)
pairs.panels(pressao_sangue)

Criando conjunto de treino e teste

library(caret)
set.seed(25)
y <- pressao_sangue$PS
indice_teste <- createDataPartition(y, times = 1, p = 0.20, list = FALSE)

conj_treino <- pressao_sangue[-indice_teste, ]
conj_teste <- pressao_sangue[indice_teste, ]

glimpse(conj_treino)
#> Rows: 14
#> Columns: 7
#> $ PS     <dbl> 105, 116, 117, 121, 110, 110, 114, 114, 115, 114, 125, 114, 110…
#> $ Idade  <dbl> 47, 49, 50, 49, 47, 49, 48, 47, 49, 50, 52, 46, 48, 56
#> $ Peso   <dbl> 85.4, 95.3, 94.7, 99.8, 90.9, 89.2, 92.7, 94.4, 94.1, 91.6, 101…
#> $ Acorp  <dbl> 1.75, 1.98, 2.01, 2.25, 1.90, 1.83, 2.07, 2.07, 1.98, 2.05, 2.1…
#> $ Dur    <dbl> 5.1, 8.2, 5.8, 2.5, 6.2, 7.1, 5.6, 5.3, 5.6, 10.2, 10.0, 7.4, 9…
#> $ Pulso  <dbl> 63, 72, 73, 69, 66, 69, 64, 74, 71, 68, 76, 69, 71, 75
#> $ Stress <dbl> 33, 10, 99, 42, 8, 62, 35, 90, 21, 47, 98, 95, 99, 99
glimpse(conj_teste)
#> Rows: 6
#> Columns: 7
#> $ PS     <dbl> 115, 112, 121, 106, 106, 113
#> $ Idade  <dbl> 49, 51, 48, 45, 46, 46
#> $ Peso   <dbl> 94.2, 89.4, 99.5, 87.1, 87.0, 94.5
#> $ Acorp  <dbl> 2.10, 1.89, 2.25, 1.92, 1.87, 1.90
#> $ Dur    <dbl> 3.8, 7.0, 9.3, 5.6, 3.6, 4.3
#> $ Pulso  <dbl> 70, 72, 71, 67, 62, 70
#> $ Stress <dbl> 14, 95, 10, 80, 18, 12

1a. Avaliação

mod1 <- lm(PS ~ ., data=conj_treino)
summary(mod1)
#> 
#> Call:
#> lm(formula = PS ~ ., data = conj_treino)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.78334 -0.06849  0.09515  0.19505  0.47278 
#> 
#> Coefficients:
#>               Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -14.704848   4.004145  -3.672  0.00794 ** 
#> Idade         0.756631   0.067774  11.164 1.03e-05 ***
#> Peso          1.014897   0.104982   9.667 2.67e-05 ***
#> Acorp         2.426007   2.723376   0.891  0.40261    
#> Dur           0.036057   0.067581   0.534  0.61018    
#> Pulso        -0.115978   0.070242  -1.651  0.14270    
#> Stress        0.006775   0.004722   1.435  0.19450    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.4551 on 7 degrees of freedom
#> Multiple R-squared:  0.9961, Adjusted R-squared:  0.9927 
#> F-statistic: 295.3 on 6 and 7 DF,  p-value: 4.705e-08
#> Carregando pacotes exigidos: carData
#> 
#> Anexando pacote: 'car'
#> O seguinte objeto é mascarado por 'package:psych':
#> 
#>     logit
#> O seguinte objeto é mascarado por 'package:dplyr':
#> 
#>     recode
#> O seguinte objeto é mascarado por 'package:purrr':
#> 
#>     some
vif(mod1)
#>     Idade      Peso     Acorp       Dur     Pulso    Stress 
#>  1.839128 11.506832  8.452385  1.255999  4.764421  1.798917

2a. Avaliação

mod2 <- update(mod1,. ~ . -Acorp) 
summary(mod2)
#> 
#> Call:
#> lm(formula = PS ~ Idade + Peso + Dur + Pulso + Stress, data = conj_treino)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.78828 -0.09161  0.03481  0.28998  0.42216 
#> 
#> Coefficients:
#>               Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -16.636422   3.322506  -5.007  0.00104 ** 
#> Idade         0.779649   0.061841  12.607 1.47e-06 ***
#> Peso          1.099617   0.043878  25.061 6.88e-09 ***
#> Dur           0.028417   0.066164   0.429  0.67890    
#> Pulso        -0.148601   0.059162  -2.512  0.03627 *  
#> Stress        0.007890   0.004494   1.756  0.11724    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.4492 on 8 degrees of freedom
#> Multiple R-squared:  0.9956, Adjusted R-squared:  0.9929 
#> F-statistic: 363.5 on 5 and 8 DF,  p-value: 3.31e-09
vif(mod2)
#>    Idade     Peso      Dur    Pulso   Stress 
#> 1.571811 2.063392 1.235775 3.469415 1.672564

3a. Avaliação

mod3 <- lm(PS ~ Idade + Peso + Pulso + Stress, data = conj_treino)
summary(mod3)
#> 
#> Call:
#> lm(formula = PS ~ Idade + Peso + Pulso + Stress, data = conj_treino)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -0.7491 -0.1267  0.0447  0.2459  0.4706 
#> 
#> Coefficients:
#>               Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -16.572609   3.165229  -5.236 0.000538 ***
#> Idade         0.783241   0.058431  13.405 2.98e-07 ***
#> Peso          1.094261   0.040118  27.276 5.80e-10 ***
#> Pulso        -0.142220   0.054610  -2.604 0.028537 *  
#> Stress        0.008011   0.004277   1.873 0.093850 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.4284 on 9 degrees of freedom
#> Multiple R-squared:  0.9955, Adjusted R-squared:  0.9935 
#> F-statistic: 499.6 on 4 and 9 DF,  p-value: 1.482e-10

#>            Test stat Pr(>|Test stat|)
#> Idade        -0.3779           0.7153
#> Peso         -0.8326           0.4292
#> Pulso         0.5699           0.5844
#> Stress       -0.7313           0.4855
#> Tukey test   -0.9748           0.3296

Comparando Previsão vs Real

conj_treino$Previsoes <- predict(mod3, data=conj_treino)
ggplot(conj_treino, aes(x=Previsoes, y=PS)) + 
  geom_point() +
  geom_abline(color = "darkblue") +
  ggtitle("Pressão sanguinea vs. Previsões do modelo linear")

Teste dos resíduos

Teste de normalidade Teste de heterocedasticidade (Bresch-Pagan) Teste de autocorrelação (Durbin-Watson)

library(lmtest)
mod3_sum <- summary(mod3)
# Teste de normalidade
shapiro.test(mod3_sum$residuals)
#> 
#>  Shapiro-Wilk normality test
#> 
#> data:  mod3_sum$residuals
#> W = 0.93949, p-value = 0.4118
# Teste de hetrocedasticidade
bptest(mod3)
#> 
#>  studentized Breusch-Pagan test
#> 
#> data:  mod3
#> BP = 1.4231, df = 4, p-value = 0.8402
# Teste de autocorrelação
dwtest(mod3)
#> 
#>  Durbin-Watson test
#> 
#> data:  mod3
#> DW = 1.7244, p-value = 0.3101
#> alternative hypothesis: true autocorrelation is greater than 0

Avaliação com conjunto de teste

sqrt(mean((conj_teste$PS - predict(mod3, conj_teste)) ^ 2)) 
#> [1] 0.5975429