library(tidyverse)
library(ISLR)
Arvores de Classificação - XGboost
Bibliotecas
Dados
data(Default)
summary(Default)
default student balance income
No :9667 No :7056 Min. : 0.0 Min. : 772
Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
Median : 823.6 Median :34553
Mean : 835.4 Mean :33517
3rd Qu.:1166.3 3rd Qu.:43808
Max. :2654.3 Max. :73554
str(Default)
'data.frame': 10000 obs. of 4 variables:
$ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
$ balance: num 730 817 1074 529 786 ...
$ income : num 44362 12106 31767 35704 38463 ...
head(Default)
default student balance income
1 No No 729.5265 44361.625
2 No Yes 817.1804 12106.135
3 No No 1073.5492 31767.139
4 No No 529.2506 35704.494
5 No No 785.6559 38463.496
6 No Yes 919.5885 7491.559
Manipulando os dados
<- tibble(Default)
credito summary(credito)
default student balance income
No :9667 No :7056 Min. : 0.0 Min. : 772
Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
Median : 823.6 Median :34553
Mean : 835.4 Mean :33517
3rd Qu.:1166.3 3rd Qu.:43808
Max. :2654.3 Max. :73554
# renomeando colunas
<- credito %>%
credito rename( inadimplente = default, estudante = student, balanco = balance,
receita = income)
<- credito %>% mutate( inadimplente = case_when(
credito == "No" ~ "Nao",
inadimplente == "Yes" ~ "Sim"
inadimplente %>% mutate(inadimplente = factor(inadimplente))
)) <- credito %>% mutate( estudante = case_when(
credito == "No" ~ 0,
estudante == "Yes" ~ 1
estudante
))
str(credito)
tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
$ inadimplente: Factor w/ 2 levels "Nao","Sim": 1 1 1 1 1 1 1 1 1 1 ...
$ estudante : num [1:10000] 0 1 0 0 0 1 0 1 0 0 ...
$ balanco : num [1:10000] 730 817 1074 529 786 ...
$ receita : num [1:10000] 44362 12106 31767 35704 38463 ...
summary(credito)
inadimplente estudante balanco receita
Nao:9667 Min. :0.0000 Min. : 0.0 Min. : 772
Sim: 333 1st Qu.:0.0000 1st Qu.: 481.7 1st Qu.:21340
Median :0.0000 Median : 823.6 Median :34553
Mean :0.2944 Mean : 835.4 Mean :33517
3rd Qu.:1.0000 3rd Qu.:1166.3 3rd Qu.:43808
Max. :1.0000 Max. :2654.3 Max. :73554
Treino e Teste
library(caret)
set.seed(21)
<- credito$inadimplente
y <- createDataPartition(y, times = 1, p = 0.2, list = FALSE)
indice_teste
<- credito %>% slice(-indice_teste) %>% select(-inadimplente)
x_conj_treino <- credito %>% slice(indice_teste) %>% select(-inadimplente)
x_conj_teste <- credito %>% slice(-indice_teste) %>% select(inadimplente)
y_treino <- as.integer(unlist(y_treino))-1
y_treino <- credito %>% slice(indice_teste) %>% select(inadimplente)
y_teste <- as.integer(unlist(y_teste))-1 y_teste
Treinando
## 1a tentativa Xgboost
library(xgboost)
= 2
num_class = list(
params booster="gbtree",
eta=0.001,
max_depth=5,
gamma=3,
subsample=0.75,
colsample_bytree=1,
objective="multi:softprob",
eval_metric="mlogloss",
num_class=num_class
)
set.seed(21)
<- xgb.cv(data = as.matrix(x_conj_treino), label = as.matrix(y_treino), params=params,
cv nrounds = 10000, nfold = 5, early_stopping_rounds=10,
nthreads=1, verbose=FALSE)
# cv
<- as.data.frame(cv$evaluation_log)
elog %>%
elog summarize(ntrees.train = which.min(train_mlogloss_mean),
ntrees.test = which.min(test_mlogloss_mean))
ntrees.train ntrees.test
1 6678 6668
<- which.min(elog$test_mlogloss_mean)) (nrounds
[1] 6668
Modelo Final
<- xgboost(data = as.matrix(x_conj_treino), label = as.matrix(y_treino),
modelo_xgb params=params, nrounds = nrounds, verbose = FALSE)
Importancia das variáveis
<- xgb.importance(model = modelo_xgb)
importancia importancia
Feature Gain Cover Frequency
<char> <num> <num> <num>
1: balanco 0.936743463 0.938791966 0.74828024
2: receita 0.057924670 0.052455552 0.22517788
3: estudante 0.005331867 0.008752483 0.02654188
xgb.plot.importance(importancia)
Previsões
$prev <- predict(modelo_xgb, as.matrix(x_conj_teste), reshape=T) x_conj_teste
Avaliando
<- as.factor(ifelse(x_conj_teste$prev[,1] > 0.5,0,1))
xgb.ychapeu confusionMatrix(xgb.ychapeu,as.factor(y_teste), positive="1")
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1924 38
1 10 29
Accuracy : 0.976
95% CI : (0.9683, 0.9823)
No Information Rate : 0.9665
P-Value [Acc > NIR] : 0.008318
Kappa : 0.5357
Mcnemar's Test P-Value : 9.735e-05
Sensitivity : 0.43284
Specificity : 0.99483
Pos Pred Value : 0.74359
Neg Pred Value : 0.98063
Prevalence : 0.03348
Detection Rate : 0.01449
Detection Prevalence : 0.01949
Balanced Accuracy : 0.71383
'Positive' Class : 1