---
title: "Model stacking for boosted trees"
output:
  rmarkdown::html_document:
    toc: true
    toc_float: false
    toc_depth: 4
    number_sections: false
    highlight: "textmate"
    css: "custom.css"
bibliography: stackgbm.bib
vignette: >
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteIndexEntry{Model stacking for boosted trees}
---

```{r, include=FALSE}
knitr::opts_chunk$set(
  comment = "#>",
  collapse = TRUE
)

run <- if (rlang::is_installed(c("catboost", "lightgbm", "xgboost"))) TRUE else FALSE
knitr::opts_chunk$set(eval = run)
```

## Introduction

Model stacking [@wolpert1992stacked] is a method for ensemble learning that
combines the strength of multiple base learners to drive up predictive performance.
It is a particularly popular and effective strategy used in machine learning competitions.

stackgbm implements a two-layer stacking model: the first layer generates
"features" produced by gradient boosting trees. The boosted tree models are
built by
xgboost [@chen2016xgboost], lightgbm [@ke2017lightgbm], and catboost [@prokhorenkova2018catboost].
The second layer is a logistic regression that uses these features as inputs.

```{r, message=FALSE}
library("stackgbm")
```

## Generate data

Let's generate some data for demonstrate purposes.
The simulated data has a $1000 \times 50$ predictor matrix with a
binary outcome vector. 800 samples will be in the training set and
the rest 200 will be in the (independent) test set.
25 out of the 50 features will be informative and follows $N(0, 10)$.

```{r}
sim_data <- msaenet::msaenet.sim.binomial(
  n = 5000,
  p = 100,
  rho = 0.8,
  coef = c(
    rnorm(20, mean = 0, sd = 5),
    rnorm(20, mean = 0, sd = 2),
    rnorm(20, mean = 0, sd = 1)
  ),
  snr = 0.5,
  p.train = 0.8,
  seed = 42
)

x_train <- sim_data$x.tr
x_test <- sim_data$x.te
y_train <- as.vector(sim_data$y.tr)
y_test <- as.vector(sim_data$y.te)
```

## Parameter tuning

`cv_xgboost()`, `cv_lightgbm()` and `cv_catboost()` provide wrappers for
tuning the most essential hyperparameters for each type of boosted tree models
with k-fold cross-validation. The "optimal" parameters will be
used to fit the stacking model later.

```{r, eval=FALSE}
params_xgboost <- cv_xgboost(x_train, y_train)
params_lightgbm <- cv_lightgbm(x_train, y_train)
params_catboost <- cv_catboost(x_train, y_train)
```

```{r, eval=FALSE, echo=FALSE}
saveRDS(params_xgboost, file = "vignettes/params_xgboost.rds")
saveRDS(params_lightgbm, file = "vignettes/params_lightgbm.rds")
saveRDS(params_catboost, file = "vignettes/params_catboost.rds")

temp_dir <- "catboost_info"
temp_file <- "lightgbm.model"
if (dir.exists(temp_dir)) unlink(temp_dir, recursive = TRUE)
if (file.exists(temp_file)) unlink(temp_file)
```

```{r, echo=FALSE}
params_xgboost <- readRDS("params_xgboost.rds")
params_lightgbm <- readRDS("params_lightgbm.rds")
params_catboost <- readRDS("params_catboost.rds")
```

## Train the stackgbm model

```{r}
model_stackgbm <- stackgbm(
  sim_data$x.tr,
  sim_data$y.tr,
  params = list(
    params_xgboost,
    params_lightgbm,
    params_catboost
  )
)
```

## Inference

```{r}
roc_stackgbm_train <- pROC::roc(
  y_train,
  predict(model_stackgbm, x_train)$prob,
  quiet = TRUE
)
roc_stackgbm_test <- pROC::roc(
  y_test,
  predict(model_stackgbm, x_test)$prob,
  quiet = TRUE
)
roc_stackgbm_train$auc
roc_stackgbm_test$auc
```

## Performance evaluation

Let's compare the predictive performance between the stacking model and
the three types of tree boosting models (base learners) fitted individually.
Note that the models and performance metrics should be (bitwise) reproducible
on the same operating system but they might vary on different platforms.

```{r, message=FALSE}
model_xgboost <- xgboost_train(
  params = list(
    objective = "binary:logistic",
    eval_metric = "auc",
    max_depth = params_xgboost$max_depth,
    eta = params_xgboost$eta
  ),
  data = xgboost_dmatrix(x_train, label = y_train),
  nrounds = params_xgboost$nrounds
)

model_lightgbm <- lightgbm_train(
  data = x_train,
  label = y_train,
  params = list(
    objective = "binary",
    learning_rate = params_lightgbm$learning_rate,
    num_iterations = params_lightgbm$num_iterations,
    max_depth = params_lightgbm$max_depth,
    num_leaves = 2^params_lightgbm$max_depth - 1
  ),
  verbose = -1
)

model_catboost <- catboost_train(
  catboost_load_pool(data = x_train, label = y_train),
  NULL,
  params = list(
    loss_function = "Logloss",
    iterations = params_catboost$iterations,
    depth = params_catboost$depth,
    logging_level = "Silent"
  )
)
```

### xgboost

```{r}
roc_xgboost_train <- pROC::roc(
  y_train,
  predict(model_xgboost, x_train),
  quiet = TRUE
)
roc_xgboost_test <- pROC::roc(
  y_test,
  predict(model_xgboost, x_test),
  quiet = TRUE
)
roc_xgboost_train$auc
roc_xgboost_test$auc
```

### lightgbm

```{r}
roc_lightgbm_train <- pROC::roc(
  y_train,
  predict(model_lightgbm, x_train),
  quiet = TRUE
)
roc_lightgbm_test <- pROC::roc(
  y_test,
  predict(model_lightgbm, x_test),
  quiet = TRUE
)
roc_lightgbm_train$auc
roc_lightgbm_test$auc
```

### catboost

```{r}
roc_catboost_train <- pROC::roc(
  y_train,
  catboost_predict(
    model_catboost,
    catboost_load_pool(data = x_train, label = NULL)
  ),
  quiet = TRUE
)
roc_catboost_test <- pROC::roc(
  y_test,
  catboost_predict(
    model_catboost,
    catboost_load_pool(data = x_test, label = NULL)
  ),
  quiet = TRUE
)
roc_catboost_train$auc
roc_catboost_test$auc
```

### Tabular summary

We can summarize the AUC values in a table.

```{r, echo=FALSE}
df <- as.data.frame(matrix(NA, ncol = 4, nrow = 2))
names(df) <- c("stackgbm", "xgboost", "lightgbm", "catboost")
rownames(df) <- c("Training", "Testing")

df$stackgbm <- c(roc_stackgbm_train$auc, roc_stackgbm_test$auc)
df$xgboost <- c(roc_xgboost_train$auc, roc_xgboost_test$auc)
df$lightgbm <- c(roc_lightgbm_train$auc, roc_lightgbm_test$auc)
df$catboost <- c(roc_catboost_train$auc, roc_catboost_test$auc)

knitr::kable(
  df,
  digits = 4,
  caption = "AUC values from four models on training and testing set"
)
```

### ROC curves

Plot the ROC curves of all models on the independent test set.

```{r}
#| roc-curves,
#| message=FALSE,
#| fig.asp=1,
#| fig.width=5,
#| fig.dpi=300,
#| fig.align="center",
#| out.width="65%"
pal <- c("#e15759", "#f28e2c", "#59a14f", "#4e79a7", "#76b7b2")

plot(pROC::smooth(roc_stackgbm_test), col = pal[1], lwd = 1)
plot(pROC::smooth(roc_xgboost_test), col = pal[2], lwd = 1, add = TRUE)
plot(pROC::smooth(roc_lightgbm_test), col = pal[3], lwd = 1, add = TRUE)
plot(pROC::smooth(roc_catboost_test), col = pal[4], lwd = 1, add = TRUE)
legend(
  "bottomright",
  col = pal,
  lwd = 2,
  legend = c("stackgbm", "xgboost", "lightgbm", "catboost")
)
```

## Notes on categorical features

[xgboost](https://cran.r-project.org/package=xgboost/vignettes/discoverYourData.html#conversion-from-categorical-to-numeric-variables) and [lightgbm](https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support)
both prefer the categorical features to be encoded as integers.
For [catboost](https://catboost.ai/en/docs/concepts/algorithm-main-stages_cat-to-numberic),
the categorical features can be encoded as character factors.

To avoid possible confusions, if your data has any categorical features,
we recommend converting them to integers or use one-hot encoding, and
use a numerical matrix as the input.

## References