---
title: "More Tidymodels"
subtitle: "Lecture 23"
author: "Dr. Colin Rundel"
footer: "Sta 523 - Fall 2022"
format:
  revealjs:
    theme: slides.scss
    transition: fade
    slide-number: true
    self-contained: true
execute: 
  echo: true
---

```{r setup, message=FALSE, warning=FALSE, include=FALSE}
options(
  htmltools.dir.version = FALSE, # for blogdown
  width=50
)

knitr::opts_chunk$set(
  fig.align = "center", fig.retina = 2, dpi = 150,
  out.width = "100%"
)

library(tidymodels)
library(tidyverse)
library(patchwork)

ggplot2::theme_set(ggplot2::theme_bw())

options(width=60)
options(tibble.width = 60)
```

#

![](imgs/hex_tidymodels.png){fig-align="center" width="50%"}


## Hotels Data

Original data from [Antonio, Almeida, and Nunes (2019)](https://doi.org/10.1016/j.dib.2018.11.126), [Data dictionary](https://github.com/rfordatascience/tidytuesday/tree/master/data/2020/2020-02-11#data-dictionary)


```{r}
hotels = read_csv(
  'https://tidymodels.org/start/case-study/hotels.csv'
) %>%
  mutate(
    across(where(is.character), as.factor)
  )
```

::: {.aside}
This version of the data is slightly modified from the original data - see [gist](https://gist.github.com/topepo/05a74916c343e57a71c51d6bc32a21ce) for the cleanup steps
:::

## The data

::: {.small}
```{r}
glimpse(hotels)
```
:::

## The model

Our goal is to develop a predictive model that is able to predict whether a booking will include children or not based on the other characteristics of the booking.

. . .

```{r}
hotels %>%
  count(children) %>%
  mutate(prop = n/sum(n))
```

## Clustering the test/train split

:::: {.columns .small}
::: {.column width='50%'}
```{r}
set.seed(123)

splits = initial_split(
  hotels, strata = children
)

hotel_train = training(splits)
hotel_test = testing(splits)
```

```{r}
dim(hotel_train)
dim(hotel_test)
```
:::

::: {.column width='50%'}
```{r}
hotel_train %>%
  count(children) %>%
  mutate(prop = n/sum(n))

hotel_test %>%
  count(children) %>%
  mutate(prop = n/sum(n))
```
:::
::::

## Logistic Regression model

::: {.small}
```{r}
show_engines("logistic_reg")
```
:::

. . .

::: {.small}
```{r}
lr_model = logistic_reg() %>%
  set_engine("glm")
translate(lr_model)
```
:::

## Recipe

::: {.small}
```{r}
holidays = c("AllSouls", "AshWednesday", "ChristmasEve", "Easter", 
              "ChristmasDay", "GoodFriday", "NewYearsDay", "PalmSunday")

lr_recipe = recipe(children ~ ., data = hotel_train) %>% 
  step_date(arrival_date) %>% 
  step_holiday(arrival_date, holidays = holidays) %>% 
  step_rm(arrival_date) %>% 
  step_rm(country) %>%
  step_dummy(all_nominal_predictors()) %>% 
  step_zv(all_predictors())

lr_recipe
```
:::

##

```{r include=FALSE}
options(width=100)
options(tibble.width = 100)
```

::: {.small}
```{r}
lr_recipe %>%
  prep() %>%
  bake(new_data = hotel_train)
```
:::

```{r include=FALSE}
options(width=60)
options(tibble.width = 60)
```

## Workflow

::: {.small}
```{r}
( lr_work = workflow() %>%
    add_model(lr_model) %>%
    add_recipe(lr_recipe) 
)
```
:::

## Fit

::: {.small}
```{r}
( lr_fit = lr_work %>%
    fit(data = hotel_train) )
```
:::

## Logistic regression predictions

::: {.small}
```{r}
( lr_perf = lr_fit %>%
    augment(new_data = hotel_train) %>%
    select(children, starts_with(".pred")) )
```
:::

## Performance metrics (within-sample)

:::: {.columns .small}
::: {.column width='50%'}
```{r}
lr_perf %>%
  conf_mat(children, .pred_class)

lr_perf %>%
  precision(children, .pred_class)

lr_perf %>%
  roc_auc(children, .pred_children)
```
:::

::: {.column width='50%' .fragment}
```{r}
lr_perf %>%
  yardstick::roc_curve(
    children,
    .pred_children
  ) %>%
  autoplot()
```
:::
::::

## Performance metrics (out-of-sample)

:::: {.columns .small}
::: {.column width='50%'}
```{r}
lr_test_perf = lr_fit %>%
  augment(new_data = hotel_test) %>%
  select(children, starts_with(".pred"))

lr_test_perf %>%
  conf_mat(children, .pred_class)

lr_test_perf %>%
  precision(children, .pred_class)

lr_test_perf %>%
  roc_auc(children, .pred_children)
```
:::

::: {.column width='50%'}
```{r}
lr_test_perf %>%
  yardstick::roc_curve(
    children,
    .pred_children
  ) %>%
  autoplot()
```
:::
::::


## Combining ROC curves

:::: {.columns .small}
::: {.column width='50%'}
```{r}
lr_roc_train = lr_perf %>%
  yardstick::roc_curve(children, .pred_children) %>% 
  mutate(name="logistic - train")

lr_roc_test = lr_test_perf %>%
  yardstick::roc_curve(children, .pred_children) %>% 
  mutate(name="logistic - test")
```
:::

::: {.column width='50%'}
```{r}
bind_rows(
  lr_roc_train,
  lr_roc_test
) %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity, col = name)) + 
    geom_path(lwd = 1.5, alpha = 0.8) +
    geom_abline(lty = 3) + 
    coord_equal()
```
:::
::::


# Lasso 

## Lasso Model

For this we will be using the `glmnet` package which supports fitting lasso, ridge and elastic net models. 

. . .


::: {.small}
```{r}
lasso_model = logistic_reg(penalty = tune(), mixture = 1) %>%
  set_engine("glmnet")
```
:::

* `mixture` determines the type of model fit with: `1` -> Lasso, `0` -> Ridge, other -> elastic net.

* `penalty` is $\lambda$ in the lasso model, scales the penalty for coefficient size.

##

```{r}
lasso_model %>% 
  parameters()
```

. . .

```{r}
lasso_model %>%
  translate()
```


## Lasso Recipe

Lasso (and Ridge) models are sensitive to the scale of the model features, and so a standard approach is to normalize all features before fitting the model.

::: {.small}
```{r}
lasso_recipe = lr_recipe %>%
  step_normalize(all_predictors())
```
:::

. . .

::: {.small}
```{r}
lasso_recipe %>%
  prep() %>%
  bake(new_data = hotel_train)
```
:::

## Lasso workflow

::: {.small}
```{r}
( lasso_work = workflow() %>%
    add_model(lasso_model) %>%
    add_recipe(lasso_recipe)
)
```
:::


## v-folds for hyperparameter tuning

```{r}
( hotel_vf = rsample::vfold_cv(hotel_train, v=5, strata = children) )
```

## grid search

::: {.small}
```{r}
#| code-line-numbers: "|1|3|4-6|7|8"
( lasso_grid = lasso_work %>%
    tune_grid(
      hotel_vf,
      grid = tibble(
        penalty = 10^seq(-4, -1, length.out = 10)
      ),
      control = control_grid(save_pred = TRUE),
      metrics = metric_set(roc_auc)
    )
)
```
:::

## Results

```{r include=FALSE}
options(width=50)
options(tibble.width = 50)
```

:::: {.columns .small}
::: {.column width='50%'}
```{r}
lasso_grid %>%
  collect_metrics()
```
:::

::: {.column width='50%'}
```{r}
lasso_grid %>% 
  collect_metrics() %>% 
  ggplot(aes(x = penalty, y = mean)) + 
    geom_point() + 
    geom_line() + 
    ylab("Area under the ROC Curve") +
    scale_x_log10(labels = scales::label_number())
```
:::
::::


```{r include=FALSE}
options(width=60)
options(tibble.width = 60)
```

## "Best" models

::: {.small}
```{r}
lasso_grid %>%
  show_best("roc_auc", n=10)
```
:::

## "Best" model

::: {.small}
```{r}
( lasso_best = lasso_grid %>%
    collect_metrics() %>%
    mutate(mean = round(mean, 2)) %>%
    arrange(desc(mean), desc(penalty)) %>%
    slice(1) )
```
:::

## Extracting predictions

Since we used `control_grid(save_pred = TRUE)` with `tune_grid()` we can recover the predictions for the out-of-sample values for each fold:

::: {.small}
```{r}
( lasso_train_perf = lasso_grid %>%
    collect_predictions(parameters = lasso_best) )
```
:::

##

::: {.small}
```{r}
lasso_train_perf %>%
  roc_auc(children, .pred_children)
```
:::

## Re-fitting

Typically with a tuned model we will refit using the complete test data and the "best" parameter value(s),

```{r}
lasso_work_tuned = update_model(
  lasso_work, 
  logistic_reg(
    mixture = 1, 
    penalty = lasso_best$penalty
  ) %>%
    set_engine("glmnet")
)

lasso_fit = lasso_work_tuned %>%
  fit(data=hotel_train)
```


## Test Performance (out-of-sample)

:::: {.columns .small}
::: {.column width='50%'}
```{r}
lasso_test_perf = lasso_fit %>%
  augment(new_data = hotel_test) %>%
  select(children, starts_with(".pred"))

lasso_test_perf %>%
  conf_mat(children, .pred_class)

lasso_test_perf %>%
  precision(children, .pred_class)

lasso_test_perf %>%
  roc_auc(children, .pred_children)
```
:::

::: {.column width='50%'}
```{r}
lasso_roc = lasso_test_perf %>%
  yardstick::roc_curve(
    children,
    .pred_children
  ) %>%
  mutate(name = "lasso - test")
lasso_roc %>%
  autoplot()
```
:::
::::

## Comparing models

```{r out.width="55%", echo=FALSE }
bind_rows(
  lr_roc_test,
  lasso_roc
) %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity, col = name)) + 
    geom_path(lwd = 1.5, alpha = 0.8) +
    geom_abline(lty = 3) + 
    coord_equal()
```

# Random Forest

## Random forest models

```{r}
show_engines("rand_forest")
```

. . .

```{r}
rf_model = rand_forest(mtry = tune(), min_n = tune(), trees = 100) %>% 
  set_engine("ranger", num.threads = 8) %>% 
  set_mode("classification")
```

## Recipe & workflow

We skip dummy coding in the recipe as it is not needed by ranger,

```{r}
rf_recipe = recipe(children ~ ., data = hotel_train) %>% 
  step_date(arrival_date) %>% 
  step_holiday(arrival_date, holidays = holidays) %>% 
  step_rm(arrival_date) %>%
  step_rm(country)
```

. . .

```{r}
rf_work = workflow() %>% 
  add_model(rf_model) %>% 
  add_recipe(rf_recipe)
```

## Tuning

:::: {.columns .small}
::: {.column width='50%'}
```{r}
rf_work %>%
  parameters()
```
:::

::: {.column width='50%'}
```{r}
rf_grid = rf_work %>% 
  tune_grid(
    hotel_vf,
    grid = 10,
    control = control_grid(save_pred = TRUE),
    metrics = metric_set(roc_auc)
  )
```
:::
::::


## "Best" parameters

```{r include=FALSE}
options(width=50)
options(tibble.width = 50)
```

:::: {.columns .small}
::: {.column width='50%'}
```{r}
rf_grid %>% 
  show_best(metric = "roc_auc")
```
:::

::: {.column width='50%'}
```{r}
autoplot(rf_grid)
```
:::
::::


```{r include=FALSE}
options(width=60)
options(tibble.width = 60)
```

## Refitting

::: {.small}
```{r}
(rf_best = rf_grid %>%
  select_best(metric = "roc_auc"))
```

```{r}
rf_work_tuned = update_model(
  rf_work, 
  rand_forest(
    trees=100,
    mtry = rf_best$mtry, 
    min_n = rf_best$min_n
  ) %>%
    set_engine("ranger", num.threads = 8) %>%
    set_mode("classification")
)

rf_fit = rf_work_tuned %>%
  fit(data=hotel_train)
```
:::

## Test Performance (out-of-sample)

:::: {.columns .small}
::: {.column width='50%'}
```{r}
rf_test_perf = rf_fit %>%
  augment(new_data = hotel_test) %>%
  select(children, starts_with(".pred"))

rf_test_perf %>%
  conf_mat(children, .pred_class)

rf_test_perf %>%
  precision(children, .pred_class)

rf_test_perf %>%
  roc_auc(children, .pred_children)
```
:::

::: {.column width='50%'}
```{r}
rf_roc = rf_test_perf %>%
  yardstick::roc_curve(
    children,
    .pred_children
  ) %>%
  mutate(name = "RF - test")
rf_roc %>%
  autoplot()
```
:::
::::


## Comparing models

```{r out.width="55%", echo=FALSE }
bind_rows(
  lr_roc_test,
  lasso_roc,
  rf_roc
) %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity, col = name)) + 
    geom_path(lwd = 1.5, alpha = 0.8) +
    geom_abline(lty = 3) + 
    coord_equal()
```