Flu Analysis: Machine Learning

Loading in Necessary Libraries

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)
library(glmnet)
Warning: package 'glmnet' was built under R version 4.2.3
Loading required package: Matrix
Loaded glmnet 4.1-7
library(here)
here() starts at C:/GitHub/MADA/kimberlyperez-MADA-portfolio
library(ranger)
Warning: package 'ranger' was built under R version 4.2.3
library(rpart)
library(rpart.plot)
Warning: package 'rpart.plot' was built under R version 4.2.3
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom        1.0.2     ✔ rsample      1.1.1
✔ dials        1.1.0     ✔ tibble       3.1.8
✔ infer        1.0.4     ✔ tidyr        1.3.0
✔ modeldata    1.1.0     ✔ tune         1.0.1
✔ parsnip      1.0.3     ✔ workflows    1.1.2
✔ purrr        1.0.1     ✔ workflowsets 1.0.0
✔ recipes      1.0.4     ✔ yardstick    1.1.0
Warning: package 'tidyr' was built under R version 4.2.3
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard()  masks scales::discard()
✖ tidyr::expand()   masks Matrix::expand()
✖ dplyr::filter()   masks stats::filter()
✖ dplyr::lag()      masks stats::lag()
✖ tidyr::pack()     masks Matrix::pack()
✖ dials::prune()    masks rpart::prune()
✖ recipes::step()   masks stats::step()
✖ tidyr::unpack()   masks Matrix::unpack()
✖ recipes::update() masks Matrix::update(), stats::update()
• Learn how to get started at https://www.tidymodels.org/start/
library(tidyverse)
── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──
✔ readr   2.1.4     ✔ forcats 0.5.2
✔ stringr 1.5.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ readr::col_factor() masks scales::col_factor()
✖ purrr::discard()    masks scales::discard()
✖ tidyr::expand()     masks Matrix::expand()
✖ dplyr::filter()     masks stats::filter()
✖ stringr::fixed()    masks recipes::fixed()
✖ dplyr::lag()        masks stats::lag()
✖ tidyr::pack()       masks Matrix::pack()
✖ readr::spec()       masks yardstick::spec()
✖ tidyr::unpack()     masks Matrix::unpack()
library(vip)

Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi

Reading in my Cleaned Data

flu<-readRDS(here("fluanalysis","processed_data", "SympAct_cleaned.rds")) #Loading in the data

glimpse(flu) #Looking at the Data 
Rows: 730
Columns: 32
$ SwollenLymphNodes <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Y…
$ ChestCongestion   <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y…
$ ChillsSweats      <fct> No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, …
$ NasalCongestion   <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y…
$ CoughYN           <fct> Yes, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, No, …
$ Sneeze            <fct> No, No, Yes, Yes, No, Yes, No, Yes, No, No, No, No, …
$ Fatigue           <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ SubjectiveFever   <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes…
$ Headache          <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes…
$ Weakness          <fct> Mild, Severe, Severe, Severe, Moderate, Moderate, Mi…
$ WeaknessYN        <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ CoughIntensity    <fct> Severe, Severe, Mild, Moderate, None, Moderate, Seve…
$ CoughYN2          <fct> Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes…
$ Myalgia           <fct> Mild, Severe, Severe, Severe, Mild, Moderate, Mild, …
$ MyalgiaYN         <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ RunnyNose         <fct> No, No, Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No…
$ AbPain            <fct> No, No, Yes, No, No, No, No, No, No, No, Yes, Yes, N…
$ ChestPain         <fct> No, No, Yes, No, No, Yes, Yes, No, No, No, No, Yes, …
$ Diarrhea          <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No,…
$ EyePn             <fct> No, No, No, No, Yes, No, No, No, No, No, Yes, No, Ye…
$ Insomnia          <fct> No, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Yes, Y…
$ ItchyEye          <fct> No, No, No, No, No, No, No, No, No, No, No, No, Yes,…
$ Nausea            <fct> No, No, Yes, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Y…
$ EarPn             <fct> No, Yes, No, Yes, No, No, No, No, No, No, No, Yes, Y…
$ Hearing           <fct> No, Yes, No, No, No, No, No, No, No, No, No, No, No,…
$ Pharyngitis       <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, …
$ Breathless        <fct> No, No, Yes, No, No, Yes, No, No, No, Yes, No, Yes, …
$ ToothPn           <fct> No, No, Yes, No, No, No, No, No, Yes, No, No, Yes, N…
$ Vision            <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, …
$ Vomit             <fct> No, No, No, No, No, No, Yes, No, No, No, Yes, Yes, N…
$ Wheeze            <fct> No, No, No, Yes, No, Yes, No, No, No, No, No, Yes, N…
$ BodyTemp          <dbl> 98.3, 100.4, 100.8, 98.8, 100.5, 98.4, 102.5, 98.4, …

Let’s Split the Data

set.seed(321) #Makes analysis reproducible when using randomization


data_split<- initial_split(flu, prop= 8/10,  strata= BodyTemp) #Here I put 70% of data to training  

#Let's create some dataframes for both trained and test data
train_data<- training(data_split)
test_data<- testing(data_split)

Null Model: Cross Validation

#5 Fold Cross Validation
f_data_train<- vfold_cv(train_data, v=5, repeats= 5, strata= BodyTemp)

f_data_test<- vfold_cv(test_data, v=5, repeats= 5, strata= BodyTemp)

Train the Data

rec_train<- 
  recipe(BodyTemp~., data=train_data) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_zv(all_predictors())

rec_train
Recipe

Inputs:

      role #variables
   outcome          1
 predictor         31

Operations:

Dummy variables from all_nominal(), -all_outcomes()
Zero variance filter on all_predictors()
rec_test<- 
  recipe(BodyTemp~., data=test_data) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_zv(all_predictors())

rec_test
Recipe

Inputs:

      role #variables
   outcome          1
 predictor         31

Operations:

Dummy variables from all_nominal(), -all_outcomes()
Zero variance filter on all_predictors()

Setting the Model: Train Data

lm_model1<- linear_reg() %>% #Let's define the model
  set_engine("lm") %>%
  set_mode("regression")

#Let's create the recipe for the train data
null_rec_tr<- recipe(BodyTemp ~1, data= train_data)

null_rec_tr
Recipe

Inputs:

    role #variables
 outcome          1

Workflow & Fit: Train Data

null_tr_wf<- 
  workflow () %>%
  add_model(lm_model1) %>%
  add_recipe(null_rec_tr)

#I want to fit the null to train workflow to folds
train_lm_n<- fit_resamples(null_tr_wf, resamples= f_data_train)
! Fold1, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...

Calculating RMSE: Train Data

null_metric_tr<- collect_metrics(train_lm_n)

null_metric_tr #By calculating RMSE and gathering mean (1.2) and standard deviation (0.013), we can utilize this information 
# A tibble: 2 × 6
  .metric .estimator   mean     n std_err .config             
  <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
1 rmse    standard     1.19    25  0.0202 Preprocessor1_Model1
2 rsq     standard   NaN        0 NA      Preprocessor1_Model1

Creating the Recipe: Test Data

Now, let’s do the same for our test data! We already created our recipe for the test data above. I will start with setting the model

lm_model1<- linear_reg() %>% #Let's define the model
  set_engine("lm") %>%
  set_mode("regression")

#Let's create the recipe for the test data
null_rec_test<- recipe(BodyTemp ~1, data= test_data)

Workflow and Fit: Test Data

#Now we can move to workflow
null_test_wf<- 
  workflow () %>%
  add_model(lm_model1) %>%
  add_recipe(null_rec_test)

#I want to fit the null to the test workflow to folds
test_lm_n<- fit_resamples(null_test_wf, resamples= f_data_test)
! Fold1, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...

Calculating RMSE: Test Data

null_metric_test<- collect_metrics(test_lm_n)

null_metric_test #By calculating RMSE and gathering mean (1.2) and standard deviation (0.03), we can utilize this information 
# A tibble: 2 × 6
  .metric .estimator   mean     n std_err .config             
  <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
1 rmse    standard     1.18    25  0.0381 Preprocessor1_Model1
2 rsq     standard   NaN        0 NA      Preprocessor1_Model1

Tuning & Fitting: Tree Model

Let’s try to identify parameters

tunespec_dtree <- 
  decision_tree(
    cost_complexity = tune(),
    tree_depth = tune()
  ) %>% 
  set_engine("rpart") %>% 
  set_mode("regression")

tunespec_dtree
Decision Tree Model Specification (regression)

Main Arguments:
  cost_complexity = tune()
  tree_depth = tune()

Computational engine: rpart 
dtree_wf <- workflow() %>%
  add_model(tunespec_dtree) %>%
  add_recipe(rec_train)

Now let’s tune grid specs

tree_grid_dtree <-
  dials::grid_regular(
    cost_complexity(), 
    tree_depth(), 
    levels = 5)

tree_grid_dtree
# A tibble: 25 × 2
   cost_complexity tree_depth
             <dbl>      <int>
 1    0.0000000001          1
 2    0.0000000178          1
 3    0.00000316            1
 4    0.000562              1
 5    0.1                   1
 6    0.0000000001          4
 7    0.0000000178          4
 8    0.00000316            4
 9    0.000562              4
10    0.1                   4
# … with 15 more rows

Now, as we have done above, let’s create a workflow for the decision tree

dt_wf <- workflow() %>%
  add_model(tunespec_dtree) %>%
  add_recipe(rec_train)

dt_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: decision_tree()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_dummy()
• step_zv()

── Model ───────────────────────────────────────────────────────────────────────
Decision Tree Model Specification (regression)

Main Arguments:
  cost_complexity = tune()
  tree_depth = tune()

Computational engine: rpart 
dt_resample <- 
  dt_wf %>% 
  tune_grid(
    resamples = f_data_train,
    grid = tree_grid_dtree)
! Fold1, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
#Awesome, now that we have these results we can do some exploration and visualization!

#Checking out the model's metrics
dt_resample %>%
  collect_metrics()
# A tibble: 50 × 8
   cost_complexity tree_depth .metric .estimator     mean     n  std_err .config
             <dbl>      <int> <chr>   <chr>         <dbl> <int>    <dbl> <chr>  
 1    0.0000000001          1 rmse    standard     1.17      25  0.0207  Prepro…
 2    0.0000000001          1 rsq     standard     0.0489    25  0.00738 Prepro…
 3    0.0000000178          1 rmse    standard     1.17      25  0.0207  Prepro…
 4    0.0000000178          1 rsq     standard     0.0489    25  0.00738 Prepro…
 5    0.00000316            1 rmse    standard     1.17      25  0.0207  Prepro…
 6    0.00000316            1 rsq     standard     0.0489    25  0.00738 Prepro…
 7    0.000562              1 rmse    standard     1.17      25  0.0207  Prepro…
 8    0.000562              1 rsq     standard     0.0489    25  0.00738 Prepro…
 9    0.1                   1 rmse    standard     1.19      25  0.0202  Prepro…
10    0.1                   1 rsq     standard   NaN          0 NA       Prepro…
# … with 40 more rows
#Checking Model Performance Here
dt_resample %>%
  autoplot()

#Let's select for the best performing model using show_best
dt_resample %>%
  show_best(n=1) #This will show us top 5
Warning: No value of `metric` was given; metric 'rmse' will be used.
# A tibble: 1 × 8
  cost_complexity tree_depth .metric .estimator  mean     n std_err .config     
            <dbl>      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>       
1    0.0000000001          1 rmse    standard    1.17    25  0.0207 Preprocesso…

Selecting the Best Performing Model

best<- dt_resample %>%
  select_best() #This function will retrieve one set of hyperparameters for the 
Warning: No value of `metric` was given; metric 'rmse' will be used.
best
# A tibble: 1 × 3
  cost_complexity tree_depth .config              
            <dbl>      <int> <chr>                
1    0.0000000001          1 Preprocessor1_Model01

Final Fit

Now that we have done the leg work, we can create the final fit

dtfinal_wf <- 
  dt_wf %>% 
  finalize_workflow(best)

dtfinal_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: decision_tree()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_dummy()
• step_zv()

── Model ───────────────────────────────────────────────────────────────────────
Decision Tree Model Specification (regression)

Main Arguments:
  cost_complexity = 1e-10
  tree_depth = 1

Computational engine: rpart 
dt_final_fit <- 
  dtfinal_wf %>%
  fit(train_data) 

Residuals and Plotting

dt_res <- dt_final_fit %>%
  augment(train_data) %>% 
  select(c(.pred, BodyTemp)) %>%
  mutate(.resid = BodyTemp - .pred)

dt_res
# A tibble: 583 × 3
   .pred BodyTemp .resid
   <dbl>    <dbl>  <dbl>
 1  98.6     97.8 -0.771
 2  99.1     98.2 -0.915
 3  99.1     97.9 -1.21 
 4  98.6     98.1 -0.471
 5  98.6     98.1 -0.471
 6  99.1     98.2 -0.915
 7  98.6     97.8 -0.771
 8  99.1     98   -1.11 
 9  99.1     98.1 -1.01 
10  99.1     98.1 -1.01 
# … with 573 more rows

Predictions v. Actual

dt_pred_plot <- ggplot(dt_res, 
                          aes(x = BodyTemp, 
                              y = .pred)) + 
  geom_point() + 
  labs(title = "Predictions vs Actual: Decision Tree", 
       x = "Body Temperature Outcome", 
       y = "Body Temperature Prediction")
dt_pred_plot

Predictions v. Residuals

dt_resplot <- ggplot(dt_res, 
                              aes(y = .resid, 
                                  x = .pred)) + 
  geom_point() + 
  labs(title = "Predictions vs Residuals: Decision Tree", 
       x = "Body Temperature Prediction", 
       y = "Residuals")
plot(dt_resplot)

Lasso

#Specifying Model
lasso_mod <- 
  linear_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet")

#Creating Workflow
lasso_wf <- workflow() %>%
  add_model(lasso_mod) %>%
  add_recipe(rec_train)

#Tuning Grid Creation
lasso_grid <- tibble(penalty = 10^seq(-3, 0, length.out = 30))

#Cross Validation and tune_grid()

lasso_resample <- 
  lasso_wf %>%
  tune_grid(resamples = f_data_train,
            grid = lasso_grid,
            control = control_grid(verbose = FALSE, save_pred = TRUE),
            metrics = metric_set(rmse))

lasso_resample %>%
  collect_metrics()
# A tibble: 30 × 7
   penalty .metric .estimator  mean     n std_err .config              
     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1 0.001   rmse    standard    1.19    25  0.0179 Preprocessor1_Model01
 2 0.00127 rmse    standard    1.19    25  0.0179 Preprocessor1_Model02
 3 0.00161 rmse    standard    1.19    25  0.0179 Preprocessor1_Model03
 4 0.00204 rmse    standard    1.19    25  0.0179 Preprocessor1_Model04
 5 0.00259 rmse    standard    1.18    25  0.0180 Preprocessor1_Model05
 6 0.00329 rmse    standard    1.18    25  0.0180 Preprocessor1_Model06
 7 0.00418 rmse    standard    1.18    25  0.0180 Preprocessor1_Model07
 8 0.00530 rmse    standard    1.18    25  0.0181 Preprocessor1_Model08
 9 0.00672 rmse    standard    1.18    25  0.0181 Preprocessor1_Model09
10 0.00853 rmse    standard    1.18    25  0.0182 Preprocessor1_Model10
# … with 20 more rows

Model Plotting

lr_plot <- 
  lasso_resample %>% 
  collect_metrics() %>% 
  ggplot(aes(x = penalty, y = mean)) + 
  geom_point() + 
  geom_line() +
  scale_x_log10(labels = scales::label_number())

lr_plot

Again, let’s Select the Best Performing Model

lasso_resample %>%
  show_best(n=1)
# A tibble: 1 × 7
  penalty .metric .estimator  mean     n std_err .config              
    <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1  0.0574 rmse    standard    1.16    25  0.0195 Preprocessor1_Model18
best_lasso <- lasso_resample %>%
  select_best()

#Final Fits...
lasso_final_wf <- 
  lasso_wf %>% 
  finalize_workflow(best_lasso)

lasso_final_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_dummy()
• step_zv()

── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)

Main Arguments:
  penalty = 0.0573615251044868
  mixture = 1

Computational engine: glmnet 
#Follow same process as above
lasso_final_fit <- 
  lasso_final_wf %>%
  fit(train_data) 

Residuals

lasso_residuals <- lasso_final_fit %>%
  augment(train_data) %>% #use augment() to make predictions from train data
  select(c(.pred, BodyTemp)) %>%
  mutate(.resid = BodyTemp - .pred) #add row here

lasso_residuals
# A tibble: 583 × 3
   .pred BodyTemp .resid
   <dbl>    <dbl>  <dbl>
 1  98.5     97.8 -0.745
 2  99.2     98.2 -1.01 
 3  99.4     97.9 -1.49 
 4  98.5     98.1 -0.399
 5  98.7     98.1 -0.583
 6  98.9     98.2 -0.736
 7  98.8     97.8 -0.955
 8  99.0     98   -0.958
 9  99.4     98.1 -1.34 
10  99.2     98.1 -1.12 
# … with 573 more rows

Model Predictions

lasso_pred_plot <- ggplot(lasso_residuals, 
                          aes(x = BodyTemp, 
                              y = .pred)) + 
  geom_point() + 
  labs(title = "Predictions v. Actual: Lasso", 
       x = "Body Temperature Outcome", 
       y = "Body Temperature Prediction")
lasso_pred_plot

lasso_residual_plot <- ggplot(lasso_residuals, 
                              aes(y = .resid, 
                                  x = .pred)) + 
  geom_point() + 
  labs(title = "Predictions v. Residuals: Lasso", 
       x = "Body Temperature Prediction", 
       y = "Residuals")
plot(lasso_residual_plot)

Random Forest

cores <- parallel::detectCores()
cores
[1] 8
#Specify
rf_mod <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger", num.threads = cores) %>% 
  set_mode("regression")

#WorkFlow
rf_wf <- workflow() %>%
  add_model(rf_mod) %>%
  add_recipe(rec_train)

#Tuning 
rf_grid  <- expand.grid(mtry = c(3, 4, 5, 6),
                        min_n = c(40,50,60), 
                        trees = c(500,1000))

#Cross validation
rf_resample <- 
  rf_wf %>% 
  tune_grid(f_data_train,
            grid = 25,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(rmse))
i Creating pre-processing data to finalize unknown parameter: mtry
rf_resample %>%
  collect_metrics()
# A tibble: 25 × 8
    mtry min_n .metric .estimator  mean     n std_err .config              
   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1    22    28 rmse    standard    1.19    25  0.0171 Preprocessor1_Model01
 2    15    10 rmse    standard    1.20    25  0.0165 Preprocessor1_Model02
 3    31     6 rmse    standard    1.23    25  0.0150 Preprocessor1_Model03
 4    21    33 rmse    standard    1.18    25  0.0176 Preprocessor1_Model04
 5    13    18 rmse    standard    1.19    25  0.0172 Preprocessor1_Model05
 6    29    37 rmse    standard    1.19    25  0.0175 Preprocessor1_Model06
 7     1    23 rmse    standard    1.17    25  0.0200 Preprocessor1_Model07
 8    28    22 rmse    standard    1.20    25  0.0167 Preprocessor1_Model08
 9    27    16 rmse    standard    1.21    25  0.0164 Preprocessor1_Model09
10    10    35 rmse    standard    1.17    25  0.0182 Preprocessor1_Model10
# … with 15 more rows
rf_resample %>%
  autoplot()

rf_resample %>%
  show_best(n=1)
# A tibble: 1 × 8
   mtry min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     4    37 rmse    standard    1.17    25  0.0190 Preprocessor1_Model16
best_rf <- rf_resample %>%
  select_best(method = "rmse")

Final Fit & Residuals

rf_final_wf <- 
  rf_wf %>% 
  finalize_workflow(best_rf)

rf_final_fit <- 
  rf_final_wf %>%
  fit(train_data) 

rf_residuals <- rf_final_fit %>%
  augment(train_data) %>% 
  select(c(.pred, BodyTemp)) %>%
  mutate(.resid = BodyTemp - .pred) 

rf_residuals
# A tibble: 583 × 3
   .pred BodyTemp .resid
   <dbl>    <dbl>  <dbl>
 1  98.6     97.8 -0.767
 2  99.2     98.2 -0.990
 3  99.3     97.9 -1.38 
 4  98.5     98.1 -0.430
 5  98.8     98.1 -0.708
 6  98.8     98.2 -0.619
 7  98.9     97.8 -1.14 
 8  98.9     98   -0.902
 9  99.3     98.1 -1.21 
10  99.0     98.1 -0.947
# … with 573 more rows

Predictions (Tuned v. Actual)

rf_pred_plot <- ggplot(rf_residuals, 
                          aes(x = BodyTemp, 
                              y = .pred)) + 
  geom_point() + 
  labs(title = "Predictions vs Actual: Random Forest", 
       x = "Body Temperature Actual", 
       y = "Body Temperature Prediction")
rf_pred_plot

rf_residual_plot <- ggplot(rf_residuals, 
                              aes(y = .resid, 
                                  x = .pred)) + 
  geom_point() + 
  labs(title = "Predictions vs Residuals: Random Forest", 
       x = "Body Temperature Prediction", 
       y = "Residuals")
plot(rf_residual_plot)

Model Selection

Overall, the majority of the RMSE models performed similarly. Overall, Lasso and RF models seemed to display a relationship between actual and predicted body temperature.Given what we know, I would select the Lasso model as it is more accurate.

Final Evaluation

lasso_last_fit <- 
  lasso_final_wf %>% 
  last_fit(data_split)

lasso_last_fit %>% 
  collect_metrics()
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard      1.16   Preprocessor1_Model1
2 rsq     standard      0.0555 Preprocessor1_Model1