modeleval

Libraries and Loading Data:

Library

library(here)
here() starts at /Users/nathangreenslit/Desktop/UGA/Spring 2023/MADA/nathangreenslit-MADA-portfolio
library(tidyverse)
── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ purrr   1.0.1
✔ tibble  3.1.8     ✔ dplyr   1.1.0
✔ tidyr   1.2.1     ✔ stringr 1.5.0
✔ readr   2.1.3     ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(rsample) #Data splitting
library(tidymodels)#Modeling
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom        1.0.2     ✔ recipes      1.0.5
✔ dials        1.1.0     ✔ tune         1.0.1
✔ infer        1.0.4     ✔ workflows    1.1.3
✔ modeldata    1.1.0     ✔ workflowsets 1.0.0
✔ parsnip      1.0.4     ✔ yardstick    1.1.0
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.

Data

d<- readRDS(here("fluanalysis", "data", "SypAct_clean.rds"))

Data Splitting:

Split Data

# Fix the random numbers by setting the seed 
# This enables the analysis to be reproducible when random numbers are used 
set.seed(222)
# Put 3/4 of the data into the training set 
data_split <- initial_split(d, prop = 3/4)

# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)

Creating Recipe and Workflow:

Create New Recipe Between Nausea and All Other Variables

flu_rec<- 
  recipe(Nausea ~ ., data = train_data) %>%
  step_dummy(all_nominal(), - all_outcomes()) %>%
  step_nzv(all_predictors())
tidy(flu_rec)
# A tibble: 2 × 6
  number operation type  trained skip  id         
   <int> <chr>     <chr> <lgl>   <lgl> <chr>      
1      1 step      dummy FALSE   FALSE dummy_FvN9r
2      2 step      nzv   FALSE   FALSE nzv_XeMK7  

Fit the Model

lr_mod <- 
  logistic_reg() %>% 
  set_engine("glm")

Create Workflow: Model + Recipe

flu_wflow <- 
  workflow() %>% 
  add_model(lr_mod) %>% 
  add_recipe(flu_rec)

Prepare Recipe and Train Model from the resulting predictors

flu_fit <- 
  flu_wflow %>% 
  fit(data = train_data)

tidy(flu_fit)
# A tibble: 32 × 5
   term                  estimate std.error statistic p.value
   <chr>                    <dbl>     <dbl>     <dbl>   <dbl>
 1 (Intercept)             2.48      9.37       0.265  0.791 
 2 BodyTemp               -0.0587    0.0958    -0.613  0.540 
 3 SwollenLymphNodes_Yes  -0.240     0.231     -1.04   0.300 
 4 ChestCongestion_Yes     0.168     0.252      0.668  0.504 
 5 ChillsSweats_Yes        0.148     0.331      0.448  0.654 
 6 NasalCongestion_Yes     0.600     0.309      1.94   0.0521
 7 Sneeze_Yes              0.0974    0.247      0.395  0.693 
 8 Fatigue_Yes             0.180     0.438      0.411  0.681 
 9 SubjectiveFever_Yes     0.191     0.261      0.733  0.463 
10 Headache_Yes            0.482     0.351      1.37   0.169 
# … with 22 more rows

Using Trained Workflow to Predict:

Predict

predict(flu_fit, test_data)
# A tibble: 183 × 1
   .pred_class
   <fct>      
 1 No         
 2 No         
 3 No         
 4 No         
 5 No         
 6 Yes        
 7 Yes        
 8 No         
 9 No         
10 Yes        
# … with 173 more rows

Augment

flu_aug_test<- 
  augment(flu_fit, test_data)

flu_aug_train<- 
  augment(flu_fit, train_data)

This shows the probabilities of either Yes or No for flu symptoms.

Evaluate Performance:

Test Data

ROC Curve

flu_aug_test %>% 
  roc_curve(truth = Nausea, .pred_Yes, event_level = "second") %>% #> For binary classification, the first factor level is assumed to be the event. Use the argument `event_level = "second"` to alter this as needed.
  autoplot()
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
  always returns an ungrouped data frame and adjust accordingly.
ℹ The deprecated feature was likely used in the yardstick package.
  Please report the issue at <https://github.com/tidymodels/yardstick/issues>.

Area Under the Curve

flu_aug_test %>% 
  roc_auc(truth = Nausea, .pred_Yes, event_level = "second")
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.731

Train Data

ROC Curve

flu_aug_train %>% 
  roc_curve(truth = Nausea, .pred_Yes, event_level = "second") %>% 
  autoplot()

Area Under the Curve

flu_aug_train %>% 
  roc_auc(truth = Nausea, .pred_Yes, event_level = "second")
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.785

ALTERNATIVE MODEL:

Create New Recipe

vom_rec<- 
  recipe(Nausea ~ Vomit, data = train_data)

Fit a Model

lr_mod <- 
  logistic_reg() %>% 
  set_engine("glm")

Create Workflow: Model + Recipe

vom_wflow <- 
  workflow() %>% 
  add_model(lr_mod) %>% 
  add_recipe(vom_rec)

Prepare Recipe and Train Model from the resulting predictors

vom_fit <- 
  vom_wflow %>% 
  fit(data = train_data)

tidy(vom_fit)
# A tibble: 2 × 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)   -0.945     0.101     -9.36 8.36e-21
2 VomitYes       2.57      0.360      7.15 8.97e-13

Using Trained Workflow to Predict:

Predict

predict(vom_fit, test_data)
# A tibble: 183 × 1
   .pred_class
   <fct>      
 1 No         
 2 No         
 3 Yes        
 4 No         
 5 No         
 6 No         
 7 Yes        
 8 No         
 9 No         
10 Yes        
# … with 173 more rows

Augment

vom_aug_test<- 
  augment(vom_fit, test_data)

vom_aug_train<- 
  augment(vom_fit, train_data)

Evaluate Performance:

Test Data

ROC Curve

vom_aug_test %>% 
  roc_curve(truth = Nausea, .pred_Yes, event_level = "second") %>% 
  autoplot()

Area Under the Curve

vom_aug_test %>% 
  roc_auc(truth = Nausea, .pred_Yes, event_level = "second")
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.578

Train Data

ROC Curve

vom_aug_train %>% 
  roc_curve(truth = Nausea, .pred_Yes, event_level = "second") %>% 
  autoplot()

Area Under the Curve

vom_aug_train %>% 
  roc_auc(truth = Nausea, .pred_Yes, event_level = "second")
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.622

This section added by Kimberly Perez

Making the Recipe [Training]

set.seed(321)
#Creating the recipe 
BT_recipe<- recipe(BodyTemp ~., data=train_data)

Workflow Creation and Prediction [Training]

#Now let's set or define our model
lr_BT<- linear_reg() %>%
  set_engine("lm")

#Creating Workflow
BT_WF<- workflow() %>% 
  add_model (lr_BT) %>%
  add_recipe(BT_recipe)

#Creation of Single Function
BT_fit<- BT_WF %>% 
  fit(data= train_data)

#Extracting 
BT_fit %>%
  extract_fit_parsnip() %>%
  tidy()
# A tibble: 32 × 5
   term                 estimate std.error statistic    p.value
   <chr>                   <dbl>     <dbl>     <dbl>      <dbl>
 1 (Intercept)           97.7        0.344   284.    0         
 2 SwollenLymphNodesYes  -0.190      0.108    -1.76  0.0789    
 3 ChestCongestionYes     0.146      0.115     1.27  0.203     
 4 ChillsSweatsYes        0.184      0.148     1.24  0.214     
 5 NasalCongestionYes    -0.182      0.136    -1.34  0.180     
 6 SneezeYes             -0.474      0.113    -4.18  0.0000338 
 7 FatigueYes             0.362      0.187     1.94  0.0529    
 8 SubjectiveFeverYes     0.564      0.118     4.79  0.00000223
 9 HeadacheYes            0.0675     0.150     0.448 0.654     
10 WeaknessMild           0.0756     0.211     0.358 0.720     
# … with 22 more rows
#Predicting 
predict(BT_fit, train_data)
# A tibble: 547 × 1
   .pred
   <dbl>
 1  99.2
 2  98.9
 3  98.7
 4  98.7
 5  98.9
 6  98.4
 7  98.7
 8  99.6
 9  99.3
10  98.7
# … with 537 more rows
pred_BT<- augment(BT_fit, train_data)

pred_BT %>% 
  select(BodyTemp, .pred)
# A tibble: 547 × 2
   BodyTemp .pred
      <dbl> <dbl>
 1    100.   99.2
 2     98.2  98.9
 3     98.1  98.7
 4    101.   98.7
 5     98    98.9
 6     98    98.4
 7     98    98.7
 8    103.   99.6
 9     99.4  99.3
10    102.   98.7
# … with 537 more rows

Model Assessment 1 [Training]

BT_rmse<- pred_BT %>% 
  rmse(truth=BodyTemp, .pred)

BT_rmse
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        1.12

Model Assessment 2 [Training]

BT_rsq<- pred_BT %>% 
  rsq(truth=BodyTemp, .pred)

BT_rsq
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard       0.153

Making the Recipe [Test]

set.seed(321)
#Creating the recipe 
BT_recipe_test<- recipe(BodyTemp ~., data=test_data)

Workflow Creation and Prediction [Test]

#Now let's set or define our model
lr_BT_test<- linear_reg() %>%
  set_engine("lm")

#Creating Workflow
BT_WF_test<- workflow() %>% 
  add_model (lr_BT_test) %>%
  add_recipe(BT_recipe)

#Creation of Single Function
BT_fit_test<- BT_WF_test %>% 
  fit(data= test_data)

#Extracting 
BT_fit_test %>%
  extract_fit_parsnip() %>%
  tidy()
# A tibble: 32 × 5
   term                 estimate std.error statistic   p.value
   <chr>                   <dbl>     <dbl>     <dbl>     <dbl>
 1 (Intercept)           99.0        0.732   135.    1.81e-159
 2 SwollenLymphNodesYes  -0.0296     0.213    -0.139 8.90e-  1
 3 ChestCongestionYes    -0.0838     0.225    -0.372 7.10e-  1
 4 ChillsSweatsYes        0.262      0.275     0.951 3.43e-  1
 5 NasalCongestionYes    -0.436      0.253    -1.72  8.68e-  2
 6 SneezeYes              0.0378     0.220     0.172 8.64e-  1
 7 FatigueYes            -0.0824     0.353    -0.234 8.16e-  1
 8 SubjectiveFeverYes     0.0292     0.234     0.125 9.01e-  1
 9 HeadacheYes           -0.217      0.246    -0.880 3.80e-  1
10 WeaknessMild          -0.377      0.484    -0.779 4.37e-  1
# … with 22 more rows
#Predicting 
predict(BT_fit_test, test_data)
# A tibble: 183 × 1
   .pred
   <dbl>
 1  99.1
 2  98.9
 3  99.9
 4  98.3
 5  99.5
 6  98.8
 7  99.8
 8  99.1
 9  98.9
10  99.5
# … with 173 more rows
pred_BT_test<- augment(BT_fit_test, test_data)

pred_BT_test %>% 
  select(BodyTemp, .pred)
# A tibble: 183 × 2
   BodyTemp .pred
      <dbl> <dbl>
 1     98.3  99.1
 2     98.8  98.9
 3    102.   99.9
 4     98.2  98.3
 5     97.8  99.5
 6     97.8  98.8
 7    100    99.8
 8    101.   99.1
 9     98.8  98.9
10    100.   99.5
# … with 173 more rows

Model Assessment 1 [Test]

BT_rmse_test<- pred_BT_test %>% 
  rmse(truth=BodyTemp, .pred)

BT_rmse_test
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        1.05

Model Assessment 2 [Test]

BT_rsq_test<- pred_BT_test %>% 
  rsq(truth=BodyTemp, .pred)

BT_rsq_test
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard       0.147

Making the Recipe: Body Temperature & Runny Nose [Training]

set.seed(321)
#Creating the recipe 
BTRN_recipe<- recipe(BodyTemp~RunnyNose, data=train_data)

Workflow Creation and Prediction [Training]

#Now let's set or define our model
lr_BTRN<- linear_reg() %>%
  set_engine("lm")

#Creating Workflow
BTRN_WF<- workflow() %>% 
  add_model (lr_BTRN) %>%
  add_recipe(BTRN_recipe)

#Creation of Single Function
BTRN_fit<- BTRN_WF %>% 
  fit(data= train_data)

#Extracting 
BTRN_fit %>%
  extract_fit_parsnip() %>%
  tidy()
# A tibble: 2 × 5
  term         estimate std.error statistic p.value
  <chr>           <dbl>     <dbl>     <dbl>   <dbl>
1 (Intercept)    99.1      0.0964   1028.    0     
2 RunnyNoseYes   -0.261    0.114      -2.29  0.0225
#Predicting 
predict(BTRN_fit, train_data)
# A tibble: 547 × 1
   .pred
   <dbl>
 1  98.9
 2  98.9
 3  98.9
 4  98.9
 5  98.9
 6  98.9
 7  98.9
 8  98.9
 9  99.1
10  98.9
# … with 537 more rows
pred_BTRN<- augment(BTRN_fit, train_data)

pred_BTRN %>% 
  select(BodyTemp, .pred)
# A tibble: 547 × 2
   BodyTemp .pred
      <dbl> <dbl>
 1    100.   98.9
 2     98.2  98.9
 3     98.1  98.9
 4    101.   98.9
 5     98    98.9
 6     98    98.9
 7     98    98.9
 8    103.   98.9
 9     99.4  99.1
10    102.   98.9
# … with 537 more rows

Model Assessment 1 [Training]

BTRN_rmse<- pred_BTRN %>% 
  rmse(truth=BodyTemp, .pred)

BTRN_rmse
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        1.21

Making the Recipe: Body Temperature & Runny Nose [Training]

set.seed(321)
#Creating the recipe 
BTRN_recipe_test<- recipe(BodyTemp~RunnyNose, data=test_data)

Workflow Creation and Prediction [Test]

#Now let's set or define our model
lr_BTRN_test<- linear_reg() %>%
  set_engine("lm")

#Creating Workflow
BTRN_WF_test<- workflow() %>% 
  add_model (lr_BTRN_test) %>%
  add_recipe(BTRN_recipe)

#Creation of Single Function
BTRN_fit_test<- BTRN_WF %>% 
  fit(data= test_data)

#Extracting 
BTRN_fit_test %>%
  extract_fit_parsnip() %>%
  tidy()
# A tibble: 2 × 5
  term         estimate std.error statistic   p.value
  <chr>           <dbl>     <dbl>     <dbl>     <dbl>
1 (Intercept)    99.1       0.154    642.   7.60e-306
2 RunnyNoseYes   -0.388     0.184     -2.11 3.63e-  2
#Predicting 
predict(BTRN_fit_test, test_data)
# A tibble: 183 × 1
   .pred
   <dbl>
 1  99.1
 2  98.7
 3  98.7
 4  98.7
 5  99.1
 6  99.1
 7  98.7
 8  99.1
 9  99.1
10  99.1
# … with 173 more rows
pred_BTRN_test<- augment(BTRN_fit_test, test_data)

pred_BTRN_test %>% 
  select(BodyTemp, .pred)
# A tibble: 183 × 2
   BodyTemp .pred
      <dbl> <dbl>
 1     98.3  99.1
 2     98.8  98.7
 3    102.   98.7
 4     98.2  98.7
 5     97.8  99.1
 6     97.8  99.1
 7    100    98.7
 8    101.   99.1
 9     98.8  99.1
10    100.   99.1
# … with 173 more rows

Model Assessment 1 [Test]

BTRN_rmse_test<- pred_BTRN_test %>% 
  rmse(truth=BodyTemp, .pred)

BTRN_rmse_test
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        1.13

Our training data out preformed our test data, estimates 1.2 and 1.12 respectively. Thus, runny nose does not seem like a predictor of body temperature.