Setup
First, loading tidymodels, along with a few additional tidymodels extension packages:
── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
✔ broom 1.0.5 ✔ recipes 1.0.8.9000
✔ dials 1.2.0.9000 ✔ rsample 1.2.0.9000
✔ dplyr 1.1.3 ✔ tibble 3.2.1
✔ ggplot2 3.4.3 ✔ tidyr 1.3.0
✔ infer 1.0.5 ✔ tune 1.1.2.9000
✔ modeldata 1.2.0 ✔ workflows 1.1.3
✔ parsnip 1.1.1 ✔ workflowsets 1.0.1
✔ purrr 1.0.2 ✔ yardstick 1.2.0.9001
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ recipes::step() masks stats::step()
• Learn how to get started at https://www.tidymodels.org/start/
library (baguette)
library (bundle)
library (finetune)
tidymodels supports a number of R frameworks for parallel computing:
library (doMC)
library (parallelly)
availableCores ()
registerDoMC (cores = max (1 , availableCores () - 1 ))
Now, cleaning data:
library (QSARdata)
data (Mutagen)
mutagen_tbl <-
bind_cols (
as_tibble_col (Mutagen_Outcome, "outcome" ),
as_tibble (Mutagen_Dragon)
)
mutagen_tbl
# A tibble: 4,335 × 1,580
outcome MW AMW Sv Se Sp Ss Mv Me Mp Ms nAT
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 mutagen 326. 7.59 29.3 42.6 30.6 50.7 0.68 0.99 0.71 2.03 43
2 mutagen 174. 9.17 13.2 19.6 13.4 38 0.7 1.03 0.71 2.92 19
3 nonmutagen 300. 9.39 20.0 33.6 21.0 61.2 0.63 1.05 0.66 3.06 32
4 nonmutagen 143. 6.23 12.6 23.1 13.5 26.2 0.55 1 0.59 2.62 23
5 nonmutagen 216. 18.0 10.6 13.0 11.7 27.1 0.88 1.08 0.98 2.71 12
6 mutagen 190. 7.93 15.4 24.4 16.0 36 0.64 1.02 0.67 2.57 24
7 mutagen 328. 12.6 18.8 27.1 20.0 49.4 0.72 1.04 0.77 2.75 26
8 nonmutagen 324. 8.11 26.3 40.7 27.4 59.2 0.66 1.02 0.68 2.47 40
9 mutagen 136. 7.56 11.3 18.2 11.8 25.7 0.63 1.01 0.65 2.57 18
10 mutagen 323. 7.89 26.8 41.5 27.9 54.9 0.65 1.01 0.68 2.29 41
# ℹ 4,325 more rows
# ℹ 1,568 more variables: nSK <int>, nBT <int>, nBO <int>, nBM <int>,
# SCBO <dbl>, ARR <dbl>, nCIC <int>, nCIR <int>, RBN <int>, RBF <dbl>,
# nDB <int>, nTB <int>, nAB <int>, nH <int>, nC <int>, nN <int>, nO <int>,
# nP <int>, nS <int>, nF <int>, nCL <int>, nBR <int>, nI <int>, nX <int>,
# nR03 <int>, nR04 <int>, nR05 <int>, nR06 <int>, nR07 <int>, nR08 <int>,
# nR09 <int>, nR10 <int>, nR11 <int>, nR12 <int>, nBnz <int>, ZM1 <int>, …
Splitting up data
We split data into training and testing sets so that, once we’ve trained our final model, we can get an honest assessment of the model’s performance:
set.seed (1 )
mutagen_split <- initial_split (mutagen_tbl)
mutagen_train <- training (mutagen_split)
mutagen_test <- testing (mutagen_split)
set.seed (1 )
mutagen_folds <- vfold_cv (mutagen_train)
Defining our modeling strategies
Our basic strategy is to first try out a bunch of different modeling approaches, and once we have an initial sense for how they perform, delve further into the one that looks the most promising.
We first define a few recipes , which specify how to process the inputted data in such a way that machine learning models will know how to work with predictors:
recipe_basic <-
recipe (outcome ~ ., mutagen_train) %>%
step_nzv (all_predictors ())
recipe_normalize <-
recipe_basic %>%
step_YeoJohnson (all_double_predictors ()) %>%
step_normalize (all_double_predictors ())
recipe_pca <-
recipe_normalize %>%
step_pca (all_numeric_predictors (), num_comp = tune ())
These recipes vary in complexity, from basic checks on the input data to advanced feature engineering techniques like principal component analysis.
We also define several model specifications . tidymodels comes with support for all sorts of machine learning algorithms, from neural networks to XGBoost boosted trees to plain old logistic regression:
spec_lr <-
logistic_reg () %>%
set_mode ("classification" )
spec_bm <-
bag_mars (num_terms = tune (), prod_degree = tune ()) %>%
set_engine ("earth" ) %>%
set_mode ("classification" )
spec_bt <-
bag_tree (cost_complexity = tune (), tree_depth = tune (), min_n = tune ()) %>%
set_engine ("rpart" ) %>%
set_mode ("classification" )
spec_nn <-
mlp (hidden_units = tune (), penalty = tune (), epochs = tune ()) %>%
set_engine ("nnet" , MaxNWts = 15000 ) %>%
set_mode ("classification" )
spec_svm <-
svm_rbf (cost = tune (), rbf_sigma = tune (), margin = tune ()) %>%
set_mode ("classification" )
spec_xgb <-
boost_tree (trees = tune (), min_n = tune (), tree_depth = tune (),
learn_rate = tune (), stop_iter = 10 ) %>%
set_engine ("xgboost" ) %>%
set_mode ("classification" )
Note how similar the code for each of these model specifications looks! tidymodels takes care of the “translation” from our unified syntax to the code that these algorithms expect.
If typing all of these out seems cumbersome to you, or you’re not sure how to define a model specification that makes sense for your data, the usemodels RStudio addin may help!
Evaluating models: round 1
We’ll pair machine learning models with the recipes that make the most sense for them:
wf_set <-
workflow_set (
preproc = list (basic = recipe_basic),
models = list (boost_tree = spec_xgb, logistic_reg = spec_lr)
) %>%
bind_rows (
workflow_set (
preproc = list (normalize = recipe_normalize),
models = list (
bag_tree = spec_bt,
bag_mars = spec_bm,
svm_rbf = spec_svm,
mlp = spec_nn
)
)
) %>%
bind_rows (
workflow_set (
preproc = list (pca = recipe_pca),
models = list (
bag_tree = spec_bt,
bag_mars = spec_bm,
svm_rbf = spec_svm,
mlp = spec_nn
)
)
)
wf_set
# A workflow set/tibble: 10 × 4
wflow_id info option result
<chr> <list> <list> <list>
1 basic_boost_tree <tibble [1 × 4]> <opts[0]> <list [0]>
2 basic_logistic_reg <tibble [1 × 4]> <opts[0]> <list [0]>
3 normalize_bag_tree <tibble [1 × 4]> <opts[0]> <list [0]>
4 normalize_bag_mars <tibble [1 × 4]> <opts[0]> <list [0]>
5 normalize_svm_rbf <tibble [1 × 4]> <opts[0]> <list [0]>
6 normalize_mlp <tibble [1 × 4]> <opts[0]> <list [0]>
7 pca_bag_tree <tibble [1 × 4]> <opts[0]> <list [0]>
8 pca_bag_mars <tibble [1 × 4]> <opts[0]> <list [0]>
9 pca_svm_rbf <tibble [1 × 4]> <opts[0]> <list [0]>
10 pca_mlp <tibble [1 × 4]> <opts[0]> <list [0]>
wf_set_fit <-
workflow_map (
wf_set,
fn = "tune_grid" ,
verbose = TRUE ,
seed = 1 ,
resamples = mutagen_folds,
control = control_grid (parallel_over = "everything" )
)
wf_set_fit <-
wf_set_fit[
map_lgl (wf_set_fit$ result,
~ pluck (., ".metrics" , 1 ) %>% inherits ("tbl_df" ),
"tune_results" ),
]
# first look at metrics:
metrics_wf_set <- collect_metrics (wf_set_fit, summarize = FALSE )
Taking a look at how these models did:
metrics_wf_set %>%
filter (.metric == "roc_auc" ) %>%
arrange (desc (.estimate))
# A tibble: 860 × 8
wflow_id .config preproc model id .metric .estimator .estimate
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
1 basic_boost_tree Preprocess… recipe boos… Fold… roc_auc binary 0.924
2 basic_boost_tree Preprocess… recipe boos… Fold… roc_auc binary 0.919
3 basic_boost_tree Preprocess… recipe boos… Fold… roc_auc binary 0.917
4 basic_boost_tree Preprocess… recipe boos… Fold… roc_auc binary 0.915
5 basic_boost_tree Preprocess… recipe boos… Fold… roc_auc binary 0.915
6 basic_boost_tree Preprocess… recipe boos… Fold… roc_auc binary 0.915
7 basic_boost_tree Preprocess… recipe boos… Fold… roc_auc binary 0.915
8 basic_boost_tree Preprocess… recipe boos… Fold… roc_auc binary 0.914
9 basic_boost_tree Preprocess… recipe boos… Fold… roc_auc binary 0.914
10 basic_boost_tree Preprocess… recipe boos… Fold… roc_auc binary 0.914
# ℹ 850 more rows
Evaluating models: round 2
It looks like XGBoost with minimal preprocessing was considerably more performant than the other proposed models. Let’s work with those XGBoost results and see if we can make any improvements to performance:
xgb_res <- extract_workflow_set_result (wf_set_fit, "basic_boost_tree" )
xgb_wflow <-
workflow () %>%
add_recipe (recipe_basic) %>%
add_model (spec_xgb)
xgb_sim_anneal_fit <-
tune_sim_anneal (
object = xgb_wflow,
resamples = mutagen_folds,
iter = 25 ,
metrics = mutagen_metrics,
initial = xgb_res,
control = control_sim_anneal (verbose = TRUE , parallel_over = "everything" )
)
metrics_xgb <- collect_metrics (xgb_sim_anneal_fit, summarize = FALSE )
Looks like we did make a small improvement:
# A tibble: 1,400 × 10
id trees min_n tree_depth learn_rate .metric .estimator .estimate .config
<chr> <int> <int> <int> <dbl> <chr> <chr> <dbl> <chr>
1 Fold01 1860 6 8 0.00276 accura… binary 0.798 initia…
2 Fold01 1860 6 8 0.00276 kap binary 0.591 initia…
3 Fold01 1860 6 8 0.00276 mcc binary 0.591 initia…
4 Fold01 1860 6 8 0.00276 roc_auc binary 0.876 initia…
5 Fold02 1860 6 8 0.00276 accura… binary 0.837 initia…
6 Fold02 1860 6 8 0.00276 kap binary 0.669 initia…
7 Fold02 1860 6 8 0.00276 mcc binary 0.672 initia…
8 Fold02 1860 6 8 0.00276 roc_auc binary 0.914 initia…
9 Fold03 1860 6 8 0.00276 accura… binary 0.852 initia…
10 Fold03 1860 6 8 0.00276 kap binary 0.700 initia…
# ℹ 1,390 more rows
# ℹ 1 more variable: .iter <int>
The final model fit
The last_fit()
function will take care of fitting the most performant model specification to the whole training dataset:
xgb_final_fit <-
last_fit (
finalize_workflow (xgb_wflow, select_best (xgb_sim_anneal_fit, metric = "roc_auc" )),
mutagen_split
)
Deploying to Connect
From here, all we need to do to deploy our fitted model is pass it off to vetiver for deployment to Posit Connect:
final_fit <- extract_workflow (xgb_final_fit)
final_fit_vetiver <- vetiver_model (final_fit, "mutagen" )
board <- board_connect ()
vetiver_pin_write (board, final_fit_vetiver)
vetiver_deploy_rsconnect (board, "simon.couch/mutagen" )