The following scripts need to be run first before knitting the document corresponding to model_type specified below:
3_model_sculpt_compas.R
Sculpting - Main models chunk (need to manually run in the interactive mode, not run during knitting as it takes some time)
Setup and load
Show the code
library(dplyr)library(stringr)source(here::here("R/0_setup.R"))theme_set(theme_bw(base_size =12))theme_facets <-theme(text =element_text(size =16),legend.position ="inside",legend.position.inside =c(0.85, 0.27), legend.background =element_rect(colour ="black"), legend.title =element_blank())theme_single <-theme(text =element_text(size =16))# Function to check if the model is trainedload_model_if_trained <-function(model_type) { model_path <-file.path(storage_folder, sname(paste0(model_type, "-fit_final.rds")))if(file.exists(model_path)) {load_results(sname(paste0(model_type, "-fit_final.rds"))) } else {stop(paste0("Model ", model_type, " is not trained for compas, `", model_path, "` does not exist. ","Please train it first with: ","`Rscript R/2_train_models.R ", model_type, " compas FALSE`")) }}
Show the code
# set dataset (any with discrete response)dataset <-"compas"# set model_typemodel_type ="xgb_bayes"# model_type = "xgb"# set nr of features for a polished modeltop_k <-3# util function for storagesname <-function(x, prefix = dataset) {paste0(prefix, "-", x)}# logit functionslogit <-function(x) log(x / (1-x))inv.logit <-function(x) 1/ (1+exp(-x))# load datasetdd <-define_data(dataset)# load xgbxgb <-load_model_if_trained(model_type)xgb_fo <-load_model_if_trained("xgb_1_order_bayes")# get product marginalspm <-sample_marginals(dd$data$train[dd$covariates$all], n =1e4, seed =1234)
xgb model
Sculpting
Main models
Sculpting on product marginals on logit scale.
Show the code
# # get rough model - on pm# # Already generated in 3_model_sculpt_compas.R# rs_pm <- sculpt_rough(# pm,# seed = 1234,# model_predict_fun = function(x) {# p <- predict(xgb, new_data = x, type = "prob")$.pred_1# logit(p)# }# )# # detailed model on pm# Already generated in 3_model_sculpt_compas.R# ds_pm <- sculpt_detailed_gam(rs_pm)## Below we use custom smoother for even better fitting smoothingsgam_cgam_smoother <-function(x, y, is_discrete, column_name, na_ind =NULL) { use_cgam <-TRUEif (column_name =="age") { s.decr <- cgam::s.decr form <- y ~s.decr(x, numknots =3) } elseif (column_name =="priors") { s.incr <- cgam::s.incr form <- y ~s.incr(x, numknots =3) } elseif (column_name =="juvenile_crimes") {return(getPAVAest(outcome = y, score = x)) # from stats4phc } else { use_cgam <-FALSE }if (use_cgam) {tryCatch( cgam::cgam(form), error =function(e) { s <- mgcv::stryCatch( mgcv::gam(y ~s(x, k =-1)),error =function(e) { mgcv::gam(y ~ x) } ) } ) } else {if (is_discrete) { s <- mgcv::stryCatch( mgcv::gam(y ~ x),error =function(e) {lm(y ~0) } ) } else { s <- mgcv::stryCatch( mgcv::gam(y ~s(x, k =-1)),error =function(e) { mgcv::gam(y ~ x) } ) } }}gam_cgam_predict <-function(smoother, new_x, is_discrete, column_name, na_ind =NULL) {if (inherits(smoother, "cgam")) {# cgam fails on extrapolation - need to do this manuallyif (min(new_x) <min(smoother$xmat0[, 1])) { new_x[new_x <min(smoother$xmat0[, 1])] <-min(smoother$xmat0[, 1]) }if (max(new_x) >max(smoother$xmat0[, 1])) { new_x[new_x >max(smoother$xmat0[, 1])] <-max(smoother$xmat0[, 1]) } newdata <-data.frame(x = new_x)predict(smoother, newData = newdata)$fit } elseif (inherits(smoother, "gam")) { newdata <-data.frame(x = new_x)as.numeric(predict(smoother, newdata = newdata)) } elseif (is.data.frame(smoother)) {# specific for pava as there is no model returned, just a vectorifelse( new_x ==0, smoother$estimate[smoother$score ==0][1],ifelse( new_x ==1, smoother$estimate[smoother$score ==1][1], smoother$estimate[smoother$score ==2][1] ) ) }}polished_smoother <-function(x, y, is_discrete, column_name, na_ind =NULL) {if (column_name =="age") { s.decr <- cgam::s.decr form <- y ~s.decr(x, numknots =3) } elseif (column_name =="priors") { s.incr <- cgam::s.incr form <- y ~s.incr(x, numknots =3) } elseif (column_name =="juvenile_crimes") {return(getPAVAest(outcome = y, score = x)) # from stats4phc } else { out <-list()class(out) <-"constant"return(out) }tryCatch( cgam::cgam(form), error =function(e) { s <- mgcv::stryCatch( mgcv::gam(y ~s(x, k =-1)),error =function(e) { mgcv::gam(y ~ x) } ) } )}polished_smoother_predict <-function(smoother, new_x, is_discrete, column_name, na_ind =NULL) {if (inherits(smoother, "cgam")) {# cgam fails on extrapolation - need to do this manuallyif (min(new_x) <min(smoother$xmat0[, 1])) { new_x[new_x <min(smoother$xmat0[, 1])] <-min(smoother$xmat0[, 1]) }if (max(new_x) >max(smoother$xmat0[, 1])) { new_x[new_x >max(smoother$xmat0[, 1])] <-max(smoother$xmat0[, 1]) } newdata <-data.frame(x = new_x)predict(smoother, newData = newdata)$fit } elseif (inherits(smoother, "constant")) { 0 } elseif (is.data.frame(smoother)) {# specific for pava as there is no model returned, just a vectorifelse( new_x ==0, smoother$estimate[smoother$score ==0][1],ifelse( new_x ==1, smoother$estimate[smoother$score ==1][1], smoother$estimate[smoother$score ==2][1] ) ) }}rs_pm <-load_results(paste0(dataset, "-", model_type, "-sculpt_rough_pm.rds"))ds_pm_v2 <-sculpt_detailed_generic(rs = rs_pm,smoother_fit = gam_cgam_smoother, smoother_predict = gam_cgam_predict)# Select variables for polished modelcheckmate::assert_number(top_k, lower =1)vars <-levels(attr(rs_pm, "cumul_R2")$feature)[1:top_k]rs_pm_top_k <- rs_pm[vars]attr(rs_pm_top_k, "offset") <-attr(rs_pm, "offset")class(rs_pm_top_k) <-class(rs_pm)ps_pm_v2 <-sculpt_detailed_generic(rs = rs_pm_top_k,smoother_fit = polished_smoother, smoother_predict = polished_smoother_predict)store_results(ds_pm_v2, paste0(dataset, "-", model_type, "-sculpt_detailed_pm_v2.rds"))store_results(ps_pm_v2, paste0(dataset, "-", model_type, "-sculpt_polished_pm_v2.rds"))
# get rough model - on trainrs_train <-sculpt_rough( dd$data$train[dd$covariates$all],seed =1234,model_predict_fun =function(x) { p <-predict(xgb, new_data = x, type ="prob")$.pred_1logit(p) },data_as_marginals =TRUE)# rough model on pm on original scalers_pm_prob <-sculpt_rough( pm,seed =1234,model_predict_fun =function(x) {predict(xgb, new_data = x, type ="prob")$.pred_1 })# rough model on train on original scalers_train_prob <-sculpt_rough( dd$data$train[dd$covariates$all],seed =1234,model_predict_fun =function(x) {predict(xgb, new_data = x, type ="prob")$.pred_1 },data_as_marginals =TRUE)# First order modelrs_pm_xgb_fo <-sculpt_rough( pm, seed =1234,model_predict_fun =function(x) { p <-predict(xgb_fo, new_data = x, type ="prob")$.pred_1logit(p) })