modsculpt coverage - 95.82%

Files
Source

# product marginals --------

#' Sample product marginals dataset
#'
#' @param dat Data.frame to sample from, must include only covariates.
#' @param n Number of observations to sample.
#' @param seed `NULL` or seed for exact reproducibility.
#'
#' @details The product marginals dataset is a grid of values that is sampled independently
#' per each column (feature) from the original dataset.
#' The aim here is to disentangle the correlations between features and assess
#' how each feature affects the model predictions individually.
#' It will not contain new values per column, but it may contain new combinations of values not
#' seen in the original data.
#' One can also check how the model behaves if there are unseen observations
#' (new combination of features).
#’
#' Note that the use of the product marginal dataset for model sculpting only works
#' if the features are approximately additive for model predictions.
#' In the quite rare case when they are not, the sculpted models using the product marginal
#' dataset is expected to have significantly lower performance and
#' the conclusions may be misleading.
#'
#' One can also try using the original data instead of the product marginals for model
#' sculpting and see how the results differ.
#'
#' @return `data.frame` with same number of columns and `n` rows.
#' @export
#'
#' @examples
#' sample_marginals(mtcars, n = 5, seed = 543)
sample_marginals <- function(dat, n, seed = NULL) {
  checkmate::assert_data_frame(dat, any.missing = FALSE)
  checkmate::assert_integerish(n, lower = 1, any.missing = FALSE, len = 1)

  dat <- as.data.frame(dat)
  cols <- colnames(dat)
  stopifnot(ncol(dat) > 0, nrow(dat) > 0)

  # indexes: random samples of length n, individual per column
  set.seed(seed)
  idx_per_cols <- lapply(
    seq_along(cols),
    function(...) sample.int(nrow(dat), size = n, replace = TRUE)
  )

  # get values for the indexes above
  dat_sub <- lapply(
    seq_along(cols),
    function(i) dat[idx_per_cols[[i]], cols[i], drop = FALSE]
  )
  dat_sub <- do.call("cbind", c(dat_sub, list(row.names = NULL)))

  # if this function is used to generate product marginals or data for ice curves
  return(dat_sub)
}



# ICE data --------

# calculate ICE data by using product marginals and prediction function
calculate_ice_data <- function(sub, predict_fun, x, x_name, col_order) {
  stopifnot(
    is.data.frame(sub) | is.null(sub),
    is.function(predict_fun),
    is.atomic(x),
    is.character(x_name),
    is.character(col_order)
  )

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  . <- ice <- line_id <- ..x <- ice_centered <- NULL # due to NSE notes in R CMD check

  # special case: sculpting performed on 1 variable
  if (is.null(sub)) {
    preds <- predict_fun(structure(data.frame(x), names = x_name))
    out <- data.table(
      x = x,
      ice = preds,
      ice_centered = preds - mean(preds),
      line_id = 1
    )

    # all other cases
  } else {
    stopifnot(!x_name %in% colnames(sub))
    out <- rbindlist(
      lapply(1:nrow(sub), function(i) cbind(x, sub[i, , drop = FALSE], row.names = NULL))
    )
    setnames(out, "x", x_name)
    out[, ice := predict_fun(as.data.frame(out)[, col_order])]
    out[, line_id := rep(1:nrow(sub), each = length(..x))]
    out[, ice_centered := ice - mean(ice), line_id]
    setnames(out, x_name, "x")
    out <- out[, c("x", "ice", "ice_centered", "line_id")]
  }
  return(out)
}

# generate ICE data from stored ICE predictions
# the result is similar shape as the returned object from calculate_ice_data
generate_ice_data <- function(predictions, x, logodds_to_prob = FALSE) {
  stopifnot(
    is.list(predictions),
    is.atomic(x)
  )

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  ..x <- line_id <- NULL # due to NSE notes in R CMD check

  out <- rbindlist(
    lapply(
      predictions,
      function(p) {
        data.frame(
          x = x,
          y = `if`(logodds_to_prob, inv.logit(p), p),
          row.names = NULL
        )
      }
    )
  )
  out[, line_id := rep(seq_along(predictions), each = length(..x))]

  return(out)
}


# PDP data -----------

# calculate PDP data from ICE data
calculate_pdp_data <- function(id) {
  stopifnot(is.data.table(id))

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  . <- x <- ice_centered <- NULL # due to NSE notes in R CMD check

  unique(id)[
    ,
    .(
      pdp_centered = mean(ice_centered),
      pdp_centered_se = sd(ice_centered, na.rm = F) / sqrt(.N)
    ),
    .(x)
  ]
}

# generate PDP data from stored ICE predictions
generate_pdp_data <- function(predictions, x, logodds_to_prob = FALSE) {
  id <- generate_ice_data(predictions = predictions, x = x, logodds_to_prob = logodds_to_prob)
  setnames(id, old = "y", new = "ice_centered")
  pd <- calculate_pdp_data(id)
  setnames(pd, old = "pdp_centered", new = "y")
  setnames(pd, old = "pdp_centered_se", new = "y_se")
  return(pd)
}


# rough sculpture ----------

check_data <- function(dat) {
  checkmate::assert_data_frame(dat, any.missing = FALSE)
  return(as.data.frame(dat)) # remove tbl_df etc
}

check_upf <- function(upf, dat) {
  checkmate::assert_function(upf, nargs = 1)
  upf_output <- upf(dat)
  check_upf_output(dat = dat, output = upf_output)
  return(upf_output)
}

check_upf_output <- function(dat, output) {
  checkmate::assert(
    checkmate::check_numeric(output, finite = TRUE, any.missing = FALSE, len = nrow(dat)),
    checkmate::check_factor(output, any.missing = FALSE, len = nrow(dat))
  )
  return(invisible(NULL))
}


#' Create a rough model
#'
#' @param dat Data to create the rough model from.
#' Must be a product marginal dataset (see `sample_marginals`)
#' with covariates only (i.e. without response).
#' @param model_predict_fun Function that returns predictions given a dataset.
#' @param n_ice Number of ICE curves to generate. Defaults to 10.
#' @param seed (`NULL`) or seed for exact reproducibility.
#' @param verbose (`integer`) 0 for silent run, > 0 for messages.
#' @param allow_par (`logical`) Allow parallel computation? Defaults to `FALSE`.
#' @param model_predict_fun_export For parallel computation only.
#' If there is a parallel backend registered (see `parallel_set()`),
#' then use this to export variables used in `model_predict_fun` (like model).
#' This is passed to `foreach::foreach(..., .export = model_predict_fun_export)`.
#' @param data_as_marginals (`logical`) Use the provided data `dat` as already sampled dataset?
#' Defaults to `FALSE`.
#'
#' @details For parallel computation, use [parallel_set()] and set `allow_par` to `TRUE`.
#' Note that parallel computation may fail if the model is too big and there is not enough memory.
#'
#' @return Object of classes `rough` and `sculpture`.
#' @export
#'
#' @examples
#' df <- mtcars
#' df$vs <- as.factor(df$vs)
#' model <- rpart::rpart(
#'   hp ~ mpg + carb + vs,
#'   data = df,
#'   control = rpart::rpart.control(minsplit = 10)
#' )
#' model_predict <- function(x) predict(model, newdata = x)
#' covariates <- c("mpg", "carb", "vs")
#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
#'
#' rs <- sculpt_rough(
#'   dat = pm,
#'   model_predict_fun = model_predict,
#'   n_ice = 10,
#'   seed = 1,
#'   verbose = 0
#' )
#'
#' class(rs)
#' head(predict(rs))
#'
#' # lm model without interaction -> additive -> same predictions
#' model <- lm(hp ~ mpg + carb + vs, data = df)
#' model_predict <- function(x) predict(model, newdata = x)
#' covariates <- c("mpg", "carb", "vs")
#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
#'
#' rs <- sculpt_rough(
#'   dat = pm,
#'   model_predict_fun = model_predict,
#'   n_ice = 10,
#'   seed = 1,
#'   verbose = 0
#' )
#'
#' class(rs)
#' head(predict(rs))
#' head(predict(model, pm))
#'
sculpt_rough <- function(dat, model_predict_fun, n_ice = 10,
                         seed = NULL, verbose = 0,
                         allow_par = FALSE,
                         model_predict_fun_export = NULL,
                         data_as_marginals = FALSE) {
  dat <- check_data(dat)
  predictions <- check_upf(model_predict_fun, dat)
  checkmate::assert_integerish(n_ice, any.missing = FALSE, len = 1)
  checkmate::assert_integerish(verbose, lower = 0, any.missing = FALSE, len = 1)
  checkmate::assert_flag(allow_par)
  checkmate::assert_flag(data_as_marginals)

  covariates <- colnames(dat)

  `%operand%` <- define_foreach_operand(allow_par = allow_par)
  res <- foreach::foreach(col = covariates, .export = model_predict_fun_export) %operand% {
    # verbosity ...
    matched <- match(col, covariates)
    if (verbose > 0) {
      if (length(covariates) < 10) {
        message(paste("Sculpting variable:", matched, "/", length(covariates)))
      } else {
        if ((matched == 1) | (matched %% 10 == 0)) {
          message(paste("Sculpting variable:", matched, "/", length(covariates)))
        }
      }
    }

    # generate product marginals
    if (data_as_marginals) {
      dat_subs <- dat[covariates[covariates != col]]
      # Sample n_ice rows
      dat_subs <- dat_subs[sample(nrow(dat_subs), n_ice, replace = TRUE), ]

    } else {
      if (length(covariates) > 1) {
        dat_subs <- sample_marginals(
          dat = dat[setdiff(covariates, col)],
          n = n_ice,
          seed = seed
        )
        stopifnot(nrow(dat_subs) == n_ice)
      } else {
        dat_subs <- NULL
      }
    }

    # calculate ice
    ice <- calculate_ice_data(
      sub = dat_subs,
      predict_fun = model_predict_fun,
      x = dat[[col]],
      x_name = col,
      col_order = colnames(dat)
    )

    # calculate pdp
    pdp <- calculate_pdp_data(id = ice)

    # continuous flag
    is_continuous <- is.numeric(dat[[col]])

    # interpolation function - used for making predictions
    af <- y <- x <- NULL # due to NSE notes in R CMD check
    if (is_continuous && nrow(pdp) > 1) {
      e_predict_fun <- new.env(parent = globalenv())
      e_predict_fun$x <- pdp[["x"]]
      e_predict_fun$y <- pdp[["pdp_centered"]]
      e_predict_fun$af <- approxfun(x = e_predict_fun$x, y = e_predict_fun$y, rule = 2)
      predict_fun <- function(v) af(v)
      environment(predict_fun) <- e_predict_fun
    } else {
      e_predict_fun <- new.env(parent = globalenv())
      e_predict_fun$x <- pdp[["x"]]
      e_predict_fun$y <- pdp[["pdp_centered"]]
      predict_fun <- function(v) {
        ind <- match(v, x)
        ifelse(is.na(ind), 0, y[ind])
      }
      environment(predict_fun) <- e_predict_fun
    }

    return(list(
      subsets = dat_subs,
      predict = predict_fun,
      ice_centered = split(ice$ice_centered, ice$line_id),
      ice = split(ice$ice, ice$line_id),
      is_discrete = !is_continuous,
      x = dat[[col]],
      x_name = col
    ))
  }

  names(res) <- covariates
  attr(res, "offset") <- mean(predictions)
  class(res) <- c("rough", "sculpture", class(res))

  # evaluate the sculpture
  es <- eval_sculpture(
    sculpture = res,
    data = as.data.frame(as.data.table(lapply(res, "[[", "x")))
  )

  # calculate variable importance
  dat_var <- calc_dir_var_imp_pdp(es$pdp)
  feat_order <- levels(dat_var$feature)

  # calculate cumulative R2
  dat_R2_cumul <- calc_cumul_R2_pdp(
    dt = es$pdp,
    feat_order = feat_order,
    model_predictions = es$prediction$pred,
    model_offset = es$offset
  )

  # calculate range
  dat_range <- calc_range_pdp(es$pdp)

  attr(res, "var_imp") <- dat_var
  attr(res, "cumul_R2") <- dat_R2_cumul
  attr(res, "range") <- dat_range

  return(res)
}



# detailed sculpture --------

#' Create a detailed model with user defined smoother
#'
#' @param rs Rough model, i.e. object of classes `rough` and `sculpture`.
#' @param smoother_fit Smoother fitting function.
#' @param smoother_predict Smoother prediction function.
#' @param missings (`NULL`) or single value or a named vector.
#' Specifies the value(-s) that stand for the missing values.
#' If `NULL`, then no missing value handling is carried out.
#' If single value, then it is assumed that this value is used for flagging missing values across
#' all continuous variables.
#' If named vector, then the names are used to refer to continuous variables and the values for
#' flagging missing values in that variable.
#' @param verbose (`integer`) 0 for silent run, > 0 for messages.
#' @param allow_par (`logical`) Allow parallel computation? Defaults to `FALSE`.
#'
#' @details For parallel computation, use [parallel_set()] and set `allow_par` to `TRUE`.
#' Note that parallel computation may fail if the model is too big and there is not enough memory.
#'
#' @section Custom smoothers:
#' If none of the predefined smoothers ([sculpt_detailed_gam()], [sculpt_detailed_lm()])
#' suits your needs, you can define your own smoothers.
#' You need to define 2 functions: `smoother_fit` and `smoother_predict`:
#'
#' `smoother_fit` takes 5 arguments ("x", "y", "is_discrete", "column_name", "na_ind") and
#' returns a model fit. "x" are the feature values, "y" are the PDP values,
#' "is_discrete" flags a discrete feature, "column_name" holds the feature name,
#' and "na_ind" passes the NA value from `missings` (or NULL by default).
#'
#' `smoother_predict` takes also 5 arguments ("smoother", "new_x", "is_discrete", "column_name",
#' "na_ind") and returns predictions as a vector. "smoother" is the model fit returned from
#' `smoother_fit`, "new_x" are the feature values that we want to predict, "is_discrete",
#' "column_name", and "na_ind" have the same purpose as in `smoother_fit`.
#' See also Examples.
#'
#' @return Object of classes `detailed` and `sculpture`.
#' @export
#'
#' @examples
#' df <- mtcars
#' df$vs <- as.factor(df$vs)
#' model <- rpart::rpart(
#'   hp ~ mpg + carb + vs,
#'   data = df,
#'   control = rpart::rpart.control(minsplit = 10)
#' )
#' model_predict <- function(x) predict(model, newdata = x)
#' covariates <- c("mpg", "carb", "vs")
#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
#'
#' rs <- sculpt_rough(
#'   dat = pm,
#'   model_predict_fun = model_predict,
#'   n_ice = 10,
#'   seed = 1,
#'   verbose = 0
#' )
#'
#' # define custom smoother
#' # - gam with 3 knots for variable "mpg"
#' # - gam with 5 knots for variable "carb"
#' # - lm for any discrete variable
#' library(mgcv)
#' my_smoother <- function(x, y, is_discrete, column_name, na_ind = NULL) {
#'   if (column_name == "mpg") {
#'     gam(y ~ s(x, k = 3))
#'   } else if (column_name == "carb") {
#'     gam(y ~ s(x, k = 5))
#'   } else if (is_discrete) {
#'     lm(y ~ x)
#'   } else {
#'     stop("Undefined smoother")
#'   }
#' }
#'
#' # define appropriate predict function
#' # - predict.gam returns an array, we need to convert it to vector
#' # - if-else branch for illustration purposes
#' my_smoother_predict <- function(smoother, new_x, is_discrete, column_name, na_ind = NULL) {
#'   if (inherits(smoother, "gam")) {
#'     # as.numeric: convert array to vector
#'     as.numeric(predict(smoother, newdata = data.frame(x = new_x)))
#'   } else {
#'     predict(smoother, newdata = data.frame(x = new_x))
#'   }
#' }
#'
#' ds <- sculpt_detailed_generic(
#'   rs = rs,
#'   smoother_fit = my_smoother,
#'   smoother_predict = my_smoother_predict
#' )
#' class(ds)
#' \dontrun{
#' # see components
#' g_component(ds)$continuous
#' }
#'
#'
#' # another example with constrained gam (cgam) package
#' \dontrun{
#' library(cgam)
#'
#' cgam_smoother <- function(x, y, is_discrete, column_name, na_ind = NULL) {
#'   if (column_name == "carb") {
#'     cgam(y ~ s.incr(x, numknots = 3))
#'   } else if (column_name == "mpg") {
#'     cgam(y ~ s.decr(x, numknots = 3))
#'   } else {
#'     cgam(y ~ x)
#'   }
#' }
#'
#' cgam_predict <- function(smoother, new_x, is_discrete, column_name, na_ind = NULL) {
#'   predict(smoother, newData = data.frame(x = new_x))$fit
#' }
#'
#' ds2 <- sculpt_detailed_generic(
#'   rs = rs,
#'   smoother_fit = cgam_smoother,
#'   smoother_predict = cgam_predict
#' )
#'
#' # see components
#' g_component(ds2)$continuous
#' }
sculpt_detailed_generic <- function(rs, smoother_fit, smoother_predict,
                                    missings = NULL, verbose = 0, allow_par = FALSE) {
  checkmate::assert_class(rs, "sculpture")
  checkmate::assert_class(rs, "rough")
  checkmate::assert_function(
    smoother_fit,
    args = c("x", "y", "is_discrete", "column_name", "na_ind")
  )
  checkmate::assert_function(
    smoother_predict,
    args = c("smoother", "new_x", "is_discrete", "column_name", "na_ind")
  )
  checkmate::assert(
    checkmate::check_null(missings),
    checkmate::check_atomic(missings, any.missing = FALSE, len = 1),
    checkmate::check_atomic(missings, any.missing = FALSE, max.len = length(rs), names = "named")
  )
  check_continuous <- vapply(rs, "[[", logical(1), "is_discrete")
  check_continuous <- names(Filter(isFALSE, check_continuous))
  if (length(missings) == 1) {
    missings <- rep(list(missings), length(check_continuous))
    names(missings) <- check_continuous
  } else if (length(missings) != 0) {
    missings <- as.list(missings)
    checkmate::assert_subset(names(missings), check_continuous, .var.name = "missings")
  }
  checkmate::assert_integerish(verbose, lower = 0, any.missing = FALSE, len = 1)
  checkmate::assert_flag(allow_par)

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  x <- NULL # due to NSE notes in R CMD check

  `%operand%` <- define_foreach_operand(allow_par = allow_par)
  res <- foreach::foreach(col = names(rs)) %operand% {
    # verbosity ...
    matched <- match(col, names(rs))
    if (verbose > 0) {
      if (length(rs) < 10) {
        message(paste("Sculpting variable:", matched, "/", length(rs)))
      } else {
        if ((matched == 1) | (matched %% 10 == 0)) {
          message(paste("Sculpting variable:", matched, "/", length(rs)))
        }
      }
    }

    # build the smoother from PDPs (based on original data, i.e. with duplicates)
    pdp_dupl <- data.table(x = rs[[col]]$x, pdp_centered = rs[[col]]$predict(rs[[col]]$x))
    pdp_dupl <- pdp_dupl[order(x)]

    # memory optimization: use a clean environment for the predict function
    e_predict_fun <- new.env()

    # estimate smoothers
    e_predict_fun$smoother <- smoother_fit(
      x = pdp_dupl$x,
      y = pdp_dupl$pdp_centered,
      is_discrete = rs[[col]]$is_discrete,
      column_name = col,
      na_ind = `if`(!is.null(missings[[col]]), pdp_dupl$x == missings[[col]])
    )

    # add the rest of variables into the function environment
    e_predict_fun$smoother_predict <- smoother_predict
    e_predict_fun$is_discrete <- rs[[col]]$is_discrete
    e_predict_fun$col <- col
    e_predict_fun$missings_flag <- missings[[col]]

    # smoother prediction function
    smoother <- is_discrete <- missings_flag <- NULL # due to NSE notes in R CMD check
    predict_fun <- function(x) {
      smoother_predict(
        smoother = smoother,
        new_x = x,
        is_discrete = is_discrete,
        column_name = col,
        na_ind = `if`(!is.null(missings_flag), x == missings_flag)
      )
    }
    # use the defined environment as the environment of the function
    environment(predict_fun) <- e_predict_fun

    # check the output of the smoother prediction
    pf_check <- predict_fun(pdp_dupl$x)
    if (!is.vector(pf_check) || is.character(pf_check)) {
      stop("The output of the `smoother_predict` needs to be a numeric/factor vector.")
    }

    return(list(
      predict = predict_fun,
      is_discrete = rs[[col]]$is_discrete,
      x = rs[[col]]$x,
      x_name = rs[[col]]$x_name,
      missings_flag = missings[[col]]
    ))
  }

  names(res) <- names(rs)
  attr(res, "offset") <- attr(rs, "offset")
  class(res) <- c("detailed", "sculpture", class(res))

  # evaluate the sculpture
  es <- eval_sculpture(
    sculpture = res,
    data = as.data.frame(as.data.table(lapply(res, "[[", "x")))
  )

  # calculate variable importance
  dat_var <- calc_dir_var_imp_pdp(es$pdp)
  feat_order <- levels(dat_var$feature)

  # calculate cumulative R2
  dat_R2_cumul <- calc_cumul_R2_pdp(
    dt = es$pdp,
    feat_order = feat_order,
    model_predictions = es$prediction$pred,
    model_offset = es$offset
  )

  # calculate range
  dat_range <- calc_range_pdp(es$pdp)

  attr(res, "var_imp") <- dat_var
  attr(res, "cumul_R2") <- dat_R2_cumul
  attr(res, "range") <- dat_range

  return(res)
}


smoother_gam <- function(x, y, is_discrete, column_name, na_ind = NULL) {
  s <- mgcv::s
  if (!is_discrete) {
    tryCatch(
      mgcv::gam(as.formula(paste0("y ~ s(x, k = -1)", `if`(!is.null(na_ind), " + na_ind")))),
      error = function(e) {
        tryCatch(
          mgcv::gam(as.formula(paste0("y ~ s(x, k = 3)", `if`(!is.null(na_ind), " + na_ind")))),
          error = function(e) {
            if (length(x) == 1) {
              lm(as.formula(paste0("y ~ x", `if`(!is.null(na_ind), " + na_ind"))))
            } else {
              mgcv::gam(as.formula(paste0("y ~ x", `if`(!is.null(na_ind), " + na_ind"))))
            }
          }
        )
      }
    )
  } else {
    tryCatch(
      mgcv::gam(y ~ x),
      error = function(e) {
        if (length(unique(x)) == 1) {
          lm(y ~ 0)
        } else {
          stop(paste(
            "Cannot fit a smoother for", column_name,
            "The error message is:", e$message
          ))
        }
      }
    )
  }
}

smoother_gam_predict <- function(smoother, new_x, is_discrete, column_name, na_ind = NULL) {
  newdata <- data.frame(x = new_x)
  newdata$na_ind <- na_ind
  tryCatch(
    as.numeric(predict(smoother, newdata = newdata)),
    warning = function(w) {
      if (grepl("^factor levels .* not in original fit$", w$message)) {
        idx_known <- new_x %in% smoother$model$x
        y <- vector("numeric", length = length(new_x))
        y[idx_known] <- as.numeric(predict(smoother, newdata = newdata[idx_known, , drop = FALSE]))
        y[!idx_known] <- 0
        return(y)
      } else {
        stop("Unknown value for prediction")
      }
    }
  )
}


#' Create a detailed model with gam smoother
#'
#' @inheritParams sculpt_detailed_generic
#'
#' @details For parallel computation, use [parallel_set()] and set `allow_par` to `TRUE`.
#' Note that parallel computation may fail if the model is too big and there is not enough memory.
#'
#' @return Object of classes `detailed` and `sculpture`.
#' @export
#'
#' @examples
#' df <- mtcars
#' df$vs <- as.factor(df$vs)
#' model <- rpart::rpart(
#'   hp ~ mpg + carb + vs,
#'   data = df,
#'   control = rpart::rpart.control(minsplit = 10)
#' )
#' model_predict <- function(x) predict(model, newdata = x)
#' covariates <- c("mpg", "carb", "vs")
#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
#'
#' rs <- sculpt_rough(
#'   dat = pm,
#'   model_predict_fun = model_predict,
#'   n_ice = 10,
#'   seed = 1,
#'   verbose = 0
#' )
#'
#' ds <- sculpt_detailed_gam(rs)
#' class(ds)
#'
sculpt_detailed_gam <- function(rs, missings = NULL, verbose = 0, allow_par = FALSE) {
  requireNamespace("mgcv")
  sculpt_detailed_generic(
    rs = rs, verbose = verbose,
    allow_par = allow_par,
    smoother_fit = smoother_gam,
    smoother_predict = smoother_gam_predict,
    missings = missings
  )
}




smoother_lm <- function(x, y, is_discrete, column_name, na_ind = NULL) {
  tryCatch(
    lm(as.formula(paste0("y ~ x", `if`(!is.null(na_ind), " + na_ind")))),
    error = function(e) {
      if (length(unique(x)) == 1) {
        lm(y ~ 0)
      } else {
        stop(paste(
          "Cannot fit a smoother for", column_name,
          "The error message is:", e$message
        ))
      }
    }
  )
}

smoother_lm_predict <- function(smoother, new_x, is_discrete, column_name, na_ind = NULL) {
  newdata <- data.frame(x = new_x)
  newdata$na_ind <- na_ind
  tryCatch(
    unname(predict(smoother, newdata = newdata)),
    error = function(e) {
      if (is_discrete && grepl("factor x has new level", e$message)) {
        idx_known <- new_x %in% smoother$model$x
        y <- vector("numeric", length = length(new_x))
        y[idx_known] <- as.numeric(predict(smoother, newdata = newdata[idx_known, , drop = FALSE]))
        y[!idx_known] <- 0
        return(y)
      } else {
        stop("Unknown value for prediction")
      }
    }
  )
}


#' Create a detailed model with lm smoother
#'
#' @inheritParams sculpt_detailed_generic
#'
#' @details For parallel computation, use [parallel_set()] and set `allow_par` to `TRUE`.
#' Note that parallel computation may fail if the model is too big and there is not enough memory.
#'
#' @return Object of classes `detailed` and `sculpture`.
#' @export
#'
#' @examples
#' df <- mtcars
#' df$vs <- as.factor(df$vs)
#' model <- rpart::rpart(
#'   hp ~ mpg + carb + vs,
#'   data = df,
#'   control = rpart::rpart.control(minsplit = 10)
#' )
#' model_predict <- function(x) predict(model, newdata = x)
#' covariates <- c("mpg", "carb", "vs")
#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
#'
#' rs <- sculpt_rough(
#'   dat = pm,
#'   model_predict_fun = model_predict,
#'   n_ice = 10,
#'   seed = 1,
#'   verbose = 0
#' )
#'
#' ds <- sculpt_detailed_lm(rs)
#' class(ds)
#'
sculpt_detailed_lm <- function(rs, missings = NULL, verbose = 0, allow_par = FALSE) {
  sculpt_detailed_generic(
    rs = rs, verbose = verbose,
    allow_par = allow_par,
    smoother_fit = smoother_lm,
    smoother_predict = smoother_lm_predict,
    missings = missings
  )
}


# polished sculpture --------


#' Create a polished model
#'
#' @param object Object of class `sculpture`, either `rough` or `detailed`.
#' @param k Number of most important variables to keep.
#' @param vars Vector of variables to keep.
#'
#' @return Object of classes `rough` / `detailed` and `sculpture`.
#' @export
#'
#' @examples
#' df <- mtcars
#' df$vs <- as.factor(df$vs)
#' model <- rpart::rpart(
#'   hp ~ mpg + carb + vs,
#'   data = df,
#'   control = rpart::rpart.control(minsplit = 10)
#' )
#' model_predict <- function(x) predict(model, newdata = x)
#' covariates <- c("mpg", "carb", "vs")
#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
#'
#' rs <- sculpt_rough(
#'   dat = pm,
#'   model_predict_fun = model_predict,
#'   n_ice = 10,
#'   seed = 1,
#'   verbose = 0
#' )
#'
#' ds <- sculpt_detailed_gam(rs)
#'
#' # this keeps only "mpg"
#' ps <- sculpt_polished(ds, k = 1)
#'
sculpt_polished <- function(object, k = NULL, vars = NULL) {
  checkmate::assert_class(object, "sculpture")
  checkmate::assert(
    checkmate::check_null(k),
    checkmate::check_null(vars)
  )

  if (is.null(k)) {
    checkmate::assert_character(vars, min.len = 1, any.missing = FALSE)
    checkmate::assert_subset(vars, names(object))
  } else if (is.null(vars)) {
    checkmate::assert_number(k, lower = 1)
    vars <- levels(attr(object, "cumul_R2")$feature)[1:k]
  }

  res <- object[vars]
  attr(res, "offset") <- attr(object, "offset")
  class(res) <- class(object)

  # evaluate the sculpture
  es <- eval_sculpture(
    sculpture = res,
    data = as.data.frame(as.data.table(lapply(res, "[[", "x")))
  )

  # calculate variable importance
  dat_var <- calc_dir_var_imp_pdp(es$pdp)
  feat_order <- levels(dat_var$feature)

  # calculate cumulative R2
  dat_R2_cumul <- calc_cumul_R2_pdp(
    dt = es$pdp,
    feat_order = feat_order,
    model_predictions = es$prediction$pred,
    model_offset = es$offset
  )

  # calculate range
  dat_range <- calc_range_pdp(es$pdp)

  attr(res, "var_imp") <- dat_var
  attr(res, "cumul_R2") <- dat_R2_cumul
  attr(res, "range") <- dat_range

  return(res)
}



# utils -----


eval_sculpture <- function(sculpture, data) {
  stopifnot(
    inherits(sculpture, "sculpture"),
    ncol(data) >= length(sculpture),
    all(names(sculpture) %in% colnames(data))
  )

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  . <- rn <- pdp_c <- NULL # due to NSE notes in R CMD check

  # get offset for predictions
  offset <- attr(sculpture, "offset")

  # get predict functions
  interp_funs <- lapply(sculpture, "[[", "predict")

  # PDPs at data
  pdp <- lapply(names(sculpture), function(col) interp_funs[[col]](data[[col]]))
  names(pdp) <- names(sculpture)

  # reshape PDPs to create predictions
  pdp <- cbind(data.table(rn = 1:nrow(data)), do.call("cbind", pdp))
  pdp <- melt(pdp, id.vars = "rn", variable.name = "feature", value.name = "pdp_c")
  pred <- pdp[, .(pred = sum(pdp_c) + offset), .(rn)][order(rn)]

  return(list(
    pdp = pdp,
    offset = offset,
    prediction = pred
  ))
}

#' @export
predict.sculpture <- function(object, newdata = NULL, ...) {
  if (is.null(newdata)) {
    newdata <- as.data.frame(as.data.table(lapply(object, "[[", "x")))
  } else {
    checkmate::assert_subset(names(object), colnames(newdata))
    newdata <- as.data.frame(newdata)
  }
  tmp <- eval_sculpture(
    sculpture = object,
    data = newdata[names(object)]
  )
  return(structure(tmp$prediction$pred, names = tmp$prediction$rn))
}

#' @export
print.sculpture <- function(x, ...) {
  n_vars <- length(x)
  cat(
    paste(
      stringr::str_to_sentence(class(x)[1]), "sculpture with",
      n_vars, paste0("variable", `if`(n_vars > 1, "s"))
    )
  )
}

# for transforming log-odds to probability
inv.logit <- function(x) 1 / (1 + exp(-x))

# sculpture metrics --------


#' Various metrics related to model sculpting
#'
#' @name var_imp
#'
#' @param object `sculpture`
#' @param newdata (Optional) Data to calculate the importance from.
#' If omitted, the data that were provided to build the sculpture are used.
#'
#' @return `data.table` with direct requested metrics.
#'
#' @examples
#' df <- mtcars
#' df$vs <- as.factor(df$vs)
#' model <- rpart::rpart(
#'   hp ~ mpg + carb + vs,
#'   data = df,
#'   control = rpart::rpart.control(minsplit = 10)
#' )
#' model_predict <- function(x) predict(model, newdata = x)
#' covariates <- c("mpg", "carb", "vs")
#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
#'
#' rs <- sculpt_rough(
#'   dat = pm,
#'   model_predict_fun = model_predict,
#'   n_ice = 10,
#'   seed = 1,
#'   verbose = 0
#' )
#'
#' # show direct variable importance
#' calc_dir_var_imp(rs)
#'
#' # show cumulative approximation R^2
#' calc_cumul_R2(rs)
NULL


calc_dir_var_imp_pdp <- function(dt) {
  stopifnot(
    all(c("rn", "feature", "pdp_c") %in% colnames(dt)),
    !"total" %in% tolower(unique(dt$feature)),
    nrow(dt) == nrow(unique(dt[, .(rn, feature)]))
  )

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  . <- rn <- feature <- pdp_c <- ratio <- variance <- variance_total <-
    NULL # due to NSE notes in R CMD check

  # calculate total variance of PDPs
  var_total <- dt[, .(pdp_c = sum(pdp_c)), .(rn)][, var(pdp_c)]

  # calculate variance per feature
  dat_var <- dt[
    ,
    .(variance = var(pdp_c), variance_total = var_total),
    .(feature)
  ][
    ,
    ratio := variance / variance_total
  ][
    order(ratio, decreasing = TRUE)
  ]

  # define as factor to keep the order
  dat_var[, feature := factor(feature, levels = feature)]
  return(dat_var)
}

#' @describeIn var_imp Direct variable importance
#' @export
calc_dir_var_imp <- function(object, newdata = NULL) {
  checkmate::assert_class(object, "sculpture")
  if (is.null(newdata)) {
    return(attr(object, "var_imp"))
  }
  checkmate::assert_data_frame(newdata, any.missing = FALSE)
  calc_dir_var_imp_pdp(
    eval_sculpture(
      sculpture = object,
      data = newdata
    )$pdp
  )
}

calc_cumul_R2_pdp <- function(dt, feat_order, model_predictions, model_offset) {
  stopifnot(
    is.data.table(dt),
    all(c("rn", "feature", "pdp_c") %in% colnames(dt)),
    !"total" %in% tolower(unique(dt$feature)),
    nrow(dt) == nrow(unique(dt[, .(rn, feature)])),
    length(model_predictions) == length(unique(dt$rn)),
    is.character(feat_order),
    is.numeric(model_offset)
  )

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  . <- rn <- feature <- pdp_c <- preds <- NULL # due to NSE notes in R CMD check

  # prepare ordered features
  cumul_features <- lapply(seq_along(feat_order), function(i) feat_order[1:i])

  # calculate R2
  R2_cumul <- vapply(
    cumul_features,
    function(cols) {
      predictions <- dt[
        feature %in% cols,
        .(preds = sum(pdp_c) + model_offset),
        .(rn)
      ][
        order(rn), preds
      ]
      metrics_R2(score_fun = "score_quadratic", y = model_predictions, y_hat = predictions)
    },
    numeric(1)
  )
  return(
    data.table(feature = factor(feat_order, levels = feat_order), R2 = R2_cumul)
  )
}


#' @describeIn var_imp Calculate cumulative approximation of R^2
#' @export
calc_cumul_R2 <- function(object, newdata = NULL) {
  checkmate::assert_class(object, "sculpture")
  if (is.null(newdata)) {
    return(attr(object, "cumul_R2"))
  }
  checkmate::assert_data_frame(newdata, any.missing = FALSE)

  eg <- eval_sculpture(
    sculpture = object,
    data = newdata
  )

  dat_var <- calc_dir_var_imp_pdp(dt = eg$pdp)

  calc_cumul_R2_pdp(
    dt = eg$pdp,
    feat_order = levels(dat_var$feature),
    model_predictions = eg$prediction$pred,
    model_offset = eg$offset
  )
}


# calculate range - for plots (facet sorting)
calc_range_pdp <- function(dt) {
  stopifnot(
    all(c("rn", "feature", "pdp_c") %in% colnames(dt)),
    nrow(dt) == nrow(unique(dt[, .(rn, feature)]))
  )

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  . <- rn <- feature <- pdp_c <- NULL # due to NSE notes in R CMD check

  dt_range <- dt[
    , .(range = max(pdp_c) - min(pdp_c)),
    .(feature)
  ][
    order(-range)
  ]
  dt_range[, feature := factor(feature, levels = feature)][]

  return(dt_range)
}



# generic metrics --------


#' Various metrics for measuring model performance.
#'
#' @name metrics
#' @param score_fun A scoring function: `score_quadratic`, `score_log_loss`,
#' or a user-defined scoring rule. See below for more details.
#' @param y Vector of observations.
#' @param y_hat Vector of predictions.
#' @param y_hat_calib Vector of calibrated predictions. See below for more details.
#' @param na_rm Logical, defaults to `FALSE`. Should NAs be removed?
#' @param rev_fct Logical, defaults to `FALSE`. Switch the factor level of
#' the data before performing calibration. Only relevant for binary response.
#'
#' @section Scoring function:
#' One can use predefined scores like `score_quadratic` or `score_log_loss`.
#' If those do not fit the needs, a user-defined scoring function can also be used.
#' This function needs to take exactly 3 arguments: `y` (truth values),
#' `y_hat` (estimated values), and `na_rm` (should NAs be removed?):
#' - both `y` and `y_hat` are numeric (not factors!)
#' - `na_rm` is a scalar logical
#'
#' It needs to return a number.
#' There is a utility function `check_score_fun` to check if the user-defined function is
#' programmed correctly.
#' It checks the input and the output, but not if the actual returned value makes sense.
#'
#'
#' @section Calibration:
#' To obtain calibrated predictions,
#' fit a calibration model and predict based on that model.
#' Users can use their own calibration model or make use of `metrics_fit_calib`,
#' which fits an `mgcv::gam()` model with smoother `mgcv::s(., k = -1)` (automatic knot selection).
#' If the input `y` is a factor, then a binomial family is used, otherwise a gaussian.
#' NAs are always dropped.
#'
#' Continuous response example:
#' ```
#' calibration_model <- metrics_fit_calib(
#'   y = truth,
#'   y_hat = prediction
#' )
#' calib_pred <- predict(calibration_model)
#' ```
#'
#' Binary response example:
#' ```
#' calibration_model <- metrics_fit_calib(
#'   y = factor(truth, levels = c("0", "1")),
#'   y_hat = prediction
#' )
#' calib_pred <- predict(calibration_model, type = "response")
#' ```
#' In the binary case, make sure that:
#' - `y` is a factor with correct level setting.
#' Usually "0" is the reference (first) level and "1" is the event (second level).
#' This may clash with `yardstick` setting where
#' the first level is by default the "event" level.
#' - `y_hat` are probabilities (not a log of odds).
#' - returned calibrated predictions `calib_pred` are also probabilities by setting
#' `type = "response"`.
#'
#'
#' @return `metrics_fit_calib` returns an [mgcv::gam()] model fit, otherwise a number.
#'
#' @examples
#' # Scores
#' score_quadratic(y = c(1.34, 2.8), y_hat = c(1.34, 2.8)) # must be 0
#' score_quadratic(y = 0.5, 0) # must be 0.5**2 = 0.25
#'
#' score_log_loss(y = c(0, 1), y_hat = c(0.01, 0.9)) # must be close to 0
#' score_log_loss(y = 0, y_hat = 0) # undefined
#'
#' check_score_fun(score_quadratic) # passes without errors
#'
#' # Metrics based on `lm` model
#' mod <- lm(hp ~ ., data = mtcars)
#' truth <- mtcars$hp
#' pred <- predict(mod)
#'
#' # calibration fit and calibrated predictions
#' calib_mod <- metrics_fit_calib(y = truth, y_hat = pred)
#' calib_pred <- predict(calib_mod)
#'
#' metrics_unc(score_fun = "score_quadratic", y = truth)
#' metrics_R2(score_fun = "score_quadratic", y = truth, y_hat = pred)
#' metrics_DI(score_fun = "score_quadratic", y = truth, y_hat_calib = calib_pred)
#' metrics_MI(score_fun = "score_quadratic", y = truth, y_hat = pred, y_hat_calib = calib_pred)
#' # Note that R^2 = DI - MI
#' metrics_r2(y = truth, y_hat = pred, y_hat_calib = calib_pred)
#'
#' # Metrics based on `glm` model (logistic regression)
#' # Note the correct setting of levels
#' mod <- glm(factor(vs, levels = c("0", "1")) ~ hp + mpg, data = mtcars, family = "binomial")
#' truth_fct <- factor(mtcars$vs, levels = c("0", "1"))
#' truth_num <- mtcars$vs
#' pred <- predict(mod, type = "response") # type = "response" returns probabilities
#'
#' # calibration fit and calibrated predictions
#' calib_mod <- metrics_fit_calib(y = truth_fct, y_hat = pred)
#' calib_pred <- predict(calib_mod, type = "response") # type = "response" returns probabilities
#'
#' metrics_unc(score_fun = "score_quadratic", y = truth_num)
#' metrics_R2(score_fun = "score_quadratic", y = truth_num, y_hat = pred)
#' metrics_DI(score_fun = "score_quadratic", y = truth_num, y_hat_calib = calib_pred)
#' metrics_MI(score_fun = "score_quadratic", y = truth_num, y_hat = pred, y_hat_calib = calib_pred)
#' # Note that R^2 = DI - MI
#' metrics_r2(y = truth_num, y_hat = pred, y_hat_calib = calib_pred)
#'
NULL

remove_missing <- function(...) {
  idx <- complete.cases(...)
  lapply(list(...), \(x) x[idx])
}

#' @describeIn metrics Binary log loss score
#' @export
score_log_loss <- function(y, y_hat, na_rm = FALSE) {
  checkmate::assert_numeric(y)
  checkmate::assert(
    checkmate::check_numeric(y_hat, len = length(y)),
    checkmate::check_numeric(y_hat, len = 1)
  )
  if (na_rm) {
    rm <- remove_missing(y = y, y_hat = y_hat)
    y <- rm[["y"]]
    y_hat <- rm[["y_hat"]]
  }
  -mean(y * log(y_hat) + (1 - y) * log(1 - y_hat))
}

#' @describeIn metrics Quadratic score
#' @export
score_quadratic <- function(y, y_hat, na_rm = FALSE) {
  checkmate::assert_numeric(y)
  checkmate::assert(
    checkmate::check_numeric(y_hat, len = length(y)),
    checkmate::check_numeric(y_hat, len = 1)
  )
  if (na_rm) {
    rm <- remove_missing(y = y, y_hat = y_hat)
    y <- rm[["y"]]
    y_hat <- rm[["y_hat"]]
  }
  mean((y - y_hat)**2)
}

#' @describeIn metrics Utility function for checking the properties of a user-defined `score_fun`.
#' @export
check_score_fun <- function(score_fun) {
  if (is.character(score_fun)) {
    checkmate::assert_function(eval(str2lang(score_fun)), args = c("y", "y_hat", "na_rm"))
  } else if (is.function(score_fun)) {
    checkmate::assert_function(score_fun, args = c("y", "y_hat", "na_rm"))
  } else {
    stop("`score_fun` must be a function.")
  }
  out <- do.call(score_fun, list(y = c(0.5, 0.6), y_hat = c(0.5, 0.55)))
  if (!checkmate::test_number(out, na.ok = TRUE)) {
    stop("The return value of `score_fun` must be a number")
  }
}


#' @describeIn metrics Uncertainty
#' @export
metrics_unc <- function(score_fun, y, na_rm = FALSE) {
  check_score_fun(score_fun)
  if (na_rm) {
    rm <- remove_missing(y = y)
    y <- rm[["y"]]
  }
  do.call(score_fun, list(y = y, y_hat = rep_len(mean(y), length(y))))
}

#' @describeIn metrics R^2 metric
#' @export
metrics_R2 <- function(score_fun, y, y_hat, na_rm = FALSE) {
  check_score_fun(score_fun)
  if (na_rm) {
    rm <- remove_missing(y = y, y_hat = y_hat)
    y <- rm[["y"]]
    y_hat <- rm[["y_hat"]]
  }
  1 -
    do.call(score_fun, list(y = y, y_hat = y_hat)) /
      do.call(score_fun, list(y = y, y_hat = rep_len(mean(y), length(y))))
}

#' @describeIn metrics Fit calibration curve using [mgcv::gam()].
#' Note that NAs are always dropped.
#' @export
metrics_fit_calib <- function(y, y_hat, rev_fct = FALSE) {
  requireNamespace("mgcv")
  s <- mgcv::s
  if (is.factor(y)) {
    fam <- binomial()
    if(rev_fct) y <- factor(y, levels=rev(levels(y)))
  } else {
    fam <- gaussian()
  }
  tryCatch(
    mgcv::gam(y ~ s(y_hat, k = -1), family = fam, na.action = "na.omit"),
    error = \(e) tryCatch(
      mgcv::gam(y ~ s(y_hat, k = 3), family = fam, na.action = "na.omit"),
      error = \(e) mgcv::gam(y ~ y_hat, family = fam, na.action = "na.omit")
    )
  )
}

#' @describeIn metrics Discrimination index
#' @export
metrics_DI <- function(score_fun, y, y_hat_calib, na_rm = FALSE) {
  check_score_fun(score_fun)
  if (na_rm) {
    rm <- remove_missing(y = y, y_hat_calib = y_hat_calib)
    y <- rm[["y"]]
    y_hat_calib <- rm[["y_hat_calib"]]
  }
  (
    do.call(score_fun, list(y = y, y_hat = rep_len(mean(y), length(y)))) -
      do.call(score_fun, list(y = y, y_hat = y_hat_calib))
  ) /
    do.call(score_fun, list(y = y, y_hat = rep_len(mean(y), length(y))))
}

#' @describeIn metrics Miscalibration index
#' @export
metrics_MI <- function(score_fun, y, y_hat, y_hat_calib, na_rm = FALSE) {
  check_score_fun(score_fun)
  if (na_rm) {
    rm <- remove_missing(y = y, y_hat = y_hat, y_hat_calib = y_hat_calib)
    y <- rm[["y"]]
    y_hat <- rm[["y_hat"]]
    y_hat_calib <- rm[["y_hat_calib"]]
  }
  (
    do.call(score_fun, list(y = y, y_hat = y_hat)) -
      do.call(score_fun, list(y = y, y_hat = y_hat_calib))
  ) /
    do.call(score_fun, list(y = y, y_hat = rep_len(mean(y), length(y))))
}


#' @describeIn metrics r^2 metric based on slope of `lm`
#' @export
metrics_r2 <- function(y, y_hat, y_hat_calib, na_rm = FALSE) {
  if (na_rm) {
    rm <- remove_missing(y = y, y_hat = y_hat, y_hat_calib = y_hat_calib)
    y <- rm[["y"]]
    y_hat <- rm[["y_hat"]]
    y_hat_calib <- rm[["y_hat_calib"]]
  } else if (anyNA(y) || anyNA(y_hat) || anyNA(y_hat_calib)) {
    return(NA)
  }
  lm_mod <- lm(y_hat_calib ~ y_hat)
  res <- (coef(lm_mod)[2] * sd(y_hat) / sd(y))**2
  if (is.na(res)) {
    res <- 0
  }
  return(unname(res))
}

# colors -------

# function for generating colours
ms_color <- function(n, hue_coloring = FALSE) {
  if (n < 7 && !hue_coloring) {
    c("#0a0a0a", "#14a3a8", "#e3211d", "#b15829", "#6a3d9a", "#34a02b")[1:n]
  } else {
    hcl(h = seq(15, 375, length = n + 1), l = 35, c = 85)[1:n]
  }
}


# facets specification -------

#' Instructions for facet vizualisations
#'
#' @param labels (`NULL`) or named character vector with variable labels.
#' @param ncol (`NULL`) or number of columns in the facet.
#' @param sort One of "alphabetical", "importance", or "range" - sorting of the facets.
#' @param top_k (`NULL`) or number of most important features to show.
#' @param subset (`NULL`) or a vector of variables to show.
#' @param scales One of "free", "free_x", or "free_y" - axis scales of the graphs.
#'
#' @return List of class `facet_specification`.
#' @export
#'
#' @examples
#' \dontrun{
#' g_ice(
#'   sculpture,
#'   facet_spec = facet_specification(
#'     ncol = 3, # display 3 columns
#'     sort = "importance" # sort by importance
#'   )
#' )
#' }
#'
facet_specification <- function(labels = NULL,
                                ncol = NULL,
                                sort = "alphabetical",
                                top_k = NULL,
                                subset = NULL,
                                scales = "free_x") {
  checkmate::assert(
    checkmate::check_character(labels, any.missing = FALSE, names = "named"),
    checkmate::check_null(labels)
  )

  checkmate::assert(
    checkmate::check_integerish(ncol, any.missing = FALSE, len = 1, lower = 1),
    checkmate::check_null(ncol)
  )

  checkmate::assert_character(sort, any.missing = FALSE, len = 1)
  checkmate::assert_subset(sort, c("alphabetical", "importance", "range"))

  checkmate::assert(
    checkmate::check_number(top_k, lower = 1),
    checkmate::check_null(top_k)
  )

  checkmate::assert(
    checkmate::check_character(subset),
    checkmate::check_null(subset)
  )

  checkmate::assert_character(scales, len = 1, any.missing = FALSE)
  checkmate::assert_subset(scales, c("free", "free_y", "free_x"))

  if (!is.null(top_k) & !is.null(subset)) {
    stop("Please use either `top_k` or `subset`, but not both together.")
  }

  out <- list(
    labels = labels, ncol = ncol, sort = sort, top_k = top_k,
    subset = subset, scales = scales
  )
  class(out) <- "facet_specification"

  return(out)
}


resolve_facet_specification <- function(obj, fs) {
  # checks
  checkmate::assert_class(obj, "sculpture")
  checkmate::assert_class(fs, "facet_specification")

  # resolve labels
  if (is.null(fs$labels)) {
    fs$labels <- structure(names(obj), names = names(obj))
  }
  checkmate::assert_character(
    fs$labels,
    names = "named", len = length(obj), any.missing = FALSE,
    .var.name = "facet_specification$labels"
  )

  # resolve facet sorting
  feat_ordered <- resolve_facet_sort(obj = obj, facet_sort = fs$sort)

  # resolve facet subset and top_k
  obj <- resolve_facet_subset_topk(
    obj = obj, facet_subset = fs$subset, facet_top_k = fs$top_k,
    feat_ordered = feat_ordered
  )

  # resolve facet ncol
  idx_c <- !vapply(obj, "[[", logical(1), "is_discrete")
  facet_ncol_res <- resolve_facet_ncol(idx_c = idx_c, facet_ncol = fs$ncol)

  return(
    list(
      object = obj,
      labels = fs$labels,
      ncol_c = facet_ncol_res$ncol_c,
      ncol_d = facet_ncol_res$ncol_d,
      scales = fs$scales
    )
  )
}


resolve_facet_sort <- function(obj, facet_sort) {
  if (facet_sort == "alphabetical") {
    feat_ordered <- sort(names(obj))
  } else if (facet_sort == "importance") {
    vimp <- attr(obj, "var_imp")
    feat_ordered <- levels(vimp$feature)
  } else if (facet_sort == "range") {
    rng <- attr(obj, "range")
    feat_ordered <- levels(rng$feature)
  } else {
    stop("Unknown sorting")
  }
  return(feat_ordered)
}

resolve_facet_subset_topk <- function(obj, facet_subset, facet_top_k, feat_ordered) {
  stopifnot(
    is.null(facet_subset) || is.null(facet_top_k),
    all(feat_ordered %in% names(obj))
  )
  if (!is.null(facet_top_k)) {
    vars <- feat_ordered[1:min(facet_top_k, length(feat_ordered))]
  } else if (!is.null(facet_subset)) {
    vars <- feat_ordered[feat_ordered %in% facet_subset]
  } else {
    vars <- feat_ordered
  }
  new_attrs <- attributes(obj)
  obj <- obj[vars]
  new_attrs$names <- names(obj)
  attributes(obj) <- new_attrs
  return(obj)
}

resolve_facet_ncol <- function(idx_c, facet_ncol) {
  n_feat_c <- sum(idx_c)
  n_feat_d <- sum(!idx_c)
  if (is.null(facet_ncol)) {
    facet_ncol_c <- min(c(n_feat_c, 4))
    facet_ncol_d <- min(c(n_feat_d, 4))
  } else {
    facet_ncol_c <- min(c(n_feat_c, facet_ncol))
    facet_ncol_d <- min(c(n_feat_d, facet_ncol))
  }
  return(list(ncol_c = facet_ncol_c, ncol_d = facet_ncol_d))
}



resolve_y_limits <- function(dat_c, dat_d, facet_scales) {
  if (facet_scales %in% c("free", "free_y")) {
    c(NA_real_, NA_real_)
  } else {
    c(
      floor(min(c(dat_c[["y"]], dat_d[["y"]])) * 10) / 10,
      ceiling(max(c(dat_c[["y"]], dat_d[["y"]])) * 10) / 10
    )
  }
}



# missings specification -------


#' Instructions for missings vizualisations
#'
#' @param vline (`logical`) Should the vertical line be shown? Defaults to `FALSE`.
#' @param hline (`logical`) Should the horizontal line be shown? Defaults to `FALSE`.
#' @param values (`NULL`) or single value or a named vector.
#' Specifies the value(-s) that stand for the missing values.
#' If `NULL`, then no missing value handling is carried out.
#' If single value, then it is assumed that this value is used for flagging missing values across
#' all continuous variables.
#' If named vector, then the names are used to refer to continuous variables and the values for
#' flagging missing values in that variable.
#' @param drop_from_plot (`logical`) Should the missing values be dropped from plot?
#' Defaults to `FALSE`.
#'
#' @return List of class `missings_specification`.
#' @export
#'
#' @examples
#' \dontrun{
#' g_ice(
#'   sculpture,
#'   missings_spec = missings_specification(
#'     vline = TRUE, # show vertical line
#'     values = -1 # NAs in all continuous variables displayed as -1
#'   )
#' )
#' }
#'
missings_specification <- function(vline = FALSE, hline = FALSE, values = NULL,
                                   drop_from_plot = FALSE) {
  checkmate::assert_flag(vline)
  checkmate::assert_flag(hline)
  checkmate::assert_flag(drop_from_plot)
  checkmate::assert(
    checkmate::check_null(values),
    checkmate::check_atomic(values, any.missing = FALSE, len = 1),
    checkmate::check_atomic(values, any.missing = FALSE, min.len = 2, names = "named")
  )
  if (any(c(vline, hline)) && is.null(values)) {
    stop("Specified to show lines, but no missing values provided.")
  }
  if (drop_from_plot && is.null(values)) {
    stop("Specified to drop missings from plot area, but no missing values provided.")
  }
  if (drop_from_plot && vline) {
    stop("Please use either `drop_from_plot` or `vline`, but not both.")
  }
  out <- list(vline = vline, hline = hline, values = values, drop_from_plot = drop_from_plot)
  class(out) <- "missings_specification"
  return(out)
}

resolve_missings_specification <- function(dat_c, ms, missings) {
  if (is.null(missings)) {
    return(list(dat_c = dat_c, missings = missings))
  }

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  line_id <- feature <- ..cols <- NULL # due to NSE notes in R CMD check

  # add PDP_centered column
  if ("line_id" %in% colnames(dat_c)) {
    dat_c_pdp <- dat_c[line_id == "pdp"]
  } else {
    dat_c_pdp <- dat_c
  }
  missings_new <- missings[dat_c_pdp, nomatch = NULL, on = c("feature", "x")]
  cols <- c("feature", "x", "y", `if`("Model" %in% colnames(missings_new), "Model"))
  missings_new <- missings_new[, ..cols]
  missings_new[, feature := factor(feature, levels = levels(dat_c$feature))]

  # remove missing observations if requested
  if (ms$drop_from_plot) {
    dat_c <- dat_c[!missings, on = c("feature", "x")]
  }
  return(list(dat_c = dat_c, missings = missings_new))
}



# plots -------


#' Plot variable importances and cumulative approximation of R^2
#'
#' @param object (`sculpture`)
#' @param feat_labels (`NULL`) or named character vector providing the variable labels.
#' @param textsize Size of text.
#' @param top_k (`NULL`) or number to show only the most `k` important variables.
#' @param pdp_plot_sample (`logical`) Sample PDP for faster ploting? Defaults to `TRUE`.
#' @param show_pdp_plot (`logical`) Show plot with PDP ranges? Defaults to `TRUE`.
#' @param var_imp_type (`character`) One of `c("normalized", "absolute", "ice", "ice_orig_mod")`.
#' Defaults to "normalized". "ice" is only valid for a rough sculpture.
#' @param logodds_to_prob (`logical`) Only valid for binary response and sculptures built on
#' the log-odds scale. Defaults to `FALSE` (i.e. no effect).
#' If `TRUE`, then the y-values are transformed through inverse logit function 1 / (1 + exp(-x)).
#' @param plot_ratios (`numeric`) Used in the layout matrix of `gridExtra::arrangeGrob()`.
#' If `show_pdp_plot`, then the default is `c(3,2,2)`, making the first plot 3 units wide and
#' the other two plots 2 units wide.
#' If `!show_pdp_plot`, then the default is `c(3,2)`, making the first plot 3 units wide and
#' the second plot 2 units wide.
#' Note that the length needs to be 3 if `show_pdp_plot` or 2 if `!show_pdp_plot`.
#'
#' @return `grob`. Use `grid::grid.draw` to plot the output
#' (`grid::grid.newpage` resets the plotting area).
#'
#' @export
#'
#' @examples
#' df <- mtcars
#' df$vs <- as.factor(df$vs)
#' model <- rpart::rpart(
#'   hp ~ mpg + carb + vs,
#'   data = df,
#'   control = rpart::rpart.control(minsplit = 10)
#' )
#' model_predict <- function(x) predict(model, newdata = x)
#' covariates <- c("mpg", "carb", "vs")
#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
#'
#' rs <- sculpt_rough(
#'   dat = pm,
#'   model_predict_fun = model_predict,
#'   n_ice = 10,
#'   seed = 1,
#'   verbose = 0
#' )
#'
#' # optionally define labels
#' labels <- structure(
#'   toupper(covariates), # labels
#'   names = covariates # current (old) names
#' )
#' vi <- g_var_imp(rs, feat_labels = labels)
#' grid::grid.draw(vi)
#'
g_var_imp <- function(object, feat_labels = NULL, textsize = 16, top_k = NULL,
                      pdp_plot_sample = TRUE,
                      show_pdp_plot = TRUE,
                      var_imp_type = "normalized",
                      logodds_to_prob = FALSE,
                      plot_ratios = `if`(show_pdp_plot, c(3, 2, 2), c(3, 2))) {
  checkmate::assert_class(object, "sculpture")
  checkmate::assert_integerish(textsize, len = 1, any.missing = FALSE)
  checkmate::assert_flag(pdp_plot_sample)
  checkmate::assert_flag(show_pdp_plot)
  checkmate::assert_choice(var_imp_type, choices = c("normalized", "absolute", "ice", "ice_orig_mod"))
  checkmate::assert_flag(logodds_to_prob)

  if (show_pdp_plot) {
    checkmate::assert_integerish(plot_ratios, lower = 1, len = 3, any.missing = FALSE)
  } else {
    checkmate::assert_integerish(plot_ratios, lower = 1, len = 2, any.missing = FALSE)
  }

  if (is.null(feat_labels)) {
    feat_labels <- structure(names(object), names = names(object))
  }
  checkmate::assert_character(feat_labels, names = "named", len = length(object))

  checkmate::assert(
    checkmate::check_null(top_k),
    checkmate::check_integerish(top_k, any.missing = FALSE, len = 1, lower = 1)
  )

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  feature <- . <- pdp_c <- ice_centered <- line_id <- var_y <-
    ice <- y <-
    NULL # due to NSE notes in R CMD check

  if (logodds_to_prob) {
    # evaluate the sculpture
    es <- eval_sculpture(
      sculpture = object,
      data = as.data.frame(as.data.table(lapply(object, "[[", "x")))
    )
    dt <- es$pdp

    # convert log-odds scale to probability scale and center back to 0
    dt[, pdp_c := inv.logit(pdp_c) - 0.5]

    # get importance
    dat_var <- calc_dir_var_imp_pdp(dt)
    feat_order <- levels(dat_var$feature)

    # get cumul. R2
    dat_R2_cumul <- calc_cumul_R2_pdp(
      dt = dt,
      feat_order = feat_order,
      model_predictions = inv.logit(es$prediction$pred),
      model_offset = inv.logit(es$offset)
    )
  } else {
    # get importance
    dat_var <- attr(object, "var_imp")
    feat_order <- levels(dat_var$feature)

    # get cumul. R2
    dat_R2_cumul <- attr(object, "cumul_R2")

    # get PDPs and predictions
    if (show_pdp_plot) {
      eg <- eval_sculpture(
        sculpture = object,
        data = as.data.frame(as.data.table(lapply(object, "[[", "x")))
      )
      dt <- eg$pdp
    }
  }

  # subset top_k if requested
  if (!is.null(top_k)) {
    top_k <- min(top_k, nrow(dat_var))
    feat_order <- feat_order[1:top_k]
    dat_var <- dat_var[feature %in% feat_order]
    dat_R2_cumul <- dat_R2_cumul[feature %in% feat_order]
    if (show_pdp_plot) {
      object <- object[feat_order]
      class(object) <- "sculpture"
    }
  }

  # g1 - PDP values
  if (show_pdp_plot) {
    # check centering
    check_dt <- dt[, .(mean_pdp_c = mean(pdp_c)), .(feature)]
    if (abs(mean(check_dt$mean_pdp_c)) > 1e-1) {
      stop(paste(
        "PDPs not centered, mean relative difference of",
        abs(mean(check_dt$mean_pdp_c))
      ))
    }
    # draw PDP plot
    dt$feature <- factor(dt$feature, levels = feat_order)
    g1 <- g_pdp(dt = dt, pdp_plot_sample = pdp_plot_sample, feat_labels = feat_labels)
  } else {
    g1 <- NULL
  }

  # g2 - variable importance
  if (var_imp_type == "normalized") {
    g2 <- g_imp_norm(dat_var = dat_var, show_pdp_plot = show_pdp_plot, textsize = textsize)
  } else if (var_imp_type == "absolute") {
    g2 <- g_imp_abs(dat_var = dat_var, show_pdp_plot = show_pdp_plot, textsize = textsize)
  } else if (var_imp_type == "ice_orig_mod") {
    if (!inherits(object, "rough")) {
      stop('`var_imp_type == "ice"` is only valid for a rough sculpture.')
    }
    # get ice curves
    dat_var_ice <- rbindlist(
      lapply(
        object,
        function(v) {
          generate_ice_data(
            predictions = v[["ice"]],
            x = v$x,
            logodds_to_prob = logodds_to_prob
          )[, .(y, line_id)]
        }
      ),
      idcol = "feature"
    )
    # convert to factor
    dat_var_ice$feature <- factor(dat_var_ice$feature, levels = feat_order)
    # calculate variance
    dat_var_ice <- dat_var_ice[, .(var_y = var(y)), by = .(feature, line_id)]
    # calculate mean of variances
    vars_mean <- dat_var_ice[, .(mean_var_y = mean(var_y)), by = .(feature)]
    # plot ice variances
    g2 <- g_imp_ice(vars = dat_var_ice, vars_mean = vars_mean)
  } else if (var_imp_type == "ice") {
    model_predict_fun <- function(x) {
      if(logodds_to_prob) {
        p <- predict(object, newdata = x)
        inv.logit(p)
      } else {
        predict(object, newdata = x)
      }
    }

    dat_var_ice <- rbindlist(
      lapply(
        object,
        function(v) {
          calculate_ice_data(
              sub = v$subsets,
              predict_fun = model_predict_fun,
              x = v$x,
              x_name = v$x_name,
              col_order = names(object)
          )[, .(ice, line_id)]
        }
      ),
      idcol = "feature"
    )

    # convert to factor
    dat_var_ice$feature <- factor(dat_var_ice$feature, levels = feat_order)
    # calculate variance
    dat_var_ice <- dat_var_ice[, .(var_y = var(ice)), by = .(feature, line_id)]
    # calculate mean of variances
    vars_mean <- dat_var_ice[, .(mean_var_y = mean(var_y)), by = .(feature)]
    # plot ice variances
    g2 <- g_imp_ice(vars = dat_var_ice, vars_mean = vars_mean)
  }

  if (show_pdp_plot) {
    g2 <- g2 + theme(axis.ticks.y = element_blank(), axis.text.y = element_blank())
  } else {
    g2 <- g2 + scale_y_discrete(labels = function(x) feat_labels[x])
  }

  # g3 - cumulative R2
  g3 <- g_cumulR2(dat_R2_cumul = dat_R2_cumul, textsize = textsize)

  # combined graph
  if (show_pdp_plot) {
    g_var_imp <- gridExtra::arrangeGrob(
      g1 + theme(
        plot.margin = unit(c(0.8, 0.5, 0.3, 0.3), "cm"),
        text = element_text(size = textsize)
      ),
      g2 + theme(
        plot.margin = unit(c(0.8, 0.5, 0.3, 0.3), "cm"),
        text = element_text(size = textsize)
      ),
      g3 + theme(
        plot.margin = unit(c(0.8, 0.5, 0.05, 0.3), "cm"),
        text = element_text(size = textsize)
      ),
      layout_matrix = matrix(rep(1:3, plot_ratios), nrow = 1)
    )
  } else {
    g_var_imp <- gridExtra::arrangeGrob(
      g2 + theme(
        plot.margin = unit(c(0.8, 0.5, 0.3, 0.3), "cm"),
        text = element_text(size = textsize)
      ),
      g3 + theme(
        plot.margin = unit(c(0.8, 0.5, 0.05, 0.3), "cm"),
        text = element_text(size = textsize)
      ),
      layout_matrix = matrix(rep(1:2, plot_ratios), nrow = 1)
    )
  }

  return(g_var_imp)
}


#' Plot additivity scatterplot(-s) with R^2 value(-s)
#'
#' @param sp Sculpted predictions. Either as a vector or as a list of those.
#' @param lp Learner predictions. Either as a vector or as a list of those. Same size as `sp`.
#' @param descriptions (Optional) Descriptions of the models to be shown on the plot.
#' Same size as `sp` if `sp` is provided as a list.
#' @param cex `cex` graphical parameter.
#' @param plot_only (`logical`) Return plot only or plot with the R^2 value?
#' Defaults to the first (i.e. `TRUE`).
#'
#' @return If `plot_only`, then a plot. If `!plot_only`, then a plot and a data.frame.
#' @export
#'
#' @examples
#' df <- mtcars
#' df$vs <- as.factor(df$vs)
#' model <- rpart::rpart(
#'   hp ~ mpg + carb + vs,
#'   data = df,
#'   control = rpart::rpart.control(minsplit = 10)
#' )
#' model_predict <- function(x) predict(model, newdata = x)
#' covariates <- c("mpg", "carb", "vs")
#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
#'
#' rs <- sculpt_rough(
#'   dat = pm,
#'   model_predict_fun = model_predict,
#'   n_ice = 10,
#'   seed = 1,
#'   verbose = 0
#' )
#'
#' g_additivity(
#'   sp = predict(rs, pm),
#'   lp = model_predict(pm),
#'   descriptions = "Product Marginal"
#' )
#'
g_additivity <- function(sp, lp, descriptions = NULL, cex = 4, plot_only = TRUE) {
  checkmate::assert(
    checkmate::check_atomic(sp),
    checkmate::check_list(sp, types = "atomic")
  )
  if (!is.list(sp)) {
    sp <- list(sp)
  }

  checkmate::assert(
    checkmate::check_atomic(lp, any.missing = FALSE, len = length(sp[[1]])),
    checkmate::check_list(lp, types = "atomic", len = length(sp))
  )
  if (is.list(lp)) {
    lapply(
      seq_along(lp),
      function(i) checkmate::assert_atomic(lp[[i]], any.missing = FALSE, len = length(sp[[i]]))
    )
  } else {
    lp <- rep(list(lp), length(sp))
  }

  checkmate::assert(
    checkmate::check_null(descriptions),
    checkmate::check_character(descriptions, any.missing = FALSE, len = length(sp))
  )
  if (is.null(descriptions)) {
    if (is.null(names(sp))) {
      descriptions <- paste("Sculpture", seq_along(sp))
    } else {
      descriptions <- names(sp)
    }
  }

  checkmate::assert_numeric(cex, lower = 0, any.missing = FALSE, len = 1)
  checkmate::assert_logical(plot_only, any.missing = FALSE, len = 1)

  # get plot data
  pd <- lapply(seq_along(sp), function(i) {
    data.frame(
      sculpted = sp[[i]],
      learner = lp[[i]],
      Model = descriptions[i]
    )
  })
  pd <- do.call("rbind", pd)
  pd$Model <- factor(pd$Model, levels = descriptions)

  # calculate R2 (vs strong learner)
  R2_mod_vs_approx <- vapply(
    seq_along(sp),
    function(i) metrics_R2(score_fun = "score_quadratic", y = lp[[i]], y_hat = sp[[i]]),
    FUN.VALUE = numeric(1)
  )

  # create R2 annotations
  annotations <- data.frame(
    R2 = paste0("R^2==", round(R2_mod_vs_approx, 4)),
    Model = factor(descriptions, levels = descriptions),
    sculpted = min(pd$sculpted) + (max(pd$sculpted) - min(pd$sculpted)) / 10,
    learner = 0.9 * max(pd$learner)
  )

  g <- ggplot(pd) +
    geom_point(aes(x = .data$sculpted, y = .data$learner), alpha = 0.4, shape = 16) +
    geom_abline(slope = 1, intercept = 0) +
    facet_wrap("Model") +
    geom_label(
      data = annotations,
      mapping = aes(x = .data$sculpted, y = .data$learner, label = .data$R2),
      hjust = 0, parse = TRUE, size = cex
    ) +
    theme_bw() +
    labs(
      x = "Sculpted Model Predictions",
      y = "Learner Predictions"
    )

  if (plot_only) {
    return(g)
  } else {
    return(list(plot = g, R2 = data.frame(description = descriptions, R2 = R2_mod_vs_approx)))
  }
}



#' Plot centered ICE profiles with centered PDP curves
#'
#' @param object Object of classes `rough` and `sculpture`.
#' @param centered `logical`, centered ice plots? Defaults to `TRUE`.
#' @param show_PDP `logical`, show PDP line? Defaults to `TRUE`.
#' @param coloured `logical`, coloured curves? Defaults to `FALSE`.
#' @param rug_sides "" for none, "b", for bottom, "trbl" for all 4 sides (see `geom_rug`)
#' @param missings_spec Object of class `missings_specificatoin`.
#' @param facet_spec Object of class `facet_specificatoin`.
#' @param logodds_to_prob (`logical`) Only valid for binary response and sculptures built on
#' the log-odds scale. Defaults to `FALSE` (i.e. no effect).
#' If `TRUE`, then the y-values are transformed through inverse logit function 1 / (1 + exp(-x)).
#'
#' @return List of `ggplot`s (one for continuous features, one for discrete).
#' @export
#'
#' @examples
#' df <- mtcars
#' df$vs <- as.factor(df$vs)
#' model <- rpart::rpart(
#'   hp ~ mpg + carb + vs,
#'   data = df,
#'   control = rpart::rpart.control(minsplit = 10)
#' )
#' model_predict <- function(x) predict(model, newdata = x)
#' covariates <- c("mpg", "carb", "vs")
#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
#'
#' rs <- sculpt_rough(
#'   dat = pm,
#'   model_predict_fun = model_predict,
#'   n_ice = 10,
#'   seed = 1,
#'   verbose = 0
#' )
#'
#' g_ice(rs)$continuous
#'
g_ice <- function(object, centered = TRUE, show_PDP = TRUE, coloured = FALSE,
                  rug_sides = "b",
                  missings_spec = missings_specification(),
                  facet_spec = facet_specification(),
                  logodds_to_prob = FALSE) {
  checkmate::assert_class(object, "rough")
  checkmate::assert_flag(centered)
  checkmate::assert_flag(show_PDP)
  checkmate::assert_flag(coloured)
  checkmate::assert_character(rug_sides, any.missing = FALSE, len = 1)
  checkmate::assert_class(facet_spec, "facet_specification")
  checkmate::assert_class(missings_spec, "missings_specification")
  checkmate::assert_flag(logodds_to_prob)

  # transform missings into a list of values per each continuous variable
  check_continuous <- vapply(object, "[[", logical(1), "is_discrete")
  check_continuous <- names(Filter(isFALSE, check_continuous))
  if (length(missings_spec$values) == 1) {
    missings <- data.table(feature = check_continuous, x = missings_spec$values)
  } else if (length(missings_spec$values) > 1) {
    missings <- data.table(feature = names(missings_spec$values), x = missings_spec$values)
    checkmate::assert_names(
      missings$feature,
      subset.of = check_continuous,
      .var.name = "missings_spec$values"
    )
  } else {
    missings <- NULL
  }

  if (coloured & show_PDP) {
    stop("Coloured lines are only available without PDP, so please set `show_PDP = FALSE`.")
  }

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  . <- x <- x_ribbon <- line_id <- y_se <- feature <- NULL # due to NSE notes in R CMD check

  # resolve facet specification
  rfs <- resolve_facet_specification(obj = object, fs = facet_spec)
  object <- rfs$object

  # get continuous vars
  idx_continuous_vars <- !vapply(object, "[[", logical(1), "is_discrete")
  has_continuous <- any(idx_continuous_vars)
  has_discrete <- any(!idx_continuous_vars)

  pred_var <- if (centered) "ice_centered" else "ice"

  # continuous
  if (has_continuous) {
    # ICE
    ice_continuous <- rbindlist(
      lapply(
        object[idx_continuous_vars],
        function(v) {
          generate_ice_data(
            predictions = v[[pred_var]],
            x = v$x,
            logodds_to_prob = logodds_to_prob
          )
        }
      ),
      idcol = "feature"
    )
    ice_continuous <- unique(ice_continuous)
    ice_continuous[, `:=`(type = "ICE Profiles", line_id = as.character(line_id))]

    # PDP
    if (show_PDP) {
      pdp_continuous <- rbindlist(
        lapply(
          object[idx_continuous_vars],
          function(v) {
            generate_pdp_data(
              predictions = v[[pred_var]],
              x = v$x,
              logodds_to_prob = logodds_to_prob
            )
          }
        ),
        idcol = "feature"
      )
      pdp_continuous[, y_se := ifelse(is.na(y_se), 0, y_se)]
      pdp_continuous[, `:=`(line_id = "pdp", type = "Rough model (with SE)")]

      dat_c <- rbind(ice_continuous, pdp_continuous, fill = TRUE)
    } else {
      dat_c <- ice_continuous
    }
  } else {
    dat_c <- data.table(y = numeric(0), feature = character(0))
  }

  # discrete
  if (has_discrete) {
    # ICE
    ice_discrete <- rbindlist(
      lapply(
        object[!idx_continuous_vars],
        function(v) {
          generate_ice_data(
            predictions = v[[pred_var]],
            x = v$x,
            logodds_to_prob = logodds_to_prob
          )
        }
      ),
      idcol = "feature"
    )
    ice_discrete <- unique(ice_discrete)
    ice_discrete[, `:=`(type = "ICE Profiles", line_id = as.character(line_id))]

    # PDP
    if (show_PDP) {
      pdp_discrete <- rbindlist(
        lapply(
          object[!idx_continuous_vars],
          function(v) {
            generate_pdp_data(
              predictions = v[[pred_var]],
              x = v$x,
              logodds_to_prob = logodds_to_prob
            )
          }
        ),
        idcol = "feature"
      )
      pdp_discrete[, `:=`(line_id = "pdp", type = "Rough model (with SE)")]

      dat_d <- rbind(ice_discrete, pdp_discrete, fill = TRUE)
    } else {
      dat_d <- ice_discrete
    }
  } else {
    dat_d <- data.table(y = numeric(0), feature = character(0))
  }

  # resolve y limits
  y_limits <- resolve_y_limits(dat_c = dat_c, dat_d = dat_d, facet_scales = facet_spec$scales)

  # resolve facet sort - need to convert to factor
  dat_c[, feature := factor(feature, levels = names(object)[idx_continuous_vars])]
  dat_d[, feature := factor(feature, levels = names(object)[!idx_continuous_vars])]

  # resolve missings specification
  rms <- resolve_missings_specification(dat_c = dat_c, ms = missings_spec, missings = missings)
  dat_c <- rms$dat_c
  missings <- rms$missings

  # graph for continuous
  if (nrow(dat_c) > 0) {
    gc <- ggplot()

    if (show_PDP) {
      gc <- gc +
        geom_ribbon(
          mapping = aes(
            x = .data$x,
            ymin = .data$y - .data$y_se,
            ymax = .data$y + .data$y_se
          ),
          data = dat_c[line_id == "pdp"],
          na.rm = T, alpha = 0.4, colour = "lightblue", fill = "lightblue"
        )
    }

    if (coloured) {
      gc <- gc +
        geom_line(
          mapping = aes(x = .data$x, y = .data$y, colour = .data$line_id),
          linewidth = 1,
          alpha = 0.3,
          data = dat_c,
          na.rm = F
        )
    } else {
      gc <- gc +
        geom_line(
          mapping = aes(
            x = .data$x, y = .data$y,
            group = .data$line_id, colour = .data$type
          ),
          data = dat_c,
          na.rm = F
        )
    }

    gc <- gc +
      geom_rug(
        mapping = aes(x = .data$x, y = .data$y),
        data = dat_c[line_id == 1],
        na.rm = F,
        sides = rug_sides
      ) +
      facet_wrap(
        "feature",
        scales = rfs$scales, ncol = rfs$ncol_c,
        labeller = as_labeller(rfs$labels)
      )

    if (!coloured) {
      gc <- gc +
        scale_color_manual(
          values = c(
            "ICE Profiles" = ifelse(show_PDP, "gray60", "black"),
            "Rough model (with SE)" = "blue"
          )[c(T, show_PDP)],
          name = ""
        )
    }

    # add missings lines
    if (!is.null(missings)) {
      if (missings_spec$vline) {
        gc <- gc +
          geom_vline(
            mapping = aes(xintercept = .data$x),
            data = missings,
            linetype = "dotted"
          )
      }
      if (missings_spec$hline) {
        gc <- gc +
          geom_hline(
            mapping = aes(yintercept = .data$y, linetype = "Score for Missing Feature"),
            data = missings
          ) +
          scale_linetype_manual(values = c("Score for Missing Feature" = "dotted"), name = NULL)
      }
    }

    gc <- gc +
      labs(x = "Features", y = "Feature Score", caption = "", colour = NULL) +
      ylim(y_limits) +
      theme_bw()

    if (show_PDP) {
      gc <- gc + guides(colour = guide_legend(override.aes = list(size = 1)))
    } else {
      gc <- gc + guides(colour = guide_none())
    }

    if (coloured) {
      dat_c2 <- dat_c
      dat_c$x_perc <- ecdf(dat_c$x)(dat_c$x)
      gc2 <- ggplot() +
        geom_line(
          mapping = aes(x = .data$x_perc, y = .data$y, colour = .data$line_id),
          linewidth = 1,
          alpha = 0.3,
          data = dat_c,
          na.rm = F
        ) +
        geom_rug(
          mapping = aes(x = .data$x_perc, y = .data$y),
          data = dat_c[line_id == 1],
          na.rm = F,
          sides = rug_sides
        ) +
        facet_wrap(
          "feature",
          scales = rfs$scales, ncol = rfs$ncol_c,
          labeller = as_labeller(rfs$labels)
        ) +
        labs(x = "Features", y = "Feature Score", caption = "", colour = NULL) +
        ylim(y_limits) +
        theme_bw() +
        guides(colour = guide_none())
    }
  } else {
    gc <- NULL
  }

  # graph for discrete
  if (nrow(dat_d) > 0) {
    gd <- ggplot()

    if (show_PDP) {
      dat_d[line_id == "pdp", x_ribbon := as.numeric(droplevels(as.factor(x))), by = .(feature)]
      gd <- gd +
        geom_point(
          mapping = aes(x = .data$x, y = .data$y, colour = .data$type),
          data = dat_d[line_id == "pdp"],
          na.rm = T
        ) +
        geom_ribbon(
          mapping = aes(
            x = .data$x_ribbon,
            ymin = .data$y - .data$y_se,
            ymax = .data$y + .data$y_se
          ),
          data = dat_d[line_id == "pdp"],
          na.rm = T, alpha = 0.4, colour = "lightblue", fill = "lightblue"
        )
    }

    if (coloured) {
      gd <- gd +
        geom_line(
          mapping = aes(x = .data$x, y = .data$y, colour = .data$line_id, group = .data$line_id),
          linewidth = 1,
          alpha = 0.3,
          data = dat_d,
          na.rm = F
        )
    } else {
      gd <- gd +
        geom_line(
          mapping = aes(
            x = .data$x, y = .data$y,
            colour = .data$type, group = .data$line_id
          ),
          data = dat_d,
          na.rm = F
        )
    }

    gd <- gd +
      facet_wrap(
        "feature", scales = rfs$scales, ncol = rfs$ncol_d,
        labeller = as_labeller(rfs$labels)
      )

    if (!coloured) {
      gd <- gd +
        scale_color_manual(
          values = c(
            "ICE Profiles" = ifelse(show_PDP, "gray60", "black"),
            "Rough model (with SE)" = "blue"
          )[c(T, show_PDP)],
          name = ""
        )
    }

    gd <- gd +
      labs(x = "Features", y = "Feature Score", caption = "", colour = NULL, linetype = NULL) +
      ylim(y_limits) +
      theme_bw()

    if (show_PDP) {
      gd <- gd + guides(colour = guide_legend(override.aes = list(size = 1)))
    } else {
      gd <- gd + guides(colour = guide_none())
    }
  } else {
    gd <- NULL
  }

  if (coloured) {
    return(list(continuous = gc, discrete = gd, perc = gc2))
  } else {
    return(list(continuous = gc, discrete = gd))
  }
}


#' Plot component functions
#'
#' @param object Object of class `sculpture`.
#' @inheritParams g_ice
#'
#' @return List of `ggplot`s (one for continuous features, one for discrete).
#' @export
#'
#' @examples
#' df <- mtcars
#' df$vs <- as.factor(df$vs)
#' model <- rpart::rpart(
#'   hp ~ mpg + carb + vs,
#'   data = df,
#'   control = rpart::rpart.control(minsplit = 10)
#' )
#' model_predict <- function(x) predict(model, newdata = x)
#' covariates <- c("mpg", "carb", "vs")
#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
#'
#' rs <- sculpt_rough(
#'   dat = pm,
#'   model_predict_fun = model_predict,
#'   n_ice = 10,
#'   seed = 1,
#'   verbose = 0
#' )
#'
#' ds <- sculpt_detailed_gam(rs)
#'
#' g_component(ds)$continuous
#'
g_component <- function(object, rug_sides = "b",
                        missings_spec = missings_specification(),
                        facet_spec = facet_specification(),
                        logodds_to_prob = FALSE) {
  checkmate::assert_class(object, "sculpture")
  checkmate::assert_character(rug_sides, any.missing = FALSE, len = 1)
  checkmate::assert_class(missings_spec, "missings_specification")
  checkmate::assert_class(facet_spec, "facet_specification")
  checkmate::assert_flag(logodds_to_prob)

  # transform missings into a list of values per each continuous variable
  check_continuous <- vapply(object, "[[", logical(1), "is_discrete")
  check_continuous <- names(Filter(isFALSE, check_continuous))
  if (length(missings_spec$values) == 1) {
    missings <- data.table(feature = check_continuous, x = missings_spec$values)
  } else if (length(missings_spec$values) > 1) {
    missings <- data.table(feature = names(missings_spec$values), x = missings_spec$values)
    checkmate::assert_names(
      missings$feature,
      subset.of = check_continuous,
      .var.name = "missings_spec$values"
    )
  } else {
    missings <- NULL
  }

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  feature <- NULL # due to NSE notes in R CMD check

  # resolve facet specification
  rfs <- resolve_facet_specification(obj = object, fs = facet_spec)
  object <- rfs$object

  # get continuous vars
  idx_continuous_vars <- !vapply(object, "[[", logical(1), "is_discrete")
  has_continuous <- any(idx_continuous_vars)
  has_discrete <- any(!idx_continuous_vars)

  if (has_continuous) {
    dat_c <- rbindlist(
      lapply(
        object[idx_continuous_vars],
        function(v) {
          data.table(
            x = v$x,
            y = `if`(logodds_to_prob, inv.logit(v$predict(v$x)), v$predict(v$x))
          )
        }
      ),
      idcol = "feature"
    )
  } else {
    dat_c <- data.table(feature = character(0))
  }

  if (has_discrete) {
    dat_d <- rbindlist(
      lapply(
        object[!idx_continuous_vars],
        function(v) {
          data.table(
            x = v$x,
            y = `if`(logodds_to_prob, inv.logit(v$predict(v$x)), v$predict(v$x))
          )
        }
      ),
      idcol = "feature"
    )
  } else {
    dat_d <- data.table(feature = character(0))
  }

  # resolve y limits
  y_limits <- resolve_y_limits(dat_c = dat_c, dat_d = dat_d, facet_scales = facet_spec$scales)

  # resolve facet sort - need to convert to factor
  dat_c[, feature := factor(feature, levels = names(object)[idx_continuous_vars])]
  dat_d[, feature := factor(feature, levels = names(object)[!idx_continuous_vars])]

  # resolve missings specification
  rms <- resolve_missings_specification(dat_c = dat_c, ms = missings_spec, missings = missings)
  dat_c <- rms$dat_c
  missings <- rms$missings

  if (missings_spec$hline) {
    legend_model_name <- paste(stringr::str_to_title(class(object)[1]), "Model Component")
    line_mapping <- aes(
      x = .data$x, y = .data$y,
      group = .data$feature, linetype = .data$legend_model_name
    )
  } else {
    line_mapping <- aes(x = .data$x, y = .data$y, group = .data$feature)
  }

  if (nrow(dat_c) > 0) {
    gc <- ggplot(dat_c) +
      geom_line(mapping = line_mapping) +
      geom_rug(
        mapping = aes(x = .data$x, y = .data$y),
        na.rm = F,
        sides = rug_sides
      ) +
      facet_wrap(
        "feature", scales = rfs$scales, ncol = rfs$ncol_c,
        labeller = as_labeller(rfs$labels)
      )

    # add missings lines
    if (!is.null(missings)) {
      if (missings_spec$vline) {
        gc <- gc +
          geom_vline(
            mapping = aes(xintercept = .data$x),
            data = missings,
            linetype = "dotted"
          )
      }
      if (missings_spec$hline) {
        gc <- gc +
          geom_hline(
            mapping = aes(yintercept = .data$y, linetype = "Score for Missing Feature"),
            data = missings
          ) +
          scale_linetype_manual(
            values = structure(
              c("dotted", "solid"),
              names = c("Score for Missing Feature", legend_model_name)
            ),
            name = NULL
          )
      }
    }

    gc <- gc +
      labs(x = "Features", y = "Feature Score") +
      ylim(y_limits) +
      theme_bw()
  } else {
    gc <- NULL
  }

  if (nrow(dat_d) > 0) {
    gd <- ggplot(dat_d) +
      geom_line(aes(x = .data$x, y = .data$y, group = .data$feature)) +
      facet_wrap(
        "feature", scales = rfs$scales, ncol = rfs$ncol_d,
        labeller = as_labeller(rfs$labels)
      ) +
      labs(x = "Features", y = "Feature Score") +
      ylim(y_limits) +
      theme_bw()
  } else {
    gd <- NULL
  }

  return(list(continuous = gc, discrete = gd))
}

#' Plot comparison of component functions
#'
#' @param sculptures List of objects of classes `sculpture`.
#' @param descriptions Character vector with model names. Same length as `sculptures`.
#' @inheritParams g_ice
#' @param hue_coloring Logical, use hue-based coloring?
#' Defaults to FALSE, meaning that predefined colors will be used instead.
#'
#' @details The first element of `sculptures` works as a reference sculpture.
#' All other sculptures must have a subset of variables with respect to the first one
#' (i.e. the same variables or less, but not new ones).
#' This allows to visualize polished together with non-polished sculptures,
#' if the non-polished one is specified as the first one.
#'
#' @return List of `ggplot`s (one for continuous features, one for discrete).
#' @export
#'
#' @examples
#' df <- mtcars
#' df$vs <- as.factor(df$vs)
#' model <- rpart::rpart(
#'   hp ~ mpg + carb + vs,
#'   data = df,
#'   control = rpart::rpart.control(minsplit = 10)
#' )
#' model_predict <- function(x) predict(model, newdata = x)
#' covariates <- c("mpg", "carb", "vs")
#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
#'
#' rs <- sculpt_rough(
#'   dat = pm,
#'   model_predict_fun = model_predict,
#'   n_ice = 10,
#'   seed = 1,
#'   verbose = 0
#' )
#'
#' ds <- sculpt_detailed_gam(rs)
#'
#' # this keeps only "mpg"
#' ps <- sculpt_polished(ds, k = 1)
#'
#' # also define simple labels
#' labels <- structure(
#'   toupper(covariates), # labels
#'   names = covariates # current (old) names
#' )
#'
#' # Component functions of "Detailed" and "Polished" are the same for "mpg" variable,
#' # therefore red curve overlays the blue one for "mpg"
#' comp <- g_comparison(
#'   sculptures = list(rs, ds, ps),
#'   descriptions = c("Rough", "Detailed", "Polished"),
#'   facet_spec = facet_specification(ncol = 2, labels = labels)
#' )
#' comp$continuous
#' comp$discrete
#'
g_comparison <- function(sculptures, descriptions, rug_sides = "b",
                         missings_spec = missings_specification(),
                         facet_spec = facet_specification(),
                         hue_coloring = FALSE,
                         logodds_to_prob = FALSE) {
  checkmate::assert_list(sculptures, types = "sculpture")
  checkmate::assert_character(descriptions, len = length(sculptures))
  checkmate::assert_character(rug_sides, any.missing = FALSE, len = 1)
  checkmate::assert_class(facet_spec, "facet_specification")
  checkmate::assert_flag(hue_coloring)
  checkmate::assert_flag(logodds_to_prob)

  names_sc_1 <- names(sculptures[[1]])
  check_names <- vapply(sculptures, function(sc) all(names(sc) %in% names_sc_1), logical(1))
  if (!all(check_names)) {
    stop("All sculptures must be subsets of the first sculpture (in terms of variables).")
  }

  # transform missings into a list of values per each continuous variable
  check_continuous <- vapply(sculptures[[1]], "[[", logical(1), "is_discrete")
  check_continuous <- names(Filter(isFALSE, check_continuous))
  if (length(missings_spec$values) == 1) {
    missings <- data.table(feature = check_continuous, x = missings_spec$values)
  } else if (length(missings_spec$values) > 1) {
    missings <- data.table(feature = names(missings_spec$values), x = missings_spec$values)
    checkmate::assert_names(
      missings$feature,
      subset.of = check_continuous,
      .var.name = "missings_spec$values"
    )
  } else {
    missings <- NULL
  }

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  feature <- Model <- NULL # due to NSE notes in R CMD check

  # resolve facet specification
  rfs <- resolve_facet_specification(obj = sculptures[[1]], fs = facet_spec)
  sculptures[[1]] <- rfs$object

  # get continuous vars
  idx_continuous_vars <- !vapply(sculptures[[1]], "[[", logical(1), "is_discrete")
  has_continuous <- any(idx_continuous_vars)
  has_discrete <- any(!idx_continuous_vars)

  if (has_discrete) {
    dat_d <- rbindlist(
      lapply(
        seq_along(sculptures),
        function(i) {
          rbindlist(
            lapply(
              sculptures[[i]][vapply(sculptures[[i]], "[[", logical(1), "is_discrete")],
              function(v) {
                data.table(
                  x = v$x,
                  y = `if`(logodds_to_prob, inv.logit(v$predict(v$x)), v$predict(v$x)),
                  Model = descriptions[i]
                )
              }
            ),
            idcol = "feature"
          )
        }
      )
    )
    dat_d$Model <- factor(dat_d$Model, levels = descriptions)
  } else {
    dat_d <- data.table(feature = character(0))
  }

  if (has_continuous) {
    dat_c <- rbindlist(
      lapply(
        seq_along(sculptures),
        function(i) {
          rbindlist(
            lapply(
              sculptures[[i]][!vapply(sculptures[[i]], "[[", logical(1), "is_discrete")],
              function(v) {
                data.table(
                  x = v$x,
                  y = `if`(logodds_to_prob, inv.logit(v$predict(v$x)), v$predict(v$x)),
                  Model = descriptions[i]
                )
              }
            ),
            idcol = "feature"
          )
        }
      )
    )
    dat_c$Model <- factor(dat_c$Model, levels = descriptions)
  } else {
    dat_c <- data.table(feature = character(0))
  }

  # resolve y limits
  y_limits <- resolve_y_limits(dat_c = dat_c, dat_d = dat_d, facet_scales = facet_spec$scales)

  # resolve facet sort - need to convert to factor
  dat_c[, feature := factor(feature, levels = names(sculptures[[1]])[idx_continuous_vars])]
  dat_d[, feature := factor(feature, levels = names(sculptures[[1]])[!idx_continuous_vars])]

  # resolve missings specification
  rms <- resolve_missings_specification(dat_c = dat_c, ms = missings_spec, missings = missings)
  dat_c <- rms$dat_c
  missings <- rms$missings

  colours <- structure(
    ms_color(length(sculptures), hue_coloring = hue_coloring),
    names = descriptions
  )

  if (nrow(dat_c) > 0) {
    gc <- ggplot(dat_c) +
      geom_line(
        aes(
          x = .data$x,
          y = .data$y,
          colour = .data$Model,
          group = interaction(.data$feature, .data$Model)
        )
      ) +
      geom_rug(
        mapping = aes(x = .data$x, y = .data$y),
        data = dat_c[Model == descriptions[1]],
        na.rm = F,
        sides = rug_sides
      ) +
      facet_wrap(
        "feature", scales = rfs$scales, ncol = rfs$ncol_c,
        labeller = as_labeller(rfs$labels)
      ) +
      scale_color_manual(values = colours)

    # add missings lines
    if (!is.null(missings)) {
      if (missings_spec$vline) {
        gc <- gc +
          geom_vline(
            mapping = aes(xintercept = .data$x),
            data = missings,
            linetype = "dotted"
          )
      }
      if (missings_spec$hline) {
        gc <- gc +
          geom_hline(
            mapping = aes(
              yintercept = .data$y, linetype = "Score for Missing Feature",
              color = Model
            ),
            data = missings
          ) +
          scale_linetype_manual(values = c("Score for Missing Feature" = "dotted"), name = NULL)
      }
    }

    gc <- gc +
      labs(x = "Features", y = "Feature Score") +
      ylim(y_limits) +
      theme_bw()
  } else {
    gc <- NULL
  }

  if (nrow(dat_d) > 0) {
    gd <- ggplot(dat_d) +
      geom_line(
        aes(
          x = .data$x,
          y = .data$y,
          colour = .data$Model,
          group = interaction(.data$feature, .data$Model)
        )
      ) +
      facet_wrap(
        "feature", scales = rfs$scales, ncol = rfs$ncol_d,
        labeller = as_labeller(rfs$labels)
      ) +
      scale_color_manual(values = colours) +
      labs(x = "Features", y = "Feature Score") +
      ylim(y_limits) +
      theme_bw()
  } else {
    gd <- NULL
  }

  return(list(continuous = gc, discrete = gd))
}

#' Create ICE curves at quantiles
#' @keywords internal
#'
#' @param object Object of class sculpture (rough, detailed)
#' @param new_data Data to make quantiles on
#' @param var_name String specifying which variable to generate ICE
#' @param qtiles Quantiles to generate ICE curves
#' @param task Prediction task type (regression or classification)
#'
#' @return Predictions
#'
#' @details
#' It should be amenable to any 1st-order model without interaction terms,
#' however not implemented yet, such as handling `predict()` function output
#' for binary endpoint
#'
calc_ice_quantile <- function(object, new_data, var_name, qtiles = seq(0, 1, by = 0.1),
                              task = "regression") {
  checkmate::assert_class(object, "sculpture")
  match.arg(task, c("regression", "classification"))

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  median <- quantile <- rgb <- NULL # due to NSE notes in R CMD check

  # Predict for all samples after replacing the variable with 1st value,
  # then take quantiles
  cov_1st_val <- new_data[[var_name]][1]

  new_data_with_1st_val <- new_data
  new_data_with_1st_val[[var_name]] <- cov_1st_val

  pred_at_1st_val <- predict(object, new_data_with_1st_val)

  pred_qtile_at_1st_qtiles <- quantile(pred_at_1st_val, qtiles)

  pred_at_1st_qtiles <- data.frame(pred_at_1st = pred_qtile_at_1st_qtiles, qtile = qtiles)

  # Separately predict for all values of the cov of interest
  preds_for_adjust_1 <- merge(
    new_data[1, setdiff(colnames(new_data), var_name)],
    unique(new_data[var_name])
  )
  preds_for_adjust_1$pred <- predict(object, newdata = preds_for_adjust_1)


  # Get the pred at the first element, because
  # it is what was selected for cov_1st_val
  pred_for_adjust_at_1st <- preds_for_adjust_1$pred[1]

  preds_for_adjust <- preds_for_adjust_1[var_name]
  preds_for_adjust$pred_adjust <- preds_for_adjust_1$pred - pred_for_adjust_at_1st

  # Combine the above 2 to make quantile lines
  pred_ice_qtile <- merge(
    pred_at_1st_qtiles,
    preds_for_adjust
  )
  pred_ice_qtile$pred <- pred_ice_qtile$pred_at_1st + pred_ice_qtile$pred_adjust

  # Convert to probabilities if classification
  if (task == "classification") {
    pred_ice_qtile$pred <- inv.logit(pred_ice_qtile$pred)
  }

  return(pred_ice_qtile)
}


#' Create density curves
#' @keywords internal
#'
#' @param new_data_with_pred Data with prediction to make density calculations on
#' @param var_name String specifying which variable to calculate density
#' @param vec_y_expand Optional values to expand y-axis
#' @return Density data for plotting
#'
#' @details
#' It should be amenable to any 1st-order model without interaction terms,
#' however not implemented yet, such as handling `predict()` function output
#' for binary endpoint
#'
calc_density <- function(new_data_with_pred, var_name,
                         vec_y_expand = NULL) {
  x_axis_range_data <- range(new_data_with_pred[[var_name]])
  x_axis_range_density <- expand_range(x_axis_range_data, 0.5, 0.5)


  y_axis_range_data <- range(c(new_data_with_pred$pred, vec_y_expand))
  y_axis_range_density <- expand_range(y_axis_range_data, 0.1, 0.1)


  # Estimate 2d density
  # Calculate bandwidth manually if MASS::bandwidth.nrd fails
  # (happens when most of data has same covariate, e.g. ==0.
  # MASS::bandwidth.nrd uses quantiles to calculate bandwidth)
  bandwidth_x <- MASS::bandwidth.nrd(new_data_with_pred[[var_name]])
  bandwidth_y <- MASS::bandwidth.nrd(new_data_with_pred$pred)
  # If bandwidth is 0, set to 25% of range
  bandwidth_x <- ifelse(bandwidth_x == 0, diff(x_axis_range_data) * 0.25, bandwidth_x)
  bandwidth_y <- ifelse(bandwidth_y == 0, diff(y_axis_range_data) * 0.25, bandwidth_y)

  density_est <- MASS::kde2d(
    x = new_data_with_pred[[var_name]],
    y = new_data_with_pred$pred,
    n = 100,
    lims = c(x_axis_range_density, y_axis_range_density),
    h = c(bandwidth_x, bandwidth_y)
  )

  # Convert to data frame
  density_data <- expand.grid(x = density_est$x, y = density_est$y)
  density_data$z <- as.vector(density_est$z)
  density_data <- density_data[order(density_data$x, density_data$y), ]

  # Dummy data to make legend go down to 0.0, replace 1st row z value with 0
  density_data[1, 3] <- 0

  return(density_data)
}


#' Expand the range of values for density plot
#' @keywords internal
#'
#' @param x numeric vector
#' @param expand_left_side Fraction to expand on left hand side
#' @param expand_right_side Fraction to expand on right hand side
#'
#' @return Vector of 2 values
#'
#'
expand_range <- function(x, expand_left_side = 0.1, expand_right_side = 0.2,
                         type = c("relative", "absolute")) {
  type <- match.arg(type)

  if (type == "relative") {
    expand_left_side <- expand_left_side * diff(range(x))
    expand_right_side <- expand_right_side * diff(range(x))
  }

  return(c(min(x) - expand_left_side, max(x) + expand_right_side))
}

#' Density plots overlaid with ICE curves
#'
#' Create density plot for the data, overlaid with ICE curves at quantiles
#' of the variable(s) of interest.
#'
#'
#' @name g_density_ice
NULL


#' @rdname g_density_ice
#' @export
#'
#' @param object Object of class sculpture (rough, detailed)
#' @param new_data Data to make quantiles on
#' @param var_name String specifying which variable to generate ICE
#' @param var_label String (optional) specifying variable label (x label of the plot)
#' @param qtiles Quantiles to generate ICE curves
#' @param task Prediction task type (regression or classification)
#'
#' @return [g_density_ice_plot()]: ggplot object
#'
#' @details
#' [g_density_ice_plot()] creates a density plot for a single variable.
#'
#' [g_density_ice_plot_list()] creates a list of density plots for multiple variables.
#'
#' These functions should be amenable to any 1st-order model without interaction terms,
#' however not implemented yet, such as handling `predict()` function output
#' for binary endpoint
#'
#'
#' @examples
#' \dontrun{
#' df <- mtcars
#' df$cyl <- as.factor(df$cyl)
#' model <- lm(hp ~ ., data = df)
#' model_predict <- function(x) predict(model, newdata = x)
#' covariates <- setdiff(colnames(df), "hp")
#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
#'
#' rs <- sculpt_rough(
#'   dat = pm,
#'   model_predict_fun = model_predict,
#'   n_ice = 5,
#'   seed = 1,
#'   verbose = 0
#' )
#'
#' g_density_ice_plot(rs, new_data = pm, var_name = "mpg")
#' g_list <- g_density_ice_plot_list(
#'   rs, new_data = pm, var_names = c("mpg", "cyl", "disp", "drat")
#' )
#' grid::grid.draw(gridExtra::arrangeGrob(grobs = g_list))
#' }
#'
g_density_ice_plot <- function(object, new_data, var_name, var_label = NULL,
                               qtiles = seq(0, 1, by = 0.1),
                               task = c("regression", "classification")) {
  checkmate::assert_class(object, "sculpture")
  new_data <- check_data(new_data)
  checkmate::assert_string(var_name)
  checkmate::assert_string(var_label, null.ok = TRUE)
  checkmate::assert_numeric(qtiles, lower = 0, upper = 1)
  checkmate::assert_character(task)

  task <- match.arg(task)

  # https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
  x <- y <- z <- pred <- qtile <- NULL # due to NSE notes in R CMD check

  if (is.null(var_label)) {
    var_label <- var_name
  }

  is_var_discrete <- !is.numeric(new_data[[var_name]])

  new_data_with_pred <- new_data
  new_data_with_pred$pred <- predict(object, newdata = new_data)

  if (task == "classification") {
    new_data_with_pred$pred <- inv.logit(new_data_with_pred$pred)
  }

  pred_ice_qtile <- calc_ice_quantile(
    object, new_data,
    var_name = var_name, qtiles = qtiles, task = task
  )

  if (is_var_discrete) {
    new_data_with_pred[[var_name]] <- as.numeric(as.factor(new_data_with_pred[[var_name]]))
    pred_ice_qtile[[var_name]] <- as.numeric(as.factor(pred_ice_qtile[[var_name]]))
  }

  density_data <- calc_density(
    new_data_with_pred,
    var_name = var_name,
    vec_y_expand = pred_ice_qtile$pred
  )

  x_axis_range_data <- range(new_data_with_pred[[var_name]])
  if (is_var_discrete) {
    x_axis_range_plot <- expand_range(x_axis_range_data, 0.3, 0.3, type = "absolute")
  } else {
    x_axis_range_plot <- expand_range(x_axis_range_data, 0, 0.15)
  }
  y_axis_range_plot <- range(c(new_data_with_pred$pred, pred_ice_qtile$pred))

  ggrepel_data <- pred_ice_qtile[
    pred_ice_qtile$qtile %in% c(0, 0.5, 1) &
      pred_ice_qtile[[var_name]] == max(pred_ice_qtile[[var_name]]),
  ]

  density_plot <- ggplot(density_data, aes(x = x, y = y)) +
    geom_raster(aes(fill = z), interpolate = TRUE) +
    labs(x = var_label, y = "Predicted Value", fill = "Density") +
    scale_fill_viridis_c() +
    coord_cartesian(xlim = x_axis_range_plot, ylim = y_axis_range_plot) +
    theme(
      panel.ontop = TRUE,
      panel.background = element_rect(color = NA, fill = NA),
      panel.grid.major = element_line(color = grDevices::rgb(1, 1, 1, 0.1)),
      panel.grid.minor = element_line(color = grDevices::rgb(1, 1, 1, 0.1))
    ) +
    geom_line(
      data = pred_ice_qtile,
      aes(x = .data[[var_name]], y = pred, group = qtile), linewidth = 0.3,
      color = "grey70"
    ) +
    geom_line(
      data = pred_ice_qtile[pred_ice_qtile$qtile %in% c(0, 0.5, 1), ],
      aes(x = .data[[var_name]], y = pred, group = qtile), linewidth = 0.7,
      color = "grey70"
    ) +
    ggrepel::geom_text_repel(
      data = ggrepel_data,
      aes(x = .data[[var_name]], y = pred, label = paste0(round(qtile * 100), "%")),
      color = "grey70",
      box.padding = unit(0.25, "lines"),
      point.padding = unit(0.25, "lines"),
      segment.linetype = "dotted",
      min.segment.length = unit(0, "lines"),
      nudge_x = diff(range(x_axis_range_plot)) / 6,
      direction = "y", hjust = "right"
    )


  if (is_var_discrete) {
    levels <- levels(as.factor(new_data[[var_name]]))
    density_plot <- density_plot +
      scale_x_continuous(
        breaks = seq_len(length(levels)),
        labels = levels,
        minor_breaks = NULL
      )
  }

  return(density_plot)
}


#' @rdname g_density_ice
#' @export
#'
#' @param var_names Vector of strings specifying which variables to generate ICE
#' @param var_labels Named vector of strings specifying variable labels.
#'
#' @return [g_density_ice_plot_list()]: list of ggplot objects
#'
g_density_ice_plot_list <- function(object, new_data, var_names, var_labels = NULL,
                                    qtiles = seq(0, 1, by = 0.1),
                                    task = c("regression", "classification")) {
  checkmate::assert_class(object, "sculpture")
  new_data <- check_data(new_data)
  checkmate::assert_character(var_names)
  checkmate::assert_character(var_labels, null.ok = TRUE)
  checkmate::assert_numeric(qtiles, lower = 0, upper = 1)
  checkmate::assert_character(task)

  task <- match.arg(task)

  out <- vector("list", length(var_names))
  names(out) <- var_names

  for (var_name in var_names) {
    out[[var_name]] <-
      g_density_ice_plot(object, new_data, var_name, var_labels[var_name], qtiles, task)
  }

  return(out)
}

#' Set and end parallel computation
#'
#' @param num_cores (`integer`) Number of cores.
#' @param cluster_type (`character`) Type of cluster. One of `c("fork", "psock")`.
#'
#' @export
#' @examples
#' \dontrun{
#' parallel_set(num_cores = 2)
#' # now the code will run on parallel with 2 cores
#' parallel_end()
#' # now the code will run sequentially
#' }
parallel_set <- function(num_cores = 10, cluster_type = "fork") {
  checkmate::assert_integerish(num_cores, lower = 1, any.missing = FALSE, len = 1)
  cluster_type <- match.arg(cluster_type, choices = c("fork", "psock"))

  parallel_end()

  if (cluster_type == "fork") {
    cl <- parallel::makeForkCluster(num_cores)
  } else {
    cl <- parallel::makePSOCKcluster(num_cores)
  }

  doParallel::registerDoParallel(cl)

  message(paste("Using", foreach::getDoParWorkers(), "cores")) # should be == num_cores
}

#' @rdname parallel_set
#' @export
parallel_end <- function() {
  if (foreach::getDoParRegistered()) {
    foreach::registerDoSEQ()
  }
}


define_foreach_operand <- function(allow_par = FALSE) {
  if (foreach::getDoParRegistered() && allow_par) {
    foreach::`%dopar%`
  } else {
    foreach::`%do%`
  }
}

g_pdp <- function(dt, pdp_plot_sample, feat_labels) {
  # pdp_plot_sample ensures faster rendering
  if (pdp_plot_sample && nrow(dt) > 4e4) {
    set.seed(101)
    g <- ggplot(
      data = dt[sample(nrow(dt), 4e4), ],
      mapping = aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = .data$pdp_c)
    ) +
      geom_jitter(shape = 16, size = 1.5, alpha = 0.7, position = position_jitter(seed = 1))
  } else {
    g <- ggplot(
      data = dt,
      mapping = aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = .data$pdp_c)
    ) +
      geom_jitter(shape = 16, size = 1.5, alpha = 0.2, position = position_jitter(seed = 1))
  }

  g <- g +
    scale_y_discrete(labels = function(x) feat_labels[x]) +
    labs(x = "Feature Score", y = "Feature") +
    theme_bw()

  return(g)
}

g_imp_abs <- function(dat_var, show_pdp_plot, textsize) {
  nudge_x <- max(dat_var$variance) / 5
  dat_var$variance_vs_top <- dat_var$variance / max(dat_var$variance)

  g <- ggplot(
    dat_var,
    aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = .data$variance)
  ) +
    geom_point() +
    geom_text(
      aes(
        x = ifelse(
          .data$variance_vs_top > 0.5,
          .data$variance - 2 * nudge_x,
          .data$variance
        ),
        label = format(round(.data$variance, 3), nsmall = 3, digits = 3)
      ),
      nudge_x = nudge_x,
      size = round(textsize / 3)
    ) +
    labs(
      x = "Direct Variable Importance",
      y = ifelse(show_pdp_plot, "", "Feature")
    ) +
    theme_bw()
  return(g)
}

g_imp_norm <- function(dat_var, show_pdp_plot, textsize) {
  g <- ggplot(
    dat_var,
    aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = .data$ratio)
  ) +
    geom_point() +
    geom_text(
      aes(
        x = ifelse(.data$ratio > 0.75, .data$ratio - 0.4, .data$ratio),
        label = sprintf("%.1f%%", round(.data$ratio * 100, 1))
      ),
      nudge_x = 0.2,
      size = round(textsize / 3)
    ) +
    xlim(c(0, 1)) +
    labs(
      x = "Direct Variable Importance",
      y = ifelse(show_pdp_plot, "", "Feature")
    ) +
    theme_bw()
  return(g)
}

g_imp_ice <- function(vars, vars_mean) {
  g <- ggplot() +
    geom_point(
      aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = .data$var_y),
      data = vars,
      size = 1,
      colour = "gray50"
    ) +
    geom_point(
      aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = .data$mean_var_y),
      data = vars_mean,
      size = 2,
      colour = "black"
    ) +
    labs(
      x = "Direct Variable Importance",
      y = "Feature"
    ) +
    theme_bw()
  return(g)
}

g_cumulR2 <- function(dat_R2_cumul, textsize) {
  g <- ggplot(
    dat_R2_cumul,
    aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = round(.data$R2, 4))
  ) +
    geom_point() +
    geom_text(
      aes(
        x = ifelse(.data$R2 < 0.25, .data$R2 + 0.4, .data$R2),
        label = sprintf("%.1f%%", round(.data$R2 * 100, 1))
      ),
      nudge_x = -0.2,
      size = round(textsize / 3)
    ) +
    xlim(c(0, 1)) +
    labs(
      x = expression("Cumulative Approximation " * R^2),
      y = ""
    ) +
    theme_bw() +
    theme(axis.ticks.y = element_blank(), axis.text.y = element_blank())
  return(g)
}

1		g_pdp <- function(dt, pdp_plot_sample, feat_labels) {
2		# pdp_plot_sample ensures faster rendering
3	2x	if (pdp_plot_sample && nrow(dt) > 4e4) {
4	!	set.seed(101)
5	!	g <- ggplot(
6	!	data = dt[sample(nrow(dt), 4e4), ],
7	!	mapping = aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = .data$pdp_c)
8		) +
9	!	geom_jitter(shape = 16, size = 1.5, alpha = 0.7, position = position_jitter(seed = 1))
10		} else {
11	2x	g <- ggplot(
12	2x	data = dt,
13	2x	mapping = aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = .data$pdp_c)
14		) +
15	2x	geom_jitter(shape = 16, size = 1.5, alpha = 0.2, position = position_jitter(seed = 1))
16		}
17
18	2x	g <- g +
19	2x	scale_y_discrete(labels = function(x) feat_labels[x]) +
20	2x	labs(x = "Feature Score", y = "Feature") +
21	2x	theme_bw()
22
23	2x	return(g)
24		}
25
26		g_imp_abs <- function(dat_var, show_pdp_plot, textsize) {
27	1x	nudge_x <- max(dat_var$variance) / 5
28	1x	dat_var$variance_vs_top <- dat_var$variance / max(dat_var$variance)
29
30	1x	g <- ggplot(
31	1x	dat_var,
32	1x	aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = .data$variance)
33		) +
34	1x	geom_point() +
35	1x	geom_text(
36	1x	aes(
37	1x	x = ifelse(
38	1x	.data$variance_vs_top > 0.5,
39	1x	.data$variance - 2 * nudge_x,
40	1x	.data$variance
41		),
42	1x	label = format(round(.data$variance, 3), nsmall = 3, digits = 3)
43		),
44	1x	nudge_x = nudge_x,
45	1x	size = round(textsize / 3)
46		) +
47	1x	labs(
48	1x	x = "Direct Variable Importance",
49	1x	y = ifelse(show_pdp_plot, "", "Feature")
50		) +
51	1x	theme_bw()
52	1x	return(g)
53		}
54
55		g_imp_norm <- function(dat_var, show_pdp_plot, textsize) {
56	6x	g <- ggplot(
57	6x	dat_var,
58	6x	aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = .data$ratio)
59		) +
60	6x	geom_point() +
61	6x	geom_text(
62	6x	aes(
63	6x	x = ifelse(.data$ratio > 0.75, .data$ratio - 0.4, .data$ratio),
64	6x	label = sprintf("%.1f%%", round(.data$ratio * 100, 1))
65		),
66	6x	nudge_x = 0.2,
67	6x	size = round(textsize / 3)
68		) +
69	6x	xlim(c(0, 1)) +
70	6x	labs(
71	6x	x = "Direct Variable Importance",
72	6x	y = ifelse(show_pdp_plot, "", "Feature")
73		) +
74	6x	theme_bw()
75	6x	return(g)
76		}
77
78		g_imp_ice <- function(vars, vars_mean) {
79	4x	g <- ggplot() +
80	4x	geom_point(
81	4x	aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = .data$var_y),
82	4x	data = vars,
83	4x	size = 1,
84	4x	colour = "gray50"
85		) +
86	4x	geom_point(
87	4x	aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = .data$mean_var_y),
88	4x	data = vars_mean,
89	4x	size = 2,
90	4x	colour = "black"
91		) +
92	4x	labs(
93	4x	x = "Direct Variable Importance",
94	4x	y = "Feature"
95		) +
96	4x	theme_bw()
97	4x	return(g)
98		}
99
100		g_cumulR2 <- function(dat_R2_cumul, textsize) {
101	11x	g <- ggplot(
102	11x	dat_R2_cumul,
103	11x	aes(y = factor(.data$feature, levels = rev(levels(.data$feature))), x = round(.data$R2, 4))
104		) +
105	11x	geom_point() +
106	11x	geom_text(
107	11x	aes(
108	11x	x = ifelse(.data$R2 < 0.25, .data$R2 + 0.4, .data$R2),
109	11x	label = sprintf("%.1f%%", round(.data$R2 * 100, 1))
110		),
111	11x	nudge_x = -0.2,
112	11x	size = round(textsize / 3)
113		) +
114	11x	xlim(c(0, 1)) +
115	11x	labs(
116	11x	x = expression("Cumulative Approximation " * R^2),
117	11x	y = ""
118		) +
119	11x	theme_bw() +
120	11x	theme(axis.ticks.y = element_blank(), axis.text.y = element_blank())
121	11x	return(g)
122		}

1		# sculpture metrics --------
2
3
4		#' Various metrics related to model sculpting
5		#'
6		#' @name var_imp
7		#'
8		#' @param object `sculpture`
9		#' @param newdata (Optional) Data to calculate the importance from.
10		#' If omitted, the data that were provided to build the sculpture are used.
11		#'
12		#' @return `data.table` with direct requested metrics.
13		#'
14		#' @examples
15		#' df <- mtcars
16		#' df$vs <- as.factor(df$vs)
17		#' model <- rpart::rpart(
18		#' hp ~ mpg + carb + vs,
19		#' data = df,
20		#' control = rpart::rpart.control(minsplit = 10)
21		#' )
22		#' model_predict <- function(x) predict(model, newdata = x)
23		#' covariates <- c("mpg", "carb", "vs")
24		#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
25		#'
26		#' rs <- sculpt_rough(
27		#' dat = pm,
28		#' model_predict_fun = model_predict,
29		#' n_ice = 10,
30		#' seed = 1,
31		#' verbose = 0
32		#' )
33		#'
34		#' # show direct variable importance
35		#' calc_dir_var_imp(rs)
36		#'
37		#' # show cumulative approximation R^2
38		#' calc_cumul_R2(rs)
39		NULL
40
41
42		calc_dir_var_imp_pdp <- function(dt) {
43	83x	stopifnot(
44	83x	all(c("rn", "feature", "pdp_c") %in% colnames(dt)),
45	83x	!"total" %in% tolower(unique(dt$feature)),
46	83x	nrow(dt) == nrow(unique(dt[, .(rn, feature)]))
47		)
48
49		# https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
50	83x	. <- rn <- feature <- pdp_c <- ratio <- variance <- variance_total <-
51	83x	NULL # due to NSE notes in R CMD check
52
53		# calculate total variance of PDPs
54	83x	var_total <- dt[, .(pdp_c = sum(pdp_c)), .(rn)][, var(pdp_c)]
55
56		# calculate variance per feature
57	83x	dat_var <- dt[
58		,
59	83x	.(variance = var(pdp_c), variance_total = var_total),
60	83x	.(feature)
61		][
62		,
63	83x	ratio := variance / variance_total
64		][
65	83x	order(ratio, decreasing = TRUE)
66		]
67
68		# define as factor to keep the order
69	83x	dat_var[, feature := factor(feature, levels = feature)]
70	83x	return(dat_var)
71		}
72
73		#' @describeIn var_imp Direct variable importance
74		#' @export
75		calc_dir_var_imp <- function(object, newdata = NULL) {
76	2x	checkmate::assert_class(object, "sculpture")
77	2x	if (is.null(newdata)) {
78	1x	return(attr(object, "var_imp"))
79		}
80	1x	checkmate::assert_data_frame(newdata, any.missing = FALSE)
81	1x	calc_dir_var_imp_pdp(
82	1x	eval_sculpture(
83	1x	sculpture = object,
84	1x	data = newdata
85	1x	)$pdp
86		)
87		}
88
89		calc_cumul_R2_pdp <- function(dt, feat_order, model_predictions, model_offset) {
90	82x	stopifnot(
91	82x	is.data.table(dt),
92	82x	all(c("rn", "feature", "pdp_c") %in% colnames(dt)),
93	82x	!"total" %in% tolower(unique(dt$feature)),
94	82x	nrow(dt) == nrow(unique(dt[, .(rn, feature)])),
95	82x	length(model_predictions) == length(unique(dt$rn)),
96	82x	is.character(feat_order),
97	82x	is.numeric(model_offset)
98		)
99
100		# https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
101	82x	. <- rn <- feature <- pdp_c <- preds <- NULL # due to NSE notes in R CMD check
102
103		# prepare ordered features
104	82x	cumul_features <- lapply(seq_along(feat_order), function(i) feat_order[1:i])
105
106		# calculate R2
107	82x	R2_cumul <- vapply(
108	82x	cumul_features,
109	82x	function(cols) {
110	217x	predictions <- dt[
111	217x	feature %in% cols,
112	217x	.(preds = sum(pdp_c) + model_offset),
113	217x	.(rn)
114		][
115	217x	order(rn), preds
116		]
117	217x	metrics_R2(score_fun = "score_quadratic", y = model_predictions, y_hat = predictions)
118		},
119	82x	numeric(1)
120		)
121	82x	return(
122	82x	data.table(feature = factor(feat_order, levels = feat_order), R2 = R2_cumul)
123		)
124		}
125
126
127		#' @describeIn var_imp Calculate cumulative approximation of R^2
128		#' @export
129		calc_cumul_R2 <- function(object, newdata = NULL) {
130	2x	checkmate::assert_class(object, "sculpture")
131	2x	if (is.null(newdata)) {
132	1x	return(attr(object, "cumul_R2"))
133		}
134	1x	checkmate::assert_data_frame(newdata, any.missing = FALSE)
135
136	1x	eg <- eval_sculpture(
137	1x	sculpture = object,
138	1x	data = newdata
139		)
140
141	1x	dat_var <- calc_dir_var_imp_pdp(dt = eg$pdp)
142
143	1x	calc_cumul_R2_pdp(
144	1x	dt = eg$pdp,
145	1x	feat_order = levels(dat_var$feature),
146	1x	model_predictions = eg$prediction$pred,
147	1x	model_offset = eg$offset
148		)
149		}
150
151
152		# calculate range - for plots (facet sorting)
153		calc_range_pdp <- function(dt) {
154	76x	stopifnot(
155	76x	all(c("rn", "feature", "pdp_c") %in% colnames(dt)),
156	76x	nrow(dt) == nrow(unique(dt[, .(rn, feature)]))
157		)
158
159		# https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
160	76x	. <- rn <- feature <- pdp_c <- NULL # due to NSE notes in R CMD check
161
162	76x	dt_range <- dt[
163	76x	, .(range = max(pdp_c) - min(pdp_c)),
164	76x	.(feature)
165		][
166	76x	order(-range)
167		]
168	76x	dt_range[, feature := factor(feature, levels = feature)][]
169
170	76x	return(dt_range)
171		}
172
173
174
175		# generic metrics --------
176
177
178		#' Various metrics for measuring model performance.
179		#'
180		#' @name metrics
181		#' @param score_fun A scoring function: `score_quadratic`, `score_log_loss`,
182		#' or a user-defined scoring rule. See below for more details.
183		#' @param y Vector of observations.
184		#' @param y_hat Vector of predictions.
185		#' @param y_hat_calib Vector of calibrated predictions. See below for more details.
186		#' @param na_rm Logical, defaults to `FALSE`. Should NAs be removed?
187		#' @param rev_fct Logical, defaults to `FALSE`. Switch the factor level of
188		#' the data before performing calibration. Only relevant for binary response.
189		#'
190		#' @section Scoring function:
191		#' One can use predefined scores like `score_quadratic` or `score_log_loss`.
192		#' If those do not fit the needs, a user-defined scoring function can also be used.
193		#' This function needs to take exactly 3 arguments: `y` (truth values),
194		#' `y_hat` (estimated values), and `na_rm` (should NAs be removed?):
195		#' - both `y` and `y_hat` are numeric (not factors!)
196		#' - `na_rm` is a scalar logical
197		#'
198		#' It needs to return a number.
199		#' There is a utility function `check_score_fun` to check if the user-defined function is
200		#' programmed correctly.
201		#' It checks the input and the output, but not if the actual returned value makes sense.
202		#'
203		#'
204		#' @section Calibration:
205		#' To obtain calibrated predictions,
206		#' fit a calibration model and predict based on that model.
207		#' Users can use their own calibration model or make use of `metrics_fit_calib`,
208		#' which fits an `mgcv::gam()` model with smoother `mgcv::s(., k = -1)` (automatic knot selection).
209		#' If the input `y` is a factor, then a binomial family is used, otherwise a gaussian.
210		#' NAs are always dropped.
211		#'
212		#' Continuous response example:
213		#' ```
214		#' calibration_model <- metrics_fit_calib(
215		#' y = truth,
216		#' y_hat = prediction
217		#' )
218		#' calib_pred <- predict(calibration_model)
219		#' ```
220		#'
221		#' Binary response example:
222		#' ```
223		#' calibration_model <- metrics_fit_calib(
224		#' y = factor(truth, levels = c("0", "1")),
225		#' y_hat = prediction
226		#' )
227		#' calib_pred <- predict(calibration_model, type = "response")
228		#' ```
229		#' In the binary case, make sure that:
230		#' - `y` is a factor with correct level setting.
231		#' Usually "0" is the reference (first) level and "1" is the event (second level).
232		#' This may clash with `yardstick` setting where
233		#' the first level is by default the "event" level.
234		#' - `y_hat` are probabilities (not a log of odds).
235		#' - returned calibrated predictions `calib_pred` are also probabilities by setting
236		#' `type = "response"`.
237		#'
238		#'
239		#' @return `metrics_fit_calib` returns an [mgcv::gam()] model fit, otherwise a number.
240		#'
241		#' @examples
242		#' # Scores
243		#' score_quadratic(y = c(1.34, 2.8), y_hat = c(1.34, 2.8)) # must be 0
244		#' score_quadratic(y = 0.5, 0) # must be 0.5**2 = 0.25
245		#'
246		#' score_log_loss(y = c(0, 1), y_hat = c(0.01, 0.9)) # must be close to 0
247		#' score_log_loss(y = 0, y_hat = 0) # undefined
248		#'
249		#' check_score_fun(score_quadratic) # passes without errors
250		#'
251		#' # Metrics based on `lm` model
252		#' mod <- lm(hp ~ ., data = mtcars)
253		#' truth <- mtcars$hp
254		#' pred <- predict(mod)
255		#'
256		#' # calibration fit and calibrated predictions
257		#' calib_mod <- metrics_fit_calib(y = truth, y_hat = pred)
258		#' calib_pred <- predict(calib_mod)
259		#'
260		#' metrics_unc(score_fun = "score_quadratic", y = truth)
261		#' metrics_R2(score_fun = "score_quadratic", y = truth, y_hat = pred)
262		#' metrics_DI(score_fun = "score_quadratic", y = truth, y_hat_calib = calib_pred)
263		#' metrics_MI(score_fun = "score_quadratic", y = truth, y_hat = pred, y_hat_calib = calib_pred)
264		#' # Note that R^2 = DI - MI
265		#' metrics_r2(y = truth, y_hat = pred, y_hat_calib = calib_pred)
266		#'
267		#' # Metrics based on `glm` model (logistic regression)
268		#' # Note the correct setting of levels
269		#' mod <- glm(factor(vs, levels = c("0", "1")) ~ hp + mpg, data = mtcars, family = "binomial")
270		#' truth_fct <- factor(mtcars$vs, levels = c("0", "1"))
271		#' truth_num <- mtcars$vs
272		#' pred <- predict(mod, type = "response") # type = "response" returns probabilities
273		#'
274		#' # calibration fit and calibrated predictions
275		#' calib_mod <- metrics_fit_calib(y = truth_fct, y_hat = pred)
276		#' calib_pred <- predict(calib_mod, type = "response") # type = "response" returns probabilities
277		#'
278		#' metrics_unc(score_fun = "score_quadratic", y = truth_num)
279		#' metrics_R2(score_fun = "score_quadratic", y = truth_num, y_hat = pred)
280		#' metrics_DI(score_fun = "score_quadratic", y = truth_num, y_hat_calib = calib_pred)
281		#' metrics_MI(score_fun = "score_quadratic", y = truth_num, y_hat = pred, y_hat_calib = calib_pred)
282		#' # Note that R^2 = DI - MI
283		#' metrics_r2(y = truth_num, y_hat = pred, y_hat_calib = calib_pred)
284		#'
285		NULL
286
287		remove_missing <- function(...) {
288	7x	idx <- complete.cases(...)
289	7x	lapply(list(...), \(x) x[idx])
290		}
291
292		#' @describeIn metrics Binary log loss score
293		#' @export
294		score_log_loss <- function(y, y_hat, na_rm = FALSE) {
295	14x	checkmate::assert_numeric(y)
296	14x	checkmate::assert(
297	14x	checkmate::check_numeric(y_hat, len = length(y)),
298	14x	checkmate::check_numeric(y_hat, len = 1)
299		)
300	14x	if (na_rm) {
301	1x	rm <- remove_missing(y = y, y_hat = y_hat)
302	1x	y <- rm[["y"]]
303	1x	y_hat <- rm[["y_hat"]]
304		}
305	14x	-mean(y * log(y_hat) + (1 - y) * log(1 - y_hat))
306		}
307
308		#' @describeIn metrics Quadratic score
309		#' @export
310		score_quadratic <- function(y, y_hat, na_rm = FALSE) {
311	733x	checkmate::assert_numeric(y)
312	733x	checkmate::assert(
313	733x	checkmate::check_numeric(y_hat, len = length(y)),
314	733x	checkmate::check_numeric(y_hat, len = 1)
315		)
316	733x	if (na_rm) {
317	1x	rm <- remove_missing(y = y, y_hat = y_hat)
318	1x	y <- rm[["y"]]
319	1x	y_hat <- rm[["y_hat"]]
320		}
321	733x	mean((y - y_hat)**2)
322		}
323
324		#' @describeIn metrics Utility function for checking the properties of a user-defined `score_fun`.
325		#' @export
326		check_score_fun <- function(score_fun) {
327	247x	if (is.character(score_fun)) {
328	245x	checkmate::assert_function(eval(str2lang(score_fun)), args = c("y", "y_hat", "na_rm"))
329	2x	} else if (is.function(score_fun)) {
330	2x	checkmate::assert_function(score_fun, args = c("y", "y_hat", "na_rm"))
331		} else {
332	!	stop("`score_fun` must be a function.")
333		}
334	246x	out <- do.call(score_fun, list(y = c(0.5, 0.6), y_hat = c(0.5, 0.55)))
335	246x	if (!checkmate::test_number(out, na.ok = TRUE)) {
336	!	stop("The return value of `score_fun` must be a number")
337		}
338		}
339
340
341		#' @describeIn metrics Uncertainty
342		#' @export
343		metrics_unc <- function(score_fun, y, na_rm = FALSE) {
344	2x	check_score_fun(score_fun)
345	2x	if (na_rm) {
346	1x	rm <- remove_missing(y = y)
347	1x	y <- rm[["y"]]
348		}
349	2x	do.call(score_fun, list(y = y, y_hat = rep_len(mean(y), length(y))))
350		}
351
352		#' @describeIn metrics R^2 metric
353		#' @export
354		metrics_R2 <- function(score_fun, y, y_hat, na_rm = FALSE) {
355	233x	check_score_fun(score_fun)
356	233x	if (na_rm) {
357	1x	rm <- remove_missing(y = y, y_hat = y_hat)
358	1x	y <- rm[["y"]]
359	1x	y_hat <- rm[["y_hat"]]
360		}
361	233x	1 -
362	233x	do.call(score_fun, list(y = y, y_hat = y_hat)) /
363	233x	do.call(score_fun, list(y = y, y_hat = rep_len(mean(y), length(y))))
364		}
365
366		#' @describeIn metrics Fit calibration curve using [mgcv::gam()].
367		#' Note that NAs are always dropped.
368		#' @export
369		metrics_fit_calib <- function(y, y_hat, rev_fct = FALSE) {
370	5x	requireNamespace("mgcv")
371	5x	s <- mgcv::s
372	5x	if (is.factor(y)) {
373	1x	fam <- binomial()
374	!	if(rev_fct) y <- factor(y, levels=rev(levels(y)))
375		} else {
376	4x	fam <- gaussian()
377		}
378	5x	tryCatch(
379	5x	mgcv::gam(y ~ s(y_hat, k = -1), family = fam, na.action = "na.omit"),
380	5x	error = \(e) tryCatch(
381	5x	mgcv::gam(y ~ s(y_hat, k = 3), family = fam, na.action = "na.omit"),
382	5x	error = \(e) mgcv::gam(y ~ y_hat, family = fam, na.action = "na.omit")
383		)
384		)
385		}
386
387		#' @describeIn metrics Discrimination index
388		#' @export
389		metrics_DI <- function(score_fun, y, y_hat_calib, na_rm = FALSE) {
390	4x	check_score_fun(score_fun)
391	4x	if (na_rm) {
392	1x	rm <- remove_missing(y = y, y_hat_calib = y_hat_calib)
393	1x	y <- rm[["y"]]
394	1x	y_hat_calib <- rm[["y_hat_calib"]]
395		}
396		(
397	4x	do.call(score_fun, list(y = y, y_hat = rep_len(mean(y), length(y)))) -
398	4x	do.call(score_fun, list(y = y, y_hat = y_hat_calib))
399		) /
400	4x	do.call(score_fun, list(y = y, y_hat = rep_len(mean(y), length(y))))
401		}
402
403		#' @describeIn metrics Miscalibration index
404		#' @export
405		metrics_MI <- function(score_fun, y, y_hat, y_hat_calib, na_rm = FALSE) {
406	5x	check_score_fun(score_fun)
407	5x	if (na_rm) {
408	1x	rm <- remove_missing(y = y, y_hat = y_hat, y_hat_calib = y_hat_calib)
409	1x	y <- rm[["y"]]
410	1x	y_hat <- rm[["y_hat"]]
411	1x	y_hat_calib <- rm[["y_hat_calib"]]
412		}
413		(
414	5x	do.call(score_fun, list(y = y, y_hat = y_hat)) -
415	5x	do.call(score_fun, list(y = y, y_hat = y_hat_calib))
416		) /
417	5x	do.call(score_fun, list(y = y, y_hat = rep_len(mean(y), length(y))))
418		}
419
420
421		#' @describeIn metrics r^2 metric based on slope of `lm`
422		#' @export
423		metrics_r2 <- function(y, y_hat, y_hat_calib, na_rm = FALSE) {
424	4x	if (na_rm) {
425	1x	rm <- remove_missing(y = y, y_hat = y_hat, y_hat_calib = y_hat_calib)
426	1x	y <- rm[["y"]]
427	1x	y_hat <- rm[["y_hat"]]
428	1x	y_hat_calib <- rm[["y_hat_calib"]]
429	3x	} else if (anyNA(y) \|\| anyNA(y_hat) \|\| anyNA(y_hat_calib)) {
430	1x	return(NA)
431		}
432	3x	lm_mod <- lm(y_hat_calib ~ y_hat)
433	3x	res <- (coef(lm_mod)[2] * sd(y_hat) / sd(y))**2
434	3x	if (is.na(res)) {
435	1x	res <- 0
436		}
437	3x	return(unname(res))
438		}

1		#' Create ICE curves at quantiles
2		#' @keywords internal
3		#'
4		#' @param object Object of class sculpture (rough, detailed)
5		#' @param new_data Data to make quantiles on
6		#' @param var_name String specifying which variable to generate ICE
7		#' @param qtiles Quantiles to generate ICE curves
8		#' @param task Prediction task type (regression or classification)
9		#'
10		#' @return Predictions
11		#'
12		#' @details
13		#' It should be amenable to any 1st-order model without interaction terms,
14		#' however not implemented yet, such as handling `predict()` function output
15		#' for binary endpoint
16		#'
17		calc_ice_quantile <- function(object, new_data, var_name, qtiles = seq(0, 1, by = 0.1),
18		task = "regression") {
19	6x	checkmate::assert_class(object, "sculpture")
20	6x	match.arg(task, c("regression", "classification"))
21
22		# https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
23	6x	median <- quantile <- rgb <- NULL # due to NSE notes in R CMD check
24
25		# Predict for all samples after replacing the variable with 1st value,
26		# then take quantiles
27	6x	cov_1st_val <- new_data[[var_name]][1]
28
29	6x	new_data_with_1st_val <- new_data
30	6x	new_data_with_1st_val[[var_name]] <- cov_1st_val
31
32	6x	pred_at_1st_val <- predict(object, new_data_with_1st_val)
33
34	6x	pred_qtile_at_1st_qtiles <- quantile(pred_at_1st_val, qtiles)
35
36	6x	pred_at_1st_qtiles <- data.frame(pred_at_1st = pred_qtile_at_1st_qtiles, qtile = qtiles)
37
38		# Separately predict for all values of the cov of interest
39	6x	preds_for_adjust_1 <- merge(
40	6x	new_data[1, setdiff(colnames(new_data), var_name)],
41	6x	unique(new_data[var_name])
42		)
43	6x	preds_for_adjust_1$pred <- predict(object, newdata = preds_for_adjust_1)
44
45
46		# Get the pred at the first element, because
47		# it is what was selected for cov_1st_val
48	6x	pred_for_adjust_at_1st <- preds_for_adjust_1$pred[1]
49
50	6x	preds_for_adjust <- preds_for_adjust_1[var_name]
51	6x	preds_for_adjust$pred_adjust <- preds_for_adjust_1$pred - pred_for_adjust_at_1st
52
53		# Combine the above 2 to make quantile lines
54	6x	pred_ice_qtile <- merge(
55	6x	pred_at_1st_qtiles,
56	6x	preds_for_adjust
57		)
58	6x	pred_ice_qtile$pred <- pred_ice_qtile$pred_at_1st + pred_ice_qtile$pred_adjust
59
60		# Convert to probabilities if classification
61	6x	if (task == "classification") {
62	1x	pred_ice_qtile$pred <- inv.logit(pred_ice_qtile$pred)
63		}
64
65	6x	return(pred_ice_qtile)
66		}
67
68
69		#' Create density curves
70		#' @keywords internal
71		#'
72		#' @param new_data_with_pred Data with prediction to make density calculations on
73		#' @param var_name String specifying which variable to calculate density
74		#' @param vec_y_expand Optional values to expand y-axis
75		#' @return Density data for plotting
76		#'
77		#' @details
78		#' It should be amenable to any 1st-order model without interaction terms,
79		#' however not implemented yet, such as handling `predict()` function output
80		#' for binary endpoint
81		#'
82		calc_density <- function(new_data_with_pred, var_name,
83		vec_y_expand = NULL) {
84	6x	x_axis_range_data <- range(new_data_with_pred[[var_name]])
85	6x	x_axis_range_density <- expand_range(x_axis_range_data, 0.5, 0.5)
86
87
88	6x	y_axis_range_data <- range(c(new_data_with_pred$pred, vec_y_expand))
89	6x	y_axis_range_density <- expand_range(y_axis_range_data, 0.1, 0.1)
90
91
92		# Estimate 2d density
93		# Calculate bandwidth manually if MASS::bandwidth.nrd fails
94		# (happens when most of data has same covariate, e.g. ==0.
95		# MASS::bandwidth.nrd uses quantiles to calculate bandwidth)
96	6x	bandwidth_x <- MASS::bandwidth.nrd(new_data_with_pred[[var_name]])
97	6x	bandwidth_y <- MASS::bandwidth.nrd(new_data_with_pred$pred)
98		# If bandwidth is 0, set to 25% of range
99	6x	bandwidth_x <- ifelse(bandwidth_x == 0, diff(x_axis_range_data) * 0.25, bandwidth_x)
100	6x	bandwidth_y <- ifelse(bandwidth_y == 0, diff(y_axis_range_data) * 0.25, bandwidth_y)
101
102	6x	density_est <- MASS::kde2d(
103	6x	x = new_data_with_pred[[var_name]],
104	6x	y = new_data_with_pred$pred,
105	6x	n = 100,
106	6x	lims = c(x_axis_range_density, y_axis_range_density),
107	6x	h = c(bandwidth_x, bandwidth_y)
108		)
109
110		# Convert to data frame
111	6x	density_data <- expand.grid(x = density_est$x, y = density_est$y)
112	6x	density_data$z <- as.vector(density_est$z)
113	6x	density_data <- density_data[order(density_data$x, density_data$y), ]
114
115		# Dummy data to make legend go down to 0.0, replace 1st row z value with 0
116	6x	density_data[1, 3] <- 0
117
118	6x	return(density_data)
119		}
120
121
122		#' Expand the range of values for density plot
123		#' @keywords internal
124		#'
125		#' @param x numeric vector
126		#' @param expand_left_side Fraction to expand on left hand side
127		#' @param expand_right_side Fraction to expand on right hand side
128		#'
129		#' @return Vector of 2 values
130		#'
131		#'
132		expand_range <- function(x, expand_left_side = 0.1, expand_right_side = 0.2,
133		type = c("relative", "absolute")) {
134	17x	type <- match.arg(type)
135
136	17x	if (type == "relative") {
137	16x	expand_left_side <- expand_left_side * diff(range(x))
138	16x	expand_right_side <- expand_right_side * diff(range(x))
139		}
140
141	17x	return(c(min(x) - expand_left_side, max(x) + expand_right_side))
142		}
143
144		#' Density plots overlaid with ICE curves
145		#'
146		#' Create density plot for the data, overlaid with ICE curves at quantiles
147		#' of the variable(s) of interest.
148		#'
149		#'
150		#' @name g_density_ice
151		NULL
152
153
154		#' @rdname g_density_ice
155		#' @export
156		#'
157		#' @param object Object of class sculpture (rough, detailed)
158		#' @param new_data Data to make quantiles on
159		#' @param var_name String specifying which variable to generate ICE
160		#' @param var_label String (optional) specifying variable label (x label of the plot)
161		#' @param qtiles Quantiles to generate ICE curves
162		#' @param task Prediction task type (regression or classification)
163		#'
164		#' @return [g_density_ice_plot()]: ggplot object
165		#'
166		#' @details
167		#' [g_density_ice_plot()] creates a density plot for a single variable.
168		#'
169		#' [g_density_ice_plot_list()] creates a list of density plots for multiple variables.
170		#'
171		#' These functions should be amenable to any 1st-order model without interaction terms,
172		#' however not implemented yet, such as handling `predict()` function output
173		#' for binary endpoint
174		#'
175		#'
176		#' @examples
177		#' \dontrun{
178		#' df <- mtcars
179		#' df$cyl <- as.factor(df$cyl)
180		#' model <- lm(hp ~ ., data = df)
181		#' model_predict <- function(x) predict(model, newdata = x)
182		#' covariates <- setdiff(colnames(df), "hp")
183		#' pm <- sample_marginals(df[covariates], n = 50, seed = 5)
184		#'
185		#' rs <- sculpt_rough(
186		#' dat = pm,
187		#' model_predict_fun = model_predict,
188		#' n_ice = 5,
189		#' seed = 1,
190		#' verbose = 0
191		#' )
192		#'
193		#' g_density_ice_plot(rs, new_data = pm, var_name = "mpg")
194		#' g_list <- g_density_ice_plot_list(
195		#' rs, new_data = pm, var_names = c("mpg", "cyl", "disp", "drat")
196		#' )
197		#' grid::grid.draw(gridExtra::arrangeGrob(grobs = g_list))
198		#' }
199		#'
200		g_density_ice_plot <- function(object, new_data, var_name, var_label = NULL,
201		qtiles = seq(0, 1, by = 0.1),
202		task = c("regression", "classification")) {
203	4x	checkmate::assert_class(object, "sculpture")
204	4x	new_data <- check_data(new_data)
205	4x	checkmate::assert_string(var_name)
206	4x	checkmate::assert_string(var_label, null.ok = TRUE)
207	4x	checkmate::assert_numeric(qtiles, lower = 0, upper = 1)
208	4x	checkmate::assert_character(task)
209
210	4x	task <- match.arg(task)
211
212		# https://cran.r-project.org/web/packages/data.table/vignettes/datatable-importing.html#globals
213	4x	x <- y <- z <- pred <- qtile <- NULL # due to NSE notes in R CMD check
214
215	4x	if (is.null(var_label)) {
216	4x	var_label <- var_name
217		}
218
219	4x	is_var_discrete <- !is.numeric(new_data[[var_name]])
220
221	4x	new_data_with_pred <- new_data
222	4x	new_data_with_pred$pred <- predict(object, newdata = new_data)
223
224	4x	if (task == "classification") {
225	!	new_data_with_pred$pred <- inv.logit(new_data_with_pred$pred)
226		}
227
228	4x	pred_ice_qtile <- calc_ice_quantile(
229	4x	object, new_data,
230	4x	var_name = var_name, qtiles = qtiles, task = task
231		)
232
233	4x	if (is_var_discrete) {
234	!	new_data_with_pred[[var_name]] <- as.numeric(as.factor(new_data_with_pred[[var_name]]))
235	!	pred_ice_qtile[[var_name]] <- as.numeric(as.factor(pred_ice_qtile[[var_name]]))
236		}
237
238	4x	density_data <- calc_density(
239	4x	new_data_with_pred,
240	4x	var_name = var_name,
241	4x	vec_y_expand = pred_ice_qtile$pred
242		)
243
244	4x	x_axis_range_data <- range(new_data_with_pred[[var_name]])
245	4x	if (is_var_discrete) {
246	!	x_axis_range_plot <- expand_range(x_axis_range_data, 0.3, 0.3, type = "absolute")
247		} else {
248	4x	x_axis_range_plot <- expand_range(x_axis_range_data, 0, 0.15)
249		}
250	4x	y_axis_range_plot <- range(c(new_data_with_pred$pred, pred_ice_qtile$pred))
251
252	4x	ggrepel_data <- pred_ice_qtile[
253	4x	pred_ice_qtile$qtile %in% c(0, 0.5, 1) &
254	4x	pred_ice_qtile[[var_name]] == max(pred_ice_qtile[[var_name]]),
255		]
256
257	4x	density_plot <- ggplot(density_data, aes(x = x, y = y)) +
258	4x	geom_raster(aes(fill = z), interpolate = TRUE) +
259	4x	labs(x = var_label, y = "Predicted Value", fill = "Density") +
260	4x	scale_fill_viridis_c() +
261	4x	coord_cartesian(xlim = x_axis_range_plot, ylim = y_axis_range_plot) +
262	4x	theme(
263	4x	panel.ontop = TRUE,
264	4x	panel.background = element_rect(color = NA, fill = NA),
265	4x	panel.grid.major = element_line(color = grDevices::rgb(1, 1, 1, 0.1)),
266	4x	panel.grid.minor = element_line(color = grDevices::rgb(1, 1, 1, 0.1))
267		) +
268	4x	geom_line(
269	4x	data = pred_ice_qtile,
270	4x	aes(x = .data[[var_name]], y = pred, group = qtile), linewidth = 0.3,
271	4x	color = "grey70"
272		) +
273	4x	geom_line(
274	4x	data = pred_ice_qtile[pred_ice_qtile$qtile %in% c(0, 0.5, 1), ],
275	4x	aes(x = .data[[var_name]], y = pred, group = qtile), linewidth = 0.7,
276	4x	color = "grey70"
277		) +
278	4x	ggrepel::geom_text_repel(
279	4x	data = ggrepel_data,
280	4x	aes(x = .data[[var_name]], y = pred, label = paste0(round(qtile * 100), "%")),
281	4x	color = "grey70",
282	4x	box.padding = unit(0.25, "lines"),
283	4x	point.padding = unit(0.25, "lines"),
284	4x	segment.linetype = "dotted",
285	4x	min.segment.length = unit(0, "lines"),
286	4x	nudge_x = diff(range(x_axis_range_plot)) / 6,
287	4x	direction = "y", hjust = "right"
288		)
289
290
291	4x	if (is_var_discrete) {
292	!	levels <- levels(as.factor(new_data[[var_name]]))
293	!	density_plot <- density_plot +
294	!	scale_x_continuous(
295	!	breaks = seq_len(length(levels)),
296	!	labels = levels,
297	!	minor_breaks = NULL
298		)
299		}
300
301	4x	return(density_plot)
302		}
303
304
305		#' @rdname g_density_ice
306		#' @export
307		#'
308		#' @param var_names Vector of strings specifying which variables to generate ICE
309		#' @param var_labels Named vector of strings specifying variable labels.
310		#'
311		#' @return [g_density_ice_plot_list()]: list of ggplot objects
312		#'
313		g_density_ice_plot_list <- function(object, new_data, var_names, var_labels = NULL,
314		qtiles = seq(0, 1, by = 0.1),
315		task = c("regression", "classification")) {
316	1x	checkmate::assert_class(object, "sculpture")
317	1x	new_data <- check_data(new_data)
318	1x	checkmate::assert_character(var_names)
319	1x	checkmate::assert_character(var_labels, null.ok = TRUE)
320	1x	checkmate::assert_numeric(qtiles, lower = 0, upper = 1)
321	1x	checkmate::assert_character(task)
322
323	1x	task <- match.arg(task)
324
325	1x	out <- vector("list", length(var_names))
326	1x	names(out) <- var_names
327
328	1x	for (var_name in var_names) {
329	2x	out[[var_name]] <-
330	2x	g_density_ice_plot(object, new_data, var_name, var_labels[var_name], qtiles, task)
331		}
332
333	1x	return(out)
334		}

1		#' Set and end parallel computation
2		#'
3		#' @param num_cores (`integer`) Number of cores.
4		#' @param cluster_type (`character`) Type of cluster. One of `c("fork", "psock")`.
5		#'
6		#' @export
7		#' @examples
8		#' \dontrun{
9		#' parallel_set(num_cores = 2)
10		#' # now the code will run on parallel with 2 cores
11		#' parallel_end()
12		#' # now the code will run sequentially
13		#' }
14		parallel_set <- function(num_cores = 10, cluster_type = "fork") {
15	4x	checkmate::assert_integerish(num_cores, lower = 1, any.missing = FALSE, len = 1)
16	3x	cluster_type <- match.arg(cluster_type, choices = c("fork", "psock"))
17
18	2x	parallel_end()
19
20	2x	if (cluster_type == "fork") {
21	1x	cl <- parallel::makeForkCluster(num_cores)
22		} else {
23	1x	cl <- parallel::makePSOCKcluster(num_cores)
24		}
25
26	2x	doParallel::registerDoParallel(cl)
27
28	2x	message(paste("Using", foreach::getDoParWorkers(), "cores")) # should be == num_cores
29		}
30
31		#' @rdname parallel_set
32		#' @export
33		parallel_end <- function() {
34	2x	if (foreach::getDoParRegistered()) {
35	1x	foreach::registerDoSEQ()
36		}
37		}
38
39
40		define_foreach_operand <- function(allow_par = FALSE) {
41	62x	if (foreach::getDoParRegistered() && allow_par) {
42	1x	foreach::`%dopar%`
43		} else {
44	61x	foreach::`%do%`
45		}
46		}