stats4phc coverage - 96.74%

Files
Source

#' Check Input Arguments
#'
#' Given outcomes, prediction scores, methods, checks for obvious issues,
#' and returns warnings if needed.
#' These may include unequal lengths, missing values, ensuring binary outcomes,
#' and numeric scores, etc.
#'
#' @inheritParams riskProfile
#'
#' @return A list of containing standardized outcomes and predicted scores.
#'
#' @keywords internal
#' @noRd
#'
#' @examples
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
#' rscore <- auroc$predicted
#' truth <- as.numeric(auroc$actual)
#' inputCheck(truth, rscore)
#'
inputCheck <- function(outcome, score) {

  # Predscore - check numeric
  if (!is.numeric(score)) {
    stop("'score' vector must be represented as a numeric.")
  }

  # Outcome - logical to numeric
  if (is.logical(outcome)) {
    outcome <- as.numeric(outcome)
  }

  # Outcome - check numeric
  if (!is.numeric(outcome)) {
    stop("'outcome' vector needs to be a numeric vector.")
  }

  # Check matching lengths
  if (length(outcome) != length(score)) {
    stop("'outcome' and 'score' must have the same lengths.")
  }

  # Need at least 3 observations
  if (length(outcome) < 3) {
    stop("Need at least 3 observations.")
  }

  # DROP NA's for data pairs
  if (any(is.na(outcome)) || any(is.na(score))) {
    warning("Observations with NA's are dropped")
    ind <- intersect(
      which(complete.cases(outcome)),
      which(complete.cases(score))
    )
    outcome <- outcome[ind]
    score <- score[ind]
  }

  # Outcome - check incorrect values
  if (!all(outcome %in% 0:1)) {
    stop("'outcome' vector must be represented as binary 1 or 0.")
  }

  # Predscore - check low frequency of score values
  if (length(unique(score)) <= 5) {
    tbl <- table(score)
    if (any(tbl <= 3)) {
      warning(
        paste(
          "There is a low-occurrence value in `score` (", names(tbl)[tbl <= 3][1], ").",
          "The results may be unreliable."
        )
      )
    }
  }

  # Outcome - check low frequency
  if (any(prop.table(table(outcome)) <= 0.03)) {
    warning(
      paste(
        "There is a low frequency of one of the outcome classes (`prop.table(table(outcome))`).",
        "The results may be unreliable."
      )
    )
  }

  return(list(outcome = outcome, score = score))
}


#' Order Inputs
#'
#' Given a vector of prediction scores, and outcome, the function orders by score.
#' Option exists to reverse ordering, if lower scores correspond to higher rate of outcomes.
#'
#' @inheritParams riskProfile
#'
#' @return Returns an ordered and complete case dataframe of outcomes and score.
#'
#' @keywords internal
#' @noRd
#'
orderInputs <- function(outcome, score, rev.order = FALSE) {
  tempdf <- data.frame(score = score, outcome = outcome)
  if (rev.order) {
    tempdf$score <- -tempdf$score
  }
  tempdf <- tempdf[order(tempdf$score, tempdf$outcome), ]
  return(tempdf)
}

#' Method Check
#'
#' Usage:
#' 1. methods <- methodCheck(methods)
#' 2. getEstMethod(methods[[1]]) or getEstMethods(methods)
#' 3. getEst(methods, ...)
#'
#' @inheritParams  riskProfile
#'
#' @return list of named lists of method arguments
#'
#' @keywords internal
#' @noRd
#'
methodCheck <- function(methods) {

  # Drop empty strings
  if (any(methods == "")) {
    methods <- methods[methods != ""]
  }

  # If character is supplied
  if (is.character(methods)) {

    # Convert to lower-case
    orig.names <- methods
    methods <- tolower(methods)

    # Check uniqueness
    if (length(unique(methods)) != length(methods)) {
      stop("`methods` should be unique when specified as character.")
    }

    # Check available method
    if (!all(methods %in% names(est.funs()))) {
      stop(
        paste(
          "Supplied method is not yet available. Try selecting from (case insensitive): ",
          paste(shQuote(names(est.funs())), collapse = ", ")
        )
      )
    }

    # Convert to list
    methods <- structure(as.list(methods), names = orig.names)

    # Otherwise, if a list is provided:
    # list(gam1 = list(method = "gam", k = 3), pv = list(method = "pava", ...), etc)
    # or list(my_estimate = my_fun(outcome, score) {...})
  } else if (is.list(methods)) {

    # Check named list
    if (!checkmate::test_named(methods, type = "unique")) {
      stop("'methods' should be a uniquely named list.")
    }

    # Check and unify estimation method names
    methods <- lapply(methods, listMethodCheck)

  } else {
    stop(
      paste(
        "`methods` must be character",
        "or named list of estimation methods / user defined functions."
      )
    )
  }

  return(methods)
}

# x is a single method (element of an outer list)
listMethodCheck <- function(x) {
  UseMethod("listMethodCheck")
}

#' @export
listMethodCheck.list <- function(x) {

  # Check named list
  if (!checkmate::test_named(x, type = "unique")) {
    stop("Inner lists in the 'methods' argument must be named.")
  }

  # Check "method" element existing in the list
  if (is.null(x[["method"]]) || !is.character(x[["method"]])) {
    stop(
      paste(
        'All lists must have a "method" element specifying one of the predefined',
        "estimation functions as a string. Please select from:",
        paste(shQuote(names(est.funs())), collapse = ", ")
      )
    )
  }

  # Update method name
  x[["method"]] <- tolower(x[["method"]])

  # Check available method
  chck_available <- checkmate::test_subset(
    x[["method"]],
    choices = names(est.funs()), empty.ok = FALSE
  )
  if (!chck_available) {
    stop(
      paste0(
        "Supplied method '",
        x[["method"]],
        "' is not yet available. Try selecting from: ",
        paste(shQuote(names(est.funs())), collapse = ", ")
      )
    )
  }

  return(x)
}

#' @export
listMethodCheck.function <- function(x) {

  # Check input arguments
  if (!checkmate::test_function(x, args = c("outcome", "score"))) {
    stop(
      paste(
        "All user defined estimation functions need to take exactly two arguments:",
        "'outcome' and 'score'."
      )
    )
  }

  return(x)
}

#' @export
listMethodCheck.default <- function(x) {
  stop(
    paste(
      "The estimation method must be specified as a list or a function."
    )
  )
}

# compared to match.arg(..., several.ok = T),
# this does not allow NULL and returns a vector of the same length
matchArgSubset <- function(x, choices) {
  checkmate::assert_character(x, any.missing = FALSE, min.len = 1)
  out <- c()
  for (ag in x) {
    matched <- tryCatch(
      match.arg(ag, choices = choices),
      error = function(e) stop(sub("'arg'", paste0("'", ag, "'"), e))
    )
    out <- c(out, matched)
  }
  return(unique(out))
}


add0thPercPC <- function(x) {
  bind_rows(
    x,
    x %>%
      group_by(.data$method) %>%
      summarise(
        score = NA,
        percentile = 0,
        outcome = NA,
        estimate = .data$estimate[.data$percentile == min(.data$percentile)][1],
        pv = "PC",
        .groups = "drop"
      )
  ) %>%
    arrange(.data$method, .data$percentile)
}


add0thPercPV <- function(x) {
  bind_rows(
    x,
    x %>%
      group_by(.data$method, .data$pv) %>%
      summarise(
        score = NA,
        percentile = 0,
        estimate = NA,
        pvValue = dplyr::first(.data$pvValue),
        .groups = "drop"
      ) %>%
      dplyr::relocate("pv", .before = "pvValue")
  ) %>%
    arrange(.data$method, .data$pv, .data$percentile)
}


add0thPercTR <- function(x) {
  bind_rows(
    x,
    x %>%
      group_by(.data$method) %>%
      summarise(
        score = NA,
        percentile = 0,
        pf = "Sensitivity",
        value = 1
      ),
    x %>%
      group_by(.data$method) %>%
      summarise(
        score = NA,
        percentile = 0,
        pf = "Specificity",
        value = 0
      )
  ) %>%
    arrange(.data$method, .data$pf, .data$percentile)
}


#' For snapshot testing of graphs
#'
#' @param code Code to create a graph
#' @param width Width of the plot.
#' @param height Height of the plot.
#'
#' @return Filepath
#'
#' @keywords internal
#' @noRd
#'
#' @examples
#' expect_snapshot_file(save_png(ggplot(mtcars) +
#'   geom_point(aes(hp, mpg))), "riskProfile.png")
#'
save_png <- function(code, width = 400, height = 400) { # nocov start
  path <- tempfile(fileext = ".png")
  png(path, width = width, height = height)
  on.exit(dev.off())
  print(code)
  return(path)
} # nocov end

#' Risk profile plot
#'
#' Predictiveness curve, PPV, NPV and 1-NPV risk estimates
#'
#' @param outcome Vector of binary outcome for each observation.
#' @param score Numeric vector of continuous predicted risk score.
#' @param methods Character vector of method names (case-insensitive) for plotting curves or
#' a named list where elements are method function and its arguments.
#' Default is set to `"asis"`.
#'
#' Full options are: `c("asis", "binned", "pava", "mspline", "gam", "cgam")`.
#'
#' To specify arguments per method, use lists. For example:
#' ```
#' list(
#'   pava = list(method = "pava", ties = "primary"),
#'   mspline = list(method = "mspline", fitonPerc = TRUE),
#'   gam = list(method = "gam", bs = "tp", logscores = FALSE),
#'   bin = list(method = "binned", bins = 10),
#'   risk = list(method = "asis")
#' )
#' ```
#' See section "Estimation" for more details.
#' @param prev.adj `NULL` (default) or scalar numeric between 0 and 1 for prevalence adjustment.
#' @param show.prev Logical, show prevalence value in the graph. Defaults to `TRUE`.
#' @param show.nonparam.pv Logical, show non-parametric calculation of PVs. Defaults to `TRUE`.
#' @param show.best.pv Logical, show best possible PVs. Defaults to `TRUE`.
#' @param include Character vector (case-insensitive, partial matching) specifying what quantities
#' to include in the plot.
#'
#' Default is: `c("PC", "PPV", "1-NPV")`.
#'
#' Full options are: `c("NPV", "PC", "PPV", "1-NPV")`.
#' @param plot.raw Logical to show percentiles or raw values.
#' Defaults to `FALSE` (i.e. percentiles).
#' @param rev.order Logical, reverse ordering of scores. Defaults to `FALSE`.
#'
#' @section Estimation:
#' The `methods` argument specifies the estimation method.
#' You can provide either a vector of strings, any of
#' ```
#' c("asis", "binned", "pava", "mspline", "gam", "cgam")
#' ```
#' (`"asis"` is not available for `calibrationProfile`),
#' or a named list of lists.
#' In the latter case, the inner list must have an element "method",
#' which specifies the estimation function (one of those above),
#' and optionally other elements, which are passed to the estimation function.
#' For example:
#' ```
#' list(
#'   gam = list(method = "gam", k = 3),
#'   c_gam = list(method = "cgam", numknots = 3)
#' )
#' ```
#'
#' To see what arguments are available for each estimation method,
#' see the documentation of that function.
#' The naming convention is `getXest`,
#' where `X` stands for the estimation method, for example [getGAMest()].
#'
#' "gam", "cgam", and "mspline" always fit on percentiles by default.
#' To change this, use `fitonPerc = FALSE`, for example 
#' ```
#' list(gam = list(method = "gam", fitonPerc = FALSE))
#' ```
#'
#' "gam" and "cgam" methods are wrappers of [mgcv::gam()] and [cgam::cgam()], respectively.
#' The default values of function arguments (like `k`, the number of knots in [mgcv::s()])
#' mirror the package defaults.
#'
#' @return A list containing the plot and data, plus `errorbar` data if they were requested 
#' (through `"binned"` estimation method with a parameter `errorbar.sem`).
#'
#' @export
#'
#' @seealso [calibrationProfile()] [sensSpec()]
#'
#' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()]
#' [getASISest()]
#'
#' @examples
#' # Read in example data
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
#' rscore <- auroc$predicted_calibrated
#' truth <- as.numeric(auroc$actual)
#'
#' # Default plot includes 1-NPV, PPV, and a predictiveness curve (PC) based on risk-cutoff
#' p1 <- riskProfile(outcome = truth, score = rscore)
#' p1$plot
#' p1$data
#'
#' # Show also NPV
#' p2 <- riskProfile(
#'   outcome = truth,
#'   score = rscore,
#'   include = c("PC", "NPV", "PPV", "1-NPV")
#'   # or use partial matching: include = c("PC", "N", "PPV", "1")
#' )
#' p2$plot
#' p2$data
#'
#' # All estimates of prediction curve
#' p3 <- riskProfile(
#'   outcome = truth,
#'   score = rscore,
#'   methods = c("mspline", "gam", "cgam", "binned", "pava", "asis"),
#'   include = c("PC", "PPV", "1-NPV")
#' )
#' p3$plot
#'
#' # Specifying method arguments (note each list has a "method" element)
#' p4 <- riskProfile(
#'   outcome = truth,
#'   score = rscore,
#'   methods = list(
#'     "gam" = list(method = "gam", bs = "tp", logscores = FALSE, fitonPerc = TRUE),
#'     "risk" = list(method = "asis"), # no available arguments for this method
#'     "bin" = list(method = "binned", quantiles = 10, errorbar.sem = 1.2)
#'   )
#' )
#' p4$plot
#'
#' # Compare multiple GAMs in terms of Predictiveness Curves
#' p5 <- riskProfile(
#'   outcome = truth,
#'   score = rscore,
#'   methods = list(
#'     "gam_3" = list(method = "gam", k = 3),
#'     "gam_4" = list(method = "gam", k = 4),
#'     "gam_7" = list(method = "gam", k = 7)
#'   ),
#'   include = "PC"
#' )
#' p5$plot
#'
#' # Using logistic regression as user-defined estimation function, fitting on percentiles
#' # Function needs to take exactly these two arguments
#' my_est <- function(outcome, score) {
#'   # Calculate percentiles
#'   perc <- ecdf(score)(score)
#'   # Fit
#'   m <- glm(outcome ~ perc, family = "binomial")
#'   # Generate predictions
#'   preds <- predict(m, type = "response")
#'   # Return a data.frame with exactly these columns
#'   return(
#'     data.frame(
#'       score = score,
#'       percentile = perc,
#'       outcome = outcome,
#'       estimate = preds
#'     )
#'   )
#' }
#' p6 <- riskProfile(
#'   outcome = truth,
#'   score = rscore,
#'   methods = list(my_lr = my_est)
#' )
#' p6$plot
#'
#' # Using cgam as user-defined estimation function
#' # Note that you can also use the predefined cgam using methods = "cgam"
#' # Attach needed library
#' # Watch out for masking of mgcv::s and cgam::s if both are attached
#' library(cgam, quietly = TRUE) 
#' # Function needs to take exactly these two arguments
#' my_est <- function(outcome, score) {
#'   # Fit on raw predictions with space = "E"
#'   m <- cgam(
#'     outcome ~ s.incr(score, numknots = 5, space = "E"),
#'     family = "binomial"
#'   )
#'   # Generate predictions and convert to vector
#'   preds <- predict(m, type = "response")$fit
#'   # Return a data.frame with exactly these columns
#'   out <- data.frame(
#'     score = score,
#'     percentile = ecdf(score)(score),
#'     outcome = outcome,
#'     estimate = preds
#'   )
#'   return(out)
#' }
#'
#' p7 <- riskProfile(
#'   outcome = truth,
#'   score = rscore,
#'   methods = list(my_cgam = my_est)
#' )
#' p7$plot
#' 
#' # Prevalence adjustment to 0.1
#' p8 <- riskProfile(outcome = truth, score = rscore, prev.adj = 0.1)
#' p8$plot
#'
riskProfile <- function(outcome,
                        score,
                        methods = "asis",
                        prev.adj = NULL,
                        show.prev = TRUE,
                        show.nonparam.pv = TRUE,
                        show.best.pv = TRUE,
                        include = c("PC", "PPV", "1-NPV"),
                        plot.raw = FALSE,
                        rev.order = FALSE) {

  # Argument checks (except outcome, score, methods - below)
  checkmate::assert_number(prev.adj, lower = 0, upper = 1, null.ok = TRUE)
  checkmate::assert_flag(show.nonparam.pv)
  checkmate::assert_flag(show.best.pv)
  checkmate::assert_flag(show.prev)
  include <- matchArgSubset(toupper(include), choices = c("PC", "PPV", "NPV", "1-NPV"))
  checkmate::assert_flag(plot.raw)
  checkmate::assert_flag(rev.order)

  # Standardize/Check outcome, scores
  op <- inputCheck(outcome = outcome, score = score)

  # Order Data by scores
  tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order)
  score <- tempdf$score
  outcome <- tempdf$outcome

  # Check methods
  methods <- methodCheck(methods = methods)
  method.names <- names(methods)

  # Calculate prevalence and percentiles
  prev <- mean(outcome, na.rm = TRUE)

  # Get the plot settings
  show.pc <- "PC" %in% include
  show.pv <- any(c("PPV", "NPV", "1-NPV") %in% include)
  show.one.only <- sum(c("PC", "PPV", "NPV", "1-NPV") %in% include) == 1

  if (plot.raw) {
    xvar <- "score"
  } else {
    xvar <- "percentile"
  }

  # Prediction Curve Data
  if (show.pc) {
    pc.ests <- getEsts(methods = methods, outcome = outcome, score = score)
    PC.data <- mutate(pc.ests$plotdata, pv = "PC")
    errorbar.data <- pc.ests$errorbardata
    step.methods <- method.names[pc.ests$idx.step]
  } else {
    PC.data <- data.frame(
      method = character(0), pv = character(0),
      percentile = numeric(0), score = numeric(0), estimate = numeric(0)
    )
    pc.ests <- errorbar.data <- NULL
    step.methods <- character(0)
  }

  # Predictive Value Data
  if (show.pv) {
    PV.data <- getPVdata(outcome = outcome, score = score, methods = methods, pc.ests = pc.ests)
  } else {
    PV.data <- data.frame(
      method = character(0), score = numeric(0), percentile = numeric(0),
      outcome = numeric(0), estimate = numeric(0),
      MNPV = numeric(0), NPV = numeric(0), PPV = numeric(0)
    )
  }

  # show.nonparam.pv
  if (show.nonparam.pv) {
    tmp <- nonParametricPV(outcome = outcome, score = score) %>%
      mutate(method = "non-parametric", estimate = NA)
    PV.data <- bind_rows(PV.data, tmp)
  }

  # Dataset of inputs
  df.in <- data.frame(outcome, score, percentile = ecdf(score)(score))

  # Adjust based on user defined prevalence
  if (!is.null(prev.adj)) {

    cdf.cases <- ecdf(score[outcome == 1])
    cdf.controls <- ecdf(score[outcome == 0])

    df.in$percentile <- adjPrevPerc(
      perc = df.in$score, prev.new = prev.adj,
      cdf.case = cdf.cases, cdf.control = cdf.controls
    )

    if (show.pc) {
      PC.data <- adjPrevPC(
        dat = PC.data, prev = prev, prev.new = prev.adj,
        cdf.case = cdf.cases, cdf.control = cdf.controls
      )
    }

    if (show.pv) {
      PV.data <- adjPrevPV(
        dat = PV.data, prev = prev, prev.new = prev.adj,
        cdf.case = cdf.cases, cdf.control = cdf.controls
      )
    }

    prev <- prev.adj
  }

  # pivot PV data
  if (show.pv) {
    PV.data <- PV.data %>%
      rename(`1-NPV` = "MNPV") %>%
      tidyr::pivot_longer(
        cols = all_of(c("1-NPV", "NPV", "PPV")), names_to = "pv", values_to = "pvValue"
      ) %>%
      mutate(pv = factor(.data$pv, levels = c("NPV", "1-NPV", "PPV"))) %>%
      arrange(.data$method, .data$pv, .data$percentile)
  } else {
    PV.data <- data.frame(
      method = character(0), score = numeric(0), percentile = numeric(0),
      outcome = numeric(0), estimate = numeric(0),
      pv = character(0), pvValue = numeric(0)
    )
  }

  # If showing percentiles, add row with 0th percentile
  if (!plot.raw) {
    if (show.pc) {
      PC.data <- add0thPercPC(PC.data)
    }
    if (show.pv) {
      PV.data <- add0thPercPV(PV.data)
    }
  }

  # Subset PC data
  smoothPC <- PC.data[!PC.data$method %in% step.methods, , drop = FALSE]
  stepPC <- PC.data[PC.data$method %in% step.methods, , drop = FALSE]

  # Subset PV data
  smoothPV <- PV.data[
    PV.data$pv %in% include & !PV.data$method %in% step.methods, ,
    drop = FALSE
  ]
  stepPV <- PV.data[
    PV.data$pv %in% include & PV.data$method %in% step.methods, ,
    drop = FALSE
  ]

  # Different aes based on what is to be shown
  # (if one kind of PV value, use both coloring and linetype for distinguishing estimation methods)
  if (show.one.only) {
    aes.pc <- aes(
      x = .data[[xvar]], y = .data$estimate,
      colour = .data$method, linetype = .data$method
    )
    aes.pv <- aes(
      x = .data[[xvar]], y = .data$pvValue,
      colour = .data$method, linetype = .data$method
    )
  } else {
    aes.pc <- aes(
      x = .data[[xvar]], y = .data$estimate,
      colour = .data$pv, linetype = .data$method
    )
    aes.pv <- aes(
      x = .data[[xvar]], y = .data$pvValue,
      colour = .data$pv, linetype = .data$method
    )
  }

  # Build plot
  p <- ggplot() +
    geom_line(aes.pc, data = smoothPC, alpha = 0.8) +
    geom_step(aes.pc, data = stepPC, alpha = 0.8, direction = "vh") +
    geom_line(aes.pv, data = smoothPV, alpha = 0.8) +
    geom_step(aes.pv, data = stepPV, alpha = 0.8, direction = "vh") +
    geom_hline(yintercept = prev, alpha = 0.8, col = "black", linetype = "dashed")

  # Best PVs
  if (show.best.pv) {
    if (!plot.raw) {
      df.in <- bind_rows(dplyr::tibble(percentile = 0, score = NA), df.in)
    }
    best <- df.in %>%
      select(all_of(c("percentile", "score"))) %>%
      distinct() %>%
      arrange(.data$percentile, .data$score) %>%
      mutate(
        PC = ifelse(.data$percentile <= 1 - prev, 0, 1),
        PPV = bestPPV(perc = .data$percentile, prev = prev),
        `1-NPV` = bestMNPV(perc = .data$percentile, prev = prev),
        NPV = 1 - .data$`1-NPV`
      ) %>%
      tidyr::pivot_longer(
        cols = c("PC", "PPV", "1-NPV", "NPV"), names_to = "pv", values_to = "pvValue"
      ) %>%
      filter(.data$pv %in% include) %>%
      mutate(
        method = "Best PVs",
        pv = paste("Best", .data$pv)
      )

    if (show.one.only) {
      p <- p +
        geom_line(
          data = best,
          aes(x = .data[[xvar]], y = .data$pvValue, linewidth = .data$pv),
          colour = "gray60"
        ) +
        scale_linewidth_manual(
          values = c("Best 1-NPV" = 0.3, "Best PPV" = 0.3, "Best PC" = 0.3, "Best NPV" = 0.3),
          name = "Best PVs"
        )
    } else {
      p <- p +
        geom_line(data = best, aes(x = .data[[xvar]], y = .data$pvValue, colour = .data$pv))
    }
  }

  # Add errorbars
  if (show.pc && !is.null(errorbar.data)) {
    p <- p +
      geom_point(
        data = errorbar.data,
        aes(x = .data$midquantile, y = .data$bin.mid),
        alpha = 0.8,
        size = 0.2
      ) +
      geom_errorbar(
        data = errorbar.data,
        aes(
          x = .data$midquantile,
          ymin = .data$bin.low,
          ymax = .data$bin.high,
          width = .02
        ),
        alpha = 0.7,
        linewidth = 0.2,
        inherit.aes = FALSE
      )
  }

  # Set always the same colours for PVs
  if (!show.one.only) {
    clrs <- predictionColours(include, show.best = show.best.pv)
    p <- p + scale_colour_manual(values = clrs, breaks = names(clrs))
  } else {
    p <- p + scale_colour_hue(l = 45)
  }

  # Finalize plot
  p <- p +
    labs(
      title = "Predictiveness Plot",
      x = ifelse(plot.raw, "Prediction Score", "Risk Percentile"),
      y = "Predicted Risk / Predictive Value",
      linetype = "Estimation Method",
      colour = ifelse(show.one.only, "Estimation Method", "Predictive Quantity")
    ) +
    scale_x_continuous(n.breaks = 6) +
    scale_y_continuous(n.breaks = 6) +
    theme_bw() +
    theme(legend.key.width = unit(2, "line"))
  
  # Add prevalence annotation if requested
  if (show.prev) {
    # x-value for plotting prevalence label
    prev_x <- ifelse(plot.raw, min(score), 0)
    prev_nudge_x <- ifelse(plot.raw, (max(score) - min(score)) / 10, 0.1)
    prev_nudge_y <- ggplot2::layer_scales(p)$y$get_limits()[2] / 10
    
    p <- p + annotate(
      geom = "text",
      x = prev_x + prev_nudge_x,
      y = ifelse(prev > 0.8, prev - prev_nudge_y, prev + prev_nudge_y),
      label = paste0("Prevalence: ", "\n", round(prev, 3)),
      colour = "black",
      alpha = 0.8,
      size = 3.5
    )
  }

  if (show.one.only) {
    p <- p + guides(colour = guide_legend(order = 1), linetype = guide_legend(order = 1))
  } else {
    p <- p + guides(colour = guide_legend(order = 1), linetype = guide_legend(order = 2))
  }

  if (show.best.pv) {
    PV.data <- bind_rows(PV.data, mutate(best, pv = gsub("Best ", "", .data$pv)))
  }

  return(
    list(
      plot = p,
      data = bind_rows(
        dplyr::as_tibble(PC.data),
        dplyr::as_tibble(PV.data)
      ),
      errorbar = errorbar.data
    )
  )
}


# Helper function to check the user input - bins and quantiles arguments
checkBinsQuantiles <- function(bins, quantiles, score) {

  checkmate::assert(
    checkmate::check_numeric(bins, any.missing = FALSE, sorted = TRUE),
    checkmate::check_null(bins)
  )
  checkmate::assert(
    checkmate::check_integerish(quantiles, lower = 1),
    checkmate::check_null(quantiles)
  )

  # Check combination of quantiles and bins
  if (!is.null(quantiles) && !is.null(bins)) {
    stop("bins and quantiles cannot be specified together, choose one and set the other to NULL")
  }

  if (is.null(quantiles) && is.null(bins)) {
    quantiles <- 10
  }

  # Further check for bins
  if (length(bins) > 1) {
    if (bins[1] > min(score)) {
      stop(paste("The first element of bins must be <= min(score), not", bins[1]))
    }
    if (bins[length(bins)] < max(score)) {
      stop(paste("The last element of bins must be >= max(score), not", bins[length(bins)]))
    }
  }

  # Scalar bins -> number of intervals
  if (length(bins) == 1 && bins <= 1) {
    stop("bins must be > 1 when provided as a scalar (i.e. number of bins)")
  }

  # Check discrete score
  lu <- length(unique(score))
  if (length(bins) == 1 && bins > lu) {
    warning(
      paste0(
        "The number of `bins` (", bins, ") ",
        "is > the number of unique score values (", lu, "). ",
        "The results may be unreliable."
      )
    )
  }
  if (length(bins) > 1 && length(bins) - 1 > lu) {
    warning(
      paste0(
        "The number of `bins` (", length(bins) - 1, ") ",
        "is > the number of unique score values (", lu, "). ",
        "The results may be unreliable."
      )
    )
  }
  if (!is.null(quantiles) && lu <= 10) {
    warning(
      "Using the quantile method for non-continuous score. ",
      "The results may be unreliable."
    )
  }

  return(list(bins = bins, quantiles = quantiles))
}


# Helper function to check the user input - errorbar.sem argument
checkErrorbarSem <- function(errorbar.sem) {
  checkmate::assert(
    checkmate::check_number(errorbar.sem, lower = 0), # this checks >= 0
    checkmate::check_null(errorbar.sem)
  )
  if (!is.null(errorbar.sem)) {
    stopifnot("`errorbar.sem` must be > 0" = errorbar.sem > 0)
  }
  return(errorbar.sem)
}


#' Binned Risk Estimates
#'
#' Calculates bins based on number of evenly spaced bins or n-tiles.
#' Determines average risk within bins, used for risk estimates.
#'
#' @inheritParams riskProfile
#' @param quantiles Numeric; quantiles to split bins.
#' @param bins Numeric; number of evenly spaced bins or bin locations.
#' @param right Logical indicating right closed interval. Defaults to `TRUE`.
#' @param errorbar.sem Scalar numeric representing the number of standard error from the means
#' (SEM) used to calculate risk error bar.
#'
#' @return A data frame with 4 columns
#' (score, score percentile, outcome, estimate).
#' Additionally, there is an attribute "errorbar" holding the error-bar data if
#' `errorbar.sem` was specified.
#'
#' @seealso [getASISest()] [getCGAMest()] [getGAMest()] [getMSPLINEest()] [getPAVAest()]
#'
#' @export
#'
#' @examples
#' # Read in example data
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
#' rscore <- auroc$predicted
#' truth <- as.numeric(auroc$actual)
#'
#' getBINNEDest(outcome = truth, score = rscore)
#'
getBINNEDest <- function(outcome,
                         score,
                         quantiles = NULL,
                         bins = NULL,
                         right = TRUE,
                         errorbar.sem = NULL) {

  # Argument checks
  checkmate::assert_numeric(outcome)
  checkmate::assert_numeric(score, len = length(outcome))
  checkmate::assert_flag(right)
  errorbar.sem <- checkErrorbarSem(errorbar.sem)

  # Check bins and quantiles
  bqp <- checkBinsQuantiles(bins = bins, quantiles = quantiles, score = score)

  # Retrieve data summaries
  df <- getSummaries(
    outcome = outcome, score = score,
    quantiles = bqp$quantiles, bins = bqp$bins,
    right = right
  )

  if (!is.null(errorbar.sem)) {
    errorbar <- getERRORest(binlvl = df[["binlvl"]], z = errorbar.sem) %>%
      mutate(
        percentile = df[["binlvl"]][["riskpercentile"]],
        bin.mid = df[["binlvl"]][["avg.outcome"]]
      )
  } else {
    errorbar <- NULL
  }

  # Create binned.data
  binned.data <- data.frame(
    score = df[["binlvl"]][["avg.risk"]],
    percentile = df[["binlvl"]][["riskpercentile"]],
    outcome = NA,
    estimate = df[["binlvl"]][["avg.outcome"]]
  )
  attr(binned.data, "errorbar") <- errorbar

  return(binned.data)
}



#' Get Summaries: observation level & bin level dataframes
#'
#' Given vectors of outcomes and risk scores, the function will return a list of
#' observation level and bin level summary dataframes for the data.
#'
#' Observation level dataframe has columns for outcome, riskscore, risk percentile, bin number,
#' and corresponding minimum and maximum score for that bin.
#'
#' Bin level dataframe has columns indicating bin number and the observation count,
#' number of events, average outcome, average risk, and standard deviation of risk,
#' within each of the bins. Risk percentile and bin intervals are also provided.
#'
#' @return List of observation level and bin level dataframes.
#'
#' @keywords internal
#' @noRd
#'
#' @examples
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
#' truth <- as.numeric(auroc$actual)
#' rscore <- auroc$predicted
#'
#' # Bin by quantiles
#' getSummaries(
#'   outcome = truth,
#'   score = rscore,
#'   quantiles = 10
#' )
#'
#' # Bin by specific percentiles
#' getSummaries(
#'   outcome = truth,
#'   score = rscore,
#'   quantiles = 0,
#'   bin = c(0, 0.25, 0.5, 0.8, 1)
#' )
#'
getSummaries <- function(outcome,
                         score,
                         quantiles = NULL,
                         bins = NULL,
                         right = TRUE) {

  stopifnot(
    !is.null(quantiles) | !is.null(bins),
    quantiles != 0 | bins != 0,
    is.logical(right)
  )

  cdf.fit <- ecdf(score)

  if (!is.null(quantiles)) {
    bin.int <- Hmisc::cut2(score, g = quantiles)

  } else if (length(bins) > 1) {
    bin.int <- cut(cdf.fit(score), breaks = bins, include.lowest = TRUE, right = right)

  } else if (length(bins) == 1) {
    bin.int <- cut(score, breaks = bins, include.lowest = TRUE, right = right)

  } else {
    stop("Unrecognized option")
  }
  
  # get numeric label of interval
  bin.num <- as.numeric(bin.int)

  # get interval borders (min, max)
  min_max <- strsplit(
    gsub("(?![-,.])[[:punct:]]", "", trimws(as.character(bin.int)), perl = TRUE),
    ","
  )
  min_max <- lapply(
    min_max, \(x) `if`(length(x) < 2, rep(x, 2), x)
  )

  # Observation level data: outcome, rscore, observation percentile, and interval
  obslvl <- data.frame(
    outcome = outcome,
    score = score,
    riskpercentile = cdf.fit(score),
    bin = bin.num,
    interval = as.character(bin.int),
    min = vapply(min_max, "[[", character(1), 1),
    max = vapply(min_max, "[[", character(1), 2)
  ) %>%
    arrange(.data$score)

  # Bin level data: within each bin: n, avg risk, sd risk, quantile, error bar
  binlvl <- obslvl %>%
    group_by(.data$bin, .data$interval) %>%
    summarise(
      n = dplyr::n(),
      events = sum(.data$outcome),
      avg.outcome = mean(.data$outcome),
      sd.outcome = sd(.data$outcome, na.rm = TRUE),
      avg.risk = mean(.data$score, na.rm = TRUE),
      sd.risk = sd(.data$score),
      riskpercentile = max(.data$riskpercentile),
      .groups = "drop"
    )

  # Replace NAs with 0
  binlvl[is.na(binlvl)] <- 0

  return(list(obslvl = obslvl, binlvl = binlvl))
}


# ERROR BAR Estimates
getERRORest <- function(binlvl, z) {

  stopifnot(
    is.data.frame(binlvl),
    is.numeric(z)
  )

  binlvl %>%
    mutate(
      bin.low = .data$avg.outcome - (z * (.data$sd.outcome / sqrt(.data$n))),
      bin.low = ifelse(.data$bin.low < 0, 0, .data$bin.low),
      bin.high = .data$avg.outcome + (z * (.data$sd.outcome / sqrt(.data$n))),
      midquantile = .data$riskpercentile - (diff(c(0, .data$riskpercentile)) / 2)
    ) %>%
    select(all_of(c("midquantile", "bin.high", "bin.low"))) %>%
    tidyr::replace_na(list(bin.high = 0, bin.low = 0))
}


#' PAVA Risk Estimates
#'
#' Determines isotonic regression estimates via pava, given a vector of binary outcomes,
#' and a vector of scores.
#'
#' @inheritParams riskProfile
#' @param weights Vector of numerics to specify PAVA observation weighting.
#' @param ties String to specify how ties should be handled for PAVA.
#' @param low_events Numeric, specifying number of events in the lowest bin.
#' @param low_nonevents Numeric, specifying number of nonevents in the lowest bin.
#' @param high_events Numeric, specifying number of events in the highest bin.
#' @param high_nonevents Numeric, specifying number of nonevents in the highest bin.
#' @param hilo_obs Numeric, specifying number of observations in the highest and lowest bins.
#'
#' @return A data frame with 4 columns
#' (score, score percentile, outcome, estimate).
#'
#' @seealso [getASISest()] [getBINNEDest()] [getCGAMest()] [getGAMest()] [getMSPLINEest()]
#'
#' @export
#'
#' @examples
#' # Read in example data
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
#' rscore <- auroc$predicted
#' truth <- as.numeric(auroc$actual)
#'
#' tail(getPAVAest(outcome = truth, score = rscore), 10)
#'
getPAVAest <- function(outcome,
                       score,
                       weights = rep(1, length(outcome)),
                       ties = "primary",
                       low_events = NULL,
                       low_nonevents = NULL,
                       high_events = NULL,
                       high_nonevents = NULL,
                       hilo_obs = NULL) {

  checkmate::assert_numeric(outcome)
  checkmate::assert_numeric(score, len = length(outcome))
  checkmate::assert_numeric(weights, any.missing = FALSE, len = length(outcome))
  checkmate::assert_character(ties, any.missing = FALSE, len = 1)
  checkmate::assert(
    checkmate::check_integerish(low_events, lower = 1, any.missing = FALSE, len = 1),
    checkmate::check_null(low_events)
  )
  checkmate::assert(
    checkmate::check_integerish(low_nonevents, lower = 1, any.missing = FALSE, len = 1),
    checkmate::check_null(low_nonevents)
  )
  checkmate::assert(
    checkmate::check_integerish(high_events, lower = 1, any.missing = FALSE, len = 1),
    checkmate::check_null(high_events)
  )
  checkmate::assert(
    checkmate::check_integerish(high_nonevents, lower = 1, any.missing = FALSE, len = 1),
    checkmate::check_null(high_nonevents)
  )
  checkmate::assert(
    checkmate::check_integerish(hilo_obs, lower = 1, any.missing = FALSE, len = 1),
    checkmate::check_null(hilo_obs)
  )

  pava.est <- isotone::gpava(z = score, y = outcome, weights = weights, ties = ties)$x

  # if constrained, then replace percentiles....
  check <- any(
    is.numeric(low_events), is.numeric(low_nonevents), is.numeric(high_events),
    is.numeric(high_nonevents), is.numeric(hilo_obs)
  )
  if (check) {
    percentile <- getConstraints(
      outcome = outcome, rscore = score,
      low_events = low_events, low_nonevents = low_nonevents,
      high_events = high_events, high_nonevents = high_nonevents,
      hilo_obs = hilo_obs
    )
  } else {
    percentile <- ecdf(score)(score)
  }

  return(data.frame(score, percentile, outcome, estimate = pava.est))
}


#' Constrained Risk Percentile Estimates
#'
#' Adjusts PAVA risk percentile estimates for the first and last bin, to meet criteria
#' for events, non-events, or total observation count.
#'
#' @inheritParams riskProfile
#'
#' @return A vector of constrained risk percentiles.
#'
#' @keywords internal
#' @noRd
#'
#' @examples
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
#' truth <- as.numeric(auroc$actual)
#' rscore <- auroc$predicted
#'
#' getConstraints(outcome = truth, rscore = rscore, low_events = 3, high_nonevents = 3)
#'
getConstraints <- function(outcome,
                           rscore,
                           low_events = NULL, # min events in lower bin (useful for a PC)
                           low_nonevents = NULL, # min non-events in lower bin
                           high_events = NULL, # min events in upper bin
                           high_nonevents = NULL, # min non-events in upper bin (useful for a PC)
                           hilo_obs = NULL) { # min total obs in upper AND lower bin

  n <- length(rscore)
  rscore <- seq_along(rscore) / length(rscore)
  # rscore_cons: scores with binning constraint (same as rscore if no constraint specified)
  rscore_cons <- rscore

  if (!is.null(low_events) && low_events == 0) low_events <- NULL
  if (!is.null(low_nonevents) && low_nonevents == 0) low_nonevents <- NULL
  if (!is.null(high_events) && high_events == 0) high_events <- NULL
  if (!is.null(high_nonevents) && high_nonevents == 0) high_nonevents <- NULL
  if (!is.null(hilo_obs) && hilo_obs == 0) hilo_obs <- NULL

  if (is.numeric(low_events) && is.numeric(low_nonevents)) {
    warning(
      paste(
        "Specified both a minimum number of events and non-events for the lower bin.",
        "Combining for total observations instead."
      )
    )
    hilo_obs <- round(low_events + low_nonevents)
    rscore_cons[1:hilo_obs] <- min(rscore)

  } else if (is.numeric(high_events) && is.numeric(high_nonevents)) {
    warning(
      paste(
        "Specified both a minimum number of events and non-events for the upper bin.",
        "Combining for total observations instead."
      )
    )
    hilo_obs <- round(high_events + high_nonevents)
    rscore_cons[(n + 1 - hilo_obs):n] <- max(rscore)

  } else {
    # Apply upper / lower constraints
    if (is.numeric(low_events)) {
      low_events <- round(low_events)
      indlo <- match(TRUE, cumsum(outcome) == low_events)
      rscore_cons[1:indlo] <- min(rscore)
    }
    if (is.numeric(low_nonevents)) {
      low_nonevents <- round(low_nonevents)
      indlo <- match(TRUE, cumsum(1 - outcome) == low_nonevents)
      rscore_cons[1:indlo] <- min(rscore)
    }
    if (is.numeric(high_nonevents)) {
      high_nonevents <- round(high_nonevents)
      indhi <- match(TRUE, cumsum(1 - outcome[n:1]) == high_nonevents)
      rscore_cons[(n + 1 - indhi):n] <- max(rscore)
    }

    if (is.numeric(high_events)) {
      high_events <- round(high_events)
      indhi <- match(TRUE, cumsum(outcome[n:1]) == high_events)
      rscore_cons[(n + 1 - indhi):n] <- max(rscore)
    }

    if (is.numeric(hilo_obs)) {
      hilo_obs <- round(hilo_obs)
      rscore_cons[1:hilo_obs] <- min(rscore)
      rscore_cons[(n + 1 - hilo_obs):n] <- max(rscore)
    }
  }

  return(as.vector(rscore_cons))
}


#' GAM Risk Estimates
#'
#' Fits a Generalized Additive Model to estimate risk, given a vector of binary outcome,
#' and a vector of scores.
#'
#' @inheritParams riskProfile
#' @param k Numeric to specify the upper limit of basis functions to fit for GAM.
#' See [mgcv::s()] for more details. Defaults to -1.
#' @param bs Character string to specify spline type.
#' See [mgcv::s()] for more details. Defaults to `"tp"`.
#' @param method Character string to specify method type.
#' See [mgcv::s()] for more details. Defaults to "REML".
#' @param logscores Logical; if `TRUE`, fit gam on log scores. Defaults to `FALSE`.
#' @param fitonPerc Logical; if `TRUE`, fit gam on risk percentiles. Defaults to `TRUE`.
#'
#' @return A data frame with 4 columns
#' (score, score percentile, outcome, estimate).
#'
#' @seealso [getASISest()] [getBINNEDest()] [getCGAMest()] [getMSPLINEest()] [getPAVAest()]
#'
#' @export
#'
#' @examples
#' # Read in example data
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
#' rscore <- auroc$predicted
#' truth <- as.numeric(auroc$actual)
#'
#' tail(getGAMest(outcome = truth, score = rscore), 10)
#'
getGAMest <- function(outcome,
                      score,
                      k = -1,
                      bs = "tp",
                      method = "REML",
                      logscores = FALSE,
                      fitonPerc = TRUE) {

  checkmate::assert_numeric(outcome)
  checkmate::assert_numeric(score, len = length(outcome))
  checkmate::assert_number(k)
  checkmate::assert_character(bs, any.missing = FALSE, len = 1)
  checkmate::assert_character(method, any.missing = FALSE, len = 1)
  checkmate::assert_flag(logscores)
  checkmate::assert_flag(fitonPerc)

  mygrid <- ecdf(score)(score)
  df <- data.frame(outcome = outcome, score = score, perc = mygrid)

  # mgcv::s does not work in formula, need to define it here
  s <- mgcv::s

  if (fitonPerc && logscores) {
    formula <- outcome ~ s(log(perc), k = k, bs = bs)
  } else if (fitonPerc && !logscores) {
    formula <- outcome ~ s(perc, k = k, bs = bs)
  } else if (!fitonPerc && logscores) {
    formula <- outcome ~ s(log(score), k = k, bs = bs)
  } else {
    formula <- outcome ~ s(score, k = k, bs = bs)
  }

  gam.fit <- mgcv::gam(formula, data = df, family = "binomial", method = method)

  gam.est <- mgcv::predict.gam(gam.fit, type = "response")

  return(data.frame(score, percentile = mygrid, outcome, estimate = gam.est))
}



#' Constrained GAM (cgam) Risk Estimates
#'
#' Fits a Constrained Generalized Additive Model to estimate risk,
#' given a vector of binary outcomes and a vector of scores.
#'
#' @inheritParams riskProfile
#' @param numknots Numeric to specify the number of knots.
#' Passed to the `smoother` function. Defaults to 3.
#' @param smoother Character string to specify the smoother (from cgam package).
#' Defaults to "s.incr".
#' @param logscores Logical; if `TRUE`, fit gam on log scores. Defaults to `FALSE`.
#' @param fitonPerc Logical; if `TRUE`, fit gam on risk percentiles. Defaults to `TRUE`.
#'
#' @return A data frame with 4 columns
#' (score, score percentile, outcome, estimate).
#'
#' @seealso [getASISest()] [getBINNEDest()] [getGAMest()] [getMSPLINEest()] [getPAVAest()]
#'
#' @export
#'
#' @examples
#' # Read in example data
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
#' rscore <- auroc$predicted
#' truth <- as.numeric(auroc$actual)
#'
#' tail(getCGAMest(outcome = truth, score = rscore), 10)
#'
getCGAMest <- function(outcome,
                       score,
                       numknots = 0,
                       smoother = "s.incr",
                       logscores = FALSE,
                       fitonPerc = TRUE) {

  checkmate::assert_numeric(outcome)
  checkmate::assert_numeric(score, len = length(outcome))
  checkmate::assert_number(numknots)
  checkmate::assert_character(smoother, any.missing = FALSE, len = 1)
  checkmate::assert_flag(logscores)
  checkmate::assert_flag(fitonPerc)

  mygrid <- ecdf(score)(score)
  df <- data.frame(outcome = outcome, score = score, perc = mygrid)

  # cgam::s does not work in formula, need to define it here
  assign(smoother, do.call(`::`, list(pkg = "cgam", name = smoother)))

  formula <- as.formula(
    paste0(
      "outcome ~ ",
      smoother, "(",
      `if`(logscores, "log("),
      `if`(fitonPerc, "perc", "score"),
      `if`(logscores, ")"),
      ", numknots = ", numknots, ")"
    )
  )

  cgam.fit <- cgam::cgam(formula, data = df, family = "binomial")

  cgam.est <- cgam::predict.cgam(cgam.fit, type = "response")$fit

  return(data.frame(score, percentile = mygrid, outcome, estimate = cgam.est))
}


### MSPLINE ESTIMATES ###

# Function written by willtownes, joseph paulson.
mspline <- function(x, y, k = 10, lower = NA, upper = NA) {
  # fits a monotonic spline to data
  # small values of k= more smoothing (flatter curves)
  # large values of k= more flexible (wiggly curves)
  # k is related to effective degrees of freedom and number of knots
  # use unconstrained gam to get rough parameter estimates
  # lower, upper optional bounds on the function
  # basically a slight modifimessageion of an example in the mgcv::pcls documentation
  dat <- data.frame(x = x, y = y)
  s <- mgcv::s # mgcv::s does not work in formula, need to define it here
  init_gam <- mgcv::gam(y ~ s(x, k = k, bs = "cr"))
  # Create Design matrix, constraints etc. for monotonic spline....
  sm <- mgcv::smoothCon(s(x, k = k, bs = "cr"), dat, knots = NULL)[[1]]
  mc <- mgcv::mono.con(sm$xp, lower = lower, upper = upper) # monotonicity constraints
  M <- list(
    X = sm$X, y = y, # design matrix, outcome
    C = matrix(0, 0, 0), # equality constraints (none)
    Ain = mc$A, bin = mc$b, # inequality constraints
    sp = init_gam$sp, p = sm$xp, # initial guesses for param estimates
    S = sm$S, # smoothness penalty matrix
    w = y * 0 + 1, off = 0 # weights, offset
  )
  # fit spine using penalized constrained least squares
  p <- mgcv::pcls(M)
  return(list(sm = sm, p = p))
}

# Function written by joseph paulson
predict.mspline <- function(msp, x) {
  # using the monotone spline msp, predict values for the vector x
  as.vector(mgcv::Predict.matrix(msp$sm, data.frame(x = x)) %*% msp$p)
}



#' Monotone Spline Risk Estimates
#'
#' Fits a Monotone constrained Generalized Additive Model (GAM) to estimate risk,
#' given a vector of binary outcomes and a vector of scores.
#'
#' @inheritParams getGAMest
#'
#' @return A data frame with 4 columns
#' (score, score percentile, outcome, estimate).
#'
#' @seealso [getASISest()] [getBINNEDest()] [getCGAMest()] [getGAMest()] [getPAVAest()]
#'
#' @export
#'
#' @examples
#' # Read in example data
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
#' rscore <- auroc$predicted
#' truth <- as.numeric(auroc$actual)
#'
#' tail(getMSPLINEest(outcome = truth, score = rscore), 10)
#'
getMSPLINEest <- function(outcome,
                          score,
                          k = 10,
                          fitonPerc = TRUE) {

  checkmate::assert_numeric(outcome)
  checkmate::assert_numeric(score, len = length(outcome))
  checkmate::assert_integerish(k, len = 1, any.missing = FALSE)
  checkmate::assert_flag(fitonPerc)

  stopifnot(!is.null(k), is.logical(fitonPerc))

  scorefit <- ecdf(score)
  mygrid <- scorefit(score)

  if (!fitonPerc) {
    fitspl <- mspline(x = score, y = outcome, k = k)
    mspline.est <- predict.mspline(fitspl, score)
  } else {
    fitspl <- mspline(x = mygrid, y = outcome, k = k)
    mspline.est <- predict.mspline(fitspl, mygrid)
  }

  mspline.est[mspline.est < 0] <- 0
  mspline.est[mspline.est > 1] <- 1

  return(data.frame(score, percentile = mygrid, outcome, estimate = mspline.est))
}


#' "As is" estimates
#'
#' This function does no estimation, but uses the score as it is
#' (it works like an identity function).
#'
#' @inheritParams getGAMest
#'
#' @return A data frame with 4 columns
#' (score, score percentile, outcome, estimate).
#'
#' @seealso [getBINNEDest()] [getCGAMest()] [getGAMest()] [getMSPLINEest()] [getPAVAest()]
#'
#' @export
#'
#' @examples
#' # Read in example data
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
#' rscore <- auroc$predicted
#' truth <- as.numeric(auroc$actual)
#'
#' tail(getASISest(outcome = truth, score = rscore), 10)
#'
getASISest <- function(outcome, score) {
  # Argument checks
  checkmate::assert_numeric(outcome)
  checkmate::assert_numeric(score, len = length(outcome))
  return(
    data.frame(
      score = score,
      percentile = ecdf(score)(score),
      outcome,
      estimate = score
    )
  )
}


# Returns Risk Estimates.
# This function calls all the other getXXXest functions
# Used for Predictiveness Curve Data
getEsts <- function(methods, outcome, score) {

  # check methods
  stopifnot(is.list(methods))

  # Get function names (and perform checks)
  fun.names <- getEstMethods(methods, with.names = TRUE)

  # Run estimations
  m.est <- lapply(methods, \(x) getEst(x, outcome = outcome, score = score))

  # Special case: get errorbar data if existing
  idx.binned <- fun.names == "binned"
  m.er <- lapply(
    which(idx.binned),
    \(i) attr(m.est[[i]], "errorbar")
  )

  # Bind together and convert to long data frame
  m.est <- bind_rows(m.est, .id = "method")
  rownames(m.est) <- NULL

  # Bind together and convert to long data frame; otherwise return NULL
  m.error <- bind_rows(m.er, .id = "method")
  if (nrow(m.error) == 0) {
    m.error <- NULL
  }

  # Return indexes of step methods and risk methods
  idx.step <- fun.names %in% c("binned", "pava")
  names(idx.step) <- names(fun.names)
  idx.asis <- fun.names == "asis"

  # Check if asis was called multiple times
  if (sum(idx.asis) > 1) {
    stop("Please use 'asis' just once (as it does not have any further arguments).")
  }

  return(
    list(
      plotdata = m.est, errorbardata = m.error,
      idx.step = idx.step, idx.asis = idx.asis,
      idx.binned = idx.binned, idx.pava = fun.names == "pava"
    )
  )
}

# Define estimation functions
est.funs <- function() {
  list(
    gam = getGAMest,
    cgam = getCGAMest,
    mspline = getMSPLINEest,
    binned = getBINNEDest,
    pava = getPAVAest,
    asis = getASISest
  )
}

# Generic and S3 methods for estimations
getEst <- function(x, outcome, score) {
  UseMethod("getEst")
}


getEst.list <- function(x, outcome, score) {
  do.call(
    est.funs()[[x[["method"]]]],
    append(list(outcome = outcome, score = score), x[names(x) != "method"])
  )
}


getEst.character <- function(x, outcome, score) {
  do.call(
    est.funs()[[x]],
    list(outcome = outcome, score = score)
  )
}


getEst.function <- function(x, outcome, score) {

  # Run estimation
  out <- x(outcome = outcome, score = score)

  # Check output
  check1 <- is.data.frame(out) &&
    identical(c("estimate", "outcome", "percentile", "score"), sort(colnames(out)))
  if (!check1) {
    stop(
      paste(
        "User defined estimation functions must return a data.frame of 4 columns:",
        "score - the predictions,",
        "percentile - the percentile of score,",
        "outcome - the original outcome,",
        "and estimate - the estimated value"
      )
    )
  }
  
  check2 <- vapply(out, is.numeric, FUN.VALUE = logical(1), USE.NAMES = TRUE)
  if (!all(check2)) {
    stop(
      paste(
        "All columns of the returned data.frame in user defined estimation function",
        "must be numeric.",
        paste0("`", names(which(!check2)), "`", collapse = ", "), "is/are not numeric."
      )
    )
  }

  return(out)
}

# Generic and S3 methods for estimation method as string (actual estimation function used)
getEstMethod <- function(x) {
  UseMethod("getEstMethod")
}

#' @export
getEstMethod.character <- function(x) {
  x
}

#' @export
getEstMethod.list <- function(x) {
  x[["method"]]
}

#' @export
getEstMethod.function <- function(x) {
  "udf"
}

getEstMethods <- function(x, with.names) {
  vapply(x, getEstMethod, FUN.VALUE = character(1), USE.NAMES = with.names)
}

#' Calibration plot
#'
#' Calibration curve risk estimates
#'
#' @inheritParams riskProfile
#' @param methods Character vector of method names (case-insensitive) for plotting curves or
#' a named list where elements are method function and its arguments.
#' Default is set to `list(gam = list(method = "gam", fitonPerc = FALSE))`.
#'
#' Full options are: `c("binned", "pava", "mspline", "gam", "cgam")`.
#'
#' To specify arguments per method, use lists. For example:
#' ```
#' list(
#'   pava = list(method = "pava", ties = "primary"),
#'   mspline = list(method = "mspline", fitonPerc = TRUE),
#'   gam = list(method = "gam", bs = "tp", logscores = FALSE),
#'   bin = list(method = "binned", bins = 10),
#' )
#' ```
#' See section "Estimation" for more details.
#' @param include Character vector (case-insensitive, partial matching) or `NULL` specifying
#' what quantities to include in the plot.
#'
#' Default is: `c("loess", "citl")`.
#'
#' Full options are: `c("loess", "citl", "rug", "datapoints")` or `NULL`.
#' "loess" adds a Loess fit, "citl" stands for "Calibration in the large",
#' "rug" adds rug ticks of `score` by `outcome` (top x-axis: `score` for `outcome == 1`,
#' bottom x-axis: `score` for `outcome == 0`),
#' "datapoints" adds jittered `score` by `outcome` (slightly shifted away from 0 / 1 y-values),
#' "`NULL`" stands for no extra information.
#' @param plot.raw Logical to show percentiles or raw values.
#' Defaults to `TRUE` (i.e. raw `score`).
#' @param rev.order Logical to reverse ordering of scores. Defaults to `FALSE`.
#' @param margin.type Type of additional margin plot, can be one of
#' `c("density", "histogram", "boxplot", "violin", "densigram")`.
#' See [ggExtra::ggMarginal()] for more details.
#' @param ... Additional arguments passed to [ggExtra::ggMarginal()].
#'
#' @inheritSection riskProfile Estimation
#'
#' @return A list containing the plot and data, plus `citl` data if they were requested.
#'
#' @export
#'
#' @seealso [riskProfile()] [sensSpec()]
#'
#' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()]
#' [getASISest()]
#'
#' @examples
#' # Read in example data
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
#' rscore <- auroc$predicted_calibrated
#' truth <- as.numeric(auroc$actual)
#'
#' # Default calibration plot
#' p1 <- calibrationProfile(outcome = truth, score = rscore)
#' p1$plot
#'
#' # Specifying multiple estimation methods
#' # By default, all the methods fit on percentiles
#' calibrationProfile(
#'   outcome = truth,
#'   score = rscore,
#'   methods = c("gam", "mspline", "binned")
#' )$plot
#'
#' # Specifying multiple estimation methods with parameters
#' calibrationProfile(
#'   outcome = truth,
#'   score = rscore,
#'   methods = list(
#'     gam = list(method = "gam", fitonPerc = FALSE, k = 3),
#'     mspline = list(method = "mspline"),
#'     bin = list(method = "binned", quantiles = 5)
#'   )
#' )$plot
#'
#' # Additional quantities and marginal histogram with specified number of bins
#' calibrationProfile(
#'   outcome = truth,
#'   score = rscore,
#'   include = c("rug", "datapoints", "citl"),
#'   # or use partial matching: include = c("r", "d", "c"),
#'   margin.type = "histogram",
#'   bins = 100 # passed to ggExtra::ggMarginal
#' )$plot
#'
calibrationProfile <- function(outcome,
                               score,
                               methods = list(gam = list(method = "gam", fitonPerc = FALSE)),
                               include = c("loess", "citl"),
                               plot.raw = TRUE,
                               rev.order = FALSE,
                               margin.type = NULL,
                               ...) {

  # Argument checks (except outcome, score, methods - below)
  checkmate::assert(
    checkmate::check_character(include),
    checkmate::check_null(include)
  )
  if (is.character(include)) {
    include <- matchArgSubset(tolower(include), choices = c("loess", "citl", "rug", "datapoints"))
  }
  checkmate::assert_flag(plot.raw)
  checkmate::assert_flag(rev.order)
  checkmate::assert(
    checkmate::check_character(margin.type, len = 1, any.missing = FALSE),
    checkmate::check_null(margin.type)
  )

  if (plot.raw) {
    xvar <- "score"
  } else {
    xvar <- "percentile"
  }

  # Standardize/Check outcome, scores
  op <- inputCheck(outcome = outcome, score = score)

  # Order Data by scores
  tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order)
  score <- tempdf$score
  outcome <- tempdf$outcome

  # Check methods
  methods <- methodCheck(methods = methods)
  method.names <- names(methods)
  if ("asis" %in% getEstMethods(methods, with.names = FALSE)) {
    stop('"asis" method is not suitable for this plot. Please remove it.')
  }

  # Get estimates
  pc.ests <- getEsts(methods = methods, outcome = outcome, score = score)
  PC.data <- pc.ests$plotdata
  step.methods <- method.names[pc.ests$idx.step]

  # Calculate percentiles
  ecdf.score <- ecdf(score)
  percentile <- ecdf.score(score)

  # Calculate Calibration in the large
  citl.data <- data.frame(
    outcome = mean(outcome),
    score = mean(score),
    percentile = ecdf.score(mean(score)),
    method = "Calibration In The Large"
  )

  # Subset PC data for plotting
  smoothPC <- PC.data[!PC.data$method %in% step.methods, , drop = FALSE]
  stepPC <- PC.data[PC.data$method %in% step.methods, , drop = FALSE]

  # data.frame with user inputs
  ddf <- data.frame(score, percentile, outcome)

  # Shape type storage
  shapes <- c()

  # Add empty scatterplot layer for ggMarginal
  if (!is.null(margin.type)) {
    p <- ggplot() +
      geom_point(
        data = ddf,
        aes(x = .data[[xvar]], y = .data$outcome), shape = NA, na.rm = TRUE
      )
  } else {
    p <- ggplot()
  }

  # Build plot
  p <- p +
    geom_line(
      data = smoothPC,
      aes(x = .data[[xvar]], y = .data$estimate, linetype = .data$method, colour = .data$method),
      alpha = 0.8,
      linewidth = 0.5
    ) +
    geom_step(
      data = stepPC,
      aes(x = .data[[xvar]], y = .data$estimate, linetype = .data$method, colour = .data$method),
      direction = "vh",
      alpha = 0.8,
      linewidth = 0.5
    ) +
    geom_abline(
      aes(slope = 1, intercept = 0, linewidth = "Identity line"),
      colour = "gray50",
      linetype = "solid"
    )

  # Add loess if requested
  if ("loess" %in% include) {
    p <- p + geom_smooth(
      data = ddf,
      aes(x = .data[[xvar]], y = .data$outcome, linetype = "loess", colour = "loess"),
      method = "loess", formula = y ~ x, se = FALSE,
      linewidth = 0.5
    )
  }

  # Add calibration in the large if requested
  if ("citl" %in% include) {
    p <- p + geom_point(
      data = citl.data,
      aes(x = .data[[xvar]], y = .data$outcome, shape = .data$method),
      colour = "red",
      size = 3,
      stroke = 1
    )
    shapes <- c(shapes, c("Calibration In The Large" = 4))
  }

  # Add datapoints if requested
  if ("datapoints" %in% include) {
    p <- p + geom_jitter(
      data = data.frame(
        score,
        percentile,
        outcome = ifelse(outcome == 0, -0.1, 1.1),
        method = "Data points"
      ),
      aes(x = .data[[xvar]], y = .data$outcome, shape = .data$method),
      colour = "black",
      size = 1.5,
      alpha = 0.4,
      position = position_jitter(seed = 5, height = 0.03)
    )
    shapes <- c(shapes, c("Data points" = 16))
  }

  # Add rug if requested
  if ("rug" %in% include) {
    p <- p + geom_rug(
      data = ddf[ddf$outcome == 0, ],
      aes(x = .data[[xvar]]),
      sides = "b",
      show.legend = FALSE
    ) + geom_rug(
      data = ddf[ddf$outcome == 1, ],
      aes(x = .data[[xvar]]),
      sides = "t",
      show.legend = FALSE
    )
  }

  # Fix shape legend ...
  if (all(c("citl", "datapoints") %in% include)) {
    shape_guide <- guide_legend(
      override.aes = list(
        colour = c("Calibration In The Large" = "red", "Data points" = "black"),
        alpha = 1, size = 2.5
      ),
      order = 3
    )
  } else if (any(c("citl", "datapoints") %in% include)) {
    shape_guide <- guide_legend(
      override.aes = list(alpha = 1, size = 2.5),
      order = 3
    )
  } else {
    shape_guide <- NULL
  }

  # Finalize graph
  p <- p +
    scale_linewidth_manual(values = c("Identity line" = 0.5)) +
    scale_shape_manual(values = shapes) +
    scale_colour_hue(l = 45) +
    scale_x_continuous(n.breaks = 6) +
    scale_y_continuous(n.breaks = 6) +
    labs(
      title = "Calibration Plot",
      x = "Predicted Probability",
      y = "Observed",
      linetype = "Estimation Method",
      linewidth = NULL,
      colour = "Estimation Method",
      shape = `if`(all(c("citl", "datapoints") %in% include), "Points", NULL)
    ) +
    theme_bw() +
    theme(legend.key.width = unit(2, "line")) +
    guides(
      linetype = guide_legend(order = 1),
      colour = guide_legend(order = 1),
      linewidth = guide_legend(order = 2),
      shape = shape_guide
    )

  # Add margin plot
  if (!is.null(margin.type)) {
    p <- ggExtra::ggMarginal(p, type = margin.type, margins = "x", ...)
  }

  return(list(plot = p, data = dplyr::as_tibble(PC.data), citl = citl.data))
}

#' Sensitivity and specificity plot
#'
#' Sensitivity and specificity risk estimates
#' 
#' Given individual binary outcomes and scores, this function plots sensitivity and specificity 
#' (using each score as a cutoff) on their respective score percentiles.
#'
#' @inheritParams riskProfile
#' @param show.best Logical; Include best possible sensitivity and specificity? Defaults to `TRUE`.
#'
#' @inheritSection riskProfile Estimation
#'
#' @return A list containing the plot and data.
#'
#' @export
#'
#' @seealso [riskProfile()] [calibrationProfile()]
#'
#' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()]
#' [getASISest()]
#'
#' @examples
#' # Read in example data
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
#' rscore <- auroc$predicted_calibrated
#' truth <- as.numeric(auroc$actual)
#'
#' # Plot sensitivity and specificity
#' p1 <- sensSpec(outcome = truth, score = rscore)
#' p1$plot
#'
#' # Same with smoothed estimates
#' p2 <- sensSpec(outcome = truth, score = rscore, methods = c("asis", "gam"))
#' p2$plot
#'
sensSpec <- function(outcome,
                     score,
                     methods = "asis",
                     show.best = TRUE,
                     plot.raw = FALSE,
                     rev.order = FALSE) {

  # Argument checks
  checkmate::assert_flag(show.best)
  checkmate::assert_flag(plot.raw)
  checkmate::assert_flag(rev.order)

  # Check methods
  methods <- methodCheck(methods = methods)

  if (plot.raw) {
    xvar <- "score"
  } else {
    xvar <- "percentile"
  }

  # Standardize/Check outcome, scores
  op <- inputCheck(outcome = outcome, score = score)

  # Order Data by scores
  tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order)
  score <- tempdf$score
  outcome <- tempdf$outcome

  # Calculate spec and sens
  dat <- getEsts(outcome = outcome, score = score, methods = methods)$plotdata
  dat <- split(dat, dat$method) %>%
    lapply(\(d) nonParametricTR(outcome = d$outcome, score = d$estimate)) %>%
    bind_rows(.id = "method")
  
  if (!plot.raw) {
    dat <- add0thPercTR(dat)
  }

  # Plot
  p <- ggplot(dat) +
    geom_step(
      aes(x = .data[[xvar]], y = .data$value, colour = .data$method, linetype = .data$pf),
      direction = "hv"
    )

  # Add best possible sensitivity and specificity
  if (show.best) {
    prev <- mean(outcome)
    if (plot.raw) {
      best <- data.frame(percentile = ecdf(score)(score), score = score)
    } else {
      best <- data.frame(percentile = c(0, ecdf(score)(score)), score = c(NA, score))
    }
    best <- best %>%
      distinct() %>%
      mutate(
        Sensitivity = bestSens(perc = .data$percentile, prev = prev),
        Specificity = bestSpec(perc = .data$percentile, prev = prev)
      ) %>%
      tidyr::pivot_longer(
        cols = c("Sensitivity", "Specificity"), names_to = "pf", values_to = "value"
      ) %>%
      mutate(method = "best")
    p <- p +
      geom_line(
        aes(x = .data[[xvar]], y = .data$value, linetype = .data$pf, linewidth = "Best Possible"),
        data = best,
        colour = "gray70"
      ) +
      scale_linewidth_manual(values = c("Best Possible" = 0.5), name = NULL)
  }

  # Finalize plot
  p <- p +
    labs(
      title = "Sensitivity and Specificity Plot",
      x = ifelse(plot.raw, "Prediction Score", "Risk Percentile"),
      y = "True Positive / Negative Rate",
      colour = "Estimation Method",
      linetype = "Predictive Quantity"
    ) +
    scale_x_continuous(n.breaks = 6) +
    scale_y_continuous(n.breaks = 6) +
    scale_linetype_manual(values = c("Sensitivity" = "solid", "Specificity" = "dashed")) +
    scale_colour_hue(l = 45) +
    theme_bw() +
    theme(legend.key.width = unit(2, "line")) +
    guides(
      linetype = guide_legend(order = 1, override.aes = list(colour = "black")),
      colour = guide_legend(order = 2),
      linewidth = guide_legend(order = 3)
    )

  if (show.best) {
    dat <- bind_rows(dat, best)
  }

  return(list(plot = p, data = dplyr::as_tibble(dat)))
}


# Calculate Predictive Value (PV) Estimates
getPVdata <- function(methods, outcome, score, pc.ests = NULL) {

  # Calculate estimates if not provided
  if (is.null(pc.ests)) {
    pc.ests <- getEsts(outcome = outcome, score = score, methods = methods)
    pc.data <- pc.ests$plotdata
  } else {
    pc.data <- pc.ests$plotdata
  }

  # locf and zip for pava and/or binned data
  if (any(pc.ests$idx.step)) {
    name.binned <- names(methods)[pc.ests$idx.binned]
    name.pava <- names(methods)[pc.ests$idx.pava]
    pc.data <- locf(pc.data, method.binned = name.binned, method.pava = name.pava)
  }

  # Calculate PV
  PV.data <- parametricPV(pc.data = pc.data)

  return(as.data.frame(PV.data))
}


# Calculates PV values from cutoff thresholds
nonParametricPV <- function(outcome, score) {

  prev <- mean(outcome, na.rm = TRUE)

  thresh.predictions <- lapply(score, function(x) as.numeric(score > x))

  ppv <- vapply(
    thresh.predictions,
    function(x) {
      tp <- sum(outcome == 1 & x == 1)
      fp <- sum(outcome == 0 & x == 1)
      tp / (tp + fp)
    },
    numeric(1)
  )

  npv <- vapply(
    thresh.predictions,
    function(x) {
      tn <- sum(outcome == 0 & x == 0)
      fn <- sum(outcome == 1 & x == 0)
      tn / (tn + fn)
    },
    numeric(1)
  )

  threshold.data <- data.frame(
    score = score,
    percentile = ecdf(score)(score),
    PPV = ppv,
    NPV = npv
  ) %>%
    mutate(MNPV = 1 - .data$NPV) %>%
    tidyr::fill(
      all_of(c("PPV", "NPV", "MNPV")),
      .direction = "downup"
    )

  return(threshold.data)
}


# Calculates PV values based on pracma::cumtrapz
parametricPV <- function(pc.data) {

  # Calculate PVs
  PV.data <- pc.data %>%
    group_by(.data$method) %>%
    mutate(
      MNPV = pracma::cumtrapz(.data$percentile, .data$estimate)[, 1] / .data$percentile,
      NPV = 1 - .data$MNPV,
      PPV =
        (
          max(pracma::cumtrapz(.data$percentile, .data$estimate), na.rm = TRUE) -
            pracma::cumtrapz(.data$percentile, .data$estimate)[, 1]
        ) / (max(.data$percentile) - .data$percentile)
    ) %>%
    ungroup() %>%
    arrange(.data$method, .data$percentile)

  # Fill NAs (starting points)
  PV.data <- PV.data %>%
    group_by(.data$method) %>%
    mutate(
      MNPV = ifelse(.data$percentile == min(.data$percentile) & is.na(.data$MNPV), 0, .data$MNPV),
      NPV = ifelse(.data$percentile == min(.data$percentile) & is.na(.data$NPV), 1, .data$NPV)
    ) %>%
    as.data.frame()

  # Fill NAs (ending points)
  PV.data <- tidyr::fill(
    PV.data,
    all_of(c("estimate", "MNPV", "NPV", "PPV")),
    .direction = "down"
  )

  return(as.data.frame(PV.data))
}


# consistent colors for PVs
predictionColours <- function(x, show.best) {
  clrs <- c(
    "Best NPV" = "gray80",
    "NPV" = "#53B400",
    "Best PPV" = "gray65",
    "PPV" = "red",
    "Best PC" = "black",
    "PC" = "plum",
    "1-NPV" = "royalblue2",
    "Best 1-NPV" = "gray50"
  )
  if (show.best) {
    x <- c(x, paste("Best", x))
  }
  return(clrs[names(clrs) %in% x]) # keep the ordering above
}


# Calculates sensitivity and specificity (true positive / negative rate)
nonParametricTR <- function(outcome, score) {

  # Tresh_predictions are lists of 0,1's based on each finer_rscore as a cutpoint
  thresh.predictions <- lapply(score, function(x) as.numeric(score > x))

  # Calc sensitivities and specificities at each risk percentile threshold
  senses <- vapply(
    thresh.predictions,
    function(x) {
      sum(outcome == 1 & x == 1) / sum(outcome == 1)
    },
    numeric(1)
  )

  specs <- vapply(
    thresh.predictions,
    function(x) {
      sum(outcome == 0 & x == 0) / sum(outcome == 0)
    },
    numeric(1)
  )

  # Create a data.frame
  dat <- data.frame(
    score = score,
    percentile = ecdf(score)(score),
    Sensitivity = senses,
    Specificity = specs
  ) %>%
    tidyr::pivot_longer(
      cols = c("Sensitivity", "Specificity"),
      names_to = "pf",
      values_to = "value"
    )

  return(as.data.frame(dat))
}



adjPrevPerc <- function(perc, prev.new, cdf.case, cdf.control) {
  prev.new * cdf.case(perc) + (1 - prev.new) * cdf.control(perc)
}

adjPrevEst <- function(x, prev, prev.new) {
  f2 <- prev / (1 - prev)
  f3 <- (1 - prev.new) / prev.new
  1 / (((1 / x) - 1) * f2 * f3 + 1)
}

adjPrevPC <- function(dat, prev, prev.new, cdf.case, cdf.control) {
  mutate(
    dat,
    percentile = adjPrevPerc(
      perc = .data$score, prev.new = prev.new,
      cdf.case = cdf.case, cdf.control = cdf.control
    ),
    estimate = adjPrevEst(x = .data$estimate, prev = prev, prev.new = prev.new)
  )
}

adjPrevPV <- function(dat, prev, prev.new, cdf.case, cdf.control) {
  mutate(
    dat,
    percentile = adjPrevPerc(
      perc = .data$score, prev.new = prev.new,
      cdf.case = cdf.case, cdf.control = cdf.control
    ),
    MNPV = adjPrevEst(x = .data$MNPV, prev = prev, prev.new = prev.new),
    NPV = 1 - .data$MNPV,
    PPV = adjPrevEst(x = .data$PPV, prev = prev, prev.new = prev.new)
  )
}



bestPPV <- function(perc, prev) {
  ifelse(perc > 1 - prev, 1, prev / (1 - perc))
}

bestMNPV <- function(perc, prev) {
  ifelse(perc <= 1 - prev, 0, 1 - ((1 - prev) / perc))
}

bestSens <- function(perc, prev) {
  ifelse(perc <= 1 - prev, 1, (1 - perc) / prev)
}

bestSpec <- function(perc, prev) {
  ifelse(perc > 1 - prev, 1, perc / (1 - prev))
}


#' zipFastener for two dataframes of unequal length
#'
#' The following function acts like a “zip fastener” for combining two dataframes.
#' It takes the first row of the first data frame and places it above of
#' the first row of the second data frame and so on.
#'
#' @param df1 dataframe 1
#' @param df2 dataframe 2
#'
#' @return Zipped data.frame
#'
#' @keywords internal
#' @noRd
#'
#' @examples
#' df1 <- data.frame(a = 1:3, b = 1:3, c = 1:3)
#' df2 <- data.frame(a = letters[1:3], b = letters[1:3], c = letters[1:3])
#'
#' zipFastener(df1, df2)
#'
zipFastener <- function(df1, df2) {

  if (ncol(df1) != ncol(df2)) {
    stop("the no. of columns has to be equal to merge them by zip feeding")
  }

  d1 <- nrow(df1)
  d2 <- nrow(df2)

  if (d1 != d2) {
    stop("the no. of rows has to be equal to merge them by zip feeding")
  }

  # zip fastener preperations
  i1 <- 1:d1 # index vector 1
  i2 <- 1:d1 + d1 # index vector 2

  # zip fastener operations
  index <- as.vector(matrix(c(i1, i2), ncol = d1, byrow = TRUE))
  index <- index[!is.na(index)] # remove NAs

  colnames(df2) <- colnames(df1) # keep 1st colnames
  res <- rbind(df1, df2)[index, ] # reorder data frame

  return(res)
}


#' last observation carried forward: for binned
#'
#' Function assumes columns in data frame are: percentile, estimate, method
#'
#' @param df dataframe containing predcurve estimates
#'
#' @return dataframe with adjusted points
#'
#' @keywords internal
#' @noRd
#'
locf.binned <- function(df) {

  df.copy <- df
  # keep same x, slide y back one spot
  df.copy$estimate <- df.copy$estimate[c(2:length(df.copy$estimate), NA)]
  combined.df <- zipFastener(df, df.copy)
  combined.df <- combined.df[-nrow(combined.df), ] # removes a redundant point

  return(combined.df)
}

#' last observation carried forward: for pava
#'
#' Function assumes columns in data frame are: percentile, estimate, method
#'
#' @param df dataframe containing predcurve estimates
#'
#' @return dataframe with adjusted points
#'
#' @keywords internal
#' @noRd
#'
locf.pava <- function(df) {

  # save first and last points
  first <- df[1, ]
  last <- df[nrow(df) - 1, ]

  # adjust data to remove extra points
  diff_y <- diff(c(0, df$estimate))
  pava.int <- df[diff_y != 0, ]

  # re-add first and last points
  df <- rbind(first, pava.int, last)
  df <- df[order(df$percentile), ] # order on percentile

  df.copy <- df
  # keep same x, slide y back one spot
  df.copy$percentile <- df.copy$percentile[
    c(2:(length(df.copy$percentile)), NA)
  ]
  combined.df <- zipFastener(df, df.copy)

  # Combine smooth estimate to fixed df
  combined.df <- na.omit(combined.df)
  combined.df <- combined.df[-nrow(combined.df), ] # removes a redundant point

  return(combined.df)
}

#' Last observation carried forward for specific estimation methods
#'
#' Iterates locf for multiple methods: calls locf.binned and locf.pava
#'
#' @param df dataframe containing predcurve estimates
#' @param method.binned vector of "binned" methods
#' @param method.pava vector of "pava" methods
#'
#' @return dataframe with adjusted points
#'
#' @keywords internal
#' @noRd
#'
locf <- function(df, method.binned, method.pava) {

  complete.data <- df[!df$method %in% c(method.binned, method.pava), ]
  binned.data <- df[df$method %in% method.binned, ]
  pava.data <- df[df$method %in% method.pava, ]

  # pava locf
  if (nrow(pava.data) >= 1) {
    pava.plotdata <- locf.pava(pava.data)
    complete.data <- bind_rows(complete.data, pava.plotdata)
  }

  # binned locf
  if (nrow(binned.data) >= 1) {
    binned.plotdata <- locf.binned(binned.data)
    complete.data <- bind_rows(complete.data, binned.plotdata)
  }
  rownames(complete.data) <- NULL

  return(complete.data)
}

1		#' Sensitivity and specificity plot
2		#'
3		#' Sensitivity and specificity risk estimates
4		#'
5		#' Given individual binary outcomes and scores, this function plots sensitivity and specificity
6		#' (using each score as a cutoff) on their respective score percentiles.
7		#'
8		#' @inheritParams riskProfile
9		#' @param show.best Logical; Include best possible sensitivity and specificity? Defaults to `TRUE`.
10		#'
11		#' @inheritSection riskProfile Estimation
12		#'
13		#' @return A list containing the plot and data.
14		#'
15		#' @export
16		#'
17		#' @seealso [riskProfile()] [calibrationProfile()]
18		#'
19		#' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()]
20		#' [getASISest()]
21		#'
22		#' @examples
23		#' # Read in example data
24		#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
25		#' rscore <- auroc$predicted_calibrated
26		#' truth <- as.numeric(auroc$actual)
27		#'
28		#' # Plot sensitivity and specificity
29		#' p1 <- sensSpec(outcome = truth, score = rscore)
30		#' p1$plot
31		#'
32		#' # Same with smoothed estimates
33		#' p2 <- sensSpec(outcome = truth, score = rscore, methods = c("asis", "gam"))
34		#' p2$plot
35		#'
36		sensSpec <- function(outcome,
37		score,
38		methods = "asis",
39		show.best = TRUE,
40		plot.raw = FALSE,
41		rev.order = FALSE) {
42
43		# Argument checks
44	7x	checkmate::assert_flag(show.best)
45	7x	checkmate::assert_flag(plot.raw)
46	7x	checkmate::assert_flag(rev.order)
47
48		# Check methods
49	7x	methods <- methodCheck(methods = methods)
50
51	7x	if (plot.raw) {
52	2x	xvar <- "score"
53		} else {
54	5x	xvar <- "percentile"
55		}
56
57		# Standardize/Check outcome, scores
58	7x	op <- inputCheck(outcome = outcome, score = score)
59
60		# Order Data by scores
61	7x	tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order)
62	7x	score <- tempdf$score
63	7x	outcome <- tempdf$outcome
64
65		# Calculate spec and sens
66	7x	dat <- getEsts(outcome = outcome, score = score, methods = methods)$plotdata
67	7x	dat <- split(dat, dat$method) %>%
68	7x	lapply(\(d) nonParametricTR(outcome = d$outcome, score = d$estimate)) %>%
69	7x	bind_rows(.id = "method")
70
71	7x	if (!plot.raw) {
72	5x	dat <- add0thPercTR(dat)
73		}
74
75		# Plot
76	7x	p <- ggplot(dat) +
77	7x	geom_step(
78	7x	aes(x = .data[[xvar]], y = .data$value, colour = .data$method, linetype = .data$pf),
79	7x	direction = "hv"
80		)
81
82		# Add best possible sensitivity and specificity
83	7x	if (show.best) {
84	7x	prev <- mean(outcome)
85	7x	if (plot.raw) {
86	2x	best <- data.frame(percentile = ecdf(score)(score), score = score)
87		} else {
88	5x	best <- data.frame(percentile = c(0, ecdf(score)(score)), score = c(NA, score))
89		}
90	7x	best <- best %>%
91	7x	distinct() %>%
92	7x	mutate(
93	7x	Sensitivity = bestSens(perc = .data$percentile, prev = prev),
94	7x	Specificity = bestSpec(perc = .data$percentile, prev = prev)
95		) %>%
96	7x	tidyr::pivot_longer(
97	7x	cols = c("Sensitivity", "Specificity"), names_to = "pf", values_to = "value"
98		) %>%
99	7x	mutate(method = "best")
100	7x	p <- p +
101	7x	geom_line(
102	7x	aes(x = .data[[xvar]], y = .data$value, linetype = .data$pf, linewidth = "Best Possible"),
103	7x	data = best,
104	7x	colour = "gray70"
105		) +
106	7x	scale_linewidth_manual(values = c("Best Possible" = 0.5), name = NULL)
107		}
108
109		# Finalize plot
110	7x	p <- p +
111	7x	labs(
112	7x	title = "Sensitivity and Specificity Plot",
113	7x	x = ifelse(plot.raw, "Prediction Score", "Risk Percentile"),
114	7x	y = "True Positive / Negative Rate",
115	7x	colour = "Estimation Method",
116	7x	linetype = "Predictive Quantity"
117		) +
118	7x	scale_x_continuous(n.breaks = 6) +
119	7x	scale_y_continuous(n.breaks = 6) +
120	7x	scale_linetype_manual(values = c("Sensitivity" = "solid", "Specificity" = "dashed")) +
121	7x	scale_colour_hue(l = 45) +
122	7x	theme_bw() +
123	7x	theme(legend.key.width = unit(2, "line")) +
124	7x	guides(
125	7x	linetype = guide_legend(order = 1, override.aes = list(colour = "black")),
126	7x	colour = guide_legend(order = 2),
127	7x	linewidth = guide_legend(order = 3)
128		)
129
130	7x	if (show.best) {
131	7x	dat <- bind_rows(dat, best)
132		}
133
134	7x	return(list(plot = p, data = dplyr::as_tibble(dat)))
135		}

1
2		#' zipFastener for two dataframes of unequal length
3		#'
4		#' The following function acts like a “zip fastener” for combining two dataframes.
5		#' It takes the first row of the first data frame and places it above of
6		#' the first row of the second data frame and so on.
7		#'
8		#' @param df1 dataframe 1
9		#' @param df2 dataframe 2
10		#'
11		#' @return Zipped data.frame
12		#'
13		#' @keywords internal
14		#' @noRd
15		#'
16		#' @examples
17		#' df1 <- data.frame(a = 1:3, b = 1:3, c = 1:3)
18		#' df2 <- data.frame(a = letters[1:3], b = letters[1:3], c = letters[1:3])
19		#'
20		#' zipFastener(df1, df2)
21		#'
22		zipFastener <- function(df1, df2) {
23
24	8x	if (ncol(df1) != ncol(df2)) {
25	1x	stop("the no. of columns has to be equal to merge them by zip feeding")
26		}
27
28	7x	d1 <- nrow(df1)
29	7x	d2 <- nrow(df2)
30
31	7x	if (d1 != d2) {
32	1x	stop("the no. of rows has to be equal to merge them by zip feeding")
33		}
34
35		# zip fastener preperations
36	6x	i1 <- 1:d1 # index vector 1
37	6x	i2 <- 1:d1 + d1 # index vector 2
38
39		# zip fastener operations
40	6x	index <- as.vector(matrix(c(i1, i2), ncol = d1, byrow = TRUE))
41	6x	index <- index[!is.na(index)] # remove NAs
42
43	6x	colnames(df2) <- colnames(df1) # keep 1st colnames
44	6x	res <- rbind(df1, df2)[index, ] # reorder data frame
45
46	6x	return(res)
47		}
48
49
50		#' last observation carried forward: for binned
51		#'
52		#' Function assumes columns in data frame are: percentile, estimate, method
53		#'
54		#' @param df dataframe containing predcurve estimates
55		#'
56		#' @return dataframe with adjusted points
57		#'
58		#' @keywords internal
59		#' @noRd
60		#'
61		locf.binned <- function(df) {
62
63	4x	df.copy <- df
64		# keep same x, slide y back one spot
65	4x	df.copy$estimate <- df.copy$estimate[c(2:length(df.copy$estimate), NA)]
66	4x	combined.df <- zipFastener(df, df.copy)
67	4x	combined.df <- combined.df[-nrow(combined.df), ] # removes a redundant point
68
69	4x	return(combined.df)
70		}
71
72		#' last observation carried forward: for pava
73		#'
74		#' Function assumes columns in data frame are: percentile, estimate, method
75		#'
76		#' @param df dataframe containing predcurve estimates
77		#'
78		#' @return dataframe with adjusted points
79		#'
80		#' @keywords internal
81		#' @noRd
82		#'
83		locf.pava <- function(df) {
84
85		# save first and last points
86	2x	first <- df[1, ]
87	2x	last <- df[nrow(df) - 1, ]
88
89		# adjust data to remove extra points
90	2x	diff_y <- diff(c(0, df$estimate))
91	2x	pava.int <- df[diff_y != 0, ]
92
93		# re-add first and last points
94	2x	df <- rbind(first, pava.int, last)
95	2x	df <- df[order(df$percentile), ] # order on percentile
96
97	2x	df.copy <- df
98		# keep same x, slide y back one spot
99	2x	df.copy$percentile <- df.copy$percentile[
100	2x	c(2:(length(df.copy$percentile)), NA)
101		]
102	2x	combined.df <- zipFastener(df, df.copy)
103
104		# Combine smooth estimate to fixed df
105	2x	combined.df <- na.omit(combined.df)
106	2x	combined.df <- combined.df[-nrow(combined.df), ] # removes a redundant point
107
108	2x	return(combined.df)
109		}
110
111		#' Last observation carried forward for specific estimation methods
112		#'
113		#' Iterates locf for multiple methods: calls locf.binned and locf.pava
114		#'
115		#' @param df dataframe containing predcurve estimates
116		#' @param method.binned vector of "binned" methods
117		#' @param method.pava vector of "pava" methods
118		#'
119		#' @return dataframe with adjusted points
120		#'
121		#' @keywords internal
122		#' @noRd
123		#'
124		locf <- function(df, method.binned, method.pava) {
125
126	4x	complete.data <- df[!df$method %in% c(method.binned, method.pava), ]
127	4x	binned.data <- df[df$method %in% method.binned, ]
128	4x	pava.data <- df[df$method %in% method.pava, ]
129
130		# pava locf
131	4x	if (nrow(pava.data) >= 1) {
132	2x	pava.plotdata <- locf.pava(pava.data)
133	2x	complete.data <- bind_rows(complete.data, pava.plotdata)
134		}
135
136		# binned locf
137	4x	if (nrow(binned.data) >= 1) {
138	4x	binned.plotdata <- locf.binned(binned.data)
139	4x	complete.data <- bind_rows(complete.data, binned.plotdata)
140		}
141	4x	rownames(complete.data) <- NULL
142
143	4x	return(complete.data)
144		}

1		#' Check Input Arguments
2		#'
3		#' Given outcomes, prediction scores, methods, checks for obvious issues,
4		#' and returns warnings if needed.
5		#' These may include unequal lengths, missing values, ensuring binary outcomes,
6		#' and numeric scores, etc.
7		#'
8		#' @inheritParams riskProfile
9		#'
10		#' @return A list of containing standardized outcomes and predicted scores.
11		#'
12		#' @keywords internal
13		#' @noRd
14		#'
15		#' @examples
16		#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
17		#' rscore <- auroc$predicted
18		#' truth <- as.numeric(auroc$actual)
19		#' inputCheck(truth, rscore)
20		#'
21		inputCheck <- function(outcome, score) {
22
23		# Predscore - check numeric
24	41x	if (!is.numeric(score)) {
25	1x	stop("'score' vector must be represented as a numeric.")
26		}
27
28		# Outcome - logical to numeric
29	40x	if (is.logical(outcome)) {
30	33x	outcome <- as.numeric(outcome)
31		}
32
33		# Outcome - check numeric
34	40x	if (!is.numeric(outcome)) {
35	1x	stop("'outcome' vector needs to be a numeric vector.")
36		}
37
38		# Check matching lengths
39	39x	if (length(outcome) != length(score)) {
40	1x	stop("'outcome' and 'score' must have the same lengths.")
41		}
42
43		# Need at least 3 observations
44	38x	if (length(outcome) < 3) {
45	1x	stop("Need at least 3 observations.")
46		}
47
48		# DROP NA's for data pairs
49	37x	if (any(is.na(outcome)) \|\| any(is.na(score))) {
50	2x	warning("Observations with NA's are dropped")
51	2x	ind <- intersect(
52	2x	which(complete.cases(outcome)),
53	2x	which(complete.cases(score))
54		)
55	2x	outcome <- outcome[ind]
56	2x	score <- score[ind]
57		}
58
59		# Outcome - check incorrect values
60	37x	if (!all(outcome %in% 0:1)) {
61	1x	stop("'outcome' vector must be represented as binary 1 or 0.")
62		}
63
64		# Predscore - check low frequency of score values
65	36x	if (length(unique(score)) <= 5) {
66	2x	tbl <- table(score)
67	2x	if (any(tbl <= 3)) {
68	2x	warning(
69	2x	paste(
70	2x	"There is a low-occurrence value in `score` (", names(tbl)[tbl <= 3][1], ").",
71	2x	"The results may be unreliable."
72		)
73		)
74		}
75		}
76
77		# Outcome - check low frequency
78	36x	if (any(prop.table(table(outcome)) <= 0.03)) {
79	1x	warning(
80	1x	paste(
81	1x	"There is a low frequency of one of the outcome classes (`prop.table(table(outcome))`).",
82	1x	"The results may be unreliable."
83		)
84		)
85		}
86
87	36x	return(list(outcome = outcome, score = score))
88		}
89
90
91		#' Order Inputs
92		#'
93		#' Given a vector of prediction scores, and outcome, the function orders by score.
94		#' Option exists to reverse ordering, if lower scores correspond to higher rate of outcomes.
95		#'
96		#' @inheritParams riskProfile
97		#'
98		#' @return Returns an ordered and complete case dataframe of outcomes and score.
99		#'
100		#' @keywords internal
101		#' @noRd
102		#'
103		orderInputs <- function(outcome, score, rev.order = FALSE) {
104	33x	tempdf <- data.frame(score = score, outcome = outcome)
105	33x	if (rev.order) {
106	6x	tempdf$score <- -tempdf$score
107		}
108	33x	tempdf <- tempdf[order(tempdf$score, tempdf$outcome), ]
109	33x	return(tempdf)
110		}
111
112		#' Method Check
113		#'
114		#' Usage:
115		#' 1. methods <- methodCheck(methods)
116		#' 2. getEstMethod(methods[[1]]) or getEstMethods(methods)
117		#' 3. getEst(methods, ...)
118		#'
119		#' @inheritParams riskProfile
120		#'
121		#' @return list of named lists of method arguments
122		#'
123		#' @keywords internal
124		#' @noRd
125		#'
126		methodCheck <- function(methods) {
127
128		# Drop empty strings
129	54x	if (any(methods == "")) {
130	1x	methods <- methods[methods != ""]
131		}
132
133		# If character is supplied
134	54x	if (is.character(methods)) {
135
136		# Convert to lower-case
137	32x	orig.names <- methods
138	32x	methods <- tolower(methods)
139
140		# Check uniqueness
141	32x	if (length(unique(methods)) != length(methods)) {
142	1x	stop("`methods` should be unique when specified as character.")
143		}
144
145		# Check available method
146	31x	if (!all(methods %in% names(est.funs()))) {
147	2x	stop(
148	2x	paste(
149	2x	"Supplied method is not yet available. Try selecting from (case insensitive): ",
150	2x	paste(shQuote(names(est.funs())), collapse = ", ")
151		)
152		)
153		}
154
155		# Convert to list
156	29x	methods <- structure(as.list(methods), names = orig.names)
157
158		# Otherwise, if a list is provided:
159		# list(gam1 = list(method = "gam", k = 3), pv = list(method = "pava", ...), etc)
160		# or list(my_estimate = my_fun(outcome, score) {...})
161	22x	} else if (is.list(methods)) {
162
163		# Check named list
164	20x	if (!checkmate::test_named(methods, type = "unique")) {
165	1x	stop("'methods' should be a uniquely named list.")
166		}
167
168		# Check and unify estimation method names
169	19x	methods <- lapply(methods, listMethodCheck)
170
171		} else {
172	2x	stop(
173	2x	paste(
174	2x	"`methods` must be character",
175	2x	"or named list of estimation methods / user defined functions."
176		)
177		)
178		}
179
180	42x	return(methods)
181		}
182
183		# x is a single method (element of an outer list)
184		listMethodCheck <- function(x) {
185	25x	UseMethod("listMethodCheck")
186		}
187
188		#' @export
189		listMethodCheck.list <- function(x) {
190
191		# Check named list
192	20x	if (!checkmate::test_named(x, type = "unique")) {
193	1x	stop("Inner lists in the 'methods' argument must be named.")
194		}
195
196		# Check "method" element existing in the list
197	19x	if (is.null(x[["method"]]) \|\| !is.character(x[["method"]])) {
198	2x	stop(
199	2x	paste(
200	2x	'All lists must have a "method" element specifying one of the predefined',
201	2x	"estimation functions as a string. Please select from:",
202	2x	paste(shQuote(names(est.funs())), collapse = ", ")
203		)
204		)
205		}
206
207		# Update method name
208	17x	x[["method"]] <- tolower(x[["method"]])
209
210		# Check available method
211	17x	chck_available <- checkmate::test_subset(
212	17x	x[["method"]],
213	17x	choices = names(est.funs()), empty.ok = FALSE
214		)
215	17x	if (!chck_available) {
216	2x	stop(
217	2x	paste0(
218	2x	"Supplied method '",
219	2x	x[["method"]],
220	2x	"' is not yet available. Try selecting from: ",
221	2x	paste(shQuote(names(est.funs())), collapse = ", ")
222		)
223		)
224		}
225
226	15x	return(x)
227		}
228
229		#' @export
230		listMethodCheck.function <- function(x) {
231
232		# Check input arguments
233	4x	if (!checkmate::test_function(x, args = c("outcome", "score"))) {
234	2x	stop(
235	2x	paste(
236	2x	"All user defined estimation functions need to take exactly two arguments:",
237	2x	"'outcome' and 'score'."
238		)
239		)
240		}
241
242	2x	return(x)
243		}
244
245		#' @export
246		listMethodCheck.default <- function(x) {
247	1x	stop(
248	1x	paste(
249	1x	"The estimation method must be specified as a list or a function."
250		)
251		)
252		}
253
254		# compared to match.arg(..., several.ok = T),
255		# this does not allow NULL and returns a vector of the same length
256		matchArgSubset <- function(x, choices) {
257	24x	checkmate::assert_character(x, any.missing = FALSE, min.len = 1)
258	24x	out <- c()
259	24x	for (ag in x) {
260	60x	matched <- tryCatch(
261	60x	match.arg(ag, choices = choices),
262	60x	error = function(e) stop(sub("'arg'", paste0("'", ag, "'"), e))
263		)
264	60x	out <- c(out, matched)
265		}
266	24x	return(unique(out))
267		}
268
269
270		add0thPercPC <- function(x) {
271	11x	bind_rows(
272	11x	x,
273	11x	x %>%
274	11x	group_by(.data$method) %>%
275	11x	summarise(
276	11x	score = NA,
277	11x	percentile = 0,
278	11x	outcome = NA,
279	11x	estimate = .data$estimate[.data$percentile == min(.data$percentile)][1],
280	11x	pv = "PC",
281	11x	.groups = "drop"
282		)
283		) %>%
284	11x	arrange(.data$method, .data$percentile)
285		}
286
287
288		add0thPercPV <- function(x) {
289	13x	bind_rows(
290	13x	x,
291	13x	x %>%
292	13x	group_by(.data$method, .data$pv) %>%
293	13x	summarise(
294	13x	score = NA,
295	13x	percentile = 0,
296	13x	estimate = NA,
297	13x	pvValue = dplyr::first(.data$pvValue),
298	13x	.groups = "drop"
299		) %>%
300	13x	dplyr::relocate("pv", .before = "pvValue")
301		) %>%
302	13x	arrange(.data$method, .data$pv, .data$percentile)
303		}
304
305
306		add0thPercTR <- function(x) {
307	5x	bind_rows(
308	5x	x,
309	5x	x %>%
310	5x	group_by(.data$method) %>%
311	5x	summarise(
312	5x	score = NA,
313	5x	percentile = 0,
314	5x	pf = "Sensitivity",
315	5x	value = 1
316		),
317	5x	x %>%
318	5x	group_by(.data$method) %>%
319	5x	summarise(
320	5x	score = NA,
321	5x	percentile = 0,
322	5x	pf = "Specificity",
323	5x	value = 0
324		)
325		) %>%
326	5x	arrange(.data$method, .data$pf, .data$percentile)
327		}
328
329
330		#' For snapshot testing of graphs
331		#'
332		#' @param code Code to create a graph
333		#' @param width Width of the plot.
334		#' @param height Height of the plot.
335		#'
336		#' @return Filepath
337		#'
338		#' @keywords internal
339		#' @noRd
340		#'
341		#' @examples
342		#' expect_snapshot_file(save_png(ggplot(mtcars) +
343		#' geom_point(aes(hp, mpg))), "riskProfile.png")
344		#'
345		save_png <- function(code, width = 400, height = 400) { # nocov start
346		path <- tempfile(fileext = ".png")
347		png(path, width = width, height = height)
348		on.exit(dev.off())
349		print(code)
350		return(path)
351		} # nocov end

1		#' Risk profile plot
2		#'
3		#' Predictiveness curve, PPV, NPV and 1-NPV risk estimates
4		#'
5		#' @param outcome Vector of binary outcome for each observation.
6		#' @param score Numeric vector of continuous predicted risk score.
7		#' @param methods Character vector of method names (case-insensitive) for plotting curves or
8		#' a named list where elements are method function and its arguments.
9		#' Default is set to `"asis"`.
10		#'
11		#' Full options are: `c("asis", "binned", "pava", "mspline", "gam", "cgam")`.
12		#'
13		#' To specify arguments per method, use lists. For example:
14		#' ```
15		#' list(
16		#' pava = list(method = "pava", ties = "primary"),
17		#' mspline = list(method = "mspline", fitonPerc = TRUE),
18		#' gam = list(method = "gam", bs = "tp", logscores = FALSE),
19		#' bin = list(method = "binned", bins = 10),
20		#' risk = list(method = "asis")
21		#' )
22		#' ```
23		#' See section "Estimation" for more details.
24		#' @param prev.adj `NULL` (default) or scalar numeric between 0 and 1 for prevalence adjustment.
25		#' @param show.prev Logical, show prevalence value in the graph. Defaults to `TRUE`.
26		#' @param show.nonparam.pv Logical, show non-parametric calculation of PVs. Defaults to `TRUE`.
27		#' @param show.best.pv Logical, show best possible PVs. Defaults to `TRUE`.
28		#' @param include Character vector (case-insensitive, partial matching) specifying what quantities
29		#' to include in the plot.
30		#'
31		#' Default is: `c("PC", "PPV", "1-NPV")`.
32		#'
33		#' Full options are: `c("NPV", "PC", "PPV", "1-NPV")`.
34		#' @param plot.raw Logical to show percentiles or raw values.
35		#' Defaults to `FALSE` (i.e. percentiles).
36		#' @param rev.order Logical, reverse ordering of scores. Defaults to `FALSE`.
37		#'
38		#' @section Estimation:
39		#' The `methods` argument specifies the estimation method.
40		#' You can provide either a vector of strings, any of
41		#' ```
42		#' c("asis", "binned", "pava", "mspline", "gam", "cgam")
43		#' ```
44		#' (`"asis"` is not available for `calibrationProfile`),
45		#' or a named list of lists.
46		#' In the latter case, the inner list must have an element "method",
47		#' which specifies the estimation function (one of those above),
48		#' and optionally other elements, which are passed to the estimation function.
49		#' For example:
50		#' ```
51		#' list(
52		#' gam = list(method = "gam", k = 3),
53		#' c_gam = list(method = "cgam", numknots = 3)
54		#' )
55		#' ```
56		#'
57		#' To see what arguments are available for each estimation method,
58		#' see the documentation of that function.
59		#' The naming convention is `getXest`,
60		#' where `X` stands for the estimation method, for example [getGAMest()].
61		#'
62		#' "gam", "cgam", and "mspline" always fit on percentiles by default.
63		#' To change this, use `fitonPerc = FALSE`, for example
64		#' ```
65		#' list(gam = list(method = "gam", fitonPerc = FALSE))
66		#' ```
67		#'
68		#' "gam" and "cgam" methods are wrappers of [mgcv::gam()] and [cgam::cgam()], respectively.
69		#' The default values of function arguments (like `k`, the number of knots in [mgcv::s()])
70		#' mirror the package defaults.
71		#'
72		#' @return A list containing the plot and data, plus `errorbar` data if they were requested
73		#' (through `"binned"` estimation method with a parameter `errorbar.sem`).
74		#'
75		#' @export
76		#'
77		#' @seealso [calibrationProfile()] [sensSpec()]
78		#'
79		#' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()]
80		#' [getASISest()]
81		#'
82		#' @examples
83		#' # Read in example data
84		#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
85		#' rscore <- auroc$predicted_calibrated
86		#' truth <- as.numeric(auroc$actual)
87		#'
88		#' # Default plot includes 1-NPV, PPV, and a predictiveness curve (PC) based on risk-cutoff
89		#' p1 <- riskProfile(outcome = truth, score = rscore)
90		#' p1$plot
91		#' p1$data
92		#'
93		#' # Show also NPV
94		#' p2 <- riskProfile(
95		#' outcome = truth,
96		#' score = rscore,
97		#' include = c("PC", "NPV", "PPV", "1-NPV")
98		#' # or use partial matching: include = c("PC", "N", "PPV", "1")
99		#' )
100		#' p2$plot
101		#' p2$data
102		#'
103		#' # All estimates of prediction curve
104		#' p3 <- riskProfile(
105		#' outcome = truth,
106		#' score = rscore,
107		#' methods = c("mspline", "gam", "cgam", "binned", "pava", "asis"),
108		#' include = c("PC", "PPV", "1-NPV")
109		#' )
110		#' p3$plot
111		#'
112		#' # Specifying method arguments (note each list has a "method" element)
113		#' p4 <- riskProfile(
114		#' outcome = truth,
115		#' score = rscore,
116		#' methods = list(
117		#' "gam" = list(method = "gam", bs = "tp", logscores = FALSE, fitonPerc = TRUE),
118		#' "risk" = list(method = "asis"), # no available arguments for this method
119		#' "bin" = list(method = "binned", quantiles = 10, errorbar.sem = 1.2)
120		#' )
121		#' )
122		#' p4$plot
123		#'
124		#' # Compare multiple GAMs in terms of Predictiveness Curves
125		#' p5 <- riskProfile(
126		#' outcome = truth,
127		#' score = rscore,
128		#' methods = list(
129		#' "gam_3" = list(method = "gam", k = 3),
130		#' "gam_4" = list(method = "gam", k = 4),
131		#' "gam_7" = list(method = "gam", k = 7)
132		#' ),
133		#' include = "PC"
134		#' )
135		#' p5$plot
136		#'
137		#' # Using logistic regression as user-defined estimation function, fitting on percentiles
138		#' # Function needs to take exactly these two arguments
139		#' my_est <- function(outcome, score) {
140		#' # Calculate percentiles
141		#' perc <- ecdf(score)(score)
142		#' # Fit
143		#' m <- glm(outcome ~ perc, family = "binomial")
144		#' # Generate predictions
145		#' preds <- predict(m, type = "response")
146		#' # Return a data.frame with exactly these columns
147		#' return(
148		#' data.frame(
149		#' score = score,
150		#' percentile = perc,
151		#' outcome = outcome,
152		#' estimate = preds
153		#' )
154		#' )
155		#' }
156		#' p6 <- riskProfile(
157		#' outcome = truth,
158		#' score = rscore,
159		#' methods = list(my_lr = my_est)
160		#' )
161		#' p6$plot
162		#'
163		#' # Using cgam as user-defined estimation function
164		#' # Note that you can also use the predefined cgam using methods = "cgam"
165		#' # Attach needed library
166		#' # Watch out for masking of mgcv::s and cgam::s if both are attached
167		#' library(cgam, quietly = TRUE)
168		#' # Function needs to take exactly these two arguments
169		#' my_est <- function(outcome, score) {
170		#' # Fit on raw predictions with space = "E"
171		#' m <- cgam(
172		#' outcome ~ s.incr(score, numknots = 5, space = "E"),
173		#' family = "binomial"
174		#' )
175		#' # Generate predictions and convert to vector
176		#' preds <- predict(m, type = "response")$fit
177		#' # Return a data.frame with exactly these columns
178		#' out <- data.frame(
179		#' score = score,
180		#' percentile = ecdf(score)(score),
181		#' outcome = outcome,
182		#' estimate = preds
183		#' )
184		#' return(out)
185		#' }
186		#'
187		#' p7 <- riskProfile(
188		#' outcome = truth,
189		#' score = rscore,
190		#' methods = list(my_cgam = my_est)
191		#' )
192		#' p7$plot
193		#'
194		#' # Prevalence adjustment to 0.1
195		#' p8 <- riskProfile(outcome = truth, score = rscore, prev.adj = 0.1)
196		#' p8$plot
197		#'
198		riskProfile <- function(outcome,
199		score,
200		methods = "asis",
201		prev.adj = NULL,
202		show.prev = TRUE,
203		show.nonparam.pv = TRUE,
204		show.best.pv = TRUE,
205		include = c("PC", "PPV", "1-NPV"),
206		plot.raw = FALSE,
207		rev.order = FALSE) {
208
209		# Argument checks (except outcome, score, methods - below)
210	15x	checkmate::assert_number(prev.adj, lower = 0, upper = 1, null.ok = TRUE)
211	15x	checkmate::assert_flag(show.nonparam.pv)
212	15x	checkmate::assert_flag(show.best.pv)
213	15x	checkmate::assert_flag(show.prev)
214	15x	include <- matchArgSubset(toupper(include), choices = c("PC", "PPV", "NPV", "1-NPV"))
215	15x	checkmate::assert_flag(plot.raw)
216	15x	checkmate::assert_flag(rev.order)
217
218		# Standardize/Check outcome, scores
219	15x	op <- inputCheck(outcome = outcome, score = score)
220
221		# Order Data by scores
222	15x	tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order)
223	15x	score <- tempdf$score
224	15x	outcome <- tempdf$outcome
225
226		# Check methods
227	15x	methods <- methodCheck(methods = methods)
228	15x	method.names <- names(methods)
229
230		# Calculate prevalence and percentiles
231	15x	prev <- mean(outcome, na.rm = TRUE)
232
233		# Get the plot settings
234	15x	show.pc <- "PC" %in% include
235	15x	show.pv <- any(c("PPV", "NPV", "1-NPV") %in% include)
236	15x	show.one.only <- sum(c("PC", "PPV", "NPV", "1-NPV") %in% include) == 1
237
238	15x	if (plot.raw) {
239	2x	xvar <- "score"
240		} else {
241	13x	xvar <- "percentile"
242		}
243
244		# Prediction Curve Data
245	15x	if (show.pc) {
246	12x	pc.ests <- getEsts(methods = methods, outcome = outcome, score = score)
247	12x	PC.data <- mutate(pc.ests$plotdata, pv = "PC")
248	12x	errorbar.data <- pc.ests$errorbardata
249	12x	step.methods <- method.names[pc.ests$idx.step]
250		} else {
251	3x	PC.data <- data.frame(
252	3x	method = character(0), pv = character(0),
253	3x	percentile = numeric(0), score = numeric(0), estimate = numeric(0)
254		)
255	3x	pc.ests <- errorbar.data <- NULL
256	3x	step.methods <- character(0)
257		}
258
259		# Predictive Value Data
260	15x	if (show.pv) {
261	14x	PV.data <- getPVdata(outcome = outcome, score = score, methods = methods, pc.ests = pc.ests)
262		} else {
263	1x	PV.data <- data.frame(
264	1x	method = character(0), score = numeric(0), percentile = numeric(0),
265	1x	outcome = numeric(0), estimate = numeric(0),
266	1x	MNPV = numeric(0), NPV = numeric(0), PPV = numeric(0)
267		)
268		}
269
270		# show.nonparam.pv
271	15x	if (show.nonparam.pv) {
272	10x	tmp <- nonParametricPV(outcome = outcome, score = score) %>%
273	10x	mutate(method = "non-parametric", estimate = NA)
274	10x	PV.data <- bind_rows(PV.data, tmp)
275		}
276
277		# Dataset of inputs
278	15x	df.in <- data.frame(outcome, score, percentile = ecdf(score)(score))
279
280		# Adjust based on user defined prevalence
281	15x	if (!is.null(prev.adj)) {
282
283	1x	cdf.cases <- ecdf(score[outcome == 1])
284	1x	cdf.controls <- ecdf(score[outcome == 0])
285
286	1x	df.in$percentile <- adjPrevPerc(
287	1x	perc = df.in$score, prev.new = prev.adj,
288	1x	cdf.case = cdf.cases, cdf.control = cdf.controls
289		)
290
291	1x	if (show.pc) {
292	1x	PC.data <- adjPrevPC(
293	1x	dat = PC.data, prev = prev, prev.new = prev.adj,
294	1x	cdf.case = cdf.cases, cdf.control = cdf.controls
295		)
296		}
297
298	1x	if (show.pv) {
299	1x	PV.data <- adjPrevPV(
300	1x	dat = PV.data, prev = prev, prev.new = prev.adj,
301	1x	cdf.case = cdf.cases, cdf.control = cdf.controls
302		)
303		}
304
305	1x	prev <- prev.adj
306		}
307
308		# pivot PV data
309	15x	if (show.pv) {
310	14x	PV.data <- PV.data %>%
311	14x	rename(`1-NPV` = "MNPV") %>%
312	14x	tidyr::pivot_longer(
313	14x	cols = all_of(c("1-NPV", "NPV", "PPV")), names_to = "pv", values_to = "pvValue"
314		) %>%
315	14x	mutate(pv = factor(.data$pv, levels = c("NPV", "1-NPV", "PPV"))) %>%
316	14x	arrange(.data$method, .data$pv, .data$percentile)
317		} else {
318	1x	PV.data <- data.frame(
319	1x	method = character(0), score = numeric(0), percentile = numeric(0),
320	1x	outcome = numeric(0), estimate = numeric(0),
321	1x	pv = character(0), pvValue = numeric(0)
322		)
323		}
324
325		# If showing percentiles, add row with 0th percentile
326	15x	if (!plot.raw) {
327	13x	if (show.pc) {
328	10x	PC.data <- add0thPercPC(PC.data)
329		}
330	13x	if (show.pv) {
331	12x	PV.data <- add0thPercPV(PV.data)
332		}
333		}
334
335		# Subset PC data
336	15x	smoothPC <- PC.data[!PC.data$method %in% step.methods, , drop = FALSE]
337	15x	stepPC <- PC.data[PC.data$method %in% step.methods, , drop = FALSE]
338
339		# Subset PV data
340	15x	smoothPV <- PV.data[
341	15x	PV.data$pv %in% include & !PV.data$method %in% step.methods, ,
342	15x	drop = FALSE
343		]
344	15x	stepPV <- PV.data[
345	15x	PV.data$pv %in% include & PV.data$method %in% step.methods, ,
346	15x	drop = FALSE
347		]
348
349		# Different aes based on what is to be shown
350		# (if one kind of PV value, use both coloring and linetype for distinguishing estimation methods)
351	15x	if (show.one.only) {
352	2x	aes.pc <- aes(
353	2x	x = .data[[xvar]], y = .data$estimate,
354	2x	colour = .data$method, linetype = .data$method
355		)
356	2x	aes.pv <- aes(
357	2x	x = .data[[xvar]], y = .data$pvValue,
358	2x	colour = .data$method, linetype = .data$method
359		)
360		} else {
361	13x	aes.pc <- aes(
362	13x	x = .data[[xvar]], y = .data$estimate,
363	13x	colour = .data$pv, linetype = .data$method
364		)
365	13x	aes.pv <- aes(
366	13x	x = .data[[xvar]], y = .data$pvValue,
367	13x	colour = .data$pv, linetype = .data$method
368		)
369		}
370
371		# Build plot
372	15x	p <- ggplot() +
373	15x	geom_line(aes.pc, data = smoothPC, alpha = 0.8) +
374	15x	geom_step(aes.pc, data = stepPC, alpha = 0.8, direction = "vh") +
375	15x	geom_line(aes.pv, data = smoothPV, alpha = 0.8) +
376	15x	geom_step(aes.pv, data = stepPV, alpha = 0.8, direction = "vh") +
377	15x	geom_hline(yintercept = prev, alpha = 0.8, col = "black", linetype = "dashed")
378
379		# Best PVs
380	15x	if (show.best.pv) {
381	11x	if (!plot.raw) {
382	9x	df.in <- bind_rows(dplyr::tibble(percentile = 0, score = NA), df.in)
383		}
384	11x	best <- df.in %>%
385	11x	select(all_of(c("percentile", "score"))) %>%
386	11x	distinct() %>%
387	11x	arrange(.data$percentile, .data$score) %>%
388	11x	mutate(
389	11x	PC = ifelse(.data$percentile <= 1 - prev, 0, 1),
390	11x	PPV = bestPPV(perc = .data$percentile, prev = prev),
391	11x	`1-NPV` = bestMNPV(perc = .data$percentile, prev = prev),
392	11x	NPV = 1 - .data$`1-NPV`
393		) %>%
394	11x	tidyr::pivot_longer(
395	11x	cols = c("PC", "PPV", "1-NPV", "NPV"), names_to = "pv", values_to = "pvValue"
396		) %>%
397	11x	filter(.data$pv %in% include) %>%
398	11x	mutate(
399	11x	method = "Best PVs",
400	11x	pv = paste("Best", .data$pv)
401		)
402
403	11x	if (show.one.only) {
404	1x	p <- p +
405	1x	geom_line(
406	1x	data = best,
407	1x	aes(x = .data[[xvar]], y = .data$pvValue, linewidth = .data$pv),
408	1x	colour = "gray60"
409		) +
410	1x	scale_linewidth_manual(
411	1x	values = c("Best 1-NPV" = 0.3, "Best PPV" = 0.3, "Best PC" = 0.3, "Best NPV" = 0.3),
412	1x	name = "Best PVs"
413		)
414		} else {
415	10x	p <- p +
416	10x	geom_line(data = best, aes(x = .data[[xvar]], y = .data$pvValue, colour = .data$pv))
417		}
418		}
419
420		# Add errorbars
421	15x	if (show.pc && !is.null(errorbar.data)) {
422	1x	p <- p +
423	1x	geom_point(
424	1x	data = errorbar.data,
425	1x	aes(x = .data$midquantile, y = .data$bin.mid),
426	1x	alpha = 0.8,
427	1x	size = 0.2
428		) +
429	1x	geom_errorbar(
430	1x	data = errorbar.data,
431	1x	aes(
432	1x	x = .data$midquantile,
433	1x	ymin = .data$bin.low,
434	1x	ymax = .data$bin.high,
435	1x	width = .02
436		),
437	1x	alpha = 0.7,
438	1x	linewidth = 0.2,
439	1x	inherit.aes = FALSE
440		)
441		}
442
443		# Set always the same colours for PVs
444	15x	if (!show.one.only) {
445	13x	clrs <- predictionColours(include, show.best = show.best.pv)
446	13x	p <- p + scale_colour_manual(values = clrs, breaks = names(clrs))
447		} else {
448	2x	p <- p + scale_colour_hue(l = 45)
449		}
450
451		# Finalize plot
452	15x	p <- p +
453	15x	labs(
454	15x	title = "Predictiveness Plot",
455	15x	x = ifelse(plot.raw, "Prediction Score", "Risk Percentile"),
456	15x	y = "Predicted Risk / Predictive Value",
457	15x	linetype = "Estimation Method",
458	15x	colour = ifelse(show.one.only, "Estimation Method", "Predictive Quantity")
459		) +
460	15x	scale_x_continuous(n.breaks = 6) +
461	15x	scale_y_continuous(n.breaks = 6) +
462	15x	theme_bw() +
463	15x	theme(legend.key.width = unit(2, "line"))
464
465		# Add prevalence annotation if requested
466	15x	if (show.prev) {
467		# x-value for plotting prevalence label
468	15x	prev_x <- ifelse(plot.raw, min(score), 0)
469	15x	prev_nudge_x <- ifelse(plot.raw, (max(score) - min(score)) / 10, 0.1)
470	15x	prev_nudge_y <- ggplot2::layer_scales(p)$y$get_limits()[2] / 10
471
472	15x	p <- p + annotate(
473	15x	geom = "text",
474	15x	x = prev_x + prev_nudge_x,
475	15x	y = ifelse(prev > 0.8, prev - prev_nudge_y, prev + prev_nudge_y),
476	15x	label = paste0("Prevalence: ", "\n", round(prev, 3)),
477	15x	colour = "black",
478	15x	alpha = 0.8,
479	15x	size = 3.5
480		)
481		}
482
483	15x	if (show.one.only) {
484	2x	p <- p + guides(colour = guide_legend(order = 1), linetype = guide_legend(order = 1))
485		} else {
486	13x	p <- p + guides(colour = guide_legend(order = 1), linetype = guide_legend(order = 2))
487		}
488
489	15x	if (show.best.pv) {
490	11x	PV.data <- bind_rows(PV.data, mutate(best, pv = gsub("Best ", "", .data$pv)))
491		}
492
493	15x	return(
494	15x	list(
495	15x	plot = p,
496	15x	data = bind_rows(
497	15x	dplyr::as_tibble(PC.data),
498	15x	dplyr::as_tibble(PV.data)
499		),
500	15x	errorbar = errorbar.data
501		)
502		)
503		}

1		#' Calibration plot
2		#'
3		#' Calibration curve risk estimates
4		#'
5		#' @inheritParams riskProfile
6		#' @param methods Character vector of method names (case-insensitive) for plotting curves or
7		#' a named list where elements are method function and its arguments.
8		#' Default is set to `list(gam = list(method = "gam", fitonPerc = FALSE))`.
9		#'
10		#' Full options are: `c("binned", "pava", "mspline", "gam", "cgam")`.
11		#'
12		#' To specify arguments per method, use lists. For example:
13		#' ```
14		#' list(
15		#' pava = list(method = "pava", ties = "primary"),
16		#' mspline = list(method = "mspline", fitonPerc = TRUE),
17		#' gam = list(method = "gam", bs = "tp", logscores = FALSE),
18		#' bin = list(method = "binned", bins = 10),
19		#' )
20		#' ```
21		#' See section "Estimation" for more details.
22		#' @param include Character vector (case-insensitive, partial matching) or `NULL` specifying
23		#' what quantities to include in the plot.
24		#'
25		#' Default is: `c("loess", "citl")`.
26		#'
27		#' Full options are: `c("loess", "citl", "rug", "datapoints")` or `NULL`.
28		#' "loess" adds a Loess fit, "citl" stands for "Calibration in the large",
29		#' "rug" adds rug ticks of `score` by `outcome` (top x-axis: `score` for `outcome == 1`,
30		#' bottom x-axis: `score` for `outcome == 0`),
31		#' "datapoints" adds jittered `score` by `outcome` (slightly shifted away from 0 / 1 y-values),
32		#' "`NULL`" stands for no extra information.
33		#' @param plot.raw Logical to show percentiles or raw values.
34		#' Defaults to `TRUE` (i.e. raw `score`).
35		#' @param rev.order Logical to reverse ordering of scores. Defaults to `FALSE`.
36		#' @param margin.type Type of additional margin plot, can be one of
37		#' `c("density", "histogram", "boxplot", "violin", "densigram")`.
38		#' See [ggExtra::ggMarginal()] for more details.
39		#' @param ... Additional arguments passed to [ggExtra::ggMarginal()].
40		#'
41		#' @inheritSection riskProfile Estimation
42		#'
43		#' @return A list containing the plot and data, plus `citl` data if they were requested.
44		#'
45		#' @export
46		#'
47		#' @seealso [riskProfile()] [sensSpec()]
48		#'
49		#' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()]
50		#' [getASISest()]
51		#'
52		#' @examples
53		#' # Read in example data
54		#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
55		#' rscore <- auroc$predicted_calibrated
56		#' truth <- as.numeric(auroc$actual)
57		#'
58		#' # Default calibration plot
59		#' p1 <- calibrationProfile(outcome = truth, score = rscore)
60		#' p1$plot
61		#'
62		#' # Specifying multiple estimation methods
63		#' # By default, all the methods fit on percentiles
64		#' calibrationProfile(
65		#' outcome = truth,
66		#' score = rscore,
67		#' methods = c("gam", "mspline", "binned")
68		#' )$plot
69		#'
70		#' # Specifying multiple estimation methods with parameters
71		#' calibrationProfile(
72		#' outcome = truth,
73		#' score = rscore,
74		#' methods = list(
75		#' gam = list(method = "gam", fitonPerc = FALSE, k = 3),
76		#' mspline = list(method = "mspline"),
77		#' bin = list(method = "binned", quantiles = 5)
78		#' )
79		#' )$plot
80		#'
81		#' # Additional quantities and marginal histogram with specified number of bins
82		#' calibrationProfile(
83		#' outcome = truth,
84		#' score = rscore,
85		#' include = c("rug", "datapoints", "citl"),
86		#' # or use partial matching: include = c("r", "d", "c"),
87		#' margin.type = "histogram",
88		#' bins = 100 # passed to ggExtra::ggMarginal
89		#' )$plot
90		#'
91		calibrationProfile <- function(outcome,
92		score,
93		methods = list(gam = list(method = "gam", fitonPerc = FALSE)),
94		include = c("loess", "citl"),
95		plot.raw = TRUE,
96		rev.order = FALSE,
97		margin.type = NULL,
98		...) {
99
100		# Argument checks (except outcome, score, methods - below)
101	10x	checkmate::assert(
102	10x	checkmate::check_character(include),
103	10x	checkmate::check_null(include)
104		)
105	10x	if (is.character(include)) {
106	9x	include <- matchArgSubset(tolower(include), choices = c("loess", "citl", "rug", "datapoints"))
107		}
108	10x	checkmate::assert_flag(plot.raw)
109	10x	checkmate::assert_flag(rev.order)
110	10x	checkmate::assert(
111	10x	checkmate::check_character(margin.type, len = 1, any.missing = FALSE),
112	10x	checkmate::check_null(margin.type)
113		)
114
115	10x	if (plot.raw) {
116	9x	xvar <- "score"
117		} else {
118	1x	xvar <- "percentile"
119		}
120
121		# Standardize/Check outcome, scores
122	10x	op <- inputCheck(outcome = outcome, score = score)
123
124		# Order Data by scores
125	10x	tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order)
126	10x	score <- tempdf$score
127	10x	outcome <- tempdf$outcome
128
129		# Check methods
130	10x	methods <- methodCheck(methods = methods)
131	10x	method.names <- names(methods)
132	10x	if ("asis" %in% getEstMethods(methods, with.names = FALSE)) {
133	1x	stop('"asis" method is not suitable for this plot. Please remove it.')
134		}
135
136		# Get estimates
137	9x	pc.ests <- getEsts(methods = methods, outcome = outcome, score = score)
138	9x	PC.data <- pc.ests$plotdata
139	9x	step.methods <- method.names[pc.ests$idx.step]
140
141		# Calculate percentiles
142	9x	ecdf.score <- ecdf(score)
143	9x	percentile <- ecdf.score(score)
144
145		# Calculate Calibration in the large
146	9x	citl.data <- data.frame(
147	9x	outcome = mean(outcome),
148	9x	score = mean(score),
149	9x	percentile = ecdf.score(mean(score)),
150	9x	method = "Calibration In The Large"
151		)
152
153		# Subset PC data for plotting
154	9x	smoothPC <- PC.data[!PC.data$method %in% step.methods, , drop = FALSE]
155	9x	stepPC <- PC.data[PC.data$method %in% step.methods, , drop = FALSE]
156
157		# data.frame with user inputs
158	9x	ddf <- data.frame(score, percentile, outcome)
159
160		# Shape type storage
161	9x	shapes <- c()
162
163		# Add empty scatterplot layer for ggMarginal
164	9x	if (!is.null(margin.type)) {
165	1x	p <- ggplot() +
166	1x	geom_point(
167	1x	data = ddf,
168	1x	aes(x = .data[[xvar]], y = .data$outcome), shape = NA, na.rm = TRUE
169		)
170		} else {
171	8x	p <- ggplot()
172		}
173
174		# Build plot
175	9x	p <- p +
176	9x	geom_line(
177	9x	data = smoothPC,
178	9x	aes(x = .data[[xvar]], y = .data$estimate, linetype = .data$method, colour = .data$method),
179	9x	alpha = 0.8,
180	9x	linewidth = 0.5
181		) +
182	9x	geom_step(
183	9x	data = stepPC,
184	9x	aes(x = .data[[xvar]], y = .data$estimate, linetype = .data$method, colour = .data$method),
185	9x	direction = "vh",
186	9x	alpha = 0.8,
187	9x	linewidth = 0.5
188		) +
189	9x	geom_abline(
190	9x	aes(slope = 1, intercept = 0, linewidth = "Identity line"),
191	9x	colour = "gray50",
192	9x	linetype = "solid"
193		)
194
195		# Add loess if requested
196	9x	if ("loess" %in% include) {
197	8x	p <- p + geom_smooth(
198	8x	data = ddf,
199	8x	aes(x = .data[[xvar]], y = .data$outcome, linetype = "loess", colour = "loess"),
200	8x	method = "loess", formula = y ~ x, se = FALSE,
201	8x	linewidth = 0.5
202		)
203		}
204
205		# Add calibration in the large if requested
206	9x	if ("citl" %in% include) {
207	8x	p <- p + geom_point(
208	8x	data = citl.data,
209	8x	aes(x = .data[[xvar]], y = .data$outcome, shape = .data$method),
210	8x	colour = "red",
211	8x	size = 3,
212	8x	stroke = 1
213		)
214	8x	shapes <- c(shapes, c("Calibration In The Large" = 4))
215		}
216
217		# Add datapoints if requested
218	9x	if ("datapoints" %in% include) {
219	1x	p <- p + geom_jitter(
220	1x	data = data.frame(
221	1x	score,
222	1x	percentile,
223	1x	outcome = ifelse(outcome == 0, -0.1, 1.1),
224	1x	method = "Data points"
225		),
226	1x	aes(x = .data[[xvar]], y = .data$outcome, shape = .data$method),
227	1x	colour = "black",
228	1x	size = 1.5,
229	1x	alpha = 0.4,
230	1x	position = position_jitter(seed = 5, height = 0.03)
231		)
232	1x	shapes <- c(shapes, c("Data points" = 16))
233		}
234
235		# Add rug if requested
236	9x	if ("rug" %in% include) {
237	1x	p <- p + geom_rug(
238	1x	data = ddf[ddf$outcome == 0, ],
239	1x	aes(x = .data[[xvar]]),
240	1x	sides = "b",
241	1x	show.legend = FALSE
242	1x	) + geom_rug(
243	1x	data = ddf[ddf$outcome == 1, ],
244	1x	aes(x = .data[[xvar]]),
245	1x	sides = "t",
246	1x	show.legend = FALSE
247		)
248		}
249
250		# Fix shape legend ...
251	9x	if (all(c("citl", "datapoints") %in% include)) {
252	1x	shape_guide <- guide_legend(
253	1x	override.aes = list(
254	1x	colour = c("Calibration In The Large" = "red", "Data points" = "black"),
255	1x	alpha = 1, size = 2.5
256		),
257	1x	order = 3
258		)
259	8x	} else if (any(c("citl", "datapoints") %in% include)) {
260	7x	shape_guide <- guide_legend(
261	7x	override.aes = list(alpha = 1, size = 2.5),
262	7x	order = 3
263		)
264		} else {
265	1x	shape_guide <- NULL
266		}
267
268		# Finalize graph
269	9x	p <- p +
270	9x	scale_linewidth_manual(values = c("Identity line" = 0.5)) +
271	9x	scale_shape_manual(values = shapes) +
272	9x	scale_colour_hue(l = 45) +
273	9x	scale_x_continuous(n.breaks = 6) +
274	9x	scale_y_continuous(n.breaks = 6) +
275	9x	labs(
276	9x	title = "Calibration Plot",
277	9x	x = "Predicted Probability",
278	9x	y = "Observed",
279	9x	linetype = "Estimation Method",
280	9x	linewidth = NULL,
281	9x	colour = "Estimation Method",
282	9x	shape = `if`(all(c("citl", "datapoints") %in% include), "Points", NULL)
283		) +
284	9x	theme_bw() +
285	9x	theme(legend.key.width = unit(2, "line")) +
286	9x	guides(
287	9x	linetype = guide_legend(order = 1),
288	9x	colour = guide_legend(order = 1),
289	9x	linewidth = guide_legend(order = 2),
290	9x	shape = shape_guide
291		)
292
293		# Add margin plot
294	9x	if (!is.null(margin.type)) {
295	1x	p <- ggExtra::ggMarginal(p, type = margin.type, margins = "x", ...)
296		}
297
298	9x	return(list(plot = p, data = dplyr::as_tibble(PC.data), citl = citl.data))
299		}

1
2		# Calculate Predictive Value (PV) Estimates
3		getPVdata <- function(methods, outcome, score, pc.ests = NULL) {
4
5		# Calculate estimates if not provided
6	16x	if (is.null(pc.ests)) {
7	5x	pc.ests <- getEsts(outcome = outcome, score = score, methods = methods)
8	5x	pc.data <- pc.ests$plotdata
9		} else {
10	11x	pc.data <- pc.ests$plotdata
11		}
12
13		# locf and zip for pava and/or binned data
14	16x	if (any(pc.ests$idx.step)) {
15	4x	name.binned <- names(methods)[pc.ests$idx.binned]
16	4x	name.pava <- names(methods)[pc.ests$idx.pava]
17	4x	pc.data <- locf(pc.data, method.binned = name.binned, method.pava = name.pava)
18		}
19
20		# Calculate PV
21	16x	PV.data <- parametricPV(pc.data = pc.data)
22
23	16x	return(as.data.frame(PV.data))
24		}
25
26
27		# Calculates PV values from cutoff thresholds
28		nonParametricPV <- function(outcome, score) {
29
30	11x	prev <- mean(outcome, na.rm = TRUE)
31
32	11x	thresh.predictions <- lapply(score, function(x) as.numeric(score > x))
33
34	11x	ppv <- vapply(
35	11x	thresh.predictions,
36	11x	function(x) {
37	440x	tp <- sum(outcome == 1 & x == 1)
38	440x	fp <- sum(outcome == 0 & x == 1)
39	440x	tp / (tp + fp)
40		},
41	11x	numeric(1)
42		)
43
44	11x	npv <- vapply(
45	11x	thresh.predictions,
46	11x	function(x) {
47	440x	tn <- sum(outcome == 0 & x == 0)
48	440x	fn <- sum(outcome == 1 & x == 0)
49	440x	tn / (tn + fn)
50		},
51	11x	numeric(1)
52		)
53
54	11x	threshold.data <- data.frame(
55	11x	score = score,
56	11x	percentile = ecdf(score)(score),
57	11x	PPV = ppv,
58	11x	NPV = npv
59		) %>%
60	11x	mutate(MNPV = 1 - .data$NPV) %>%
61	11x	tidyr::fill(
62	11x	all_of(c("PPV", "NPV", "MNPV")),
63	11x	.direction = "downup"
64		)
65
66	11x	return(threshold.data)
67		}
68
69
70		# Calculates PV values based on pracma::cumtrapz
71		parametricPV <- function(pc.data) {
72
73		# Calculate PVs
74	16x	PV.data <- pc.data %>%
75	16x	group_by(.data$method) %>%
76	16x	mutate(
77	16x	MNPV = pracma::cumtrapz(.data$percentile, .data$estimate)[, 1] / .data$percentile,
78	16x	NPV = 1 - .data$MNPV,
79	16x	PPV =
80		(
81	16x	max(pracma::cumtrapz(.data$percentile, .data$estimate), na.rm = TRUE) -
82	16x	pracma::cumtrapz(.data$percentile, .data$estimate)[, 1]
83	16x	) / (max(.data$percentile) - .data$percentile)
84		) %>%
85	16x	ungroup() %>%
86	16x	arrange(.data$method, .data$percentile)
87
88		# Fill NAs (starting points)
89	16x	PV.data <- PV.data %>%
90	16x	group_by(.data$method) %>%
91	16x	mutate(
92	16x	MNPV = ifelse(.data$percentile == min(.data$percentile) & is.na(.data$MNPV), 0, .data$MNPV),
93	16x	NPV = ifelse(.data$percentile == min(.data$percentile) & is.na(.data$NPV), 1, .data$NPV)
94		) %>%
95	16x	as.data.frame()
96
97		# Fill NAs (ending points)
98	16x	PV.data <- tidyr::fill(
99	16x	PV.data,
100	16x	all_of(c("estimate", "MNPV", "NPV", "PPV")),
101	16x	.direction = "down"
102		)
103
104	16x	return(as.data.frame(PV.data))
105		}
106
107
108		# consistent colors for PVs
109		predictionColours <- function(x, show.best) {
110	13x	clrs <- c(
111	13x	"Best NPV" = "gray80",
112	13x	"NPV" = "#53B400",
113	13x	"Best PPV" = "gray65",
114	13x	"PPV" = "red",
115	13x	"Best PC" = "black",
116	13x	"PC" = "plum",
117	13x	"1-NPV" = "royalblue2",
118	13x	"Best 1-NPV" = "gray50"
119		)
120	13x	if (show.best) {
121	10x	x <- c(x, paste("Best", x))
122		}
123	13x	return(clrs[names(clrs) %in% x]) # keep the ordering above
124		}
125
126
127		# Calculates sensitivity and specificity (true positive / negative rate)
128		nonParametricTR <- function(outcome, score) {
129
130		# Tresh_predictions are lists of 0,1's based on each finer_rscore as a cutpoint
131	9x	thresh.predictions <- lapply(score, function(x) as.numeric(score > x))
132
133		# Calc sensitivities and specificities at each risk percentile threshold
134	9x	senses <- vapply(
135	9x	thresh.predictions,
136	9x	function(x) {
137	360x	sum(outcome == 1 & x == 1) / sum(outcome == 1)
138		},
139	9x	numeric(1)
140		)
141
142	9x	specs <- vapply(
143	9x	thresh.predictions,
144	9x	function(x) {
145	360x	sum(outcome == 0 & x == 0) / sum(outcome == 0)
146		},
147	9x	numeric(1)
148		)
149
150		# Create a data.frame
151	9x	dat <- data.frame(
152	9x	score = score,
153	9x	percentile = ecdf(score)(score),
154	9x	Sensitivity = senses,
155	9x	Specificity = specs
156		) %>%
157	9x	tidyr::pivot_longer(
158	9x	cols = c("Sensitivity", "Specificity"),
159	9x	names_to = "pf",
160	9x	values_to = "value"
161		)
162
163	9x	return(as.data.frame(dat))
164		}
165
166
167
168		adjPrevPerc <- function(perc, prev.new, cdf.case, cdf.control) {
169	4x	prev.new * cdf.case(perc) + (1 - prev.new) * cdf.control(perc)
170		}
171
172		adjPrevEst <- function(x, prev, prev.new) {
173	5x	f2 <- prev / (1 - prev)
174	5x	f3 <- (1 - prev.new) / prev.new
175	5x	1 / (((1 / x) - 1) * f2 * f3 + 1)
176		}
177
178		adjPrevPC <- function(dat, prev, prev.new, cdf.case, cdf.control) {
179	1x	mutate(
180	1x	dat,
181	1x	percentile = adjPrevPerc(
182	1x	perc = .data$score, prev.new = prev.new,
183	1x	cdf.case = cdf.case, cdf.control = cdf.control
184		),
185	1x	estimate = adjPrevEst(x = .data$estimate, prev = prev, prev.new = prev.new)
186		)
187		}
188
189		adjPrevPV <- function(dat, prev, prev.new, cdf.case, cdf.control) {
190	1x	mutate(
191	1x	dat,
192	1x	percentile = adjPrevPerc(
193	1x	perc = .data$score, prev.new = prev.new,
194	1x	cdf.case = cdf.case, cdf.control = cdf.control
195		),
196	1x	MNPV = adjPrevEst(x = .data$MNPV, prev = prev, prev.new = prev.new),
197	1x	NPV = 1 - .data$MNPV,
198	1x	PPV = adjPrevEst(x = .data$PPV, prev = prev, prev.new = prev.new)
199		)
200		}
201
202
203
204		bestPPV <- function(perc, prev) {
205	12x	ifelse(perc > 1 - prev, 1, prev / (1 - perc))
206		}
207
208		bestMNPV <- function(perc, prev) {
209	12x	ifelse(perc <= 1 - prev, 0, 1 - ((1 - prev) / perc))
210		}
211
212		bestSens <- function(perc, prev) {
213	8x	ifelse(perc <= 1 - prev, 1, (1 - perc) / prev)
214		}
215
216		bestSpec <- function(perc, prev) {
217	8x	ifelse(perc > 1 - prev, 1, perc / (1 - prev))
218		}