| 1 |
#' Calibration plot |
|
| 2 |
#' |
|
| 3 |
#' Calibration curve risk estimates |
|
| 4 |
#' |
|
| 5 |
#' @inheritParams riskProfile |
|
| 6 |
#' @param methods Character vector of method names (case-insensitive) for plotting curves or |
|
| 7 |
#' a named list where elements are method function and its arguments. |
|
| 8 |
#' Default is set to `list(gam = list(method = "gam", fitonPerc = FALSE))`. |
|
| 9 |
#' |
|
| 10 |
#' Full options are: `c("binned", "pava", "mspline", "gam", "cgam")`.
|
|
| 11 |
#' |
|
| 12 |
#' To specify arguments per method, use lists. For example: |
|
| 13 |
#' ``` |
|
| 14 |
#' list( |
|
| 15 |
#' pava = list(method = "pava", ties = "primary"), |
|
| 16 |
#' mspline = list(method = "mspline", fitonPerc = TRUE), |
|
| 17 |
#' gam = list(method = "gam", bs = "tp", logscores = FALSE), |
|
| 18 |
#' bin = list(method = "binned", bins = 10), |
|
| 19 |
#' ) |
|
| 20 |
#' ``` |
|
| 21 |
#' See section "Estimation" for more details. |
|
| 22 |
#' @param include Character vector (case-insensitive, partial matching) or `NULL` specifying |
|
| 23 |
#' what quantities to include in the plot. |
|
| 24 |
#' |
|
| 25 |
#' Default is: `c("loess", "citl")`.
|
|
| 26 |
#' |
|
| 27 |
#' Full options are: `c("loess", "citl", "rug", "datapoints")` or `NULL`.
|
|
| 28 |
#' "loess" adds a Loess fit, "citl" stands for "Calibration in the large", |
|
| 29 |
#' "rug" adds rug ticks of `score` by `outcome` (top x-axis: `score` for `outcome == 1`, |
|
| 30 |
#' bottom x-axis: `score` for `outcome == 0`), |
|
| 31 |
#' "datapoints" adds jittered `score` by `outcome` (slightly shifted away from 0 / 1 y-values), |
|
| 32 |
#' "`NULL`" stands for no extra information. |
|
| 33 |
#' @param plot.raw Logical to show percentiles or raw values. |
|
| 34 |
#' Defaults to `TRUE` (i.e. raw `score`). |
|
| 35 |
#' @param rev.order Logical to reverse ordering of scores. Defaults to `FALSE`. |
|
| 36 |
#' @param margin.type Type of additional margin plot, can be one of |
|
| 37 |
#' `c("density", "histogram", "boxplot", "violin", "densigram")`.
|
|
| 38 |
#' See [ggExtra::ggMarginal()] for more details. |
|
| 39 |
#' @param ... Additional arguments passed to [ggExtra::ggMarginal()]. |
|
| 40 |
#' |
|
| 41 |
#' @inheritSection riskProfile Estimation |
|
| 42 |
#' |
|
| 43 |
#' @return A list containing the plot and data, plus `citl` data if they were requested. |
|
| 44 |
#' |
|
| 45 |
#' @export |
|
| 46 |
#' |
|
| 47 |
#' @seealso [riskProfile()] [sensSpec()] |
|
| 48 |
#' |
|
| 49 |
#' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()] |
|
| 50 |
#' [getASISest()] |
|
| 51 |
#' |
|
| 52 |
#' @examples |
|
| 53 |
#' # Read in example data |
|
| 54 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
|
|
| 55 |
#' rscore <- auroc$predicted_calibrated |
|
| 56 |
#' truth <- as.numeric(auroc$actual) |
|
| 57 |
#' |
|
| 58 |
#' # Default calibration plot |
|
| 59 |
#' p1 <- calibrationProfile(outcome = truth, score = rscore) |
|
| 60 |
#' p1$plot |
|
| 61 |
#' |
|
| 62 |
#' # Specifying multiple estimation methods |
|
| 63 |
#' # By default, all the methods fit on percentiles |
|
| 64 |
#' calibrationProfile( |
|
| 65 |
#' outcome = truth, |
|
| 66 |
#' score = rscore, |
|
| 67 |
#' methods = c("gam", "mspline", "binned")
|
|
| 68 |
#' )$plot |
|
| 69 |
#' |
|
| 70 |
#' # Specifying multiple estimation methods with parameters |
|
| 71 |
#' calibrationProfile( |
|
| 72 |
#' outcome = truth, |
|
| 73 |
#' score = rscore, |
|
| 74 |
#' methods = list( |
|
| 75 |
#' gam = list(method = "gam", fitonPerc = FALSE, k = 3), |
|
| 76 |
#' mspline = list(method = "mspline"), |
|
| 77 |
#' bin = list(method = "binned", quantiles = 5) |
|
| 78 |
#' ) |
|
| 79 |
#' )$plot |
|
| 80 |
#' |
|
| 81 |
#' # Additional quantities and marginal histogram with specified number of bins |
|
| 82 |
#' calibrationProfile( |
|
| 83 |
#' outcome = truth, |
|
| 84 |
#' score = rscore, |
|
| 85 |
#' include = c("rug", "datapoints", "citl"),
|
|
| 86 |
#' # or use partial matching: include = c("r", "d", "c"),
|
|
| 87 |
#' margin.type = "histogram", |
|
| 88 |
#' bins = 100 # passed to ggExtra::ggMarginal |
|
| 89 |
#' )$plot |
|
| 90 |
#' |
|
| 91 |
calibrationProfile <- function(outcome, |
|
| 92 |
score, |
|
| 93 |
methods = list(gam = list(method = "gam", fitonPerc = FALSE)), |
|
| 94 |
include = c("loess", "citl"),
|
|
| 95 |
plot.raw = TRUE, |
|
| 96 |
rev.order = FALSE, |
|
| 97 |
margin.type = NULL, |
|
| 98 |
...) {
|
|
| 99 | ||
| 100 |
# Argument checks (except outcome, score, methods - below) |
|
| 101 | 10x |
checkmate::assert( |
| 102 | 10x |
checkmate::check_character(include), |
| 103 | 10x |
checkmate::check_null(include) |
| 104 |
) |
|
| 105 | 10x |
if (is.character(include)) {
|
| 106 | 9x |
include <- matchArgSubset(tolower(include), choices = c("loess", "citl", "rug", "datapoints"))
|
| 107 |
} |
|
| 108 | 10x |
checkmate::assert_flag(plot.raw) |
| 109 | 10x |
checkmate::assert_flag(rev.order) |
| 110 | 10x |
checkmate::assert( |
| 111 | 10x |
checkmate::check_character(margin.type, len = 1, any.missing = FALSE), |
| 112 | 10x |
checkmate::check_null(margin.type) |
| 113 |
) |
|
| 114 | ||
| 115 | 10x |
if (plot.raw) {
|
| 116 | 9x |
xvar <- "score" |
| 117 |
} else {
|
|
| 118 | 1x |
xvar <- "percentile" |
| 119 |
} |
|
| 120 | ||
| 121 |
# Standardize/Check outcome, scores |
|
| 122 | 10x |
op <- inputCheck(outcome = outcome, score = score) |
| 123 | ||
| 124 |
# Order Data by scores |
|
| 125 | 10x |
tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order) |
| 126 | 10x |
score <- tempdf$score |
| 127 | 10x |
outcome <- tempdf$outcome |
| 128 | ||
| 129 |
# Check methods |
|
| 130 | 10x |
methods <- methodCheck(methods = methods) |
| 131 | 10x |
method.names <- names(methods) |
| 132 | 10x |
if ("asis" %in% getEstMethods(methods, with.names = FALSE)) {
|
| 133 | 1x |
stop('"asis" method is not suitable for this plot. Please remove it.')
|
| 134 |
} |
|
| 135 | ||
| 136 |
# Get estimates |
|
| 137 | 9x |
pc.ests <- getEsts(methods = methods, outcome = outcome, score = score) |
| 138 | 9x |
PC.data <- pc.ests$plotdata |
| 139 | 9x |
step.methods <- method.names[pc.ests$idx.step] |
| 140 | ||
| 141 |
# Calculate percentiles |
|
| 142 | 9x |
ecdf.score <- ecdf(score) |
| 143 | 9x |
percentile <- ecdf.score(score) |
| 144 | ||
| 145 |
# Calculate Calibration in the large |
|
| 146 | 9x |
citl.data <- data.frame( |
| 147 | 9x |
outcome = mean(outcome), |
| 148 | 9x |
score = mean(score), |
| 149 | 9x |
percentile = ecdf.score(mean(score)), |
| 150 | 9x |
method = "Calibration In The Large" |
| 151 |
) |
|
| 152 | ||
| 153 |
# Subset PC data for plotting |
|
| 154 | 9x |
smoothPC <- PC.data[!PC.data$method %in% step.methods, , drop = FALSE] |
| 155 | 9x |
stepPC <- PC.data[PC.data$method %in% step.methods, , drop = FALSE] |
| 156 | ||
| 157 |
# data.frame with user inputs |
|
| 158 | 9x |
ddf <- data.frame(score, percentile, outcome) |
| 159 | ||
| 160 |
# Shape type storage |
|
| 161 | 9x |
shapes <- c() |
| 162 | ||
| 163 |
# Add empty scatterplot layer for ggMarginal |
|
| 164 | 9x |
if (!is.null(margin.type)) {
|
| 165 | 1x |
p <- ggplot() + |
| 166 | 1x |
geom_point( |
| 167 | 1x |
data = ddf, |
| 168 | 1x |
aes(x = .data[[xvar]], y = .data$outcome), shape = NA, na.rm = TRUE |
| 169 |
) |
|
| 170 |
} else {
|
|
| 171 | 8x |
p <- ggplot() |
| 172 |
} |
|
| 173 | ||
| 174 |
# Build plot |
|
| 175 | 9x |
p <- p + |
| 176 | 9x |
geom_line( |
| 177 | 9x |
data = smoothPC, |
| 178 | 9x |
aes(x = .data[[xvar]], y = .data$estimate, linetype = .data$method, colour = .data$method), |
| 179 | 9x |
alpha = 0.8, |
| 180 | 9x |
linewidth = 0.5 |
| 181 |
) + |
|
| 182 | 9x |
geom_step( |
| 183 | 9x |
data = stepPC, |
| 184 | 9x |
aes(x = .data[[xvar]], y = .data$estimate, linetype = .data$method, colour = .data$method), |
| 185 | 9x |
direction = "vh", |
| 186 | 9x |
alpha = 0.8, |
| 187 | 9x |
linewidth = 0.5 |
| 188 |
) + |
|
| 189 | 9x |
geom_abline( |
| 190 | 9x |
aes(slope = 1, intercept = 0, linewidth = "Identity line"), |
| 191 | 9x |
colour = "gray50", |
| 192 | 9x |
linetype = "solid" |
| 193 |
) |
|
| 194 | ||
| 195 |
# Add loess if requested |
|
| 196 | 9x |
if ("loess" %in% include) {
|
| 197 | 8x |
p <- p + geom_smooth( |
| 198 | 8x |
data = ddf, |
| 199 | 8x |
aes(x = .data[[xvar]], y = .data$outcome, linetype = "loess", colour = "loess"), |
| 200 | 8x |
method = "loess", formula = y ~ x, se = FALSE, |
| 201 | 8x |
linewidth = 0.5 |
| 202 |
) |
|
| 203 |
} |
|
| 204 | ||
| 205 |
# Add calibration in the large if requested |
|
| 206 | 9x |
if ("citl" %in% include) {
|
| 207 | 8x |
p <- p + geom_point( |
| 208 | 8x |
data = citl.data, |
| 209 | 8x |
aes(x = .data[[xvar]], y = .data$outcome, shape = .data$method), |
| 210 | 8x |
colour = "red", |
| 211 | 8x |
size = 3, |
| 212 | 8x |
stroke = 1 |
| 213 |
) |
|
| 214 | 8x |
shapes <- c(shapes, c("Calibration In The Large" = 4))
|
| 215 |
} |
|
| 216 | ||
| 217 |
# Add datapoints if requested |
|
| 218 | 9x |
if ("datapoints" %in% include) {
|
| 219 | 1x |
p <- p + geom_jitter( |
| 220 | 1x |
data = data.frame( |
| 221 | 1x |
score, |
| 222 | 1x |
percentile, |
| 223 | 1x |
outcome = ifelse(outcome == 0, -0.1, 1.1), |
| 224 | 1x |
method = "Data points" |
| 225 |
), |
|
| 226 | 1x |
aes(x = .data[[xvar]], y = .data$outcome, shape = .data$method), |
| 227 | 1x |
colour = "black", |
| 228 | 1x |
size = 1.5, |
| 229 | 1x |
alpha = 0.4, |
| 230 | 1x |
position = position_jitter(seed = 5, height = 0.03) |
| 231 |
) |
|
| 232 | 1x |
shapes <- c(shapes, c("Data points" = 16))
|
| 233 |
} |
|
| 234 | ||
| 235 |
# Add rug if requested |
|
| 236 | 9x |
if ("rug" %in% include) {
|
| 237 | 1x |
p <- p + geom_rug( |
| 238 | 1x |
data = ddf[ddf$outcome == 0, ], |
| 239 | 1x |
aes(x = .data[[xvar]]), |
| 240 | 1x |
sides = "b", |
| 241 | 1x |
show.legend = FALSE |
| 242 | 1x |
) + geom_rug( |
| 243 | 1x |
data = ddf[ddf$outcome == 1, ], |
| 244 | 1x |
aes(x = .data[[xvar]]), |
| 245 | 1x |
sides = "t", |
| 246 | 1x |
show.legend = FALSE |
| 247 |
) |
|
| 248 |
} |
|
| 249 | ||
| 250 |
# Fix shape legend ... |
|
| 251 | 9x |
if (all(c("citl", "datapoints") %in% include)) {
|
| 252 | 1x |
shape_guide <- guide_legend( |
| 253 | 1x |
override.aes = list( |
| 254 | 1x |
colour = c("Calibration In The Large" = "red", "Data points" = "black"),
|
| 255 | 1x |
alpha = 1, size = 2.5 |
| 256 |
), |
|
| 257 | 1x |
order = 3 |
| 258 |
) |
|
| 259 | 8x |
} else if (any(c("citl", "datapoints") %in% include)) {
|
| 260 | 7x |
shape_guide <- guide_legend( |
| 261 | 7x |
override.aes = list(alpha = 1, size = 2.5), |
| 262 | 7x |
order = 3 |
| 263 |
) |
|
| 264 |
} else {
|
|
| 265 | 1x |
shape_guide <- NULL |
| 266 |
} |
|
| 267 | ||
| 268 |
# Finalize graph |
|
| 269 | 9x |
p <- p + |
| 270 | 9x |
scale_linewidth_manual(values = c("Identity line" = 0.5)) +
|
| 271 | 9x |
scale_shape_manual(values = shapes) + |
| 272 | 9x |
scale_colour_hue(l = 45) + |
| 273 | 9x |
scale_x_continuous(n.breaks = 6) + |
| 274 | 9x |
scale_y_continuous(n.breaks = 6) + |
| 275 | 9x |
labs( |
| 276 | 9x |
title = "Calibration Plot", |
| 277 | 9x |
x = "Predicted Probability", |
| 278 | 9x |
y = "Observed", |
| 279 | 9x |
linetype = "Estimation Method", |
| 280 | 9x |
linewidth = NULL, |
| 281 | 9x |
colour = "Estimation Method", |
| 282 | 9x |
shape = `if`(all(c("citl", "datapoints") %in% include), "Points", NULL)
|
| 283 |
) + |
|
| 284 | 9x |
theme_bw() + |
| 285 | 9x |
theme(legend.key.width = unit(2, "line")) + |
| 286 | 9x |
guides( |
| 287 | 9x |
linetype = guide_legend(order = 1), |
| 288 | 9x |
colour = guide_legend(order = 1), |
| 289 | 9x |
linewidth = guide_legend(order = 2), |
| 290 | 9x |
shape = shape_guide |
| 291 |
) |
|
| 292 | ||
| 293 |
# Add margin plot |
|
| 294 | 9x |
if (!is.null(margin.type)) {
|
| 295 | 1x |
p <- ggExtra::ggMarginal(p, type = margin.type, margins = "x", ...) |
| 296 |
} |
|
| 297 | ||
| 298 | 9x |
return(list(plot = p, data = dplyr::as_tibble(PC.data), citl = citl.data)) |
| 299 |
} |
| 1 | ||
| 2 |
# Helper function to check the user input - bins and quantiles arguments |
|
| 3 |
checkBinsQuantiles <- function(bins, quantiles, score) {
|
|
| 4 | ||
| 5 | 15x |
checkmate::assert( |
| 6 | 15x |
checkmate::check_numeric(bins, any.missing = FALSE, sorted = TRUE), |
| 7 | 15x |
checkmate::check_null(bins) |
| 8 |
) |
|
| 9 | 15x |
checkmate::assert( |
| 10 | 15x |
checkmate::check_integerish(quantiles, lower = 1), |
| 11 | 15x |
checkmate::check_null(quantiles) |
| 12 |
) |
|
| 13 | ||
| 14 |
# Check combination of quantiles and bins |
|
| 15 | 15x |
if (!is.null(quantiles) && !is.null(bins)) {
|
| 16 | 1x |
stop("bins and quantiles cannot be specified together, choose one and set the other to NULL")
|
| 17 |
} |
|
| 18 | ||
| 19 | 14x |
if (is.null(quantiles) && is.null(bins)) {
|
| 20 | 5x |
quantiles <- 10 |
| 21 |
} |
|
| 22 | ||
| 23 |
# Further check for bins |
|
| 24 | 14x |
if (length(bins) > 1) {
|
| 25 | 4x |
if (bins[1] > min(score)) {
|
| 26 | 1x |
stop(paste("The first element of bins must be <= min(score), not", bins[1]))
|
| 27 |
} |
|
| 28 | 3x |
if (bins[length(bins)] < max(score)) {
|
| 29 | 1x |
stop(paste("The last element of bins must be >= max(score), not", bins[length(bins)]))
|
| 30 |
} |
|
| 31 |
} |
|
| 32 | ||
| 33 |
# Scalar bins -> number of intervals |
|
| 34 | 12x |
if (length(bins) == 1 && bins <= 1) {
|
| 35 | 1x |
stop("bins must be > 1 when provided as a scalar (i.e. number of bins)")
|
| 36 |
} |
|
| 37 | ||
| 38 |
# Check discrete score |
|
| 39 | 11x |
lu <- length(unique(score)) |
| 40 | 11x |
if (length(bins) == 1 && bins > lu) {
|
| 41 | 1x |
warning( |
| 42 | 1x |
paste0( |
| 43 | 1x |
"The number of `bins` (", bins, ") ",
|
| 44 | 1x |
"is > the number of unique score values (", lu, "). ",
|
| 45 | 1x |
"The results may be unreliable." |
| 46 |
) |
|
| 47 |
) |
|
| 48 |
} |
|
| 49 | 11x |
if (length(bins) > 1 && length(bins) - 1 > lu) {
|
| 50 | 1x |
warning( |
| 51 | 1x |
paste0( |
| 52 | 1x |
"The number of `bins` (", length(bins) - 1, ") ",
|
| 53 | 1x |
"is > the number of unique score values (", lu, "). ",
|
| 54 | 1x |
"The results may be unreliable." |
| 55 |
) |
|
| 56 |
) |
|
| 57 |
} |
|
| 58 | 11x |
if (!is.null(quantiles) && lu <= 10) {
|
| 59 | 1x |
warning( |
| 60 | 1x |
"Using the quantile method for non-continuous score. ", |
| 61 | 1x |
"The results may be unreliable." |
| 62 |
) |
|
| 63 |
} |
|
| 64 | ||
| 65 | 11x |
return(list(bins = bins, quantiles = quantiles)) |
| 66 |
} |
|
| 67 | ||
| 68 | ||
| 69 |
# Helper function to check the user input - errorbar.sem argument |
|
| 70 |
checkErrorbarSem <- function(errorbar.sem) {
|
|
| 71 | 7x |
checkmate::assert( |
| 72 | 7x |
checkmate::check_number(errorbar.sem, lower = 0), # this checks >= 0 |
| 73 | 7x |
checkmate::check_null(errorbar.sem) |
| 74 |
) |
|
| 75 | 7x |
if (!is.null(errorbar.sem)) {
|
| 76 | 2x |
stopifnot("`errorbar.sem` must be > 0" = errorbar.sem > 0)
|
| 77 |
} |
|
| 78 | 7x |
return(errorbar.sem) |
| 79 |
} |
|
| 80 | ||
| 81 | ||
| 82 |
#' Binned Risk Estimates |
|
| 83 |
#' |
|
| 84 |
#' Calculates bins based on number of evenly spaced bins or n-tiles. |
|
| 85 |
#' Determines average risk within bins, used for risk estimates. |
|
| 86 |
#' |
|
| 87 |
#' @inheritParams riskProfile |
|
| 88 |
#' @param quantiles Numeric; quantiles to split bins. |
|
| 89 |
#' @param bins Numeric; number of evenly spaced bins or bin locations. |
|
| 90 |
#' @param right Logical indicating right closed interval. Defaults to `TRUE`. |
|
| 91 |
#' @param errorbar.sem Scalar numeric representing the number of standard error from the means |
|
| 92 |
#' (SEM) used to calculate risk error bar. |
|
| 93 |
#' |
|
| 94 |
#' @return A data frame with 4 columns |
|
| 95 |
#' (score, score percentile, outcome, estimate). |
|
| 96 |
#' Additionally, there is an attribute "errorbar" holding the error-bar data if |
|
| 97 |
#' `errorbar.sem` was specified. |
|
| 98 |
#' |
|
| 99 |
#' @seealso [getASISest()] [getCGAMest()] [getGAMest()] [getMSPLINEest()] [getPAVAest()] |
|
| 100 |
#' |
|
| 101 |
#' @export |
|
| 102 |
#' |
|
| 103 |
#' @examples |
|
| 104 |
#' # Read in example data |
|
| 105 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
|
|
| 106 |
#' rscore <- auroc$predicted |
|
| 107 |
#' truth <- as.numeric(auroc$actual) |
|
| 108 |
#' |
|
| 109 |
#' getBINNEDest(outcome = truth, score = rscore) |
|
| 110 |
#' |
|
| 111 |
getBINNEDest <- function(outcome, |
|
| 112 |
score, |
|
| 113 |
quantiles = NULL, |
|
| 114 |
bins = NULL, |
|
| 115 |
right = TRUE, |
|
| 116 |
errorbar.sem = NULL) {
|
|
| 117 | ||
| 118 |
# Argument checks |
|
| 119 | 7x |
checkmate::assert_numeric(outcome) |
| 120 | 7x |
checkmate::assert_numeric(score, len = length(outcome)) |
| 121 | 7x |
checkmate::assert_flag(right) |
| 122 | 7x |
errorbar.sem <- checkErrorbarSem(errorbar.sem) |
| 123 | ||
| 124 |
# Check bins and quantiles |
|
| 125 | 7x |
bqp <- checkBinsQuantiles(bins = bins, quantiles = quantiles, score = score) |
| 126 | ||
| 127 |
# Retrieve data summaries |
|
| 128 | 7x |
df <- getSummaries( |
| 129 | 7x |
outcome = outcome, score = score, |
| 130 | 7x |
quantiles = bqp$quantiles, bins = bqp$bins, |
| 131 | 7x |
right = right |
| 132 |
) |
|
| 133 | ||
| 134 | 7x |
if (!is.null(errorbar.sem)) {
|
| 135 | 2x |
errorbar <- getERRORest(binlvl = df[["binlvl"]], z = errorbar.sem) %>% |
| 136 | 2x |
mutate( |
| 137 | 2x |
percentile = df[["binlvl"]][["riskpercentile"]], |
| 138 | 2x |
bin.mid = df[["binlvl"]][["avg.outcome"]] |
| 139 |
) |
|
| 140 |
} else {
|
|
| 141 | 5x |
errorbar <- NULL |
| 142 |
} |
|
| 143 | ||
| 144 |
# Create binned.data |
|
| 145 | 7x |
binned.data <- data.frame( |
| 146 | 7x |
score = df[["binlvl"]][["avg.risk"]], |
| 147 | 7x |
percentile = df[["binlvl"]][["riskpercentile"]], |
| 148 | 7x |
outcome = NA, |
| 149 | 7x |
estimate = df[["binlvl"]][["avg.outcome"]] |
| 150 |
) |
|
| 151 | 7x |
attr(binned.data, "errorbar") <- errorbar |
| 152 | ||
| 153 | 7x |
return(binned.data) |
| 154 |
} |
|
| 155 | ||
| 156 | ||
| 157 | ||
| 158 |
#' Get Summaries: observation level & bin level dataframes |
|
| 159 |
#' |
|
| 160 |
#' Given vectors of outcomes and risk scores, the function will return a list of |
|
| 161 |
#' observation level and bin level summary dataframes for the data. |
|
| 162 |
#' |
|
| 163 |
#' Observation level dataframe has columns for outcome, riskscore, risk percentile, bin number, |
|
| 164 |
#' and corresponding minimum and maximum score for that bin. |
|
| 165 |
#' |
|
| 166 |
#' Bin level dataframe has columns indicating bin number and the observation count, |
|
| 167 |
#' number of events, average outcome, average risk, and standard deviation of risk, |
|
| 168 |
#' within each of the bins. Risk percentile and bin intervals are also provided. |
|
| 169 |
#' |
|
| 170 |
#' @return List of observation level and bin level dataframes. |
|
| 171 |
#' |
|
| 172 |
#' @keywords internal |
|
| 173 |
#' @noRd |
|
| 174 |
#' |
|
| 175 |
#' @examples |
|
| 176 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
|
|
| 177 |
#' truth <- as.numeric(auroc$actual) |
|
| 178 |
#' rscore <- auroc$predicted |
|
| 179 |
#' |
|
| 180 |
#' # Bin by quantiles |
|
| 181 |
#' getSummaries( |
|
| 182 |
#' outcome = truth, |
|
| 183 |
#' score = rscore, |
|
| 184 |
#' quantiles = 10 |
|
| 185 |
#' ) |
|
| 186 |
#' |
|
| 187 |
#' # Bin by specific percentiles |
|
| 188 |
#' getSummaries( |
|
| 189 |
#' outcome = truth, |
|
| 190 |
#' score = rscore, |
|
| 191 |
#' quantiles = 0, |
|
| 192 |
#' bin = c(0, 0.25, 0.5, 0.8, 1) |
|
| 193 |
#' ) |
|
| 194 |
#' |
|
| 195 |
getSummaries <- function(outcome, |
|
| 196 |
score, |
|
| 197 |
quantiles = NULL, |
|
| 198 |
bins = NULL, |
|
| 199 |
right = TRUE) {
|
|
| 200 | ||
| 201 | 16x |
stopifnot( |
| 202 | 16x |
!is.null(quantiles) | !is.null(bins), |
| 203 | 16x |
quantiles != 0 | bins != 0, |
| 204 | 16x |
is.logical(right) |
| 205 |
) |
|
| 206 | ||
| 207 | 15x |
cdf.fit <- ecdf(score) |
| 208 | ||
| 209 | 15x |
if (!is.null(quantiles)) {
|
| 210 | 12x |
bin.int <- Hmisc::cut2(score, g = quantiles) |
| 211 | ||
| 212 | 3x |
} else if (length(bins) > 1) {
|
| 213 | ! |
bin.int <- cut(cdf.fit(score), breaks = bins, include.lowest = TRUE, right = right) |
| 214 | ||
| 215 | 3x |
} else if (length(bins) == 1) {
|
| 216 | 2x |
bin.int <- cut(score, breaks = bins, include.lowest = TRUE, right = right) |
| 217 | ||
| 218 |
} else {
|
|
| 219 | 1x |
stop("Unrecognized option")
|
| 220 |
} |
|
| 221 |
|
|
| 222 |
# get numeric label of interval |
|
| 223 | 14x |
bin.num <- as.numeric(bin.int) |
| 224 | ||
| 225 |
# get interval borders (min, max) |
|
| 226 | 14x |
min_max <- strsplit( |
| 227 | 14x |
gsub("(?![-,.])[[:punct:]]", "", trimws(as.character(bin.int)), perl = TRUE),
|
| 228 |
"," |
|
| 229 |
) |
|
| 230 | 14x |
min_max <- lapply( |
| 231 | 14x |
min_max, \(x) `if`(length(x) < 2, rep(x, 2), x) |
| 232 |
) |
|
| 233 | ||
| 234 |
# Observation level data: outcome, rscore, observation percentile, and interval |
|
| 235 | 14x |
obslvl <- data.frame( |
| 236 | 14x |
outcome = outcome, |
| 237 | 14x |
score = score, |
| 238 | 14x |
riskpercentile = cdf.fit(score), |
| 239 | 14x |
bin = bin.num, |
| 240 | 14x |
interval = as.character(bin.int), |
| 241 | 14x |
min = vapply(min_max, "[[", character(1), 1), |
| 242 | 14x |
max = vapply(min_max, "[[", character(1), 2) |
| 243 |
) %>% |
|
| 244 | 14x |
arrange(.data$score) |
| 245 | ||
| 246 |
# Bin level data: within each bin: n, avg risk, sd risk, quantile, error bar |
|
| 247 | 14x |
binlvl <- obslvl %>% |
| 248 | 14x |
group_by(.data$bin, .data$interval) %>% |
| 249 | 14x |
summarise( |
| 250 | 14x |
n = dplyr::n(), |
| 251 | 14x |
events = sum(.data$outcome), |
| 252 | 14x |
avg.outcome = mean(.data$outcome), |
| 253 | 14x |
sd.outcome = sd(.data$outcome, na.rm = TRUE), |
| 254 | 14x |
avg.risk = mean(.data$score, na.rm = TRUE), |
| 255 | 14x |
sd.risk = sd(.data$score), |
| 256 | 14x |
riskpercentile = max(.data$riskpercentile), |
| 257 | 14x |
.groups = "drop" |
| 258 |
) |
|
| 259 | ||
| 260 |
# Replace NAs with 0 |
|
| 261 | 14x |
binlvl[is.na(binlvl)] <- 0 |
| 262 | ||
| 263 | 14x |
return(list(obslvl = obslvl, binlvl = binlvl)) |
| 264 |
} |
|
| 265 | ||
| 266 | ||
| 267 |
# ERROR BAR Estimates |
|
| 268 |
getERRORest <- function(binlvl, z) {
|
|
| 269 | ||
| 270 | 2x |
stopifnot( |
| 271 | 2x |
is.data.frame(binlvl), |
| 272 | 2x |
is.numeric(z) |
| 273 |
) |
|
| 274 | ||
| 275 | 2x |
binlvl %>% |
| 276 | 2x |
mutate( |
| 277 | 2x |
bin.low = .data$avg.outcome - (z * (.data$sd.outcome / sqrt(.data$n))), |
| 278 | 2x |
bin.low = ifelse(.data$bin.low < 0, 0, .data$bin.low), |
| 279 | 2x |
bin.high = .data$avg.outcome + (z * (.data$sd.outcome / sqrt(.data$n))), |
| 280 | 2x |
midquantile = .data$riskpercentile - (diff(c(0, .data$riskpercentile)) / 2) |
| 281 |
) %>% |
|
| 282 | 2x |
select(all_of(c("midquantile", "bin.high", "bin.low"))) %>%
|
| 283 | 2x |
tidyr::replace_na(list(bin.high = 0, bin.low = 0)) |
| 284 |
} |
|
| 285 | ||
| 286 | ||
| 287 |
#' PAVA Risk Estimates |
|
| 288 |
#' |
|
| 289 |
#' Determines isotonic regression estimates via pava, given a vector of binary outcomes, |
|
| 290 |
#' and a vector of scores. |
|
| 291 |
#' |
|
| 292 |
#' @inheritParams riskProfile |
|
| 293 |
#' @param weights Vector of numerics to specify PAVA observation weighting. |
|
| 294 |
#' @param ties String to specify how ties should be handled for PAVA. |
|
| 295 |
#' @param low_events Numeric, specifying number of events in the lowest bin. |
|
| 296 |
#' @param low_nonevents Numeric, specifying number of nonevents in the lowest bin. |
|
| 297 |
#' @param high_events Numeric, specifying number of events in the highest bin. |
|
| 298 |
#' @param high_nonevents Numeric, specifying number of nonevents in the highest bin. |
|
| 299 |
#' @param hilo_obs Numeric, specifying number of observations in the highest and lowest bins. |
|
| 300 |
#' |
|
| 301 |
#' @return A data frame with 4 columns |
|
| 302 |
#' (score, score percentile, outcome, estimate). |
|
| 303 |
#' |
|
| 304 |
#' @seealso [getASISest()] [getBINNEDest()] [getCGAMest()] [getGAMest()] [getMSPLINEest()] |
|
| 305 |
#' |
|
| 306 |
#' @export |
|
| 307 |
#' |
|
| 308 |
#' @examples |
|
| 309 |
#' # Read in example data |
|
| 310 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
|
|
| 311 |
#' rscore <- auroc$predicted |
|
| 312 |
#' truth <- as.numeric(auroc$actual) |
|
| 313 |
#' |
|
| 314 |
#' tail(getPAVAest(outcome = truth, score = rscore), 10) |
|
| 315 |
#' |
|
| 316 |
getPAVAest <- function(outcome, |
|
| 317 |
score, |
|
| 318 |
weights = rep(1, length(outcome)), |
|
| 319 |
ties = "primary", |
|
| 320 |
low_events = NULL, |
|
| 321 |
low_nonevents = NULL, |
|
| 322 |
high_events = NULL, |
|
| 323 |
high_nonevents = NULL, |
|
| 324 |
hilo_obs = NULL) {
|
|
| 325 | ||
| 326 | 5x |
checkmate::assert_numeric(outcome) |
| 327 | 5x |
checkmate::assert_numeric(score, len = length(outcome)) |
| 328 | 5x |
checkmate::assert_numeric(weights, any.missing = FALSE, len = length(outcome)) |
| 329 | 5x |
checkmate::assert_character(ties, any.missing = FALSE, len = 1) |
| 330 | 5x |
checkmate::assert( |
| 331 | 5x |
checkmate::check_integerish(low_events, lower = 1, any.missing = FALSE, len = 1), |
| 332 | 5x |
checkmate::check_null(low_events) |
| 333 |
) |
|
| 334 | 5x |
checkmate::assert( |
| 335 | 5x |
checkmate::check_integerish(low_nonevents, lower = 1, any.missing = FALSE, len = 1), |
| 336 | 5x |
checkmate::check_null(low_nonevents) |
| 337 |
) |
|
| 338 | 5x |
checkmate::assert( |
| 339 | 5x |
checkmate::check_integerish(high_events, lower = 1, any.missing = FALSE, len = 1), |
| 340 | 5x |
checkmate::check_null(high_events) |
| 341 |
) |
|
| 342 | 5x |
checkmate::assert( |
| 343 | 5x |
checkmate::check_integerish(high_nonevents, lower = 1, any.missing = FALSE, len = 1), |
| 344 | 5x |
checkmate::check_null(high_nonevents) |
| 345 |
) |
|
| 346 | 5x |
checkmate::assert( |
| 347 | 5x |
checkmate::check_integerish(hilo_obs, lower = 1, any.missing = FALSE, len = 1), |
| 348 | 5x |
checkmate::check_null(hilo_obs) |
| 349 |
) |
|
| 350 | ||
| 351 | 5x |
pava.est <- isotone::gpava(z = score, y = outcome, weights = weights, ties = ties)$x |
| 352 | ||
| 353 |
# if constrained, then replace percentiles.... |
|
| 354 | 5x |
check <- any( |
| 355 | 5x |
is.numeric(low_events), is.numeric(low_nonevents), is.numeric(high_events), |
| 356 | 5x |
is.numeric(high_nonevents), is.numeric(hilo_obs) |
| 357 |
) |
|
| 358 | 5x |
if (check) {
|
| 359 | ! |
percentile <- getConstraints( |
| 360 | ! |
outcome = outcome, rscore = score, |
| 361 | ! |
low_events = low_events, low_nonevents = low_nonevents, |
| 362 | ! |
high_events = high_events, high_nonevents = high_nonevents, |
| 363 | ! |
hilo_obs = hilo_obs |
| 364 |
) |
|
| 365 |
} else {
|
|
| 366 | 5x |
percentile <- ecdf(score)(score) |
| 367 |
} |
|
| 368 | ||
| 369 | 5x |
return(data.frame(score, percentile, outcome, estimate = pava.est)) |
| 370 |
} |
|
| 371 | ||
| 372 | ||
| 373 |
#' Constrained Risk Percentile Estimates |
|
| 374 |
#' |
|
| 375 |
#' Adjusts PAVA risk percentile estimates for the first and last bin, to meet criteria |
|
| 376 |
#' for events, non-events, or total observation count. |
|
| 377 |
#' |
|
| 378 |
#' @inheritParams riskProfile |
|
| 379 |
#' |
|
| 380 |
#' @return A vector of constrained risk percentiles. |
|
| 381 |
#' |
|
| 382 |
#' @keywords internal |
|
| 383 |
#' @noRd |
|
| 384 |
#' |
|
| 385 |
#' @examples |
|
| 386 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
|
|
| 387 |
#' truth <- as.numeric(auroc$actual) |
|
| 388 |
#' rscore <- auroc$predicted |
|
| 389 |
#' |
|
| 390 |
#' getConstraints(outcome = truth, rscore = rscore, low_events = 3, high_nonevents = 3) |
|
| 391 |
#' |
|
| 392 |
getConstraints <- function(outcome, |
|
| 393 |
rscore, |
|
| 394 |
low_events = NULL, # min events in lower bin (useful for a PC) |
|
| 395 |
low_nonevents = NULL, # min non-events in lower bin |
|
| 396 |
high_events = NULL, # min events in upper bin |
|
| 397 |
high_nonevents = NULL, # min non-events in upper bin (useful for a PC) |
|
| 398 |
hilo_obs = NULL) { # min total obs in upper AND lower bin
|
|
| 399 | ||
| 400 | 1x |
n <- length(rscore) |
| 401 | 1x |
rscore <- seq_along(rscore) / length(rscore) |
| 402 |
# rscore_cons: scores with binning constraint (same as rscore if no constraint specified) |
|
| 403 | 1x |
rscore_cons <- rscore |
| 404 | ||
| 405 | ! |
if (!is.null(low_events) && low_events == 0) low_events <- NULL |
| 406 | ! |
if (!is.null(low_nonevents) && low_nonevents == 0) low_nonevents <- NULL |
| 407 | ! |
if (!is.null(high_events) && high_events == 0) high_events <- NULL |
| 408 | ! |
if (!is.null(high_nonevents) && high_nonevents == 0) high_nonevents <- NULL |
| 409 | ! |
if (!is.null(hilo_obs) && hilo_obs == 0) hilo_obs <- NULL |
| 410 | ||
| 411 | 1x |
if (is.numeric(low_events) && is.numeric(low_nonevents)) {
|
| 412 | ! |
warning( |
| 413 | ! |
paste( |
| 414 | ! |
"Specified both a minimum number of events and non-events for the lower bin.", |
| 415 | ! |
"Combining for total observations instead." |
| 416 |
) |
|
| 417 |
) |
|
| 418 | ! |
hilo_obs <- round(low_events + low_nonevents) |
| 419 | ! |
rscore_cons[1:hilo_obs] <- min(rscore) |
| 420 | ||
| 421 | 1x |
} else if (is.numeric(high_events) && is.numeric(high_nonevents)) {
|
| 422 | ! |
warning( |
| 423 | ! |
paste( |
| 424 | ! |
"Specified both a minimum number of events and non-events for the upper bin.", |
| 425 | ! |
"Combining for total observations instead." |
| 426 |
) |
|
| 427 |
) |
|
| 428 | ! |
hilo_obs <- round(high_events + high_nonevents) |
| 429 | ! |
rscore_cons[(n + 1 - hilo_obs):n] <- max(rscore) |
| 430 | ||
| 431 |
} else {
|
|
| 432 |
# Apply upper / lower constraints |
|
| 433 | 1x |
if (is.numeric(low_events)) {
|
| 434 | 1x |
low_events <- round(low_events) |
| 435 | 1x |
indlo <- match(TRUE, cumsum(outcome) == low_events) |
| 436 | 1x |
rscore_cons[1:indlo] <- min(rscore) |
| 437 |
} |
|
| 438 | 1x |
if (is.numeric(low_nonevents)) {
|
| 439 | ! |
low_nonevents <- round(low_nonevents) |
| 440 | ! |
indlo <- match(TRUE, cumsum(1 - outcome) == low_nonevents) |
| 441 | ! |
rscore_cons[1:indlo] <- min(rscore) |
| 442 |
} |
|
| 443 | 1x |
if (is.numeric(high_nonevents)) {
|
| 444 | 1x |
high_nonevents <- round(high_nonevents) |
| 445 | 1x |
indhi <- match(TRUE, cumsum(1 - outcome[n:1]) == high_nonevents) |
| 446 | 1x |
rscore_cons[(n + 1 - indhi):n] <- max(rscore) |
| 447 |
} |
|
| 448 | ||
| 449 | 1x |
if (is.numeric(high_events)) {
|
| 450 | ! |
high_events <- round(high_events) |
| 451 | ! |
indhi <- match(TRUE, cumsum(outcome[n:1]) == high_events) |
| 452 | ! |
rscore_cons[(n + 1 - indhi):n] <- max(rscore) |
| 453 |
} |
|
| 454 | ||
| 455 | 1x |
if (is.numeric(hilo_obs)) {
|
| 456 | ! |
hilo_obs <- round(hilo_obs) |
| 457 | ! |
rscore_cons[1:hilo_obs] <- min(rscore) |
| 458 | ! |
rscore_cons[(n + 1 - hilo_obs):n] <- max(rscore) |
| 459 |
} |
|
| 460 |
} |
|
| 461 | ||
| 462 | 1x |
return(as.vector(rscore_cons)) |
| 463 |
} |
|
| 464 | ||
| 465 | ||
| 466 |
#' GAM Risk Estimates |
|
| 467 |
#' |
|
| 468 |
#' Fits a Generalized Additive Model to estimate risk, given a vector of binary outcome, |
|
| 469 |
#' and a vector of scores. |
|
| 470 |
#' |
|
| 471 |
#' @inheritParams riskProfile |
|
| 472 |
#' @param k Numeric to specify the upper limit of basis functions to fit for GAM. |
|
| 473 |
#' See [mgcv::s()] for more details. Defaults to -1. |
|
| 474 |
#' @param bs Character string to specify spline type. |
|
| 475 |
#' See [mgcv::s()] for more details. Defaults to `"tp"`. |
|
| 476 |
#' @param method Character string to specify method type. |
|
| 477 |
#' See [mgcv::s()] for more details. Defaults to "REML". |
|
| 478 |
#' @param logscores Logical; if `TRUE`, fit gam on log scores. Defaults to `FALSE`. |
|
| 479 |
#' @param fitonPerc Logical; if `TRUE`, fit gam on risk percentiles. Defaults to `TRUE`. |
|
| 480 |
#' |
|
| 481 |
#' @return A data frame with 4 columns |
|
| 482 |
#' (score, score percentile, outcome, estimate). |
|
| 483 |
#' |
|
| 484 |
#' @seealso [getASISest()] [getBINNEDest()] [getCGAMest()] [getMSPLINEest()] [getPAVAest()] |
|
| 485 |
#' |
|
| 486 |
#' @export |
|
| 487 |
#' |
|
| 488 |
#' @examples |
|
| 489 |
#' # Read in example data |
|
| 490 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
|
|
| 491 |
#' rscore <- auroc$predicted |
|
| 492 |
#' truth <- as.numeric(auroc$actual) |
|
| 493 |
#' |
|
| 494 |
#' tail(getGAMest(outcome = truth, score = rscore), 10) |
|
| 495 |
#' |
|
| 496 |
getGAMest <- function(outcome, |
|
| 497 |
score, |
|
| 498 |
k = -1, |
|
| 499 |
bs = "tp", |
|
| 500 |
method = "REML", |
|
| 501 |
logscores = FALSE, |
|
| 502 |
fitonPerc = TRUE) {
|
|
| 503 | ||
| 504 | 23x |
checkmate::assert_numeric(outcome) |
| 505 | 23x |
checkmate::assert_numeric(score, len = length(outcome)) |
| 506 | 23x |
checkmate::assert_number(k) |
| 507 | 23x |
checkmate::assert_character(bs, any.missing = FALSE, len = 1) |
| 508 | 23x |
checkmate::assert_character(method, any.missing = FALSE, len = 1) |
| 509 | 23x |
checkmate::assert_flag(logscores) |
| 510 | 23x |
checkmate::assert_flag(fitonPerc) |
| 511 | ||
| 512 | 23x |
mygrid <- ecdf(score)(score) |
| 513 | 23x |
df <- data.frame(outcome = outcome, score = score, perc = mygrid) |
| 514 | ||
| 515 |
# mgcv::s does not work in formula, need to define it here |
|
| 516 | 23x |
s <- mgcv::s |
| 517 | ||
| 518 | 23x |
if (fitonPerc && logscores) {
|
| 519 | 1x |
formula <- outcome ~ s(log(perc), k = k, bs = bs) |
| 520 | 22x |
} else if (fitonPerc && !logscores) {
|
| 521 | 15x |
formula <- outcome ~ s(perc, k = k, bs = bs) |
| 522 | 7x |
} else if (!fitonPerc && logscores) {
|
| 523 | 1x |
formula <- outcome ~ s(log(score), k = k, bs = bs) |
| 524 |
} else {
|
|
| 525 | 6x |
formula <- outcome ~ s(score, k = k, bs = bs) |
| 526 |
} |
|
| 527 | ||
| 528 | 23x |
gam.fit <- mgcv::gam(formula, data = df, family = "binomial", method = method) |
| 529 | ||
| 530 | 23x |
gam.est <- mgcv::predict.gam(gam.fit, type = "response") |
| 531 | ||
| 532 | 23x |
return(data.frame(score, percentile = mygrid, outcome, estimate = gam.est)) |
| 533 |
} |
|
| 534 | ||
| 535 | ||
| 536 | ||
| 537 |
#' Constrained GAM (cgam) Risk Estimates |
|
| 538 |
#' |
|
| 539 |
#' Fits a Constrained Generalized Additive Model to estimate risk, |
|
| 540 |
#' given a vector of binary outcomes and a vector of scores. |
|
| 541 |
#' |
|
| 542 |
#' @inheritParams riskProfile |
|
| 543 |
#' @param numknots Numeric to specify the number of knots. |
|
| 544 |
#' Passed to the `smoother` function. Defaults to 3. |
|
| 545 |
#' @param smoother Character string to specify the smoother (from cgam package). |
|
| 546 |
#' Defaults to "s.incr". |
|
| 547 |
#' @param logscores Logical; if `TRUE`, fit gam on log scores. Defaults to `FALSE`. |
|
| 548 |
#' @param fitonPerc Logical; if `TRUE`, fit gam on risk percentiles. Defaults to `TRUE`. |
|
| 549 |
#' |
|
| 550 |
#' @return A data frame with 4 columns |
|
| 551 |
#' (score, score percentile, outcome, estimate). |
|
| 552 |
#' |
|
| 553 |
#' @seealso [getASISest()] [getBINNEDest()] [getGAMest()] [getMSPLINEest()] [getPAVAest()] |
|
| 554 |
#' |
|
| 555 |
#' @export |
|
| 556 |
#' |
|
| 557 |
#' @examples |
|
| 558 |
#' # Read in example data |
|
| 559 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
|
|
| 560 |
#' rscore <- auroc$predicted |
|
| 561 |
#' truth <- as.numeric(auroc$actual) |
|
| 562 |
#' |
|
| 563 |
#' tail(getCGAMest(outcome = truth, score = rscore), 10) |
|
| 564 |
#' |
|
| 565 |
getCGAMest <- function(outcome, |
|
| 566 |
score, |
|
| 567 |
numknots = 0, |
|
| 568 |
smoother = "s.incr", |
|
| 569 |
logscores = FALSE, |
|
| 570 |
fitonPerc = TRUE) {
|
|
| 571 | ||
| 572 | 3x |
checkmate::assert_numeric(outcome) |
| 573 | 3x |
checkmate::assert_numeric(score, len = length(outcome)) |
| 574 | 3x |
checkmate::assert_number(numknots) |
| 575 | 3x |
checkmate::assert_character(smoother, any.missing = FALSE, len = 1) |
| 576 | 3x |
checkmate::assert_flag(logscores) |
| 577 | 3x |
checkmate::assert_flag(fitonPerc) |
| 578 | ||
| 579 | 3x |
mygrid <- ecdf(score)(score) |
| 580 | 3x |
df <- data.frame(outcome = outcome, score = score, perc = mygrid) |
| 581 | ||
| 582 |
# cgam::s does not work in formula, need to define it here |
|
| 583 | 3x |
assign(smoother, do.call(`::`, list(pkg = "cgam", name = smoother))) |
| 584 | ||
| 585 | 3x |
formula <- as.formula( |
| 586 | 3x |
paste0( |
| 587 | 3x |
"outcome ~ ", |
| 588 | 3x |
smoother, "(",
|
| 589 | 3x |
`if`(logscores, "log("),
|
| 590 | 3x |
`if`(fitonPerc, "perc", "score"), |
| 591 | 3x |
`if`(logscores, ")"), |
| 592 | 3x |
", numknots = ", numknots, ")" |
| 593 |
) |
|
| 594 |
) |
|
| 595 | ||
| 596 | 3x |
cgam.fit <- cgam::cgam(formula, data = df, family = "binomial") |
| 597 | ||
| 598 | 3x |
cgam.est <- cgam::predict.cgam(cgam.fit, type = "response")$fit |
| 599 | ||
| 600 | 3x |
return(data.frame(score, percentile = mygrid, outcome, estimate = cgam.est)) |
| 601 |
} |
|
| 602 | ||
| 603 | ||
| 604 |
### MSPLINE ESTIMATES ### |
|
| 605 | ||
| 606 |
# Function written by willtownes, joseph paulson. |
|
| 607 |
mspline <- function(x, y, k = 10, lower = NA, upper = NA) {
|
|
| 608 |
# fits a monotonic spline to data |
|
| 609 |
# small values of k= more smoothing (flatter curves) |
|
| 610 |
# large values of k= more flexible (wiggly curves) |
|
| 611 |
# k is related to effective degrees of freedom and number of knots |
|
| 612 |
# use unconstrained gam to get rough parameter estimates |
|
| 613 |
# lower, upper optional bounds on the function |
|
| 614 |
# basically a slight modifimessageion of an example in the mgcv::pcls documentation |
|
| 615 | 6x |
dat <- data.frame(x = x, y = y) |
| 616 | 6x |
s <- mgcv::s # mgcv::s does not work in formula, need to define it here |
| 617 | 6x |
init_gam <- mgcv::gam(y ~ s(x, k = k, bs = "cr")) |
| 618 |
# Create Design matrix, constraints etc. for monotonic spline.... |
|
| 619 | 6x |
sm <- mgcv::smoothCon(s(x, k = k, bs = "cr"), dat, knots = NULL)[[1]] |
| 620 | 6x |
mc <- mgcv::mono.con(sm$xp, lower = lower, upper = upper) # monotonicity constraints |
| 621 | 6x |
M <- list( |
| 622 | 6x |
X = sm$X, y = y, # design matrix, outcome |
| 623 | 6x |
C = matrix(0, 0, 0), # equality constraints (none) |
| 624 | 6x |
Ain = mc$A, bin = mc$b, # inequality constraints |
| 625 | 6x |
sp = init_gam$sp, p = sm$xp, # initial guesses for param estimates |
| 626 | 6x |
S = sm$S, # smoothness penalty matrix |
| 627 | 6x |
w = y * 0 + 1, off = 0 # weights, offset |
| 628 |
) |
|
| 629 |
# fit spine using penalized constrained least squares |
|
| 630 | 6x |
p <- mgcv::pcls(M) |
| 631 | 6x |
return(list(sm = sm, p = p)) |
| 632 |
} |
|
| 633 | ||
| 634 |
# Function written by joseph paulson |
|
| 635 |
predict.mspline <- function(msp, x) {
|
|
| 636 |
# using the monotone spline msp, predict values for the vector x |
|
| 637 | 6x |
as.vector(mgcv::Predict.matrix(msp$sm, data.frame(x = x)) %*% msp$p) |
| 638 |
} |
|
| 639 | ||
| 640 | ||
| 641 | ||
| 642 |
#' Monotone Spline Risk Estimates |
|
| 643 |
#' |
|
| 644 |
#' Fits a Monotone constrained Generalized Additive Model (GAM) to estimate risk, |
|
| 645 |
#' given a vector of binary outcomes and a vector of scores. |
|
| 646 |
#' |
|
| 647 |
#' @inheritParams getGAMest |
|
| 648 |
#' |
|
| 649 |
#' @return A data frame with 4 columns |
|
| 650 |
#' (score, score percentile, outcome, estimate). |
|
| 651 |
#' |
|
| 652 |
#' @seealso [getASISest()] [getBINNEDest()] [getCGAMest()] [getGAMest()] [getPAVAest()] |
|
| 653 |
#' |
|
| 654 |
#' @export |
|
| 655 |
#' |
|
| 656 |
#' @examples |
|
| 657 |
#' # Read in example data |
|
| 658 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
|
|
| 659 |
#' rscore <- auroc$predicted |
|
| 660 |
#' truth <- as.numeric(auroc$actual) |
|
| 661 |
#' |
|
| 662 |
#' tail(getMSPLINEest(outcome = truth, score = rscore), 10) |
|
| 663 |
#' |
|
| 664 |
getMSPLINEest <- function(outcome, |
|
| 665 |
score, |
|
| 666 |
k = 10, |
|
| 667 |
fitonPerc = TRUE) {
|
|
| 668 | ||
| 669 | 6x |
checkmate::assert_numeric(outcome) |
| 670 | 6x |
checkmate::assert_numeric(score, len = length(outcome)) |
| 671 | 6x |
checkmate::assert_integerish(k, len = 1, any.missing = FALSE) |
| 672 | 6x |
checkmate::assert_flag(fitonPerc) |
| 673 | ||
| 674 | 6x |
stopifnot(!is.null(k), is.logical(fitonPerc)) |
| 675 | ||
| 676 | 6x |
scorefit <- ecdf(score) |
| 677 | 6x |
mygrid <- scorefit(score) |
| 678 | ||
| 679 | 6x |
if (!fitonPerc) {
|
| 680 | 1x |
fitspl <- mspline(x = score, y = outcome, k = k) |
| 681 | 1x |
mspline.est <- predict.mspline(fitspl, score) |
| 682 |
} else {
|
|
| 683 | 5x |
fitspl <- mspline(x = mygrid, y = outcome, k = k) |
| 684 | 5x |
mspline.est <- predict.mspline(fitspl, mygrid) |
| 685 |
} |
|
| 686 | ||
| 687 | 6x |
mspline.est[mspline.est < 0] <- 0 |
| 688 | 6x |
mspline.est[mspline.est > 1] <- 1 |
| 689 | ||
| 690 | 6x |
return(data.frame(score, percentile = mygrid, outcome, estimate = mspline.est)) |
| 691 |
} |
|
| 692 | ||
| 693 | ||
| 694 |
#' "As is" estimates |
|
| 695 |
#' |
|
| 696 |
#' This function does no estimation, but uses the score as it is |
|
| 697 |
#' (it works like an identity function). |
|
| 698 |
#' |
|
| 699 |
#' @inheritParams getGAMest |
|
| 700 |
#' |
|
| 701 |
#' @return A data frame with 4 columns |
|
| 702 |
#' (score, score percentile, outcome, estimate). |
|
| 703 |
#' |
|
| 704 |
#' @seealso [getBINNEDest()] [getCGAMest()] [getGAMest()] [getMSPLINEest()] [getPAVAest()] |
|
| 705 |
#' |
|
| 706 |
#' @export |
|
| 707 |
#' |
|
| 708 |
#' @examples |
|
| 709 |
#' # Read in example data |
|
| 710 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
|
|
| 711 |
#' rscore <- auroc$predicted |
|
| 712 |
#' truth <- as.numeric(auroc$actual) |
|
| 713 |
#' |
|
| 714 |
#' tail(getASISest(outcome = truth, score = rscore), 10) |
|
| 715 |
#' |
|
| 716 |
getASISest <- function(outcome, score) {
|
|
| 717 |
# Argument checks |
|
| 718 | 17x |
checkmate::assert_numeric(outcome) |
| 719 | 17x |
checkmate::assert_numeric(score, len = length(outcome)) |
| 720 | 17x |
return( |
| 721 | 17x |
data.frame( |
| 722 | 17x |
score = score, |
| 723 | 17x |
percentile = ecdf(score)(score), |
| 724 | 17x |
outcome, |
| 725 | 17x |
estimate = score |
| 726 |
) |
|
| 727 |
) |
|
| 728 |
} |
|
| 729 | ||
| 730 | ||
| 731 |
# Returns Risk Estimates. |
|
| 732 |
# This function calls all the other getXXXest functions |
|
| 733 |
# Used for Predictiveness Curve Data |
|
| 734 |
getEsts <- function(methods, outcome, score) {
|
|
| 735 | ||
| 736 |
# check methods |
|
| 737 | 41x |
stopifnot(is.list(methods)) |
| 738 | ||
| 739 |
# Get function names (and perform checks) |
|
| 740 | 38x |
fun.names <- getEstMethods(methods, with.names = TRUE) |
| 741 | ||
| 742 |
# Run estimations |
|
| 743 | 38x |
m.est <- lapply(methods, \(x) getEst(x, outcome = outcome, score = score)) |
| 744 | ||
| 745 |
# Special case: get errorbar data if existing |
|
| 746 | 36x |
idx.binned <- fun.names == "binned" |
| 747 | 36x |
m.er <- lapply( |
| 748 | 36x |
which(idx.binned), |
| 749 | 36x |
\(i) attr(m.est[[i]], "errorbar") |
| 750 |
) |
|
| 751 | ||
| 752 |
# Bind together and convert to long data frame |
|
| 753 | 36x |
m.est <- bind_rows(m.est, .id = "method") |
| 754 | 36x |
rownames(m.est) <- NULL |
| 755 | ||
| 756 |
# Bind together and convert to long data frame; otherwise return NULL |
|
| 757 | 36x |
m.error <- bind_rows(m.er, .id = "method") |
| 758 | 36x |
if (nrow(m.error) == 0) {
|
| 759 | 35x |
m.error <- NULL |
| 760 |
} |
|
| 761 | ||
| 762 |
# Return indexes of step methods and risk methods |
|
| 763 | 36x |
idx.step <- fun.names %in% c("binned", "pava")
|
| 764 | 36x |
names(idx.step) <- names(fun.names) |
| 765 | 36x |
idx.asis <- fun.names == "asis" |
| 766 | ||
| 767 |
# Check if asis was called multiple times |
|
| 768 | 36x |
if (sum(idx.asis) > 1) {
|
| 769 | 1x |
stop("Please use 'asis' just once (as it does not have any further arguments).")
|
| 770 |
} |
|
| 771 | ||
| 772 | 35x |
return( |
| 773 | 35x |
list( |
| 774 | 35x |
plotdata = m.est, errorbardata = m.error, |
| 775 | 35x |
idx.step = idx.step, idx.asis = idx.asis, |
| 776 | 35x |
idx.binned = idx.binned, idx.pava = fun.names == "pava" |
| 777 |
) |
|
| 778 |
) |
|
| 779 |
} |
|
| 780 | ||
| 781 |
# Define estimation functions |
|
| 782 |
est.funs <- function() {
|
|
| 783 | 104x |
list( |
| 784 | 104x |
gam = getGAMest, |
| 785 | 104x |
cgam = getCGAMest, |
| 786 | 104x |
mspline = getMSPLINEest, |
| 787 | 104x |
binned = getBINNEDest, |
| 788 | 104x |
pava = getPAVAest, |
| 789 | 104x |
asis = getASISest |
| 790 |
) |
|
| 791 |
} |
|
| 792 | ||
| 793 |
# Generic and S3 methods for estimations |
|
| 794 |
getEst <- function(x, outcome, score) {
|
|
| 795 | 54x |
UseMethod("getEst")
|
| 796 |
} |
|
| 797 | ||
| 798 | ||
| 799 |
getEst.list <- function(x, outcome, score) {
|
|
| 800 | 14x |
do.call( |
| 801 | 14x |
est.funs()[[x[["method"]]]], |
| 802 | 14x |
append(list(outcome = outcome, score = score), x[names(x) != "method"]) |
| 803 |
) |
|
| 804 |
} |
|
| 805 | ||
| 806 | ||
| 807 |
getEst.character <- function(x, outcome, score) {
|
|
| 808 | 36x |
do.call( |
| 809 | 36x |
est.funs()[[x]], |
| 810 | 36x |
list(outcome = outcome, score = score) |
| 811 |
) |
|
| 812 |
} |
|
| 813 | ||
| 814 | ||
| 815 |
getEst.function <- function(x, outcome, score) {
|
|
| 816 | ||
| 817 |
# Run estimation |
|
| 818 | 4x |
out <- x(outcome = outcome, score = score) |
| 819 | ||
| 820 |
# Check output |
|
| 821 | 4x |
check1 <- is.data.frame(out) && |
| 822 | 4x |
identical(c("estimate", "outcome", "percentile", "score"), sort(colnames(out)))
|
| 823 | 4x |
if (!check1) {
|
| 824 | 1x |
stop( |
| 825 | 1x |
paste( |
| 826 | 1x |
"User defined estimation functions must return a data.frame of 4 columns:", |
| 827 | 1x |
"score - the predictions,", |
| 828 | 1x |
"percentile - the percentile of score,", |
| 829 | 1x |
"outcome - the original outcome,", |
| 830 | 1x |
"and estimate - the estimated value" |
| 831 |
) |
|
| 832 |
) |
|
| 833 |
} |
|
| 834 |
|
|
| 835 | 3x |
check2 <- vapply(out, is.numeric, FUN.VALUE = logical(1), USE.NAMES = TRUE) |
| 836 | 3x |
if (!all(check2)) {
|
| 837 | 1x |
stop( |
| 838 | 1x |
paste( |
| 839 | 1x |
"All columns of the returned data.frame in user defined estimation function", |
| 840 | 1x |
"must be numeric.", |
| 841 | 1x |
paste0("`", names(which(!check2)), "`", collapse = ", "), "is/are not numeric."
|
| 842 |
) |
|
| 843 |
) |
|
| 844 |
} |
|
| 845 | ||
| 846 | 2x |
return(out) |
| 847 |
} |
|
| 848 | ||
| 849 |
# Generic and S3 methods for estimation method as string (actual estimation function used) |
|
| 850 |
getEstMethod <- function(x) {
|
|
| 851 | 67x |
UseMethod("getEstMethod")
|
| 852 |
} |
|
| 853 | ||
| 854 |
#' @export |
|
| 855 |
getEstMethod.character <- function(x) {
|
|
| 856 | 43x |
x |
| 857 |
} |
|
| 858 | ||
| 859 |
#' @export |
|
| 860 |
getEstMethod.list <- function(x) {
|
|
| 861 | 19x |
x[["method"]] |
| 862 |
} |
|
| 863 | ||
| 864 |
#' @export |
|
| 865 |
getEstMethod.function <- function(x) {
|
|
| 866 | 5x |
"udf" |
| 867 |
} |
|
| 868 | ||
| 869 |
getEstMethods <- function(x, with.names) {
|
|
| 870 | 48x |
vapply(x, getEstMethod, FUN.VALUE = character(1), USE.NAMES = with.names) |
| 871 |
} |
| 1 |
#' Risk profile plot |
|
| 2 |
#' |
|
| 3 |
#' Predictiveness curve, PPV, NPV and 1-NPV risk estimates |
|
| 4 |
#' |
|
| 5 |
#' @param outcome Vector of binary outcome for each observation. |
|
| 6 |
#' @param score Numeric vector of continuous predicted risk score. |
|
| 7 |
#' @param methods Character vector of method names (case-insensitive) for plotting curves or |
|
| 8 |
#' a named list where elements are method function and its arguments. |
|
| 9 |
#' Default is set to `"asis"`. |
|
| 10 |
#' |
|
| 11 |
#' Full options are: `c("asis", "binned", "pava", "mspline", "gam", "cgam")`.
|
|
| 12 |
#' |
|
| 13 |
#' To specify arguments per method, use lists. For example: |
|
| 14 |
#' ``` |
|
| 15 |
#' list( |
|
| 16 |
#' pava = list(method = "pava", ties = "primary"), |
|
| 17 |
#' mspline = list(method = "mspline", fitonPerc = TRUE), |
|
| 18 |
#' gam = list(method = "gam", bs = "tp", logscores = FALSE), |
|
| 19 |
#' bin = list(method = "binned", bins = 10), |
|
| 20 |
#' risk = list(method = "asis") |
|
| 21 |
#' ) |
|
| 22 |
#' ``` |
|
| 23 |
#' See section "Estimation" for more details. |
|
| 24 |
#' @param prev.adj `NULL` (default) or scalar numeric between 0 and 1 for prevalence adjustment. |
|
| 25 |
#' @param show.prev Logical, show prevalence value in the graph. Defaults to `TRUE`. |
|
| 26 |
#' @param show.nonparam.pv Logical, show non-parametric calculation of PVs. Defaults to `TRUE`. |
|
| 27 |
#' @param show.best.pv Logical, show best possible PVs. Defaults to `TRUE`. |
|
| 28 |
#' @param include Character vector (case-insensitive, partial matching) specifying what quantities |
|
| 29 |
#' to include in the plot. |
|
| 30 |
#' |
|
| 31 |
#' Default is: `c("PC", "PPV", "1-NPV")`.
|
|
| 32 |
#' |
|
| 33 |
#' Full options are: `c("NPV", "PC", "PPV", "1-NPV")`.
|
|
| 34 |
#' @param plot.raw Logical to show percentiles or raw values. |
|
| 35 |
#' Defaults to `FALSE` (i.e. percentiles). |
|
| 36 |
#' @param rev.order Logical, reverse ordering of scores. Defaults to `FALSE`. |
|
| 37 |
#' |
|
| 38 |
#' @section Estimation: |
|
| 39 |
#' The `methods` argument specifies the estimation method. |
|
| 40 |
#' You can provide either a vector of strings, any of |
|
| 41 |
#' ``` |
|
| 42 |
#' c("asis", "binned", "pava", "mspline", "gam", "cgam")
|
|
| 43 |
#' ``` |
|
| 44 |
#' (`"asis"` is not available for `calibrationProfile`), |
|
| 45 |
#' or a named list of lists. |
|
| 46 |
#' In the latter case, the inner list must have an element "method", |
|
| 47 |
#' which specifies the estimation function (one of those above), |
|
| 48 |
#' and optionally other elements, which are passed to the estimation function. |
|
| 49 |
#' For example: |
|
| 50 |
#' ``` |
|
| 51 |
#' list( |
|
| 52 |
#' gam = list(method = "gam", k = 3), |
|
| 53 |
#' c_gam = list(method = "cgam", numknots = 3) |
|
| 54 |
#' ) |
|
| 55 |
#' ``` |
|
| 56 |
#' |
|
| 57 |
#' To see what arguments are available for each estimation method, |
|
| 58 |
#' see the documentation of that function. |
|
| 59 |
#' The naming convention is `getXest`, |
|
| 60 |
#' where `X` stands for the estimation method, for example [getGAMest()]. |
|
| 61 |
#' |
|
| 62 |
#' "gam", "cgam", and "mspline" always fit on percentiles by default. |
|
| 63 |
#' To change this, use `fitonPerc = FALSE`, for example |
|
| 64 |
#' ``` |
|
| 65 |
#' list(gam = list(method = "gam", fitonPerc = FALSE)) |
|
| 66 |
#' ``` |
|
| 67 |
#' |
|
| 68 |
#' "gam" and "cgam" methods are wrappers of [mgcv::gam()] and [cgam::cgam()], respectively. |
|
| 69 |
#' The default values of function arguments (like `k`, the number of knots in [mgcv::s()]) |
|
| 70 |
#' mirror the package defaults. |
|
| 71 |
#' |
|
| 72 |
#' @return A list containing the plot and data, plus `errorbar` data if they were requested |
|
| 73 |
#' (through `"binned"` estimation method with a parameter `errorbar.sem`). |
|
| 74 |
#' |
|
| 75 |
#' @export |
|
| 76 |
#' |
|
| 77 |
#' @seealso [calibrationProfile()] [sensSpec()] |
|
| 78 |
#' |
|
| 79 |
#' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()] |
|
| 80 |
#' [getASISest()] |
|
| 81 |
#' |
|
| 82 |
#' @examples |
|
| 83 |
#' # Read in example data |
|
| 84 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
|
|
| 85 |
#' rscore <- auroc$predicted_calibrated |
|
| 86 |
#' truth <- as.numeric(auroc$actual) |
|
| 87 |
#' |
|
| 88 |
#' # Default plot includes 1-NPV, PPV, and a predictiveness curve (PC) based on risk-cutoff |
|
| 89 |
#' p1 <- riskProfile(outcome = truth, score = rscore) |
|
| 90 |
#' p1$plot |
|
| 91 |
#' p1$data |
|
| 92 |
#' |
|
| 93 |
#' # Show also NPV |
|
| 94 |
#' p2 <- riskProfile( |
|
| 95 |
#' outcome = truth, |
|
| 96 |
#' score = rscore, |
|
| 97 |
#' include = c("PC", "NPV", "PPV", "1-NPV")
|
|
| 98 |
#' # or use partial matching: include = c("PC", "N", "PPV", "1")
|
|
| 99 |
#' ) |
|
| 100 |
#' p2$plot |
|
| 101 |
#' p2$data |
|
| 102 |
#' |
|
| 103 |
#' # All estimates of prediction curve |
|
| 104 |
#' p3 <- riskProfile( |
|
| 105 |
#' outcome = truth, |
|
| 106 |
#' score = rscore, |
|
| 107 |
#' methods = c("mspline", "gam", "cgam", "binned", "pava", "asis"),
|
|
| 108 |
#' include = c("PC", "PPV", "1-NPV")
|
|
| 109 |
#' ) |
|
| 110 |
#' p3$plot |
|
| 111 |
#' |
|
| 112 |
#' # Specifying method arguments (note each list has a "method" element) |
|
| 113 |
#' p4 <- riskProfile( |
|
| 114 |
#' outcome = truth, |
|
| 115 |
#' score = rscore, |
|
| 116 |
#' methods = list( |
|
| 117 |
#' "gam" = list(method = "gam", bs = "tp", logscores = FALSE, fitonPerc = TRUE), |
|
| 118 |
#' "risk" = list(method = "asis"), # no available arguments for this method |
|
| 119 |
#' "bin" = list(method = "binned", quantiles = 10, errorbar.sem = 1.2) |
|
| 120 |
#' ) |
|
| 121 |
#' ) |
|
| 122 |
#' p4$plot |
|
| 123 |
#' |
|
| 124 |
#' # Compare multiple GAMs in terms of Predictiveness Curves |
|
| 125 |
#' p5 <- riskProfile( |
|
| 126 |
#' outcome = truth, |
|
| 127 |
#' score = rscore, |
|
| 128 |
#' methods = list( |
|
| 129 |
#' "gam_3" = list(method = "gam", k = 3), |
|
| 130 |
#' "gam_4" = list(method = "gam", k = 4), |
|
| 131 |
#' "gam_7" = list(method = "gam", k = 7) |
|
| 132 |
#' ), |
|
| 133 |
#' include = "PC" |
|
| 134 |
#' ) |
|
| 135 |
#' p5$plot |
|
| 136 |
#' |
|
| 137 |
#' # Using logistic regression as user-defined estimation function, fitting on percentiles |
|
| 138 |
#' # Function needs to take exactly these two arguments |
|
| 139 |
#' my_est <- function(outcome, score) {
|
|
| 140 |
#' # Calculate percentiles |
|
| 141 |
#' perc <- ecdf(score)(score) |
|
| 142 |
#' # Fit |
|
| 143 |
#' m <- glm(outcome ~ perc, family = "binomial") |
|
| 144 |
#' # Generate predictions |
|
| 145 |
#' preds <- predict(m, type = "response") |
|
| 146 |
#' # Return a data.frame with exactly these columns |
|
| 147 |
#' return( |
|
| 148 |
#' data.frame( |
|
| 149 |
#' score = score, |
|
| 150 |
#' percentile = perc, |
|
| 151 |
#' outcome = outcome, |
|
| 152 |
#' estimate = preds |
|
| 153 |
#' ) |
|
| 154 |
#' ) |
|
| 155 |
#' } |
|
| 156 |
#' p6 <- riskProfile( |
|
| 157 |
#' outcome = truth, |
|
| 158 |
#' score = rscore, |
|
| 159 |
#' methods = list(my_lr = my_est) |
|
| 160 |
#' ) |
|
| 161 |
#' p6$plot |
|
| 162 |
#' |
|
| 163 |
#' # Using cgam as user-defined estimation function |
|
| 164 |
#' # Note that you can also use the predefined cgam using methods = "cgam" |
|
| 165 |
#' # Attach needed library |
|
| 166 |
#' # Watch out for masking of mgcv::s and cgam::s if both are attached |
|
| 167 |
#' library(cgam, quietly = TRUE) |
|
| 168 |
#' # Function needs to take exactly these two arguments |
|
| 169 |
#' my_est <- function(outcome, score) {
|
|
| 170 |
#' # Fit on raw predictions with space = "E" |
|
| 171 |
#' m <- cgam( |
|
| 172 |
#' outcome ~ s.incr(score, numknots = 5, space = "E"), |
|
| 173 |
#' family = "binomial" |
|
| 174 |
#' ) |
|
| 175 |
#' # Generate predictions and convert to vector |
|
| 176 |
#' preds <- predict(m, type = "response")$fit |
|
| 177 |
#' # Return a data.frame with exactly these columns |
|
| 178 |
#' out <- data.frame( |
|
| 179 |
#' score = score, |
|
| 180 |
#' percentile = ecdf(score)(score), |
|
| 181 |
#' outcome = outcome, |
|
| 182 |
#' estimate = preds |
|
| 183 |
#' ) |
|
| 184 |
#' return(out) |
|
| 185 |
#' } |
|
| 186 |
#' |
|
| 187 |
#' p7 <- riskProfile( |
|
| 188 |
#' outcome = truth, |
|
| 189 |
#' score = rscore, |
|
| 190 |
#' methods = list(my_cgam = my_est) |
|
| 191 |
#' ) |
|
| 192 |
#' p7$plot |
|
| 193 |
#' |
|
| 194 |
#' # Prevalence adjustment to 0.1 |
|
| 195 |
#' p8 <- riskProfile(outcome = truth, score = rscore, prev.adj = 0.1) |
|
| 196 |
#' p8$plot |
|
| 197 |
#' |
|
| 198 |
riskProfile <- function(outcome, |
|
| 199 |
score, |
|
| 200 |
methods = "asis", |
|
| 201 |
prev.adj = NULL, |
|
| 202 |
show.prev = TRUE, |
|
| 203 |
show.nonparam.pv = TRUE, |
|
| 204 |
show.best.pv = TRUE, |
|
| 205 |
include = c("PC", "PPV", "1-NPV"),
|
|
| 206 |
plot.raw = FALSE, |
|
| 207 |
rev.order = FALSE) {
|
|
| 208 | ||
| 209 |
# Argument checks (except outcome, score, methods - below) |
|
| 210 | 15x |
checkmate::assert_number(prev.adj, lower = 0, upper = 1, null.ok = TRUE) |
| 211 | 15x |
checkmate::assert_flag(show.nonparam.pv) |
| 212 | 15x |
checkmate::assert_flag(show.best.pv) |
| 213 | 15x |
checkmate::assert_flag(show.prev) |
| 214 | 15x |
include <- matchArgSubset(toupper(include), choices = c("PC", "PPV", "NPV", "1-NPV"))
|
| 215 | 15x |
checkmate::assert_flag(plot.raw) |
| 216 | 15x |
checkmate::assert_flag(rev.order) |
| 217 | ||
| 218 |
# Standardize/Check outcome, scores |
|
| 219 | 15x |
op <- inputCheck(outcome = outcome, score = score) |
| 220 | ||
| 221 |
# Order Data by scores |
|
| 222 | 15x |
tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order) |
| 223 | 15x |
score <- tempdf$score |
| 224 | 15x |
outcome <- tempdf$outcome |
| 225 | ||
| 226 |
# Check methods |
|
| 227 | 15x |
methods <- methodCheck(methods = methods) |
| 228 | 15x |
method.names <- names(methods) |
| 229 | ||
| 230 |
# Calculate prevalence and percentiles |
|
| 231 | 15x |
prev <- mean(outcome, na.rm = TRUE) |
| 232 | ||
| 233 |
# Get the plot settings |
|
| 234 | 15x |
show.pc <- "PC" %in% include |
| 235 | 15x |
show.pv <- any(c("PPV", "NPV", "1-NPV") %in% include)
|
| 236 | 15x |
show.one.only <- sum(c("PC", "PPV", "NPV", "1-NPV") %in% include) == 1
|
| 237 | ||
| 238 | 15x |
if (plot.raw) {
|
| 239 | 2x |
xvar <- "score" |
| 240 |
} else {
|
|
| 241 | 13x |
xvar <- "percentile" |
| 242 |
} |
|
| 243 | ||
| 244 |
# Prediction Curve Data |
|
| 245 | 15x |
if (show.pc) {
|
| 246 | 12x |
pc.ests <- getEsts(methods = methods, outcome = outcome, score = score) |
| 247 | 12x |
PC.data <- mutate(pc.ests$plotdata, pv = "PC") |
| 248 | 12x |
errorbar.data <- pc.ests$errorbardata |
| 249 | 12x |
step.methods <- method.names[pc.ests$idx.step] |
| 250 |
} else {
|
|
| 251 | 3x |
PC.data <- data.frame( |
| 252 | 3x |
method = character(0), pv = character(0), |
| 253 | 3x |
percentile = numeric(0), score = numeric(0), estimate = numeric(0) |
| 254 |
) |
|
| 255 | 3x |
pc.ests <- errorbar.data <- NULL |
| 256 | 3x |
step.methods <- character(0) |
| 257 |
} |
|
| 258 | ||
| 259 |
# Predictive Value Data |
|
| 260 | 15x |
if (show.pv) {
|
| 261 | 14x |
PV.data <- getPVdata(outcome = outcome, score = score, methods = methods, pc.ests = pc.ests) |
| 262 |
} else {
|
|
| 263 | 1x |
PV.data <- data.frame( |
| 264 | 1x |
method = character(0), score = numeric(0), percentile = numeric(0), |
| 265 | 1x |
outcome = numeric(0), estimate = numeric(0), |
| 266 | 1x |
MNPV = numeric(0), NPV = numeric(0), PPV = numeric(0) |
| 267 |
) |
|
| 268 |
} |
|
| 269 | ||
| 270 |
# show.nonparam.pv |
|
| 271 | 15x |
if (show.nonparam.pv) {
|
| 272 | 10x |
tmp <- nonParametricPV(outcome = outcome, score = score) %>% |
| 273 | 10x |
mutate(method = "non-parametric", estimate = NA) |
| 274 | 10x |
PV.data <- bind_rows(PV.data, tmp) |
| 275 |
} |
|
| 276 | ||
| 277 |
# Dataset of inputs |
|
| 278 | 15x |
df.in <- data.frame(outcome, score, percentile = ecdf(score)(score)) |
| 279 | ||
| 280 |
# Adjust based on user defined prevalence |
|
| 281 | 15x |
if (!is.null(prev.adj)) {
|
| 282 | ||
| 283 | 1x |
cdf.cases <- ecdf(score[outcome == 1]) |
| 284 | 1x |
cdf.controls <- ecdf(score[outcome == 0]) |
| 285 | ||
| 286 | 1x |
df.in$percentile <- adjPrevPerc( |
| 287 | 1x |
perc = df.in$score, prev.new = prev.adj, |
| 288 | 1x |
cdf.case = cdf.cases, cdf.control = cdf.controls |
| 289 |
) |
|
| 290 | ||
| 291 | 1x |
if (show.pc) {
|
| 292 | 1x |
PC.data <- adjPrevPC( |
| 293 | 1x |
dat = PC.data, prev = prev, prev.new = prev.adj, |
| 294 | 1x |
cdf.case = cdf.cases, cdf.control = cdf.controls |
| 295 |
) |
|
| 296 |
} |
|
| 297 | ||
| 298 | 1x |
if (show.pv) {
|
| 299 | 1x |
PV.data <- adjPrevPV( |
| 300 | 1x |
dat = PV.data, prev = prev, prev.new = prev.adj, |
| 301 | 1x |
cdf.case = cdf.cases, cdf.control = cdf.controls |
| 302 |
) |
|
| 303 |
} |
|
| 304 | ||
| 305 | 1x |
prev <- prev.adj |
| 306 |
} |
|
| 307 | ||
| 308 |
# pivot PV data |
|
| 309 | 15x |
if (show.pv) {
|
| 310 | 14x |
PV.data <- PV.data %>% |
| 311 | 14x |
rename(`1-NPV` = "MNPV") %>% |
| 312 | 14x |
tidyr::pivot_longer( |
| 313 | 14x |
cols = all_of(c("1-NPV", "NPV", "PPV")), names_to = "pv", values_to = "pvValue"
|
| 314 |
) %>% |
|
| 315 | 14x |
mutate(pv = factor(.data$pv, levels = c("NPV", "1-NPV", "PPV"))) %>%
|
| 316 | 14x |
arrange(.data$method, .data$pv, .data$percentile) |
| 317 |
} else {
|
|
| 318 | 1x |
PV.data <- data.frame( |
| 319 | 1x |
method = character(0), score = numeric(0), percentile = numeric(0), |
| 320 | 1x |
outcome = numeric(0), estimate = numeric(0), |
| 321 | 1x |
pv = character(0), pvValue = numeric(0) |
| 322 |
) |
|
| 323 |
} |
|
| 324 | ||
| 325 |
# If showing percentiles, add row with 0th percentile |
|
| 326 | 15x |
if (!plot.raw) {
|
| 327 | 13x |
if (show.pc) {
|
| 328 | 10x |
PC.data <- add0thPercPC(PC.data) |
| 329 |
} |
|
| 330 | 13x |
if (show.pv) {
|
| 331 | 12x |
PV.data <- add0thPercPV(PV.data) |
| 332 |
} |
|
| 333 |
} |
|
| 334 | ||
| 335 |
# Subset PC data |
|
| 336 | 15x |
smoothPC <- PC.data[!PC.data$method %in% step.methods, , drop = FALSE] |
| 337 | 15x |
stepPC <- PC.data[PC.data$method %in% step.methods, , drop = FALSE] |
| 338 | ||
| 339 |
# Subset PV data |
|
| 340 | 15x |
smoothPV <- PV.data[ |
| 341 | 15x |
PV.data$pv %in% include & !PV.data$method %in% step.methods, , |
| 342 | 15x |
drop = FALSE |
| 343 |
] |
|
| 344 | 15x |
stepPV <- PV.data[ |
| 345 | 15x |
PV.data$pv %in% include & PV.data$method %in% step.methods, , |
| 346 | 15x |
drop = FALSE |
| 347 |
] |
|
| 348 | ||
| 349 |
# Different aes based on what is to be shown |
|
| 350 |
# (if one kind of PV value, use both coloring and linetype for distinguishing estimation methods) |
|
| 351 | 15x |
if (show.one.only) {
|
| 352 | 2x |
aes.pc <- aes( |
| 353 | 2x |
x = .data[[xvar]], y = .data$estimate, |
| 354 | 2x |
colour = .data$method, linetype = .data$method |
| 355 |
) |
|
| 356 | 2x |
aes.pv <- aes( |
| 357 | 2x |
x = .data[[xvar]], y = .data$pvValue, |
| 358 | 2x |
colour = .data$method, linetype = .data$method |
| 359 |
) |
|
| 360 |
} else {
|
|
| 361 | 13x |
aes.pc <- aes( |
| 362 | 13x |
x = .data[[xvar]], y = .data$estimate, |
| 363 | 13x |
colour = .data$pv, linetype = .data$method |
| 364 |
) |
|
| 365 | 13x |
aes.pv <- aes( |
| 366 | 13x |
x = .data[[xvar]], y = .data$pvValue, |
| 367 | 13x |
colour = .data$pv, linetype = .data$method |
| 368 |
) |
|
| 369 |
} |
|
| 370 | ||
| 371 |
# Build plot |
|
| 372 | 15x |
p <- ggplot() + |
| 373 | 15x |
geom_line(aes.pc, data = smoothPC, alpha = 0.8) + |
| 374 | 15x |
geom_step(aes.pc, data = stepPC, alpha = 0.8, direction = "vh") + |
| 375 | 15x |
geom_line(aes.pv, data = smoothPV, alpha = 0.8) + |
| 376 | 15x |
geom_step(aes.pv, data = stepPV, alpha = 0.8, direction = "vh") + |
| 377 | 15x |
geom_hline(yintercept = prev, alpha = 0.8, col = "black", linetype = "dashed") |
| 378 | ||
| 379 |
# Best PVs |
|
| 380 | 15x |
if (show.best.pv) {
|
| 381 | 11x |
if (!plot.raw) {
|
| 382 | 9x |
df.in <- bind_rows(dplyr::tibble(percentile = 0, score = NA), df.in) |
| 383 |
} |
|
| 384 | 11x |
best <- df.in %>% |
| 385 | 11x |
select(all_of(c("percentile", "score"))) %>%
|
| 386 | 11x |
distinct() %>% |
| 387 | 11x |
arrange(.data$percentile, .data$score) %>% |
| 388 | 11x |
mutate( |
| 389 | 11x |
PC = ifelse(.data$percentile <= 1 - prev, 0, 1), |
| 390 | 11x |
PPV = bestPPV(perc = .data$percentile, prev = prev), |
| 391 | 11x |
`1-NPV` = bestMNPV(perc = .data$percentile, prev = prev), |
| 392 | 11x |
NPV = 1 - .data$`1-NPV` |
| 393 |
) %>% |
|
| 394 | 11x |
tidyr::pivot_longer( |
| 395 | 11x |
cols = c("PC", "PPV", "1-NPV", "NPV"), names_to = "pv", values_to = "pvValue"
|
| 396 |
) %>% |
|
| 397 | 11x |
filter(.data$pv %in% include) %>% |
| 398 | 11x |
mutate( |
| 399 | 11x |
method = "Best PVs", |
| 400 | 11x |
pv = paste("Best", .data$pv)
|
| 401 |
) |
|
| 402 | ||
| 403 | 11x |
if (show.one.only) {
|
| 404 | 1x |
p <- p + |
| 405 | 1x |
geom_line( |
| 406 | 1x |
data = best, |
| 407 | 1x |
aes(x = .data[[xvar]], y = .data$pvValue, linewidth = .data$pv), |
| 408 | 1x |
colour = "gray60" |
| 409 |
) + |
|
| 410 | 1x |
scale_linewidth_manual( |
| 411 | 1x |
values = c("Best 1-NPV" = 0.3, "Best PPV" = 0.3, "Best PC" = 0.3, "Best NPV" = 0.3),
|
| 412 | 1x |
name = "Best PVs" |
| 413 |
) |
|
| 414 |
} else {
|
|
| 415 | 10x |
p <- p + |
| 416 | 10x |
geom_line(data = best, aes(x = .data[[xvar]], y = .data$pvValue, colour = .data$pv)) |
| 417 |
} |
|
| 418 |
} |
|
| 419 | ||
| 420 |
# Add errorbars |
|
| 421 | 15x |
if (show.pc && !is.null(errorbar.data)) {
|
| 422 | 1x |
p <- p + |
| 423 | 1x |
geom_point( |
| 424 | 1x |
data = errorbar.data, |
| 425 | 1x |
aes(x = .data$midquantile, y = .data$bin.mid), |
| 426 | 1x |
alpha = 0.8, |
| 427 | 1x |
size = 0.2 |
| 428 |
) + |
|
| 429 | 1x |
geom_errorbar( |
| 430 | 1x |
data = errorbar.data, |
| 431 | 1x |
aes( |
| 432 | 1x |
x = .data$midquantile, |
| 433 | 1x |
ymin = .data$bin.low, |
| 434 | 1x |
ymax = .data$bin.high, |
| 435 | 1x |
width = .02 |
| 436 |
), |
|
| 437 | 1x |
alpha = 0.7, |
| 438 | 1x |
linewidth = 0.2, |
| 439 | 1x |
inherit.aes = FALSE |
| 440 |
) |
|
| 441 |
} |
|
| 442 | ||
| 443 |
# Set always the same colours for PVs |
|
| 444 | 15x |
if (!show.one.only) {
|
| 445 | 13x |
clrs <- predictionColours(include, show.best = show.best.pv) |
| 446 | 13x |
p <- p + scale_colour_manual(values = clrs, breaks = names(clrs)) |
| 447 |
} else {
|
|
| 448 | 2x |
p <- p + scale_colour_hue(l = 45) |
| 449 |
} |
|
| 450 | ||
| 451 |
# Finalize plot |
|
| 452 | 15x |
p <- p + |
| 453 | 15x |
labs( |
| 454 | 15x |
title = "Predictiveness Plot", |
| 455 | 15x |
x = ifelse(plot.raw, "Prediction Score", "Risk Percentile"), |
| 456 | 15x |
y = "Predicted Risk / Predictive Value", |
| 457 | 15x |
linetype = "Estimation Method", |
| 458 | 15x |
colour = ifelse(show.one.only, "Estimation Method", "Predictive Quantity") |
| 459 |
) + |
|
| 460 | 15x |
scale_x_continuous(n.breaks = 6) + |
| 461 | 15x |
scale_y_continuous(n.breaks = 6) + |
| 462 | 15x |
theme_bw() + |
| 463 | 15x |
theme(legend.key.width = unit(2, "line")) |
| 464 |
|
|
| 465 |
# Add prevalence annotation if requested |
|
| 466 | 15x |
if (show.prev) {
|
| 467 |
# x-value for plotting prevalence label |
|
| 468 | 15x |
prev_x <- ifelse(plot.raw, min(score), 0) |
| 469 | 15x |
prev_nudge_x <- ifelse(plot.raw, (max(score) - min(score)) / 10, 0.1) |
| 470 | 15x |
prev_nudge_y <- ggplot2::layer_scales(p)$y$get_limits()[2] / 10 |
| 471 |
|
|
| 472 | 15x |
p <- p + annotate( |
| 473 | 15x |
geom = "text", |
| 474 | 15x |
x = prev_x + prev_nudge_x, |
| 475 | 15x |
y = ifelse(prev > 0.8, prev - prev_nudge_y, prev + prev_nudge_y), |
| 476 | 15x |
label = paste0("Prevalence: ", "\n", round(prev, 3)),
|
| 477 | 15x |
colour = "black", |
| 478 | 15x |
alpha = 0.8, |
| 479 | 15x |
size = 3.5 |
| 480 |
) |
|
| 481 |
} |
|
| 482 | ||
| 483 | 15x |
if (show.one.only) {
|
| 484 | 2x |
p <- p + guides(colour = guide_legend(order = 1), linetype = guide_legend(order = 1)) |
| 485 |
} else {
|
|
| 486 | 13x |
p <- p + guides(colour = guide_legend(order = 1), linetype = guide_legend(order = 2)) |
| 487 |
} |
|
| 488 | ||
| 489 | 15x |
if (show.best.pv) {
|
| 490 | 11x |
PV.data <- bind_rows(PV.data, mutate(best, pv = gsub("Best ", "", .data$pv)))
|
| 491 |
} |
|
| 492 | ||
| 493 | 15x |
return( |
| 494 | 15x |
list( |
| 495 | 15x |
plot = p, |
| 496 | 15x |
data = bind_rows( |
| 497 | 15x |
dplyr::as_tibble(PC.data), |
| 498 | 15x |
dplyr::as_tibble(PV.data) |
| 499 |
), |
|
| 500 | 15x |
errorbar = errorbar.data |
| 501 |
) |
|
| 502 |
) |
|
| 503 |
} |
| 1 |
#' Sensitivity and specificity plot |
|
| 2 |
#' |
|
| 3 |
#' Sensitivity and specificity risk estimates |
|
| 4 |
#' |
|
| 5 |
#' Given individual binary outcomes and scores, this function plots sensitivity and specificity |
|
| 6 |
#' (using each score as a cutoff) on their respective score percentiles. |
|
| 7 |
#' |
|
| 8 |
#' @inheritParams riskProfile |
|
| 9 |
#' @param show.best Logical; Include best possible sensitivity and specificity? Defaults to `TRUE`. |
|
| 10 |
#' |
|
| 11 |
#' @inheritSection riskProfile Estimation |
|
| 12 |
#' |
|
| 13 |
#' @return A list containing the plot and data. |
|
| 14 |
#' |
|
| 15 |
#' @export |
|
| 16 |
#' |
|
| 17 |
#' @seealso [riskProfile()] [calibrationProfile()] |
|
| 18 |
#' |
|
| 19 |
#' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()] |
|
| 20 |
#' [getASISest()] |
|
| 21 |
#' |
|
| 22 |
#' @examples |
|
| 23 |
#' # Read in example data |
|
| 24 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
|
|
| 25 |
#' rscore <- auroc$predicted_calibrated |
|
| 26 |
#' truth <- as.numeric(auroc$actual) |
|
| 27 |
#' |
|
| 28 |
#' # Plot sensitivity and specificity |
|
| 29 |
#' p1 <- sensSpec(outcome = truth, score = rscore) |
|
| 30 |
#' p1$plot |
|
| 31 |
#' |
|
| 32 |
#' # Same with smoothed estimates |
|
| 33 |
#' p2 <- sensSpec(outcome = truth, score = rscore, methods = c("asis", "gam"))
|
|
| 34 |
#' p2$plot |
|
| 35 |
#' |
|
| 36 |
sensSpec <- function(outcome, |
|
| 37 |
score, |
|
| 38 |
methods = "asis", |
|
| 39 |
show.best = TRUE, |
|
| 40 |
plot.raw = FALSE, |
|
| 41 |
rev.order = FALSE) {
|
|
| 42 | ||
| 43 |
# Argument checks |
|
| 44 | 7x |
checkmate::assert_flag(show.best) |
| 45 | 7x |
checkmate::assert_flag(plot.raw) |
| 46 | 7x |
checkmate::assert_flag(rev.order) |
| 47 | ||
| 48 |
# Check methods |
|
| 49 | 7x |
methods <- methodCheck(methods = methods) |
| 50 | ||
| 51 | 7x |
if (plot.raw) {
|
| 52 | 2x |
xvar <- "score" |
| 53 |
} else {
|
|
| 54 | 5x |
xvar <- "percentile" |
| 55 |
} |
|
| 56 | ||
| 57 |
# Standardize/Check outcome, scores |
|
| 58 | 7x |
op <- inputCheck(outcome = outcome, score = score) |
| 59 | ||
| 60 |
# Order Data by scores |
|
| 61 | 7x |
tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order) |
| 62 | 7x |
score <- tempdf$score |
| 63 | 7x |
outcome <- tempdf$outcome |
| 64 | ||
| 65 |
# Calculate spec and sens |
|
| 66 | 7x |
dat <- getEsts(outcome = outcome, score = score, methods = methods)$plotdata |
| 67 | 7x |
dat <- split(dat, dat$method) %>% |
| 68 | 7x |
lapply(\(d) nonParametricTR(outcome = d$outcome, score = d$estimate)) %>% |
| 69 | 7x |
bind_rows(.id = "method") |
| 70 | ||
| 71 |
# Plot |
|
| 72 | 7x |
p <- ggplot(dat) + |
| 73 | 7x |
geom_step( |
| 74 | 7x |
aes(x = .data[[xvar]], y = .data$value, colour = .data$method, linetype = .data$pf), |
| 75 | 7x |
direction = "hv" |
| 76 |
) |
|
| 77 | ||
| 78 |
# Add best possible sensitivity and specificity |
|
| 79 | 7x |
if (show.best) {
|
| 80 | 7x |
prev <- mean(outcome) |
| 81 | 7x |
if (plot.raw) {
|
| 82 | 2x |
best <- data.frame(percentile = ecdf(score)(score), score = score) |
| 83 |
} else {
|
|
| 84 | 5x |
best <- data.frame(percentile = c(0, ecdf(score)(score)), score = c(NA, score)) |
| 85 |
} |
|
| 86 | 7x |
best <- best %>% |
| 87 | 7x |
distinct() %>% |
| 88 | 7x |
mutate( |
| 89 | 7x |
Sensitivity = bestSens(perc = .data$percentile, prev = prev), |
| 90 | 7x |
Specificity = bestSpec(perc = .data$percentile, prev = prev) |
| 91 |
) %>% |
|
| 92 | 7x |
tidyr::pivot_longer( |
| 93 | 7x |
cols = c("Sensitivity", "Specificity"), names_to = "pf", values_to = "value"
|
| 94 |
) %>% |
|
| 95 | 7x |
mutate(method = "best") |
| 96 | 7x |
p <- p + |
| 97 | 7x |
geom_line( |
| 98 | 7x |
aes(x = .data[[xvar]], y = .data$value, linetype = .data$pf, linewidth = "Best Possible"), |
| 99 | 7x |
data = best, |
| 100 | 7x |
colour = "gray70" |
| 101 |
) + |
|
| 102 | 7x |
scale_linewidth_manual(values = c("Best Possible" = 0.5), name = NULL)
|
| 103 |
} |
|
| 104 | ||
| 105 |
# Finalize plot |
|
| 106 | 7x |
p <- p + |
| 107 | 7x |
labs( |
| 108 | 7x |
title = "Sensitivity and Specificity Plot", |
| 109 | 7x |
x = ifelse(plot.raw, "Prediction Score", "Risk Percentile"), |
| 110 | 7x |
y = "True Positive / Negative Rate", |
| 111 | 7x |
colour = "Estimation Method", |
| 112 | 7x |
linetype = "Predictive Quantity" |
| 113 |
) + |
|
| 114 | 7x |
scale_x_continuous(n.breaks = 6) + |
| 115 | 7x |
scale_y_continuous(n.breaks = 6) + |
| 116 | 7x |
scale_linetype_manual(values = c("Sensitivity" = "solid", "Specificity" = "dashed")) +
|
| 117 | 7x |
scale_colour_hue(l = 45) + |
| 118 | 7x |
theme_bw() + |
| 119 | 7x |
theme(legend.key.width = unit(2, "line")) + |
| 120 | 7x |
guides( |
| 121 | 7x |
linetype = guide_legend(order = 1, override.aes = list(colour = "black")), |
| 122 | 7x |
colour = guide_legend(order = 2), |
| 123 | 7x |
linewidth = guide_legend(order = 3) |
| 124 |
) |
|
| 125 | ||
| 126 | 7x |
if (show.best) {
|
| 127 | 7x |
dat <- bind_rows(dat, best) |
| 128 |
} |
|
| 129 | ||
| 130 | 7x |
return(list(plot = p, data = dplyr::as_tibble(dat))) |
| 131 |
} |
| 1 |
#' Check Input Arguments |
|
| 2 |
#' |
|
| 3 |
#' Given outcomes, prediction scores, methods, checks for obvious issues, |
|
| 4 |
#' and returns warnings if needed. |
|
| 5 |
#' These may include unequal lengths, missing values, ensuring binary outcomes, |
|
| 6 |
#' and numeric scores, etc. |
|
| 7 |
#' |
|
| 8 |
#' @inheritParams riskProfile |
|
| 9 |
#' |
|
| 10 |
#' @return A list of containing standardized outcomes and predicted scores. |
|
| 11 |
#' |
|
| 12 |
#' @keywords internal |
|
| 13 |
#' @noRd |
|
| 14 |
#' |
|
| 15 |
#' @examples |
|
| 16 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc"))
|
|
| 17 |
#' rscore <- auroc$predicted |
|
| 18 |
#' truth <- as.numeric(auroc$actual) |
|
| 19 |
#' inputCheck(truth, rscore) |
|
| 20 |
#' |
|
| 21 |
inputCheck <- function(outcome, score) {
|
|
| 22 | ||
| 23 |
# Predscore - check numeric |
|
| 24 | 41x |
if (!is.numeric(score)) {
|
| 25 | 1x |
stop("'score' vector must be represented as a numeric.")
|
| 26 |
} |
|
| 27 | ||
| 28 |
# Outcome - logical to numeric |
|
| 29 | 40x |
if (is.logical(outcome)) {
|
| 30 | 33x |
outcome <- as.numeric(outcome) |
| 31 |
} |
|
| 32 | ||
| 33 |
# Outcome - check numeric |
|
| 34 | 40x |
if (!is.numeric(outcome)) {
|
| 35 | 1x |
stop("'outcome' vector needs to be a numeric vector.")
|
| 36 |
} |
|
| 37 | ||
| 38 |
# Check matching lengths |
|
| 39 | 39x |
if (length(outcome) != length(score)) {
|
| 40 | 1x |
stop("'outcome' and 'score' must have the same lengths.")
|
| 41 |
} |
|
| 42 | ||
| 43 |
# Need at least 3 observations |
|
| 44 | 38x |
if (length(outcome) < 3) {
|
| 45 | 1x |
stop("Need at least 3 observations.")
|
| 46 |
} |
|
| 47 | ||
| 48 |
# DROP NA's for data pairs |
|
| 49 | 37x |
if (any(is.na(outcome)) || any(is.na(score))) {
|
| 50 | 2x |
warning("Observations with NA's are dropped")
|
| 51 | 2x |
ind <- intersect( |
| 52 | 2x |
which(complete.cases(outcome)), |
| 53 | 2x |
which(complete.cases(score)) |
| 54 |
) |
|
| 55 | 2x |
outcome <- outcome[ind] |
| 56 | 2x |
score <- score[ind] |
| 57 |
} |
|
| 58 | ||
| 59 |
# Outcome - check incorrect values |
|
| 60 | 37x |
if (!all(outcome %in% 0:1)) {
|
| 61 | 1x |
stop("'outcome' vector must be represented as binary 1 or 0.")
|
| 62 |
} |
|
| 63 | ||
| 64 |
# Predscore - check low frequency of score values |
|
| 65 | 36x |
if (length(unique(score)) <= 5) {
|
| 66 | 2x |
tbl <- table(score) |
| 67 | 2x |
if (any(tbl <= 3)) {
|
| 68 | 2x |
warning( |
| 69 | 2x |
paste( |
| 70 | 2x |
"There is a low-occurrence value in `score` (", names(tbl)[tbl <= 3][1], ").",
|
| 71 | 2x |
"The results may be unreliable." |
| 72 |
) |
|
| 73 |
) |
|
| 74 |
} |
|
| 75 |
} |
|
| 76 | ||
| 77 |
# Outcome - check low frequency |
|
| 78 | 36x |
if (any(prop.table(table(outcome)) <= 0.03)) {
|
| 79 | 1x |
warning( |
| 80 | 1x |
paste( |
| 81 | 1x |
"There is a low frequency of one of the outcome classes (`prop.table(table(outcome))`).", |
| 82 | 1x |
"The results may be unreliable." |
| 83 |
) |
|
| 84 |
) |
|
| 85 |
} |
|
| 86 | ||
| 87 | 36x |
return(list(outcome = outcome, score = score)) |
| 88 |
} |
|
| 89 | ||
| 90 | ||
| 91 |
#' Order Inputs |
|
| 92 |
#' |
|
| 93 |
#' Given a vector of prediction scores, and outcome, the function orders by score. |
|
| 94 |
#' Option exists to reverse ordering, if lower scores correspond to higher rate of outcomes. |
|
| 95 |
#' |
|
| 96 |
#' @inheritParams riskProfile |
|
| 97 |
#' |
|
| 98 |
#' @return Returns an ordered and complete case dataframe of outcomes and score. |
|
| 99 |
#' |
|
| 100 |
#' @keywords internal |
|
| 101 |
#' @noRd |
|
| 102 |
#' |
|
| 103 |
orderInputs <- function(outcome, score, rev.order = FALSE) {
|
|
| 104 | 33x |
tempdf <- data.frame(score = score, outcome = outcome) |
| 105 | 33x |
if (rev.order) {
|
| 106 | 6x |
tempdf$score <- -tempdf$score |
| 107 |
} |
|
| 108 | 33x |
tempdf <- tempdf[order(tempdf$score, tempdf$outcome), ] |
| 109 | 33x |
return(tempdf) |
| 110 |
} |
|
| 111 | ||
| 112 |
#' Method Check |
|
| 113 |
#' |
|
| 114 |
#' Usage: |
|
| 115 |
#' 1. methods <- methodCheck(methods) |
|
| 116 |
#' 2. getEstMethod(methods[[1]]) or getEstMethods(methods) |
|
| 117 |
#' 3. getEst(methods, ...) |
|
| 118 |
#' |
|
| 119 |
#' @inheritParams riskProfile |
|
| 120 |
#' |
|
| 121 |
#' @return list of named lists of method arguments |
|
| 122 |
#' |
|
| 123 |
#' @keywords internal |
|
| 124 |
#' @noRd |
|
| 125 |
#' |
|
| 126 |
methodCheck <- function(methods) {
|
|
| 127 | ||
| 128 |
# Drop empty strings |
|
| 129 | 54x |
if (any(methods == "")) {
|
| 130 | 1x |
methods <- methods[methods != ""] |
| 131 |
} |
|
| 132 | ||
| 133 |
# If character is supplied |
|
| 134 | 54x |
if (is.character(methods)) {
|
| 135 | ||
| 136 |
# Convert to lower-case |
|
| 137 | 32x |
orig.names <- methods |
| 138 | 32x |
methods <- tolower(methods) |
| 139 | ||
| 140 |
# Check uniqueness |
|
| 141 | 32x |
if (length(unique(methods)) != length(methods)) {
|
| 142 | 1x |
stop("`methods` should be unique when specified as character.")
|
| 143 |
} |
|
| 144 | ||
| 145 |
# Check available method |
|
| 146 | 31x |
if (!all(methods %in% names(est.funs()))) {
|
| 147 | 2x |
stop( |
| 148 | 2x |
paste( |
| 149 | 2x |
"Supplied method is not yet available. Try selecting from (case insensitive): ", |
| 150 | 2x |
paste(shQuote(names(est.funs())), collapse = ", ") |
| 151 |
) |
|
| 152 |
) |
|
| 153 |
} |
|
| 154 | ||
| 155 |
# Convert to list |
|
| 156 | 29x |
methods <- structure(as.list(methods), names = orig.names) |
| 157 | ||
| 158 |
# Otherwise, if a list is provided: |
|
| 159 |
# list(gam1 = list(method = "gam", k = 3), pv = list(method = "pava", ...), etc) |
|
| 160 |
# or list(my_estimate = my_fun(outcome, score) {...})
|
|
| 161 | 22x |
} else if (is.list(methods)) {
|
| 162 | ||
| 163 |
# Check named list |
|
| 164 | 20x |
if (!checkmate::test_named(methods, type = "unique")) {
|
| 165 | 1x |
stop("'methods' should be a uniquely named list.")
|
| 166 |
} |
|
| 167 | ||
| 168 |
# Check and unify estimation method names |
|
| 169 | 19x |
methods <- lapply(methods, listMethodCheck) |
| 170 | ||
| 171 |
} else {
|
|
| 172 | 2x |
stop( |
| 173 | 2x |
paste( |
| 174 | 2x |
"`methods` must be character", |
| 175 | 2x |
"or named list of estimation methods / user defined functions." |
| 176 |
) |
|
| 177 |
) |
|
| 178 |
} |
|
| 179 | ||
| 180 | 42x |
return(methods) |
| 181 |
} |
|
| 182 | ||
| 183 |
# x is a single method (element of an outer list) |
|
| 184 |
listMethodCheck <- function(x) {
|
|
| 185 | 25x |
UseMethod("listMethodCheck")
|
| 186 |
} |
|
| 187 | ||
| 188 |
#' @export |
|
| 189 |
listMethodCheck.list <- function(x) {
|
|
| 190 | ||
| 191 |
# Check named list |
|
| 192 | 20x |
if (!checkmate::test_named(x, type = "unique")) {
|
| 193 | 1x |
stop("Inner lists in the 'methods' argument must be named.")
|
| 194 |
} |
|
| 195 | ||
| 196 |
# Check "method" element existing in the list |
|
| 197 | 19x |
if (is.null(x[["method"]]) || !is.character(x[["method"]])) {
|
| 198 | 2x |
stop( |
| 199 | 2x |
paste( |
| 200 | 2x |
'All lists must have a "method" element specifying one of the predefined', |
| 201 | 2x |
"estimation functions as a string. Please select from:", |
| 202 | 2x |
paste(shQuote(names(est.funs())), collapse = ", ") |
| 203 |
) |
|
| 204 |
) |
|
| 205 |
} |
|
| 206 | ||
| 207 |
# Update method name |
|
| 208 | 17x |
x[["method"]] <- tolower(x[["method"]]) |
| 209 | ||
| 210 |
# Check available method |
|
| 211 | 17x |
chck_available <- checkmate::test_subset( |
| 212 | 17x |
x[["method"]], |
| 213 | 17x |
choices = names(est.funs()), empty.ok = FALSE |
| 214 |
) |
|
| 215 | 17x |
if (!chck_available) {
|
| 216 | 2x |
stop( |
| 217 | 2x |
paste0( |
| 218 | 2x |
"Supplied method '", |
| 219 | 2x |
x[["method"]], |
| 220 | 2x |
"' is not yet available. Try selecting from: ", |
| 221 | 2x |
paste(shQuote(names(est.funs())), collapse = ", ") |
| 222 |
) |
|
| 223 |
) |
|
| 224 |
} |
|
| 225 | ||
| 226 | 15x |
return(x) |
| 227 |
} |
|
| 228 | ||
| 229 |
#' @export |
|
| 230 |
listMethodCheck.function <- function(x) {
|
|
| 231 | ||
| 232 |
# Check input arguments |
|
| 233 | 4x |
if (!checkmate::test_function(x, args = c("outcome", "score"))) {
|
| 234 | 2x |
stop( |
| 235 | 2x |
paste( |
| 236 | 2x |
"All user defined estimation functions need to take exactly two arguments:", |
| 237 | 2x |
"'outcome' and 'score'." |
| 238 |
) |
|
| 239 |
) |
|
| 240 |
} |
|
| 241 | ||
| 242 | 2x |
return(x) |
| 243 |
} |
|
| 244 | ||
| 245 |
#' @export |
|
| 246 |
listMethodCheck.default <- function(x) {
|
|
| 247 | 1x |
stop( |
| 248 | 1x |
paste( |
| 249 | 1x |
"The estimation method must be specified as a list or a function." |
| 250 |
) |
|
| 251 |
) |
|
| 252 |
} |
|
| 253 | ||
| 254 |
# compared to match.arg(..., several.ok = T), |
|
| 255 |
# this does not allow NULL and returns a vector of the same length |
|
| 256 |
matchArgSubset <- function(x, choices) {
|
|
| 257 | 24x |
checkmate::assert_character(x, any.missing = FALSE, min.len = 1) |
| 258 | 24x |
out <- c() |
| 259 | 24x |
for (ag in x) {
|
| 260 | 60x |
matched <- tryCatch( |
| 261 | 60x |
match.arg(ag, choices = choices), |
| 262 | 60x |
error = function(e) stop(sub("'arg'", paste0("'", ag, "'"), e))
|
| 263 |
) |
|
| 264 | 60x |
out <- c(out, matched) |
| 265 |
} |
|
| 266 | 24x |
return(unique(out)) |
| 267 |
} |
|
| 268 | ||
| 269 | ||
| 270 |
add0thPercPC <- function(x) {
|
|
| 271 | 11x |
bind_rows( |
| 272 | 11x |
x, |
| 273 | 11x |
x %>% |
| 274 | 11x |
group_by(.data$method) %>% |
| 275 | 11x |
summarise( |
| 276 | 11x |
score = NA, |
| 277 | 11x |
percentile = 0, |
| 278 | 11x |
outcome = NA, |
| 279 | 11x |
estimate = .data$estimate[.data$percentile == min(.data$percentile)][1], |
| 280 | 11x |
pv = "PC", |
| 281 | 11x |
.groups = "drop" |
| 282 |
) |
|
| 283 |
) %>% |
|
| 284 | 11x |
arrange(.data$method, .data$percentile) |
| 285 |
} |
|
| 286 | ||
| 287 | ||
| 288 |
add0thPercPV <- function(x) {
|
|
| 289 | 13x |
bind_rows( |
| 290 | 13x |
x, |
| 291 | 13x |
x %>% |
| 292 | 13x |
group_by(.data$method, .data$pv) %>% |
| 293 | 13x |
summarise( |
| 294 | 13x |
score = NA, |
| 295 | 13x |
percentile = 0, |
| 296 | 13x |
estimate = NA, |
| 297 | 13x |
pvValue = dplyr::first(.data$pvValue), |
| 298 | 13x |
.groups = "drop" |
| 299 |
) %>% |
|
| 300 | 13x |
dplyr::relocate("pv", .before = "pvValue")
|
| 301 |
) %>% |
|
| 302 | 13x |
arrange(.data$method, .data$pv, .data$percentile) |
| 303 |
} |
|
| 304 | ||
| 305 | ||
| 306 |
#' For snapshot testing of graphs |
|
| 307 |
#' |
|
| 308 |
#' @param code Code to create a graph |
|
| 309 |
#' @param width Width of the plot. |
|
| 310 |
#' @param height Height of the plot. |
|
| 311 |
#' |
|
| 312 |
#' @return Filepath |
|
| 313 |
#' |
|
| 314 |
#' @keywords internal |
|
| 315 |
#' @noRd |
|
| 316 |
#' |
|
| 317 |
#' @examples |
|
| 318 |
#' expect_snapshot_file(save_png(ggplot(mtcars) + |
|
| 319 |
#' geom_point(aes(hp, mpg))), "riskProfile.png") |
|
| 320 |
#' |
|
| 321 |
save_png <- function(code, width = 400, height = 400) { # nocov start
|
|
| 322 |
path <- tempfile(fileext = ".png") |
|
| 323 |
png(path, width = width, height = height) |
|
| 324 |
on.exit(dev.off()) |
|
| 325 |
print(code) |
|
| 326 |
return(path) |
|
| 327 |
} # nocov end |
| 1 | ||
| 2 |
#' zipFastener for two dataframes of unequal length |
|
| 3 |
#' |
|
| 4 |
#' The following function acts like a “zip fastener” for combining two dataframes. |
|
| 5 |
#' It takes the first row of the first data frame and places it above of |
|
| 6 |
#' the first row of the second data frame and so on. |
|
| 7 |
#' |
|
| 8 |
#' @param df1 dataframe 1 |
|
| 9 |
#' @param df2 dataframe 2 |
|
| 10 |
#' |
|
| 11 |
#' @return Zipped data.frame |
|
| 12 |
#' |
|
| 13 |
#' @keywords internal |
|
| 14 |
#' @noRd |
|
| 15 |
#' |
|
| 16 |
#' @examples |
|
| 17 |
#' df1 <- data.frame(a = 1:3, b = 1:3, c = 1:3) |
|
| 18 |
#' df2 <- data.frame(a = letters[1:3], b = letters[1:3], c = letters[1:3]) |
|
| 19 |
#' |
|
| 20 |
#' zipFastener(df1, df2) |
|
| 21 |
#' |
|
| 22 |
zipFastener <- function(df1, df2) {
|
|
| 23 | ||
| 24 | 8x |
if (ncol(df1) != ncol(df2)) {
|
| 25 | 1x |
stop("the no. of columns has to be equal to merge them by zip feeding")
|
| 26 |
} |
|
| 27 | ||
| 28 | 7x |
d1 <- nrow(df1) |
| 29 | 7x |
d2 <- nrow(df2) |
| 30 | ||
| 31 | 7x |
if (d1 != d2) {
|
| 32 | 1x |
stop("the no. of rows has to be equal to merge them by zip feeding")
|
| 33 |
} |
|
| 34 | ||
| 35 |
# zip fastener preperations |
|
| 36 | 6x |
i1 <- 1:d1 # index vector 1 |
| 37 | 6x |
i2 <- 1:d1 + d1 # index vector 2 |
| 38 | ||
| 39 |
# zip fastener operations |
|
| 40 | 6x |
index <- as.vector(matrix(c(i1, i2), ncol = d1, byrow = TRUE)) |
| 41 | 6x |
index <- index[!is.na(index)] # remove NAs |
| 42 | ||
| 43 | 6x |
colnames(df2) <- colnames(df1) # keep 1st colnames |
| 44 | 6x |
res <- rbind(df1, df2)[index, ] # reorder data frame |
| 45 | ||
| 46 | 6x |
return(res) |
| 47 |
} |
|
| 48 | ||
| 49 | ||
| 50 |
#' last observation carried forward: for binned |
|
| 51 |
#' |
|
| 52 |
#' Function assumes columns in data frame are: percentile, estimate, method |
|
| 53 |
#' |
|
| 54 |
#' @param df dataframe containing predcurve estimates |
|
| 55 |
#' |
|
| 56 |
#' @return dataframe with adjusted points |
|
| 57 |
#' |
|
| 58 |
#' @keywords internal |
|
| 59 |
#' @noRd |
|
| 60 |
#' |
|
| 61 |
locf.binned <- function(df) {
|
|
| 62 | ||
| 63 | 4x |
df.copy <- df |
| 64 |
# keep same x, slide y back one spot |
|
| 65 | 4x |
df.copy$estimate <- df.copy$estimate[c(2:length(df.copy$estimate), NA)] |
| 66 | 4x |
combined.df <- zipFastener(df, df.copy) |
| 67 | 4x |
combined.df <- combined.df[-nrow(combined.df), ] # removes a redundant point |
| 68 | ||
| 69 | 4x |
return(combined.df) |
| 70 |
} |
|
| 71 | ||
| 72 |
#' last observation carried forward: for pava |
|
| 73 |
#' |
|
| 74 |
#' Function assumes columns in data frame are: percentile, estimate, method |
|
| 75 |
#' |
|
| 76 |
#' @param df dataframe containing predcurve estimates |
|
| 77 |
#' |
|
| 78 |
#' @return dataframe with adjusted points |
|
| 79 |
#' |
|
| 80 |
#' @keywords internal |
|
| 81 |
#' @noRd |
|
| 82 |
#' |
|
| 83 |
locf.pava <- function(df) {
|
|
| 84 | ||
| 85 |
# save first and last points |
|
| 86 | 2x |
first <- df[1, ] |
| 87 | 2x |
last <- df[nrow(df) - 1, ] |
| 88 | ||
| 89 |
# adjust data to remove extra points |
|
| 90 | 2x |
diff_y <- diff(c(0, df$estimate)) |
| 91 | 2x |
pava.int <- df[diff_y != 0, ] |
| 92 | ||
| 93 |
# re-add first and last points |
|
| 94 | 2x |
df <- rbind(first, pava.int, last) |
| 95 | 2x |
df <- df[order(df$percentile), ] # order on percentile |
| 96 | ||
| 97 | 2x |
df.copy <- df |
| 98 |
# keep same x, slide y back one spot |
|
| 99 | 2x |
df.copy$percentile <- df.copy$percentile[ |
| 100 | 2x |
c(2:(length(df.copy$percentile)), NA) |
| 101 |
] |
|
| 102 | 2x |
combined.df <- zipFastener(df, df.copy) |
| 103 | ||
| 104 |
# Combine smooth estimate to fixed df |
|
| 105 | 2x |
combined.df <- na.omit(combined.df) |
| 106 | 2x |
combined.df <- combined.df[-nrow(combined.df), ] # removes a redundant point |
| 107 | ||
| 108 | 2x |
return(combined.df) |
| 109 |
} |
|
| 110 | ||
| 111 |
#' Last observation carried forward for specific estimation methods |
|
| 112 |
#' |
|
| 113 |
#' Iterates locf for multiple methods: calls locf.binned and locf.pava |
|
| 114 |
#' |
|
| 115 |
#' @param df dataframe containing predcurve estimates |
|
| 116 |
#' @param method.binned vector of "binned" methods |
|
| 117 |
#' @param method.pava vector of "pava" methods |
|
| 118 |
#' |
|
| 119 |
#' @return dataframe with adjusted points |
|
| 120 |
#' |
|
| 121 |
#' @keywords internal |
|
| 122 |
#' @noRd |
|
| 123 |
#' |
|
| 124 |
locf <- function(df, method.binned, method.pava) {
|
|
| 125 | ||
| 126 | 4x |
complete.data <- df[!df$method %in% c(method.binned, method.pava), ] |
| 127 | 4x |
binned.data <- df[df$method %in% method.binned, ] |
| 128 | 4x |
pava.data <- df[df$method %in% method.pava, ] |
| 129 | ||
| 130 |
# pava locf |
|
| 131 | 4x |
if (nrow(pava.data) >= 1) {
|
| 132 | 2x |
pava.plotdata <- locf.pava(pava.data) |
| 133 | 2x |
complete.data <- bind_rows(complete.data, pava.plotdata) |
| 134 |
} |
|
| 135 | ||
| 136 |
# binned locf |
|
| 137 | 4x |
if (nrow(binned.data) >= 1) {
|
| 138 | 4x |
binned.plotdata <- locf.binned(binned.data) |
| 139 | 4x |
complete.data <- bind_rows(complete.data, binned.plotdata) |
| 140 |
} |
|
| 141 | 4x |
rownames(complete.data) <- NULL |
| 142 | ||
| 143 | 4x |
return(complete.data) |
| 144 |
} |
| 1 | ||
| 2 |
# Calculate Predictive Value (PV) Estimates |
|
| 3 |
getPVdata <- function(methods, outcome, score, pc.ests = NULL) {
|
|
| 4 | ||
| 5 |
# Calculate estimates if not provided |
|
| 6 | 16x |
if (is.null(pc.ests)) {
|
| 7 | 5x |
pc.ests <- getEsts(outcome = outcome, score = score, methods = methods) |
| 8 | 5x |
pc.data <- pc.ests$plotdata |
| 9 |
} else {
|
|
| 10 | 11x |
pc.data <- pc.ests$plotdata |
| 11 |
} |
|
| 12 | ||
| 13 |
# locf and zip for pava and/or binned data |
|
| 14 | 16x |
if (any(pc.ests$idx.step)) {
|
| 15 | 4x |
name.binned <- names(methods)[pc.ests$idx.binned] |
| 16 | 4x |
name.pava <- names(methods)[pc.ests$idx.pava] |
| 17 | 4x |
pc.data <- locf(pc.data, method.binned = name.binned, method.pava = name.pava) |
| 18 |
} |
|
| 19 | ||
| 20 |
# Calculate PV |
|
| 21 | 16x |
PV.data <- parametricPV(pc.data = pc.data) |
| 22 | ||
| 23 | 16x |
return(as.data.frame(PV.data)) |
| 24 |
} |
|
| 25 | ||
| 26 | ||
| 27 |
# Calculates PV values from cutoff thresholds |
|
| 28 |
nonParametricPV <- function(outcome, score) {
|
|
| 29 | ||
| 30 | 11x |
prev <- mean(outcome, na.rm = TRUE) |
| 31 | ||
| 32 | 11x |
thresh.predictions <- lapply(score, function(x) as.numeric(score > x)) |
| 33 | ||
| 34 | 11x |
ppv <- vapply( |
| 35 | 11x |
thresh.predictions, |
| 36 | 11x |
function(x) {
|
| 37 | 440x |
yardstick::ppv_vec( |
| 38 | 440x |
truth = factor(outcome, levels = c("1", "0")),
|
| 39 | 440x |
estimate = factor(x, levels = c("1", "0")),
|
| 40 | 440x |
event_level = "first" |
| 41 |
) |
|
| 42 |
}, |
|
| 43 | 11x |
numeric(1) |
| 44 |
) |
|
| 45 | ||
| 46 | 11x |
npv <- vapply( |
| 47 | 11x |
thresh.predictions, |
| 48 | 11x |
function(x) {
|
| 49 | 440x |
yardstick::npv_vec( |
| 50 | 440x |
truth = factor(outcome, levels = c("1", "0")),
|
| 51 | 440x |
estimate = factor(x, levels = c("1", "0")),
|
| 52 | 440x |
event_level = "first" |
| 53 |
) |
|
| 54 |
}, |
|
| 55 | 11x |
numeric(1) |
| 56 |
) |
|
| 57 | ||
| 58 | 11x |
threshold.data <- data.frame( |
| 59 | 11x |
score = score, |
| 60 | 11x |
percentile = ecdf(score)(score), |
| 61 | 11x |
PPV = ppv, |
| 62 | 11x |
NPV = npv |
| 63 |
) %>% |
|
| 64 | 11x |
mutate(MNPV = 1 - .data$NPV) %>% |
| 65 | 11x |
tidyr::fill( |
| 66 | 11x |
all_of(c("PPV", "NPV", "MNPV")),
|
| 67 | 11x |
.direction = "downup" |
| 68 |
) |
|
| 69 | ||
| 70 | 11x |
return(threshold.data) |
| 71 |
} |
|
| 72 | ||
| 73 | ||
| 74 |
# Calculates PV values based on pracma::cumtrapz |
|
| 75 |
parametricPV <- function(pc.data) {
|
|
| 76 | ||
| 77 |
# Calculate PVs |
|
| 78 | 16x |
PV.data <- pc.data %>% |
| 79 | 16x |
group_by(.data$method) %>% |
| 80 | 16x |
mutate( |
| 81 | 16x |
MNPV = pracma::cumtrapz(.data$percentile, .data$estimate)[, 1] / .data$percentile, |
| 82 | 16x |
NPV = 1 - .data$MNPV, |
| 83 | 16x |
PPV = |
| 84 |
( |
|
| 85 | 16x |
max(pracma::cumtrapz(.data$percentile, .data$estimate), na.rm = TRUE) - |
| 86 | 16x |
pracma::cumtrapz(.data$percentile, .data$estimate)[, 1] |
| 87 | 16x |
) / (max(.data$percentile) - .data$percentile) |
| 88 |
) %>% |
|
| 89 | 16x |
ungroup() %>% |
| 90 | 16x |
arrange(.data$method, .data$percentile) |
| 91 | ||
| 92 |
# Fill NAs (starting points) |
|
| 93 | 16x |
PV.data <- PV.data %>% |
| 94 | 16x |
group_by(.data$method) %>% |
| 95 | 16x |
mutate( |
| 96 | 16x |
MNPV = ifelse(.data$percentile == min(.data$percentile) & is.na(.data$MNPV), 0, .data$MNPV), |
| 97 | 16x |
NPV = ifelse(.data$percentile == min(.data$percentile) & is.na(.data$NPV), 1, .data$NPV) |
| 98 |
) %>% |
|
| 99 | 16x |
as.data.frame() |
| 100 | ||
| 101 |
# Fill NAs (ending points) |
|
| 102 | 16x |
PV.data <- tidyr::fill( |
| 103 | 16x |
PV.data, |
| 104 | 16x |
all_of(c("estimate", "MNPV", "NPV", "PPV")),
|
| 105 | 16x |
.direction = "down" |
| 106 |
) |
|
| 107 | ||
| 108 | 16x |
return(as.data.frame(PV.data)) |
| 109 |
} |
|
| 110 | ||
| 111 | ||
| 112 |
# consistent colors for PVs |
|
| 113 |
predictionColours <- function(x, show.best) {
|
|
| 114 | 13x |
clrs <- c( |
| 115 | 13x |
"Best NPV" = "gray80", |
| 116 | 13x |
"NPV" = "#53B400", |
| 117 | 13x |
"Best PPV" = "gray65", |
| 118 | 13x |
"PPV" = "red", |
| 119 | 13x |
"Best PC" = "black", |
| 120 | 13x |
"PC" = "plum", |
| 121 | 13x |
"1-NPV" = "royalblue2", |
| 122 | 13x |
"Best 1-NPV" = "gray50" |
| 123 |
) |
|
| 124 | 13x |
if (show.best) {
|
| 125 | 10x |
x <- c(x, paste("Best", x))
|
| 126 |
} |
|
| 127 | 13x |
return(clrs[names(clrs) %in% x]) # keep the ordering above |
| 128 |
} |
|
| 129 | ||
| 130 | ||
| 131 |
# Calculates sensitivity and specificity (true positive / negative rate) |
|
| 132 |
nonParametricTR <- function(outcome, score) {
|
|
| 133 | ||
| 134 |
# Tresh_predictions are lists of 0,1's based on each finer_rscore as a cutpoint |
|
| 135 | 9x |
thresh.predictions <- lapply(score, function(x) as.numeric(score > x)) |
| 136 | ||
| 137 |
# Calc sensitivities and specificities at each risk percentile threshold |
|
| 138 | 9x |
senses <- vapply( |
| 139 | 9x |
thresh.predictions[1:(length(score) - 1)], |
| 140 | 9x |
function(x) {
|
| 141 | 351x |
yardstick::sens_vec( |
| 142 | 351x |
truth = factor(outcome, levels = c("1", "0")),
|
| 143 | 351x |
estimate = factor(x, levels = c("1", "0")),
|
| 144 | 351x |
event_level = "first" |
| 145 |
) |
|
| 146 |
}, |
|
| 147 | 9x |
numeric(1) |
| 148 |
) |
|
| 149 | ||
| 150 | 9x |
specs <- vapply( |
| 151 | 9x |
thresh.predictions[1:(length(score) - 1)], |
| 152 | 9x |
function(x) {
|
| 153 | 351x |
yardstick::spec_vec( |
| 154 | 351x |
truth = factor(outcome, levels = c("1", "0")),
|
| 155 | 351x |
estimate = factor(x, levels = c("1", "0")),
|
| 156 | 351x |
event_level = "first" |
| 157 |
) |
|
| 158 |
}, |
|
| 159 | 9x |
numeric(1) |
| 160 |
) |
|
| 161 | ||
| 162 |
# Create a data.frame |
|
| 163 | 9x |
dat <- data.frame( |
| 164 | 9x |
score = c(min(score), score), |
| 165 | 9x |
percentile = c(0, ecdf(score)(score)), |
| 166 | 9x |
Sensitivity = c(1, senses, 0), |
| 167 | 9x |
Specificity = c(0, specs, 1) |
| 168 |
) %>% |
|
| 169 | 9x |
tidyr::pivot_longer( |
| 170 | 9x |
cols = c("Sensitivity", "Specificity"),
|
| 171 | 9x |
names_to = "pf", |
| 172 | 9x |
values_to = "value" |
| 173 |
) |
|
| 174 | ||
| 175 | 9x |
return(as.data.frame(dat)) |
| 176 |
} |
|
| 177 | ||
| 178 | ||
| 179 | ||
| 180 |
adjPrevPerc <- function(perc, prev.new, cdf.case, cdf.control) {
|
|
| 181 | 4x |
prev.new * cdf.case(perc) + (1 - prev.new) * cdf.control(perc) |
| 182 |
} |
|
| 183 | ||
| 184 |
adjPrevEst <- function(x, prev, prev.new) {
|
|
| 185 | 5x |
f2 <- prev / (1 - prev) |
| 186 | 5x |
f3 <- (1 - prev.new) / prev.new |
| 187 | 5x |
1 / (((1 / x) - 1) * f2 * f3 + 1) |
| 188 |
} |
|
| 189 | ||
| 190 |
adjPrevPC <- function(dat, prev, prev.new, cdf.case, cdf.control) {
|
|
| 191 | 1x |
mutate( |
| 192 | 1x |
dat, |
| 193 | 1x |
percentile = adjPrevPerc( |
| 194 | 1x |
perc = .data$score, prev.new = prev.new, |
| 195 | 1x |
cdf.case = cdf.case, cdf.control = cdf.control |
| 196 |
), |
|
| 197 | 1x |
estimate = adjPrevEst(x = .data$estimate, prev = prev, prev.new = prev.new) |
| 198 |
) |
|
| 199 |
} |
|
| 200 | ||
| 201 |
adjPrevPV <- function(dat, prev, prev.new, cdf.case, cdf.control) {
|
|
| 202 | 1x |
mutate( |
| 203 | 1x |
dat, |
| 204 | 1x |
percentile = adjPrevPerc( |
| 205 | 1x |
perc = .data$score, prev.new = prev.new, |
| 206 | 1x |
cdf.case = cdf.case, cdf.control = cdf.control |
| 207 |
), |
|
| 208 | 1x |
MNPV = adjPrevEst(x = .data$MNPV, prev = prev, prev.new = prev.new), |
| 209 | 1x |
NPV = 1 - .data$MNPV, |
| 210 | 1x |
PPV = adjPrevEst(x = .data$PPV, prev = prev, prev.new = prev.new) |
| 211 |
) |
|
| 212 |
} |
|
| 213 | ||
| 214 | ||
| 215 | ||
| 216 |
bestPPV <- function(perc, prev) {
|
|
| 217 | 12x |
ifelse(perc > 1 - prev, 1, prev / (1 - perc)) |
| 218 |
} |
|
| 219 | ||
| 220 |
bestMNPV <- function(perc, prev) {
|
|
| 221 | 12x |
ifelse(perc <= 1 - prev, 0, 1 - ((1 - prev) / perc)) |
| 222 |
} |
|
| 223 | ||
| 224 |
bestSens <- function(perc, prev) {
|
|
| 225 | 8x |
ifelse(perc <= 1 - prev, 1, (1 - perc) / prev) |
| 226 |
} |
|
| 227 | ||
| 228 |
bestSpec <- function(perc, prev) {
|
|
| 229 | 8x |
ifelse(perc > 1 - prev, 1, perc / (1 - prev)) |
| 230 |
} |