| 1 | #' Check Input Arguments | |
| 2 | #' | |
| 3 | #' Given outcomes, prediction scores, methods, checks for obvious issues, | |
| 4 | #' and returns warnings if needed. | |
| 5 | #' These may include unequal lengths, missing values, ensuring binary outcomes, | |
| 6 | #' and numeric scores, etc. | |
| 7 | #' | |
| 8 | #' @inheritParams riskProfile | |
| 9 | #' | |
| 10 | #' @return A list of containing standardized outcomes and predicted scores. | |
| 11 | #' | |
| 12 | #' @keywords internal | |
| 13 | #' @noRd | |
| 14 | #' | |
| 15 | #' @examples | |
| 16 | #' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) | |
| 17 | #' rscore <- auroc$predicted | |
| 18 | #' truth <- as.numeric(auroc$actual) | |
| 19 | #' inputCheck(truth, rscore) | |
| 20 | #' | |
| 21 | inputCheck <- function(outcome, score) { | |
| 22 | ||
| 23 | # Predscore - check numeric | |
| 24 | 41x |   if (!is.numeric(score)) { | 
| 25 | 1x |     stop("'score' vector must be represented as a numeric.") | 
| 26 | } | |
| 27 | ||
| 28 | # Outcome - logical to numeric | |
| 29 | 40x |   if (is.logical(outcome)) { | 
| 30 | 33x | outcome <- as.numeric(outcome) | 
| 31 | } | |
| 32 | ||
| 33 | # Outcome - check numeric | |
| 34 | 40x |   if (!is.numeric(outcome)) { | 
| 35 | 1x |     stop("'outcome' vector needs to be a numeric vector.") | 
| 36 | } | |
| 37 | ||
| 38 | # Check matching lengths | |
| 39 | 39x |   if (length(outcome) != length(score)) { | 
| 40 | 1x |     stop("'outcome' and 'score' must have the same lengths.") | 
| 41 | } | |
| 42 | ||
| 43 | # Need at least 3 observations | |
| 44 | 38x |   if (length(outcome) < 3) { | 
| 45 | 1x |     stop("Need at least 3 observations.") | 
| 46 | } | |
| 47 | ||
| 48 | # DROP NA's for data pairs | |
| 49 | 37x |   if (any(is.na(outcome)) || any(is.na(score))) { | 
| 50 | 2x |     warning("Observations with NA's are dropped") | 
| 51 | 2x | ind <- intersect( | 
| 52 | 2x | which(complete.cases(outcome)), | 
| 53 | 2x | which(complete.cases(score)) | 
| 54 | ) | |
| 55 | 2x | outcome <- outcome[ind] | 
| 56 | 2x | score <- score[ind] | 
| 57 | } | |
| 58 | ||
| 59 | # Outcome - check incorrect values | |
| 60 | 37x |   if (!all(outcome %in% 0:1)) { | 
| 61 | 1x |     stop("'outcome' vector must be represented as binary 1 or 0.") | 
| 62 | } | |
| 63 | ||
| 64 | # Predscore - check low frequency of score values | |
| 65 | 36x |   if (length(unique(score)) <= 5) { | 
| 66 | 2x | tbl <- table(score) | 
| 67 | 2x |     if (any(tbl <= 3)) { | 
| 68 | 2x | warning( | 
| 69 | 2x | paste( | 
| 70 | 2x |           "There is a low-occurrence value in `score` (", names(tbl)[tbl <= 3][1], ").", | 
| 71 | 2x | "The results may be unreliable." | 
| 72 | ) | |
| 73 | ) | |
| 74 | } | |
| 75 | } | |
| 76 | ||
| 77 | # Outcome - check low frequency | |
| 78 | 36x |   if (any(prop.table(table(outcome)) <= 0.03)) { | 
| 79 | 1x | warning( | 
| 80 | 1x | paste( | 
| 81 | 1x | "There is a low frequency of one of the outcome classes (`prop.table(table(outcome))`).", | 
| 82 | 1x | "The results may be unreliable." | 
| 83 | ) | |
| 84 | ) | |
| 85 | } | |
| 86 | ||
| 87 | 36x | return(list(outcome = outcome, score = score)) | 
| 88 | } | |
| 89 | ||
| 90 | ||
| 91 | #' Order Inputs | |
| 92 | #' | |
| 93 | #' Given a vector of prediction scores, and outcome, the function orders by score. | |
| 94 | #' Option exists to reverse ordering, if lower scores correspond to higher rate of outcomes. | |
| 95 | #' | |
| 96 | #' @inheritParams riskProfile | |
| 97 | #' | |
| 98 | #' @return Returns an ordered and complete case dataframe of outcomes and score. | |
| 99 | #' | |
| 100 | #' @keywords internal | |
| 101 | #' @noRd | |
| 102 | #' | |
| 103 | orderInputs <- function(outcome, score, rev.order = FALSE) { | |
| 104 | 33x | tempdf <- data.frame(score = score, outcome = outcome) | 
| 105 | 33x |   if (rev.order) { | 
| 106 | 6x | tempdf$score <- -tempdf$score | 
| 107 | } | |
| 108 | 33x | tempdf <- tempdf[order(tempdf$score, tempdf$outcome), ] | 
| 109 | 33x | return(tempdf) | 
| 110 | } | |
| 111 | ||
| 112 | #' Method Check | |
| 113 | #' | |
| 114 | #' Usage: | |
| 115 | #' 1. methods <- methodCheck(methods) | |
| 116 | #' 2. getEstMethod(methods[[1]]) or getEstMethods(methods) | |
| 117 | #' 3. getEst(methods, ...) | |
| 118 | #' | |
| 119 | #' @inheritParams riskProfile | |
| 120 | #' | |
| 121 | #' @return list of named lists of method arguments | |
| 122 | #' | |
| 123 | #' @keywords internal | |
| 124 | #' @noRd | |
| 125 | #' | |
| 126 | methodCheck <- function(methods) { | |
| 127 | ||
| 128 | # Drop empty strings | |
| 129 | 54x |   if (any(methods == "")) { | 
| 130 | 1x | methods <- methods[methods != ""] | 
| 131 | } | |
| 132 | ||
| 133 | # If character is supplied | |
| 134 | 54x |   if (is.character(methods)) { | 
| 135 | ||
| 136 | # Convert to lower-case | |
| 137 | 32x | orig.names <- methods | 
| 138 | 32x | methods <- tolower(methods) | 
| 139 | ||
| 140 | # Check uniqueness | |
| 141 | 32x |     if (length(unique(methods)) != length(methods)) { | 
| 142 | 1x |       stop("`methods` should be unique when specified as character.") | 
| 143 | } | |
| 144 | ||
| 145 | # Check available method | |
| 146 | 31x |     if (!all(methods %in% names(est.funs()))) { | 
| 147 | 2x | stop( | 
| 148 | 2x | paste( | 
| 149 | 2x | "Supplied method is not yet available. Try selecting from (case insensitive): ", | 
| 150 | 2x | paste(shQuote(names(est.funs())), collapse = ", ") | 
| 151 | ) | |
| 152 | ) | |
| 153 | } | |
| 154 | ||
| 155 | # Convert to list | |
| 156 | 29x | methods <- structure(as.list(methods), names = orig.names) | 
| 157 | ||
| 158 | # Otherwise, if a list is provided: | |
| 159 | # list(gam1 = list(method = "gam", k = 3), pv = list(method = "pava", ...), etc) | |
| 160 |     # or list(my_estimate = my_fun(outcome, score) {...}) | |
| 161 | 22x |   } else if (is.list(methods)) { | 
| 162 | ||
| 163 | # Check named list | |
| 164 | 20x |     if (!checkmate::test_named(methods, type = "unique")) { | 
| 165 | 1x |       stop("'methods' should be a uniquely named list.") | 
| 166 | } | |
| 167 | ||
| 168 | # Check and unify estimation method names | |
| 169 | 19x | methods <- lapply(methods, listMethodCheck) | 
| 170 | ||
| 171 |   } else { | |
| 172 | 2x | stop( | 
| 173 | 2x | paste( | 
| 174 | 2x | "`methods` must be character", | 
| 175 | 2x | "or named list of estimation methods / user defined functions." | 
| 176 | ) | |
| 177 | ) | |
| 178 | } | |
| 179 | ||
| 180 | 42x | return(methods) | 
| 181 | } | |
| 182 | ||
| 183 | # x is a single method (element of an outer list) | |
| 184 | listMethodCheck <- function(x) { | |
| 185 | 25x |   UseMethod("listMethodCheck") | 
| 186 | } | |
| 187 | ||
| 188 | #' @export | |
| 189 | listMethodCheck.list <- function(x) { | |
| 190 | ||
| 191 | # Check named list | |
| 192 | 20x |   if (!checkmate::test_named(x, type = "unique")) { | 
| 193 | 1x |     stop("Inner lists in the 'methods' argument must be named.") | 
| 194 | } | |
| 195 | ||
| 196 | # Check "method" element existing in the list | |
| 197 | 19x |   if (is.null(x[["method"]]) || !is.character(x[["method"]])) { | 
| 198 | 2x | stop( | 
| 199 | 2x | paste( | 
| 200 | 2x | 'All lists must have a "method" element specifying one of the predefined', | 
| 201 | 2x | "estimation functions as a string. Please select from:", | 
| 202 | 2x | paste(shQuote(names(est.funs())), collapse = ", ") | 
| 203 | ) | |
| 204 | ) | |
| 205 | } | |
| 206 | ||
| 207 | # Update method name | |
| 208 | 17x | x[["method"]] <- tolower(x[["method"]]) | 
| 209 | ||
| 210 | # Check available method | |
| 211 | 17x | chck_available <- checkmate::test_subset( | 
| 212 | 17x | x[["method"]], | 
| 213 | 17x | choices = names(est.funs()), empty.ok = FALSE | 
| 214 | ) | |
| 215 | 17x |   if (!chck_available) { | 
| 216 | 2x | stop( | 
| 217 | 2x | paste0( | 
| 218 | 2x | "Supplied method '", | 
| 219 | 2x | x[["method"]], | 
| 220 | 2x | "' is not yet available. Try selecting from: ", | 
| 221 | 2x | paste(shQuote(names(est.funs())), collapse = ", ") | 
| 222 | ) | |
| 223 | ) | |
| 224 | } | |
| 225 | ||
| 226 | 15x | return(x) | 
| 227 | } | |
| 228 | ||
| 229 | #' @export | |
| 230 | listMethodCheck.function <- function(x) { | |
| 231 | ||
| 232 | # Check input arguments | |
| 233 | 4x |   if (!checkmate::test_function(x, args = c("outcome", "score"))) { | 
| 234 | 2x | stop( | 
| 235 | 2x | paste( | 
| 236 | 2x | "All user defined estimation functions need to take exactly two arguments:", | 
| 237 | 2x | "'outcome' and 'score'." | 
| 238 | ) | |
| 239 | ) | |
| 240 | } | |
| 241 | ||
| 242 | 2x | return(x) | 
| 243 | } | |
| 244 | ||
| 245 | #' @export | |
| 246 | listMethodCheck.default <- function(x) { | |
| 247 | 1x | stop( | 
| 248 | 1x | paste( | 
| 249 | 1x | "The estimation method must be specified as a list or a function." | 
| 250 | ) | |
| 251 | ) | |
| 252 | } | |
| 253 | ||
| 254 | # compared to match.arg(..., several.ok = T), | |
| 255 | # this does not allow NULL and returns a vector of the same length | |
| 256 | matchArgSubset <- function(x, choices) { | |
| 257 | 24x | checkmate::assert_character(x, any.missing = FALSE, min.len = 1) | 
| 258 | 24x | out <- c() | 
| 259 | 24x |   for (ag in x) { | 
| 260 | 60x | matched <- tryCatch( | 
| 261 | 60x | match.arg(ag, choices = choices), | 
| 262 | 60x |       error = function(e) stop(sub("'arg'", paste0("'", ag, "'"), e)) | 
| 263 | ) | |
| 264 | 60x | out <- c(out, matched) | 
| 265 | } | |
| 266 | 24x | return(unique(out)) | 
| 267 | } | |
| 268 | ||
| 269 | ||
| 270 | add0thPercPC <- function(x) { | |
| 271 | 11x | bind_rows( | 
| 272 | 11x | x, | 
| 273 | 11x | x %>% | 
| 274 | 11x | group_by(.data$method) %>% | 
| 275 | 11x | summarise( | 
| 276 | 11x | score = NA, | 
| 277 | 11x | percentile = 0, | 
| 278 | 11x | outcome = NA, | 
| 279 | 11x | estimate = .data$estimate[.data$percentile == min(.data$percentile)][1], | 
| 280 | 11x | pv = "PC", | 
| 281 | 11x | .groups = "drop" | 
| 282 | ) | |
| 283 | ) %>% | |
| 284 | 11x | arrange(.data$method, .data$percentile) | 
| 285 | } | |
| 286 | ||
| 287 | ||
| 288 | add0thPercPV <- function(x) { | |
| 289 | 13x | bind_rows( | 
| 290 | 13x | x, | 
| 291 | 13x | x %>% | 
| 292 | 13x | group_by(.data$method, .data$pv) %>% | 
| 293 | 13x | summarise( | 
| 294 | 13x | score = NA, | 
| 295 | 13x | percentile = 0, | 
| 296 | 13x | estimate = NA, | 
| 297 | 13x | pvValue = dplyr::first(.data$pvValue), | 
| 298 | 13x | .groups = "drop" | 
| 299 | ) %>% | |
| 300 | 13x |       dplyr::relocate("pv", .before = "pvValue") | 
| 301 | ) %>% | |
| 302 | 13x | arrange(.data$method, .data$pv, .data$percentile) | 
| 303 | } | |
| 304 | ||
| 305 | ||
| 306 | add0thPercTR <- function(x) { | |
| 307 | 5x | bind_rows( | 
| 308 | 5x | x, | 
| 309 | 5x | x %>% | 
| 310 | 5x | group_by(.data$method) %>% | 
| 311 | 5x | summarise( | 
| 312 | 5x | score = NA, | 
| 313 | 5x | percentile = 0, | 
| 314 | 5x | pf = "Sensitivity", | 
| 315 | 5x | value = 1 | 
| 316 | ), | |
| 317 | 5x | x %>% | 
| 318 | 5x | group_by(.data$method) %>% | 
| 319 | 5x | summarise( | 
| 320 | 5x | score = NA, | 
| 321 | 5x | percentile = 0, | 
| 322 | 5x | pf = "Specificity", | 
| 323 | 5x | value = 0 | 
| 324 | ) | |
| 325 | ) %>% | |
| 326 | 5x | arrange(.data$method, .data$pf, .data$percentile) | 
| 327 | } | |
| 328 | ||
| 329 | ||
| 330 | #' For snapshot testing of graphs | |
| 331 | #' | |
| 332 | #' @param code Code to create a graph | |
| 333 | #' @param width Width of the plot. | |
| 334 | #' @param height Height of the plot. | |
| 335 | #' | |
| 336 | #' @return Filepath | |
| 337 | #' | |
| 338 | #' @keywords internal | |
| 339 | #' @noRd | |
| 340 | #' | |
| 341 | #' @examples | |
| 342 | #' expect_snapshot_file(save_png(ggplot(mtcars) + | |
| 343 | #' geom_point(aes(hp, mpg))), "riskProfile.png") | |
| 344 | #' | |
| 345 | save_png <- function(code, width = 400, height = 400) { # nocov start | |
| 346 | path <- tempfile(fileext = ".png") | |
| 347 | png(path, width = width, height = height) | |
| 348 | on.exit(dev.off()) | |
| 349 | print(code) | |
| 350 | return(path) | |
| 351 | } # nocov end | 
| 1 | #' Risk profile plot | |
| 2 | #' | |
| 3 | #' Predictiveness curve, PPV, NPV and 1-NPV risk estimates | |
| 4 | #' | |
| 5 | #' @param outcome Vector of binary outcome for each observation. | |
| 6 | #' @param score Numeric vector of continuous predicted risk score. | |
| 7 | #' @param methods Character vector of method names (case-insensitive) for plotting curves or | |
| 8 | #' a named list where elements are method function and its arguments. | |
| 9 | #' Default is set to `"asis"`. | |
| 10 | #' | |
| 11 | #' Full options are: `c("asis", "binned", "pava", "mspline", "gam", "cgam")`. | |
| 12 | #' | |
| 13 | #' To specify arguments per method, use lists. For example: | |
| 14 | #' ``` | |
| 15 | #' list( | |
| 16 | #' pava = list(method = "pava", ties = "primary"), | |
| 17 | #' mspline = list(method = "mspline", fitonPerc = TRUE), | |
| 18 | #' gam = list(method = "gam", bs = "tp", logscores = FALSE), | |
| 19 | #' bin = list(method = "binned", bins = 10), | |
| 20 | #' risk = list(method = "asis") | |
| 21 | #' ) | |
| 22 | #' ``` | |
| 23 | #' See section "Estimation" for more details. | |
| 24 | #' @param prev.adj `NULL` (default) or scalar numeric between 0 and 1 for prevalence adjustment. | |
| 25 | #' @param show.prev Logical, show prevalence value in the graph. Defaults to `TRUE`. | |
| 26 | #' @param show.nonparam.pv Logical, show non-parametric calculation of PVs. Defaults to `TRUE`. | |
| 27 | #' @param show.best.pv Logical, show best possible PVs. Defaults to `TRUE`. | |
| 28 | #' @param include Character vector (case-insensitive, partial matching) specifying what quantities | |
| 29 | #' to include in the plot. | |
| 30 | #' | |
| 31 | #' Default is: `c("PC", "PPV", "1-NPV")`. | |
| 32 | #' | |
| 33 | #' Full options are: `c("NPV", "PC", "PPV", "1-NPV")`. | |
| 34 | #' @param plot.raw Logical to show percentiles or raw values. | |
| 35 | #' Defaults to `FALSE` (i.e. percentiles). | |
| 36 | #' @param rev.order Logical, reverse ordering of scores. Defaults to `FALSE`. | |
| 37 | #' | |
| 38 | #' @section Estimation: | |
| 39 | #' The `methods` argument specifies the estimation method. | |
| 40 | #' You can provide either a vector of strings, any of | |
| 41 | #' ``` | |
| 42 | #' c("asis", "binned", "pava", "mspline", "gam", "cgam") | |
| 43 | #' ``` | |
| 44 | #' (`"asis"` is not available for `calibrationProfile`), | |
| 45 | #' or a named list of lists. | |
| 46 | #' In the latter case, the inner list must have an element "method", | |
| 47 | #' which specifies the estimation function (one of those above), | |
| 48 | #' and optionally other elements, which are passed to the estimation function. | |
| 49 | #' For example: | |
| 50 | #' ``` | |
| 51 | #' list( | |
| 52 | #' gam = list(method = "gam", k = 3), | |
| 53 | #' c_gam = list(method = "cgam", numknots = 3) | |
| 54 | #' ) | |
| 55 | #' ``` | |
| 56 | #' | |
| 57 | #' To see what arguments are available for each estimation method, | |
| 58 | #' see the documentation of that function. | |
| 59 | #' The naming convention is `getXest`, | |
| 60 | #' where `X` stands for the estimation method, for example [getGAMest()]. | |
| 61 | #' | |
| 62 | #' "gam", "cgam", and "mspline" always fit on percentiles by default. | |
| 63 | #' To change this, use `fitonPerc = FALSE`, for example | |
| 64 | #' ``` | |
| 65 | #' list(gam = list(method = "gam", fitonPerc = FALSE)) | |
| 66 | #' ``` | |
| 67 | #' | |
| 68 | #' "gam" and "cgam" methods are wrappers of [mgcv::gam()] and [cgam::cgam()], respectively. | |
| 69 | #' The default values of function arguments (like `k`, the number of knots in [mgcv::s()]) | |
| 70 | #' mirror the package defaults. | |
| 71 | #' | |
| 72 | #' @return A list containing the plot and data, plus `errorbar` data if they were requested | |
| 73 | #' (through `"binned"` estimation method with a parameter `errorbar.sem`). | |
| 74 | #' | |
| 75 | #' @export | |
| 76 | #' | |
| 77 | #' @seealso [calibrationProfile()] [sensSpec()] | |
| 78 | #' | |
| 79 | #' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()] | |
| 80 | #' [getASISest()] | |
| 81 | #' | |
| 82 | #' @examples | |
| 83 | #' # Read in example data | |
| 84 | #' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) | |
| 85 | #' rscore <- auroc$predicted_calibrated | |
| 86 | #' truth <- as.numeric(auroc$actual) | |
| 87 | #' | |
| 88 | #' # Default plot includes 1-NPV, PPV, and a predictiveness curve (PC) based on risk-cutoff | |
| 89 | #' p1 <- riskProfile(outcome = truth, score = rscore) | |
| 90 | #' p1$plot | |
| 91 | #' p1$data | |
| 92 | #' | |
| 93 | #' # Show also NPV | |
| 94 | #' p2 <- riskProfile( | |
| 95 | #' outcome = truth, | |
| 96 | #' score = rscore, | |
| 97 | #'   include = c("PC", "NPV", "PPV", "1-NPV") | |
| 98 | #'   # or use partial matching: include = c("PC", "N", "PPV", "1") | |
| 99 | #' ) | |
| 100 | #' p2$plot | |
| 101 | #' p2$data | |
| 102 | #' | |
| 103 | #' # All estimates of prediction curve | |
| 104 | #' p3 <- riskProfile( | |
| 105 | #' outcome = truth, | |
| 106 | #' score = rscore, | |
| 107 | #'   methods = c("mspline", "gam", "cgam", "binned", "pava", "asis"), | |
| 108 | #'   include = c("PC", "PPV", "1-NPV") | |
| 109 | #' ) | |
| 110 | #' p3$plot | |
| 111 | #' | |
| 112 | #' # Specifying method arguments (note each list has a "method" element) | |
| 113 | #' p4 <- riskProfile( | |
| 114 | #' outcome = truth, | |
| 115 | #' score = rscore, | |
| 116 | #' methods = list( | |
| 117 | #' "gam" = list(method = "gam", bs = "tp", logscores = FALSE, fitonPerc = TRUE), | |
| 118 | #' "risk" = list(method = "asis"), # no available arguments for this method | |
| 119 | #' "bin" = list(method = "binned", quantiles = 10, errorbar.sem = 1.2) | |
| 120 | #' ) | |
| 121 | #' ) | |
| 122 | #' p4$plot | |
| 123 | #' | |
| 124 | #' # Compare multiple GAMs in terms of Predictiveness Curves | |
| 125 | #' p5 <- riskProfile( | |
| 126 | #' outcome = truth, | |
| 127 | #' score = rscore, | |
| 128 | #' methods = list( | |
| 129 | #' "gam_3" = list(method = "gam", k = 3), | |
| 130 | #' "gam_4" = list(method = "gam", k = 4), | |
| 131 | #' "gam_7" = list(method = "gam", k = 7) | |
| 132 | #' ), | |
| 133 | #' include = "PC" | |
| 134 | #' ) | |
| 135 | #' p5$plot | |
| 136 | #' | |
| 137 | #' # Using logistic regression as user-defined estimation function, fitting on percentiles | |
| 138 | #' # Function needs to take exactly these two arguments | |
| 139 | #' my_est <- function(outcome, score) { | |
| 140 | #' # Calculate percentiles | |
| 141 | #' perc <- ecdf(score)(score) | |
| 142 | #' # Fit | |
| 143 | #' m <- glm(outcome ~ perc, family = "binomial") | |
| 144 | #' # Generate predictions | |
| 145 | #' preds <- predict(m, type = "response") | |
| 146 | #' # Return a data.frame with exactly these columns | |
| 147 | #' return( | |
| 148 | #' data.frame( | |
| 149 | #' score = score, | |
| 150 | #' percentile = perc, | |
| 151 | #' outcome = outcome, | |
| 152 | #' estimate = preds | |
| 153 | #' ) | |
| 154 | #' ) | |
| 155 | #' } | |
| 156 | #' p6 <- riskProfile( | |
| 157 | #' outcome = truth, | |
| 158 | #' score = rscore, | |
| 159 | #' methods = list(my_lr = my_est) | |
| 160 | #' ) | |
| 161 | #' p6$plot | |
| 162 | #' | |
| 163 | #' # Using cgam as user-defined estimation function | |
| 164 | #' # Note that you can also use the predefined cgam using methods = "cgam" | |
| 165 | #' # Attach needed library | |
| 166 | #' # Watch out for masking of mgcv::s and cgam::s if both are attached | |
| 167 | #' library(cgam, quietly = TRUE) | |
| 168 | #' # Function needs to take exactly these two arguments | |
| 169 | #' my_est <- function(outcome, score) { | |
| 170 | #' # Fit on raw predictions with space = "E" | |
| 171 | #' m <- cgam( | |
| 172 | #' outcome ~ s.incr(score, numknots = 5, space = "E"), | |
| 173 | #' family = "binomial" | |
| 174 | #' ) | |
| 175 | #' # Generate predictions and convert to vector | |
| 176 | #' preds <- predict(m, type = "response")$fit | |
| 177 | #' # Return a data.frame with exactly these columns | |
| 178 | #' out <- data.frame( | |
| 179 | #' score = score, | |
| 180 | #' percentile = ecdf(score)(score), | |
| 181 | #' outcome = outcome, | |
| 182 | #' estimate = preds | |
| 183 | #' ) | |
| 184 | #' return(out) | |
| 185 | #' } | |
| 186 | #' | |
| 187 | #' p7 <- riskProfile( | |
| 188 | #' outcome = truth, | |
| 189 | #' score = rscore, | |
| 190 | #' methods = list(my_cgam = my_est) | |
| 191 | #' ) | |
| 192 | #' p7$plot | |
| 193 | #' | |
| 194 | #' # Prevalence adjustment to 0.1 | |
| 195 | #' p8 <- riskProfile(outcome = truth, score = rscore, prev.adj = 0.1) | |
| 196 | #' p8$plot | |
| 197 | #' | |
| 198 | riskProfile <- function(outcome, | |
| 199 | score, | |
| 200 | methods = "asis", | |
| 201 | prev.adj = NULL, | |
| 202 | show.prev = TRUE, | |
| 203 | show.nonparam.pv = TRUE, | |
| 204 | show.best.pv = TRUE, | |
| 205 |                         include = c("PC", "PPV", "1-NPV"), | |
| 206 | plot.raw = FALSE, | |
| 207 |                         rev.order = FALSE) { | |
| 208 | ||
| 209 | # Argument checks (except outcome, score, methods - below) | |
| 210 | 15x | checkmate::assert_number(prev.adj, lower = 0, upper = 1, null.ok = TRUE) | 
| 211 | 15x | checkmate::assert_flag(show.nonparam.pv) | 
| 212 | 15x | checkmate::assert_flag(show.best.pv) | 
| 213 | 15x | checkmate::assert_flag(show.prev) | 
| 214 | 15x |   include <- matchArgSubset(toupper(include), choices = c("PC", "PPV", "NPV", "1-NPV")) | 
| 215 | 15x | checkmate::assert_flag(plot.raw) | 
| 216 | 15x | checkmate::assert_flag(rev.order) | 
| 217 | ||
| 218 | # Standardize/Check outcome, scores | |
| 219 | 15x | op <- inputCheck(outcome = outcome, score = score) | 
| 220 | ||
| 221 | # Order Data by scores | |
| 222 | 15x | tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order) | 
| 223 | 15x | score <- tempdf$score | 
| 224 | 15x | outcome <- tempdf$outcome | 
| 225 | ||
| 226 | # Check methods | |
| 227 | 15x | methods <- methodCheck(methods = methods) | 
| 228 | 15x | method.names <- names(methods) | 
| 229 | ||
| 230 | # Calculate prevalence and percentiles | |
| 231 | 15x | prev <- mean(outcome, na.rm = TRUE) | 
| 232 | ||
| 233 | # Get the plot settings | |
| 234 | 15x | show.pc <- "PC" %in% include | 
| 235 | 15x |   show.pv <- any(c("PPV", "NPV", "1-NPV") %in% include) | 
| 236 | 15x |   show.one.only <- sum(c("PC", "PPV", "NPV", "1-NPV") %in% include) == 1 | 
| 237 | ||
| 238 | 15x |   if (plot.raw) { | 
| 239 | 2x | xvar <- "score" | 
| 240 |   } else { | |
| 241 | 13x | xvar <- "percentile" | 
| 242 | } | |
| 243 | ||
| 244 | # Prediction Curve Data | |
| 245 | 15x |   if (show.pc) { | 
| 246 | 12x | pc.ests <- getEsts(methods = methods, outcome = outcome, score = score) | 
| 247 | 12x | PC.data <- mutate(pc.ests$plotdata, pv = "PC") | 
| 248 | 12x | errorbar.data <- pc.ests$errorbardata | 
| 249 | 12x | step.methods <- method.names[pc.ests$idx.step] | 
| 250 |   } else { | |
| 251 | 3x | PC.data <- data.frame( | 
| 252 | 3x | method = character(0), pv = character(0), | 
| 253 | 3x | percentile = numeric(0), score = numeric(0), estimate = numeric(0) | 
| 254 | ) | |
| 255 | 3x | pc.ests <- errorbar.data <- NULL | 
| 256 | 3x | step.methods <- character(0) | 
| 257 | } | |
| 258 | ||
| 259 | # Predictive Value Data | |
| 260 | 15x |   if (show.pv) { | 
| 261 | 14x | PV.data <- getPVdata(outcome = outcome, score = score, methods = methods, pc.ests = pc.ests) | 
| 262 |   } else { | |
| 263 | 1x | PV.data <- data.frame( | 
| 264 | 1x | method = character(0), score = numeric(0), percentile = numeric(0), | 
| 265 | 1x | outcome = numeric(0), estimate = numeric(0), | 
| 266 | 1x | MNPV = numeric(0), NPV = numeric(0), PPV = numeric(0) | 
| 267 | ) | |
| 268 | } | |
| 269 | ||
| 270 | # show.nonparam.pv | |
| 271 | 15x |   if (show.nonparam.pv) { | 
| 272 | 10x | tmp <- nonParametricPV(outcome = outcome, score = score) %>% | 
| 273 | 10x | mutate(method = "non-parametric", estimate = NA) | 
| 274 | 10x | PV.data <- bind_rows(PV.data, tmp) | 
| 275 | } | |
| 276 | ||
| 277 | # Dataset of inputs | |
| 278 | 15x | df.in <- data.frame(outcome, score, percentile = ecdf(score)(score)) | 
| 279 | ||
| 280 | # Adjust based on user defined prevalence | |
| 281 | 15x |   if (!is.null(prev.adj)) { | 
| 282 | ||
| 283 | 1x | cdf.cases <- ecdf(score[outcome == 1]) | 
| 284 | 1x | cdf.controls <- ecdf(score[outcome == 0]) | 
| 285 | ||
| 286 | 1x | df.in$percentile <- adjPrevPerc( | 
| 287 | 1x | perc = df.in$score, prev.new = prev.adj, | 
| 288 | 1x | cdf.case = cdf.cases, cdf.control = cdf.controls | 
| 289 | ) | |
| 290 | ||
| 291 | 1x |     if (show.pc) { | 
| 292 | 1x | PC.data <- adjPrevPC( | 
| 293 | 1x | dat = PC.data, prev = prev, prev.new = prev.adj, | 
| 294 | 1x | cdf.case = cdf.cases, cdf.control = cdf.controls | 
| 295 | ) | |
| 296 | } | |
| 297 | ||
| 298 | 1x |     if (show.pv) { | 
| 299 | 1x | PV.data <- adjPrevPV( | 
| 300 | 1x | dat = PV.data, prev = prev, prev.new = prev.adj, | 
| 301 | 1x | cdf.case = cdf.cases, cdf.control = cdf.controls | 
| 302 | ) | |
| 303 | } | |
| 304 | ||
| 305 | 1x | prev <- prev.adj | 
| 306 | } | |
| 307 | ||
| 308 | # pivot PV data | |
| 309 | 15x |   if (show.pv) { | 
| 310 | 14x | PV.data <- PV.data %>% | 
| 311 | 14x | rename(`1-NPV` = "MNPV") %>% | 
| 312 | 14x | tidyr::pivot_longer( | 
| 313 | 14x |         cols = all_of(c("1-NPV", "NPV", "PPV")), names_to = "pv", values_to = "pvValue" | 
| 314 | ) %>% | |
| 315 | 14x |       mutate(pv = factor(.data$pv, levels = c("NPV", "1-NPV", "PPV"))) %>% | 
| 316 | 14x | arrange(.data$method, .data$pv, .data$percentile) | 
| 317 |   } else { | |
| 318 | 1x | PV.data <- data.frame( | 
| 319 | 1x | method = character(0), score = numeric(0), percentile = numeric(0), | 
| 320 | 1x | outcome = numeric(0), estimate = numeric(0), | 
| 321 | 1x | pv = character(0), pvValue = numeric(0) | 
| 322 | ) | |
| 323 | } | |
| 324 | ||
| 325 | # If showing percentiles, add row with 0th percentile | |
| 326 | 15x |   if (!plot.raw) { | 
| 327 | 13x |     if (show.pc) { | 
| 328 | 10x | PC.data <- add0thPercPC(PC.data) | 
| 329 | } | |
| 330 | 13x |     if (show.pv) { | 
| 331 | 12x | PV.data <- add0thPercPV(PV.data) | 
| 332 | } | |
| 333 | } | |
| 334 | ||
| 335 | # Subset PC data | |
| 336 | 15x | smoothPC <- PC.data[!PC.data$method %in% step.methods, , drop = FALSE] | 
| 337 | 15x | stepPC <- PC.data[PC.data$method %in% step.methods, , drop = FALSE] | 
| 338 | ||
| 339 | # Subset PV data | |
| 340 | 15x | smoothPV <- PV.data[ | 
| 341 | 15x | PV.data$pv %in% include & !PV.data$method %in% step.methods, , | 
| 342 | 15x | drop = FALSE | 
| 343 | ] | |
| 344 | 15x | stepPV <- PV.data[ | 
| 345 | 15x | PV.data$pv %in% include & PV.data$method %in% step.methods, , | 
| 346 | 15x | drop = FALSE | 
| 347 | ] | |
| 348 | ||
| 349 | # Different aes based on what is to be shown | |
| 350 | # (if one kind of PV value, use both coloring and linetype for distinguishing estimation methods) | |
| 351 | 15x |   if (show.one.only) { | 
| 352 | 2x | aes.pc <- aes( | 
| 353 | 2x | x = .data[[xvar]], y = .data$estimate, | 
| 354 | 2x | colour = .data$method, linetype = .data$method | 
| 355 | ) | |
| 356 | 2x | aes.pv <- aes( | 
| 357 | 2x | x = .data[[xvar]], y = .data$pvValue, | 
| 358 | 2x | colour = .data$method, linetype = .data$method | 
| 359 | ) | |
| 360 |   } else { | |
| 361 | 13x | aes.pc <- aes( | 
| 362 | 13x | x = .data[[xvar]], y = .data$estimate, | 
| 363 | 13x | colour = .data$pv, linetype = .data$method | 
| 364 | ) | |
| 365 | 13x | aes.pv <- aes( | 
| 366 | 13x | x = .data[[xvar]], y = .data$pvValue, | 
| 367 | 13x | colour = .data$pv, linetype = .data$method | 
| 368 | ) | |
| 369 | } | |
| 370 | ||
| 371 | # Build plot | |
| 372 | 15x | p <- ggplot() + | 
| 373 | 15x | geom_line(aes.pc, data = smoothPC, alpha = 0.8) + | 
| 374 | 15x | geom_step(aes.pc, data = stepPC, alpha = 0.8, direction = "vh") + | 
| 375 | 15x | geom_line(aes.pv, data = smoothPV, alpha = 0.8) + | 
| 376 | 15x | geom_step(aes.pv, data = stepPV, alpha = 0.8, direction = "vh") + | 
| 377 | 15x | geom_hline(yintercept = prev, alpha = 0.8, col = "black", linetype = "dashed") | 
| 378 | ||
| 379 | # Best PVs | |
| 380 | 15x |   if (show.best.pv) { | 
| 381 | 11x |     if (!plot.raw) { | 
| 382 | 9x | df.in <- bind_rows(dplyr::tibble(percentile = 0, score = NA), df.in) | 
| 383 | } | |
| 384 | 11x | best <- df.in %>% | 
| 385 | 11x |       select(all_of(c("percentile", "score"))) %>% | 
| 386 | 11x | distinct() %>% | 
| 387 | 11x | arrange(.data$percentile, .data$score) %>% | 
| 388 | 11x | mutate( | 
| 389 | 11x | PC = ifelse(.data$percentile <= 1 - prev, 0, 1), | 
| 390 | 11x | PPV = bestPPV(perc = .data$percentile, prev = prev), | 
| 391 | 11x | `1-NPV` = bestMNPV(perc = .data$percentile, prev = prev), | 
| 392 | 11x | NPV = 1 - .data$`1-NPV` | 
| 393 | ) %>% | |
| 394 | 11x | tidyr::pivot_longer( | 
| 395 | 11x |         cols = c("PC", "PPV", "1-NPV", "NPV"), names_to = "pv", values_to = "pvValue" | 
| 396 | ) %>% | |
| 397 | 11x | filter(.data$pv %in% include) %>% | 
| 398 | 11x | mutate( | 
| 399 | 11x | method = "Best PVs", | 
| 400 | 11x |         pv = paste("Best", .data$pv) | 
| 401 | ) | |
| 402 | ||
| 403 | 11x |     if (show.one.only) { | 
| 404 | 1x | p <- p + | 
| 405 | 1x | geom_line( | 
| 406 | 1x | data = best, | 
| 407 | 1x | aes(x = .data[[xvar]], y = .data$pvValue, linewidth = .data$pv), | 
| 408 | 1x | colour = "gray60" | 
| 409 | ) + | |
| 410 | 1x | scale_linewidth_manual( | 
| 411 | 1x |           values = c("Best 1-NPV" = 0.3, "Best PPV" = 0.3, "Best PC" = 0.3, "Best NPV" = 0.3), | 
| 412 | 1x | name = "Best PVs" | 
| 413 | ) | |
| 414 |     } else { | |
| 415 | 10x | p <- p + | 
| 416 | 10x | geom_line(data = best, aes(x = .data[[xvar]], y = .data$pvValue, colour = .data$pv)) | 
| 417 | } | |
| 418 | } | |
| 419 | ||
| 420 | # Add errorbars | |
| 421 | 15x |   if (show.pc && !is.null(errorbar.data)) { | 
| 422 | 1x | p <- p + | 
| 423 | 1x | geom_point( | 
| 424 | 1x | data = errorbar.data, | 
| 425 | 1x | aes(x = .data$midquantile, y = .data$bin.mid), | 
| 426 | 1x | alpha = 0.8, | 
| 427 | 1x | size = 0.2 | 
| 428 | ) + | |
| 429 | 1x | geom_errorbar( | 
| 430 | 1x | data = errorbar.data, | 
| 431 | 1x | aes( | 
| 432 | 1x | x = .data$midquantile, | 
| 433 | 1x | ymin = .data$bin.low, | 
| 434 | 1x | ymax = .data$bin.high, | 
| 435 | 1x | width = .02 | 
| 436 | ), | |
| 437 | 1x | alpha = 0.7, | 
| 438 | 1x | linewidth = 0.2, | 
| 439 | 1x | inherit.aes = FALSE | 
| 440 | ) | |
| 441 | } | |
| 442 | ||
| 443 | # Set always the same colours for PVs | |
| 444 | 15x |   if (!show.one.only) { | 
| 445 | 13x | clrs <- predictionColours(include, show.best = show.best.pv) | 
| 446 | 13x | p <- p + scale_colour_manual(values = clrs, breaks = names(clrs)) | 
| 447 |   } else { | |
| 448 | 2x | p <- p + scale_colour_hue(l = 45) | 
| 449 | } | |
| 450 | ||
| 451 | # Finalize plot | |
| 452 | 15x | p <- p + | 
| 453 | 15x | labs( | 
| 454 | 15x | title = "Predictiveness Plot", | 
| 455 | 15x | x = ifelse(plot.raw, "Prediction Score", "Risk Percentile"), | 
| 456 | 15x | y = "Predicted Risk / Predictive Value", | 
| 457 | 15x | linetype = "Estimation Method", | 
| 458 | 15x | colour = ifelse(show.one.only, "Estimation Method", "Predictive Quantity") | 
| 459 | ) + | |
| 460 | 15x | scale_x_continuous(n.breaks = 6) + | 
| 461 | 15x | scale_y_continuous(n.breaks = 6) + | 
| 462 | 15x | theme_bw() + | 
| 463 | 15x | theme(legend.key.width = unit(2, "line")) | 
| 464 |  | |
| 465 | # Add prevalence annotation if requested | |
| 466 | 15x |   if (show.prev) { | 
| 467 | # x-value for plotting prevalence label | |
| 468 | 15x | prev_x <- ifelse(plot.raw, min(score), 0) | 
| 469 | 15x | prev_nudge_x <- ifelse(plot.raw, (max(score) - min(score)) / 10, 0.1) | 
| 470 | 15x | prev_nudge_y <- ggplot2::layer_scales(p)$y$get_limits()[2] / 10 | 
| 471 |  | |
| 472 | 15x | p <- p + annotate( | 
| 473 | 15x | geom = "text", | 
| 474 | 15x | x = prev_x + prev_nudge_x, | 
| 475 | 15x | y = ifelse(prev > 0.8, prev - prev_nudge_y, prev + prev_nudge_y), | 
| 476 | 15x |       label = paste0("Prevalence: ", "\n", round(prev, 3)), | 
| 477 | 15x | colour = "black", | 
| 478 | 15x | alpha = 0.8, | 
| 479 | 15x | size = 3.5 | 
| 480 | ) | |
| 481 | } | |
| 482 | ||
| 483 | 15x |   if (show.one.only) { | 
| 484 | 2x | p <- p + guides(colour = guide_legend(order = 1), linetype = guide_legend(order = 1)) | 
| 485 |   } else { | |
| 486 | 13x | p <- p + guides(colour = guide_legend(order = 1), linetype = guide_legend(order = 2)) | 
| 487 | } | |
| 488 | ||
| 489 | 15x |   if (show.best.pv) { | 
| 490 | 11x |     PV.data <- bind_rows(PV.data, mutate(best, pv = gsub("Best ", "", .data$pv))) | 
| 491 | } | |
| 492 | ||
| 493 | 15x | return( | 
| 494 | 15x | list( | 
| 495 | 15x | plot = p, | 
| 496 | 15x | data = bind_rows( | 
| 497 | 15x | dplyr::as_tibble(PC.data), | 
| 498 | 15x | dplyr::as_tibble(PV.data) | 
| 499 | ), | |
| 500 | 15x | errorbar = errorbar.data | 
| 501 | ) | |
| 502 | ) | |
| 503 | } | 
| 1 | ||
| 2 | # Helper function to check the user input - bins and quantiles arguments | |
| 3 | checkBinsQuantiles <- function(bins, quantiles, score) { | |
| 4 | ||
| 5 | 15x | checkmate::assert( | 
| 6 | 15x | checkmate::check_numeric(bins, any.missing = FALSE, sorted = TRUE), | 
| 7 | 15x | checkmate::check_null(bins) | 
| 8 | ) | |
| 9 | 15x | checkmate::assert( | 
| 10 | 15x | checkmate::check_integerish(quantiles, lower = 1), | 
| 11 | 15x | checkmate::check_null(quantiles) | 
| 12 | ) | |
| 13 | ||
| 14 | # Check combination of quantiles and bins | |
| 15 | 15x |   if (!is.null(quantiles) && !is.null(bins)) { | 
| 16 | 1x |     stop("bins and quantiles cannot be specified together, choose one and set the other to NULL") | 
| 17 | } | |
| 18 | ||
| 19 | 14x |   if (is.null(quantiles) && is.null(bins)) { | 
| 20 | 5x | quantiles <- 10 | 
| 21 | } | |
| 22 | ||
| 23 | # Further check for bins | |
| 24 | 14x |   if (length(bins) > 1) { | 
| 25 | 4x |     if (bins[1] > min(score)) { | 
| 26 | 1x |       stop(paste("The first element of bins must be <= min(score), not", bins[1])) | 
| 27 | } | |
| 28 | 3x |     if (bins[length(bins)] < max(score)) { | 
| 29 | 1x |       stop(paste("The last element of bins must be >= max(score), not", bins[length(bins)])) | 
| 30 | } | |
| 31 | } | |
| 32 | ||
| 33 | # Scalar bins -> number of intervals | |
| 34 | 12x |   if (length(bins) == 1 && bins <= 1) { | 
| 35 | 1x |     stop("bins must be > 1 when provided as a scalar (i.e. number of bins)") | 
| 36 | } | |
| 37 | ||
| 38 | # Check discrete score | |
| 39 | 11x | lu <- length(unique(score)) | 
| 40 | 11x |   if (length(bins) == 1 && bins > lu) { | 
| 41 | 1x | warning( | 
| 42 | 1x | paste0( | 
| 43 | 1x |         "The number of `bins` (", bins, ") ", | 
| 44 | 1x |         "is > the number of unique score values (", lu, "). ", | 
| 45 | 1x | "The results may be unreliable." | 
| 46 | ) | |
| 47 | ) | |
| 48 | } | |
| 49 | 11x |   if (length(bins) > 1 && length(bins) - 1 > lu) { | 
| 50 | 1x | warning( | 
| 51 | 1x | paste0( | 
| 52 | 1x |         "The number of `bins` (", length(bins) - 1, ") ", | 
| 53 | 1x |         "is > the number of unique score values (", lu, "). ", | 
| 54 | 1x | "The results may be unreliable." | 
| 55 | ) | |
| 56 | ) | |
| 57 | } | |
| 58 | 11x |   if (!is.null(quantiles) && lu <= 10) { | 
| 59 | 1x | warning( | 
| 60 | 1x | "Using the quantile method for non-continuous score. ", | 
| 61 | 1x | "The results may be unreliable." | 
| 62 | ) | |
| 63 | } | |
| 64 | ||
| 65 | 11x | return(list(bins = bins, quantiles = quantiles)) | 
| 66 | } | |
| 67 | ||
| 68 | ||
| 69 | # Helper function to check the user input - errorbar.sem argument | |
| 70 | checkErrorbarSem <- function(errorbar.sem) { | |
| 71 | 7x | checkmate::assert( | 
| 72 | 7x | checkmate::check_number(errorbar.sem, lower = 0), # this checks >= 0 | 
| 73 | 7x | checkmate::check_null(errorbar.sem) | 
| 74 | ) | |
| 75 | 7x |   if (!is.null(errorbar.sem)) { | 
| 76 | 2x |     stopifnot("`errorbar.sem` must be > 0" = errorbar.sem > 0) | 
| 77 | } | |
| 78 | 7x | return(errorbar.sem) | 
| 79 | } | |
| 80 | ||
| 81 | ||
| 82 | #' Binned Risk Estimates | |
| 83 | #' | |
| 84 | #' Calculates bins based on number of evenly spaced bins or n-tiles. | |
| 85 | #' Determines average risk within bins, used for risk estimates. | |
| 86 | #' | |
| 87 | #' @inheritParams riskProfile | |
| 88 | #' @param quantiles Numeric; quantiles to split bins. | |
| 89 | #' @param bins Numeric; number of evenly spaced bins or bin locations. | |
| 90 | #' @param right Logical indicating right closed interval. Defaults to `TRUE`. | |
| 91 | #' @param errorbar.sem Scalar numeric representing the number of standard error from the means | |
| 92 | #' (SEM) used to calculate risk error bar. | |
| 93 | #' | |
| 94 | #' @return A data frame with 4 columns | |
| 95 | #' (score, score percentile, outcome, estimate). | |
| 96 | #' Additionally, there is an attribute "errorbar" holding the error-bar data if | |
| 97 | #' `errorbar.sem` was specified. | |
| 98 | #' | |
| 99 | #' @seealso [getASISest()] [getCGAMest()] [getGAMest()] [getMSPLINEest()] [getPAVAest()] | |
| 100 | #' | |
| 101 | #' @export | |
| 102 | #' | |
| 103 | #' @examples | |
| 104 | #' # Read in example data | |
| 105 | #' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) | |
| 106 | #' rscore <- auroc$predicted | |
| 107 | #' truth <- as.numeric(auroc$actual) | |
| 108 | #' | |
| 109 | #' getBINNEDest(outcome = truth, score = rscore) | |
| 110 | #' | |
| 111 | getBINNEDest <- function(outcome, | |
| 112 | score, | |
| 113 | quantiles = NULL, | |
| 114 | bins = NULL, | |
| 115 | right = TRUE, | |
| 116 |                          errorbar.sem = NULL) { | |
| 117 | ||
| 118 | # Argument checks | |
| 119 | 7x | checkmate::assert_numeric(outcome) | 
| 120 | 7x | checkmate::assert_numeric(score, len = length(outcome)) | 
| 121 | 7x | checkmate::assert_flag(right) | 
| 122 | 7x | errorbar.sem <- checkErrorbarSem(errorbar.sem) | 
| 123 | ||
| 124 | # Check bins and quantiles | |
| 125 | 7x | bqp <- checkBinsQuantiles(bins = bins, quantiles = quantiles, score = score) | 
| 126 | ||
| 127 | # Retrieve data summaries | |
| 128 | 7x | df <- getSummaries( | 
| 129 | 7x | outcome = outcome, score = score, | 
| 130 | 7x | quantiles = bqp$quantiles, bins = bqp$bins, | 
| 131 | 7x | right = right | 
| 132 | ) | |
| 133 | ||
| 134 | 7x |   if (!is.null(errorbar.sem)) { | 
| 135 | 2x | errorbar <- getERRORest(binlvl = df[["binlvl"]], z = errorbar.sem) %>% | 
| 136 | 2x | mutate( | 
| 137 | 2x | percentile = df[["binlvl"]][["riskpercentile"]], | 
| 138 | 2x | bin.mid = df[["binlvl"]][["avg.outcome"]] | 
| 139 | ) | |
| 140 |   } else { | |
| 141 | 5x | errorbar <- NULL | 
| 142 | } | |
| 143 | ||
| 144 | # Create binned.data | |
| 145 | 7x | binned.data <- data.frame( | 
| 146 | 7x | score = df[["binlvl"]][["avg.risk"]], | 
| 147 | 7x | percentile = df[["binlvl"]][["riskpercentile"]], | 
| 148 | 7x | outcome = NA, | 
| 149 | 7x | estimate = df[["binlvl"]][["avg.outcome"]] | 
| 150 | ) | |
| 151 | 7x | attr(binned.data, "errorbar") <- errorbar | 
| 152 | ||
| 153 | 7x | return(binned.data) | 
| 154 | } | |
| 155 | ||
| 156 | ||
| 157 | ||
| 158 | #' Get Summaries: observation level & bin level dataframes | |
| 159 | #' | |
| 160 | #' Given vectors of outcomes and risk scores, the function will return a list of | |
| 161 | #' observation level and bin level summary dataframes for the data. | |
| 162 | #' | |
| 163 | #' Observation level dataframe has columns for outcome, riskscore, risk percentile, bin number, | |
| 164 | #' and corresponding minimum and maximum score for that bin. | |
| 165 | #' | |
| 166 | #' Bin level dataframe has columns indicating bin number and the observation count, | |
| 167 | #' number of events, average outcome, average risk, and standard deviation of risk, | |
| 168 | #' within each of the bins. Risk percentile and bin intervals are also provided. | |
| 169 | #' | |
| 170 | #' @return List of observation level and bin level dataframes. | |
| 171 | #' | |
| 172 | #' @keywords internal | |
| 173 | #' @noRd | |
| 174 | #' | |
| 175 | #' @examples | |
| 176 | #' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) | |
| 177 | #' truth <- as.numeric(auroc$actual) | |
| 178 | #' rscore <- auroc$predicted | |
| 179 | #' | |
| 180 | #' # Bin by quantiles | |
| 181 | #' getSummaries( | |
| 182 | #' outcome = truth, | |
| 183 | #' score = rscore, | |
| 184 | #' quantiles = 10 | |
| 185 | #' ) | |
| 186 | #' | |
| 187 | #' # Bin by specific percentiles | |
| 188 | #' getSummaries( | |
| 189 | #' outcome = truth, | |
| 190 | #' score = rscore, | |
| 191 | #' quantiles = 0, | |
| 192 | #' bin = c(0, 0.25, 0.5, 0.8, 1) | |
| 193 | #' ) | |
| 194 | #' | |
| 195 | getSummaries <- function(outcome, | |
| 196 | score, | |
| 197 | quantiles = NULL, | |
| 198 | bins = NULL, | |
| 199 |                          right = TRUE) { | |
| 200 | ||
| 201 | 16x | stopifnot( | 
| 202 | 16x | !is.null(quantiles) | !is.null(bins), | 
| 203 | 16x | quantiles != 0 | bins != 0, | 
| 204 | 16x | is.logical(right) | 
| 205 | ) | |
| 206 | ||
| 207 | 15x | cdf.fit <- ecdf(score) | 
| 208 | ||
| 209 | 15x |   if (!is.null(quantiles)) { | 
| 210 | 12x | bin.int <- Hmisc::cut2(score, g = quantiles) | 
| 211 | ||
| 212 | 3x |   } else if (length(bins) > 1) { | 
| 213 | ! | bin.int <- cut(cdf.fit(score), breaks = bins, include.lowest = TRUE, right = right) | 
| 214 | ||
| 215 | 3x |   } else if (length(bins) == 1) { | 
| 216 | 2x | bin.int <- cut(score, breaks = bins, include.lowest = TRUE, right = right) | 
| 217 | ||
| 218 |   } else { | |
| 219 | 1x |     stop("Unrecognized option") | 
| 220 | } | |
| 221 |  | |
| 222 | # get numeric label of interval | |
| 223 | 14x | bin.num <- as.numeric(bin.int) | 
| 224 | ||
| 225 | # get interval borders (min, max) | |
| 226 | 14x | min_max <- strsplit( | 
| 227 | 14x |     gsub("(?![-,.])[[:punct:]]", "", trimws(as.character(bin.int)), perl = TRUE), | 
| 228 | "," | |
| 229 | ) | |
| 230 | 14x | min_max <- lapply( | 
| 231 | 14x | min_max, \(x) `if`(length(x) < 2, rep(x, 2), x) | 
| 232 | ) | |
| 233 | ||
| 234 | # Observation level data: outcome, rscore, observation percentile, and interval | |
| 235 | 14x | obslvl <- data.frame( | 
| 236 | 14x | outcome = outcome, | 
| 237 | 14x | score = score, | 
| 238 | 14x | riskpercentile = cdf.fit(score), | 
| 239 | 14x | bin = bin.num, | 
| 240 | 14x | interval = as.character(bin.int), | 
| 241 | 14x | min = vapply(min_max, "[[", character(1), 1), | 
| 242 | 14x | max = vapply(min_max, "[[", character(1), 2) | 
| 243 | ) %>% | |
| 244 | 14x | arrange(.data$score) | 
| 245 | ||
| 246 | # Bin level data: within each bin: n, avg risk, sd risk, quantile, error bar | |
| 247 | 14x | binlvl <- obslvl %>% | 
| 248 | 14x | group_by(.data$bin, .data$interval) %>% | 
| 249 | 14x | summarise( | 
| 250 | 14x | n = dplyr::n(), | 
| 251 | 14x | events = sum(.data$outcome), | 
| 252 | 14x | avg.outcome = mean(.data$outcome), | 
| 253 | 14x | sd.outcome = sd(.data$outcome, na.rm = TRUE), | 
| 254 | 14x | avg.risk = mean(.data$score, na.rm = TRUE), | 
| 255 | 14x | sd.risk = sd(.data$score), | 
| 256 | 14x | riskpercentile = max(.data$riskpercentile), | 
| 257 | 14x | .groups = "drop" | 
| 258 | ) | |
| 259 | ||
| 260 | # Replace NAs with 0 | |
| 261 | 14x | binlvl[is.na(binlvl)] <- 0 | 
| 262 | ||
| 263 | 14x | return(list(obslvl = obslvl, binlvl = binlvl)) | 
| 264 | } | |
| 265 | ||
| 266 | ||
| 267 | # ERROR BAR Estimates | |
| 268 | getERRORest <- function(binlvl, z) { | |
| 269 | ||
| 270 | 2x | stopifnot( | 
| 271 | 2x | is.data.frame(binlvl), | 
| 272 | 2x | is.numeric(z) | 
| 273 | ) | |
| 274 | ||
| 275 | 2x | binlvl %>% | 
| 276 | 2x | mutate( | 
| 277 | 2x | bin.low = .data$avg.outcome - (z * (.data$sd.outcome / sqrt(.data$n))), | 
| 278 | 2x | bin.low = ifelse(.data$bin.low < 0, 0, .data$bin.low), | 
| 279 | 2x | bin.high = .data$avg.outcome + (z * (.data$sd.outcome / sqrt(.data$n))), | 
| 280 | 2x | midquantile = .data$riskpercentile - (diff(c(0, .data$riskpercentile)) / 2) | 
| 281 | ) %>% | |
| 282 | 2x |     select(all_of(c("midquantile", "bin.high", "bin.low"))) %>% | 
| 283 | 2x | tidyr::replace_na(list(bin.high = 0, bin.low = 0)) | 
| 284 | } | |
| 285 | ||
| 286 | ||
| 287 | #' PAVA Risk Estimates | |
| 288 | #' | |
| 289 | #' Determines isotonic regression estimates via pava, given a vector of binary outcomes, | |
| 290 | #' and a vector of scores. | |
| 291 | #' | |
| 292 | #' @inheritParams riskProfile | |
| 293 | #' @param weights Vector of numerics to specify PAVA observation weighting. | |
| 294 | #' @param ties String to specify how ties should be handled for PAVA. | |
| 295 | #' @param low_events Numeric, specifying number of events in the lowest bin. | |
| 296 | #' @param low_nonevents Numeric, specifying number of nonevents in the lowest bin. | |
| 297 | #' @param high_events Numeric, specifying number of events in the highest bin. | |
| 298 | #' @param high_nonevents Numeric, specifying number of nonevents in the highest bin. | |
| 299 | #' @param hilo_obs Numeric, specifying number of observations in the highest and lowest bins. | |
| 300 | #' | |
| 301 | #' @return A data frame with 4 columns | |
| 302 | #' (score, score percentile, outcome, estimate). | |
| 303 | #' | |
| 304 | #' @seealso [getASISest()] [getBINNEDest()] [getCGAMest()] [getGAMest()] [getMSPLINEest()] | |
| 305 | #' | |
| 306 | #' @export | |
| 307 | #' | |
| 308 | #' @examples | |
| 309 | #' # Read in example data | |
| 310 | #' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) | |
| 311 | #' rscore <- auroc$predicted | |
| 312 | #' truth <- as.numeric(auroc$actual) | |
| 313 | #' | |
| 314 | #' tail(getPAVAest(outcome = truth, score = rscore), 10) | |
| 315 | #' | |
| 316 | getPAVAest <- function(outcome, | |
| 317 | score, | |
| 318 | weights = rep(1, length(outcome)), | |
| 319 | ties = "primary", | |
| 320 | low_events = NULL, | |
| 321 | low_nonevents = NULL, | |
| 322 | high_events = NULL, | |
| 323 | high_nonevents = NULL, | |
| 324 |                        hilo_obs = NULL) { | |
| 325 | ||
| 326 | 5x | checkmate::assert_numeric(outcome) | 
| 327 | 5x | checkmate::assert_numeric(score, len = length(outcome)) | 
| 328 | 5x | checkmate::assert_numeric(weights, any.missing = FALSE, len = length(outcome)) | 
| 329 | 5x | checkmate::assert_character(ties, any.missing = FALSE, len = 1) | 
| 330 | 5x | checkmate::assert( | 
| 331 | 5x | checkmate::check_integerish(low_events, lower = 1, any.missing = FALSE, len = 1), | 
| 332 | 5x | checkmate::check_null(low_events) | 
| 333 | ) | |
| 334 | 5x | checkmate::assert( | 
| 335 | 5x | checkmate::check_integerish(low_nonevents, lower = 1, any.missing = FALSE, len = 1), | 
| 336 | 5x | checkmate::check_null(low_nonevents) | 
| 337 | ) | |
| 338 | 5x | checkmate::assert( | 
| 339 | 5x | checkmate::check_integerish(high_events, lower = 1, any.missing = FALSE, len = 1), | 
| 340 | 5x | checkmate::check_null(high_events) | 
| 341 | ) | |
| 342 | 5x | checkmate::assert( | 
| 343 | 5x | checkmate::check_integerish(high_nonevents, lower = 1, any.missing = FALSE, len = 1), | 
| 344 | 5x | checkmate::check_null(high_nonevents) | 
| 345 | ) | |
| 346 | 5x | checkmate::assert( | 
| 347 | 5x | checkmate::check_integerish(hilo_obs, lower = 1, any.missing = FALSE, len = 1), | 
| 348 | 5x | checkmate::check_null(hilo_obs) | 
| 349 | ) | |
| 350 | ||
| 351 | 5x | pava.est <- isotone::gpava(z = score, y = outcome, weights = weights, ties = ties)$x | 
| 352 | ||
| 353 | # if constrained, then replace percentiles.... | |
| 354 | 5x | check <- any( | 
| 355 | 5x | is.numeric(low_events), is.numeric(low_nonevents), is.numeric(high_events), | 
| 356 | 5x | is.numeric(high_nonevents), is.numeric(hilo_obs) | 
| 357 | ) | |
| 358 | 5x |   if (check) { | 
| 359 | ! | percentile <- getConstraints( | 
| 360 | ! | outcome = outcome, rscore = score, | 
| 361 | ! | low_events = low_events, low_nonevents = low_nonevents, | 
| 362 | ! | high_events = high_events, high_nonevents = high_nonevents, | 
| 363 | ! | hilo_obs = hilo_obs | 
| 364 | ) | |
| 365 |   } else { | |
| 366 | 5x | percentile <- ecdf(score)(score) | 
| 367 | } | |
| 368 | ||
| 369 | 5x | return(data.frame(score, percentile, outcome, estimate = pava.est)) | 
| 370 | } | |
| 371 | ||
| 372 | ||
| 373 | #' Constrained Risk Percentile Estimates | |
| 374 | #' | |
| 375 | #' Adjusts PAVA risk percentile estimates for the first and last bin, to meet criteria | |
| 376 | #' for events, non-events, or total observation count. | |
| 377 | #' | |
| 378 | #' @inheritParams riskProfile | |
| 379 | #' | |
| 380 | #' @return A vector of constrained risk percentiles. | |
| 381 | #' | |
| 382 | #' @keywords internal | |
| 383 | #' @noRd | |
| 384 | #' | |
| 385 | #' @examples | |
| 386 | #' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) | |
| 387 | #' truth <- as.numeric(auroc$actual) | |
| 388 | #' rscore <- auroc$predicted | |
| 389 | #' | |
| 390 | #' getConstraints(outcome = truth, rscore = rscore, low_events = 3, high_nonevents = 3) | |
| 391 | #' | |
| 392 | getConstraints <- function(outcome, | |
| 393 | rscore, | |
| 394 | low_events = NULL, # min events in lower bin (useful for a PC) | |
| 395 | low_nonevents = NULL, # min non-events in lower bin | |
| 396 | high_events = NULL, # min events in upper bin | |
| 397 | high_nonevents = NULL, # min non-events in upper bin (useful for a PC) | |
| 398 |                            hilo_obs = NULL) { # min total obs in upper AND lower bin | |
| 399 | ||
| 400 | 1x | n <- length(rscore) | 
| 401 | 1x | rscore <- seq_along(rscore) / length(rscore) | 
| 402 | # rscore_cons: scores with binning constraint (same as rscore if no constraint specified) | |
| 403 | 1x | rscore_cons <- rscore | 
| 404 | ||
| 405 | ! | if (!is.null(low_events) && low_events == 0) low_events <- NULL | 
| 406 | ! | if (!is.null(low_nonevents) && low_nonevents == 0) low_nonevents <- NULL | 
| 407 | ! | if (!is.null(high_events) && high_events == 0) high_events <- NULL | 
| 408 | ! | if (!is.null(high_nonevents) && high_nonevents == 0) high_nonevents <- NULL | 
| 409 | ! | if (!is.null(hilo_obs) && hilo_obs == 0) hilo_obs <- NULL | 
| 410 | ||
| 411 | 1x |   if (is.numeric(low_events) && is.numeric(low_nonevents)) { | 
| 412 | ! | warning( | 
| 413 | ! | paste( | 
| 414 | ! | "Specified both a minimum number of events and non-events for the lower bin.", | 
| 415 | ! | "Combining for total observations instead." | 
| 416 | ) | |
| 417 | ) | |
| 418 | ! | hilo_obs <- round(low_events + low_nonevents) | 
| 419 | ! | rscore_cons[1:hilo_obs] <- min(rscore) | 
| 420 | ||
| 421 | 1x |   } else if (is.numeric(high_events) && is.numeric(high_nonevents)) { | 
| 422 | ! | warning( | 
| 423 | ! | paste( | 
| 424 | ! | "Specified both a minimum number of events and non-events for the upper bin.", | 
| 425 | ! | "Combining for total observations instead." | 
| 426 | ) | |
| 427 | ) | |
| 428 | ! | hilo_obs <- round(high_events + high_nonevents) | 
| 429 | ! | rscore_cons[(n + 1 - hilo_obs):n] <- max(rscore) | 
| 430 | ||
| 431 |   } else { | |
| 432 | # Apply upper / lower constraints | |
| 433 | 1x |     if (is.numeric(low_events)) { | 
| 434 | 1x | low_events <- round(low_events) | 
| 435 | 1x | indlo <- match(TRUE, cumsum(outcome) == low_events) | 
| 436 | 1x | rscore_cons[1:indlo] <- min(rscore) | 
| 437 | } | |
| 438 | 1x |     if (is.numeric(low_nonevents)) { | 
| 439 | ! | low_nonevents <- round(low_nonevents) | 
| 440 | ! | indlo <- match(TRUE, cumsum(1 - outcome) == low_nonevents) | 
| 441 | ! | rscore_cons[1:indlo] <- min(rscore) | 
| 442 | } | |
| 443 | 1x |     if (is.numeric(high_nonevents)) { | 
| 444 | 1x | high_nonevents <- round(high_nonevents) | 
| 445 | 1x | indhi <- match(TRUE, cumsum(1 - outcome[n:1]) == high_nonevents) | 
| 446 | 1x | rscore_cons[(n + 1 - indhi):n] <- max(rscore) | 
| 447 | } | |
| 448 | ||
| 449 | 1x |     if (is.numeric(high_events)) { | 
| 450 | ! | high_events <- round(high_events) | 
| 451 | ! | indhi <- match(TRUE, cumsum(outcome[n:1]) == high_events) | 
| 452 | ! | rscore_cons[(n + 1 - indhi):n] <- max(rscore) | 
| 453 | } | |
| 454 | ||
| 455 | 1x |     if (is.numeric(hilo_obs)) { | 
| 456 | ! | hilo_obs <- round(hilo_obs) | 
| 457 | ! | rscore_cons[1:hilo_obs] <- min(rscore) | 
| 458 | ! | rscore_cons[(n + 1 - hilo_obs):n] <- max(rscore) | 
| 459 | } | |
| 460 | } | |
| 461 | ||
| 462 | 1x | return(as.vector(rscore_cons)) | 
| 463 | } | |
| 464 | ||
| 465 | ||
| 466 | #' GAM Risk Estimates | |
| 467 | #' | |
| 468 | #' Fits a Generalized Additive Model to estimate risk, given a vector of binary outcome, | |
| 469 | #' and a vector of scores. | |
| 470 | #' | |
| 471 | #' @inheritParams riskProfile | |
| 472 | #' @param k Numeric to specify the upper limit of basis functions to fit for GAM. | |
| 473 | #' See [mgcv::s()] for more details. Defaults to -1. | |
| 474 | #' @param bs Character string to specify spline type. | |
| 475 | #' See [mgcv::s()] for more details. Defaults to `"tp"`. | |
| 476 | #' @param method Character string to specify method type. | |
| 477 | #' See [mgcv::s()] for more details. Defaults to "REML". | |
| 478 | #' @param logscores Logical; if `TRUE`, fit gam on log scores. Defaults to `FALSE`. | |
| 479 | #' @param fitonPerc Logical; if `TRUE`, fit gam on risk percentiles. Defaults to `TRUE`. | |
| 480 | #' | |
| 481 | #' @return A data frame with 4 columns | |
| 482 | #' (score, score percentile, outcome, estimate). | |
| 483 | #' | |
| 484 | #' @seealso [getASISest()] [getBINNEDest()] [getCGAMest()] [getMSPLINEest()] [getPAVAest()] | |
| 485 | #' | |
| 486 | #' @export | |
| 487 | #' | |
| 488 | #' @examples | |
| 489 | #' # Read in example data | |
| 490 | #' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) | |
| 491 | #' rscore <- auroc$predicted | |
| 492 | #' truth <- as.numeric(auroc$actual) | |
| 493 | #' | |
| 494 | #' tail(getGAMest(outcome = truth, score = rscore), 10) | |
| 495 | #' | |
| 496 | getGAMest <- function(outcome, | |
| 497 | score, | |
| 498 | k = -1, | |
| 499 | bs = "tp", | |
| 500 | method = "REML", | |
| 501 | logscores = FALSE, | |
| 502 |                       fitonPerc = TRUE) { | |
| 503 | ||
| 504 | 23x | checkmate::assert_numeric(outcome) | 
| 505 | 23x | checkmate::assert_numeric(score, len = length(outcome)) | 
| 506 | 23x | checkmate::assert_number(k) | 
| 507 | 23x | checkmate::assert_character(bs, any.missing = FALSE, len = 1) | 
| 508 | 23x | checkmate::assert_character(method, any.missing = FALSE, len = 1) | 
| 509 | 23x | checkmate::assert_flag(logscores) | 
| 510 | 23x | checkmate::assert_flag(fitonPerc) | 
| 511 | ||
| 512 | 23x | mygrid <- ecdf(score)(score) | 
| 513 | 23x | df <- data.frame(outcome = outcome, score = score, perc = mygrid) | 
| 514 | ||
| 515 | # mgcv::s does not work in formula, need to define it here | |
| 516 | 23x | s <- mgcv::s | 
| 517 | ||
| 518 | 23x |   if (fitonPerc && logscores) { | 
| 519 | 1x | formula <- outcome ~ s(log(perc), k = k, bs = bs) | 
| 520 | 22x |   } else if (fitonPerc && !logscores) { | 
| 521 | 15x | formula <- outcome ~ s(perc, k = k, bs = bs) | 
| 522 | 7x |   } else if (!fitonPerc && logscores) { | 
| 523 | 1x | formula <- outcome ~ s(log(score), k = k, bs = bs) | 
| 524 |   } else { | |
| 525 | 6x | formula <- outcome ~ s(score, k = k, bs = bs) | 
| 526 | } | |
| 527 | ||
| 528 | 23x | gam.fit <- mgcv::gam(formula, data = df, family = "binomial", method = method) | 
| 529 | ||
| 530 | 23x | gam.est <- mgcv::predict.gam(gam.fit, type = "response") | 
| 531 | ||
| 532 | 23x | return(data.frame(score, percentile = mygrid, outcome, estimate = gam.est)) | 
| 533 | } | |
| 534 | ||
| 535 | ||
| 536 | ||
| 537 | #' Constrained GAM (cgam) Risk Estimates | |
| 538 | #' | |
| 539 | #' Fits a Constrained Generalized Additive Model to estimate risk, | |
| 540 | #' given a vector of binary outcomes and a vector of scores. | |
| 541 | #' | |
| 542 | #' @inheritParams riskProfile | |
| 543 | #' @param numknots Numeric to specify the number of knots. | |
| 544 | #' Passed to the `smoother` function. Defaults to 3. | |
| 545 | #' @param smoother Character string to specify the smoother (from cgam package). | |
| 546 | #' Defaults to "s.incr". | |
| 547 | #' @param logscores Logical; if `TRUE`, fit gam on log scores. Defaults to `FALSE`. | |
| 548 | #' @param fitonPerc Logical; if `TRUE`, fit gam on risk percentiles. Defaults to `TRUE`. | |
| 549 | #' | |
| 550 | #' @return A data frame with 4 columns | |
| 551 | #' (score, score percentile, outcome, estimate). | |
| 552 | #' | |
| 553 | #' @seealso [getASISest()] [getBINNEDest()] [getGAMest()] [getMSPLINEest()] [getPAVAest()] | |
| 554 | #' | |
| 555 | #' @export | |
| 556 | #' | |
| 557 | #' @examples | |
| 558 | #' # Read in example data | |
| 559 | #' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) | |
| 560 | #' rscore <- auroc$predicted | |
| 561 | #' truth <- as.numeric(auroc$actual) | |
| 562 | #' | |
| 563 | #' tail(getCGAMest(outcome = truth, score = rscore), 10) | |
| 564 | #' | |
| 565 | getCGAMest <- function(outcome, | |
| 566 | score, | |
| 567 | numknots = 0, | |
| 568 | smoother = "s.incr", | |
| 569 | logscores = FALSE, | |
| 570 |                        fitonPerc = TRUE) { | |
| 571 | ||
| 572 | 3x | checkmate::assert_numeric(outcome) | 
| 573 | 3x | checkmate::assert_numeric(score, len = length(outcome)) | 
| 574 | 3x | checkmate::assert_number(numknots) | 
| 575 | 3x | checkmate::assert_character(smoother, any.missing = FALSE, len = 1) | 
| 576 | 3x | checkmate::assert_flag(logscores) | 
| 577 | 3x | checkmate::assert_flag(fitonPerc) | 
| 578 | ||
| 579 | 3x | mygrid <- ecdf(score)(score) | 
| 580 | 3x | df <- data.frame(outcome = outcome, score = score, perc = mygrid) | 
| 581 | ||
| 582 | # cgam::s does not work in formula, need to define it here | |
| 583 | 3x | assign(smoother, do.call(`::`, list(pkg = "cgam", name = smoother))) | 
| 584 | ||
| 585 | 3x | formula <- as.formula( | 
| 586 | 3x | paste0( | 
| 587 | 3x | "outcome ~ ", | 
| 588 | 3x |       smoother, "(", | 
| 589 | 3x |       `if`(logscores, "log("), | 
| 590 | 3x | `if`(fitonPerc, "perc", "score"), | 
| 591 | 3x | `if`(logscores, ")"), | 
| 592 | 3x | ", numknots = ", numknots, ")" | 
| 593 | ) | |
| 594 | ) | |
| 595 | ||
| 596 | 3x | cgam.fit <- cgam::cgam(formula, data = df, family = "binomial") | 
| 597 | ||
| 598 | 3x | cgam.est <- cgam::predict.cgam(cgam.fit, type = "response")$fit | 
| 599 | ||
| 600 | 3x | return(data.frame(score, percentile = mygrid, outcome, estimate = cgam.est)) | 
| 601 | } | |
| 602 | ||
| 603 | ||
| 604 | ### MSPLINE ESTIMATES ### | |
| 605 | ||
| 606 | # Function written by willtownes, joseph paulson. | |
| 607 | mspline <- function(x, y, k = 10, lower = NA, upper = NA) { | |
| 608 | # fits a monotonic spline to data | |
| 609 | # small values of k= more smoothing (flatter curves) | |
| 610 | # large values of k= more flexible (wiggly curves) | |
| 611 | # k is related to effective degrees of freedom and number of knots | |
| 612 | # use unconstrained gam to get rough parameter estimates | |
| 613 | # lower, upper optional bounds on the function | |
| 614 | # basically a slight modifimessageion of an example in the mgcv::pcls documentation | |
| 615 | 6x | dat <- data.frame(x = x, y = y) | 
| 616 | 6x | s <- mgcv::s # mgcv::s does not work in formula, need to define it here | 
| 617 | 6x | init_gam <- mgcv::gam(y ~ s(x, k = k, bs = "cr")) | 
| 618 | # Create Design matrix, constraints etc. for monotonic spline.... | |
| 619 | 6x | sm <- mgcv::smoothCon(s(x, k = k, bs = "cr"), dat, knots = NULL)[[1]] | 
| 620 | 6x | mc <- mgcv::mono.con(sm$xp, lower = lower, upper = upper) # monotonicity constraints | 
| 621 | 6x | M <- list( | 
| 622 | 6x | X = sm$X, y = y, # design matrix, outcome | 
| 623 | 6x | C = matrix(0, 0, 0), # equality constraints (none) | 
| 624 | 6x | Ain = mc$A, bin = mc$b, # inequality constraints | 
| 625 | 6x | sp = init_gam$sp, p = sm$xp, # initial guesses for param estimates | 
| 626 | 6x | S = sm$S, # smoothness penalty matrix | 
| 627 | 6x | w = y * 0 + 1, off = 0 # weights, offset | 
| 628 | ) | |
| 629 | # fit spine using penalized constrained least squares | |
| 630 | 6x | p <- mgcv::pcls(M) | 
| 631 | 6x | return(list(sm = sm, p = p)) | 
| 632 | } | |
| 633 | ||
| 634 | # Function written by joseph paulson | |
| 635 | predict.mspline <- function(msp, x) { | |
| 636 | # using the monotone spline msp, predict values for the vector x | |
| 637 | 6x | as.vector(mgcv::Predict.matrix(msp$sm, data.frame(x = x)) %*% msp$p) | 
| 638 | } | |
| 639 | ||
| 640 | ||
| 641 | ||
| 642 | #' Monotone Spline Risk Estimates | |
| 643 | #' | |
| 644 | #' Fits a Monotone constrained Generalized Additive Model (GAM) to estimate risk, | |
| 645 | #' given a vector of binary outcomes and a vector of scores. | |
| 646 | #' | |
| 647 | #' @inheritParams getGAMest | |
| 648 | #' | |
| 649 | #' @return A data frame with 4 columns | |
| 650 | #' (score, score percentile, outcome, estimate). | |
| 651 | #' | |
| 652 | #' @seealso [getASISest()] [getBINNEDest()] [getCGAMest()] [getGAMest()] [getPAVAest()] | |
| 653 | #' | |
| 654 | #' @export | |
| 655 | #' | |
| 656 | #' @examples | |
| 657 | #' # Read in example data | |
| 658 | #' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) | |
| 659 | #' rscore <- auroc$predicted | |
| 660 | #' truth <- as.numeric(auroc$actual) | |
| 661 | #' | |
| 662 | #' tail(getMSPLINEest(outcome = truth, score = rscore), 10) | |
| 663 | #' | |
| 664 | getMSPLINEest <- function(outcome, | |
| 665 | score, | |
| 666 | k = 10, | |
| 667 |                           fitonPerc = TRUE) { | |
| 668 | ||
| 669 | 6x | checkmate::assert_numeric(outcome) | 
| 670 | 6x | checkmate::assert_numeric(score, len = length(outcome)) | 
| 671 | 6x | checkmate::assert_integerish(k, len = 1, any.missing = FALSE) | 
| 672 | 6x | checkmate::assert_flag(fitonPerc) | 
| 673 | ||
| 674 | 6x | stopifnot(!is.null(k), is.logical(fitonPerc)) | 
| 675 | ||
| 676 | 6x | scorefit <- ecdf(score) | 
| 677 | 6x | mygrid <- scorefit(score) | 
| 678 | ||
| 679 | 6x |   if (!fitonPerc) { | 
| 680 | 1x | fitspl <- mspline(x = score, y = outcome, k = k) | 
| 681 | 1x | mspline.est <- predict.mspline(fitspl, score) | 
| 682 |   } else { | |
| 683 | 5x | fitspl <- mspline(x = mygrid, y = outcome, k = k) | 
| 684 | 5x | mspline.est <- predict.mspline(fitspl, mygrid) | 
| 685 | } | |
| 686 | ||
| 687 | 6x | mspline.est[mspline.est < 0] <- 0 | 
| 688 | 6x | mspline.est[mspline.est > 1] <- 1 | 
| 689 | ||
| 690 | 6x | return(data.frame(score, percentile = mygrid, outcome, estimate = mspline.est)) | 
| 691 | } | |
| 692 | ||
| 693 | ||
| 694 | #' "As is" estimates | |
| 695 | #' | |
| 696 | #' This function does no estimation, but uses the score as it is | |
| 697 | #' (it works like an identity function). | |
| 698 | #' | |
| 699 | #' @inheritParams getGAMest | |
| 700 | #' | |
| 701 | #' @return A data frame with 4 columns | |
| 702 | #' (score, score percentile, outcome, estimate). | |
| 703 | #' | |
| 704 | #' @seealso [getBINNEDest()] [getCGAMest()] [getGAMest()] [getMSPLINEest()] [getPAVAest()] | |
| 705 | #' | |
| 706 | #' @export | |
| 707 | #' | |
| 708 | #' @examples | |
| 709 | #' # Read in example data | |
| 710 | #' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) | |
| 711 | #' rscore <- auroc$predicted | |
| 712 | #' truth <- as.numeric(auroc$actual) | |
| 713 | #' | |
| 714 | #' tail(getASISest(outcome = truth, score = rscore), 10) | |
| 715 | #' | |
| 716 | getASISest <- function(outcome, score) { | |
| 717 | # Argument checks | |
| 718 | 17x | checkmate::assert_numeric(outcome) | 
| 719 | 17x | checkmate::assert_numeric(score, len = length(outcome)) | 
| 720 | 17x | return( | 
| 721 | 17x | data.frame( | 
| 722 | 17x | score = score, | 
| 723 | 17x | percentile = ecdf(score)(score), | 
| 724 | 17x | outcome, | 
| 725 | 17x | estimate = score | 
| 726 | ) | |
| 727 | ) | |
| 728 | } | |
| 729 | ||
| 730 | ||
| 731 | # Returns Risk Estimates. | |
| 732 | # This function calls all the other getXXXest functions | |
| 733 | # Used for Predictiveness Curve Data | |
| 734 | getEsts <- function(methods, outcome, score) { | |
| 735 | ||
| 736 | # check methods | |
| 737 | 41x | stopifnot(is.list(methods)) | 
| 738 | ||
| 739 | # Get function names (and perform checks) | |
| 740 | 38x | fun.names <- getEstMethods(methods, with.names = TRUE) | 
| 741 | ||
| 742 | # Run estimations | |
| 743 | 38x | m.est <- lapply(methods, \(x) getEst(x, outcome = outcome, score = score)) | 
| 744 | ||
| 745 | # Special case: get errorbar data if existing | |
| 746 | 36x | idx.binned <- fun.names == "binned" | 
| 747 | 36x | m.er <- lapply( | 
| 748 | 36x | which(idx.binned), | 
| 749 | 36x | \(i) attr(m.est[[i]], "errorbar") | 
| 750 | ) | |
| 751 | ||
| 752 | # Bind together and convert to long data frame | |
| 753 | 36x | m.est <- bind_rows(m.est, .id = "method") | 
| 754 | 36x | rownames(m.est) <- NULL | 
| 755 | ||
| 756 | # Bind together and convert to long data frame; otherwise return NULL | |
| 757 | 36x | m.error <- bind_rows(m.er, .id = "method") | 
| 758 | 36x |   if (nrow(m.error) == 0) { | 
| 759 | 35x | m.error <- NULL | 
| 760 | } | |
| 761 | ||
| 762 | # Return indexes of step methods and risk methods | |
| 763 | 36x |   idx.step <- fun.names %in% c("binned", "pava") | 
| 764 | 36x | names(idx.step) <- names(fun.names) | 
| 765 | 36x | idx.asis <- fun.names == "asis" | 
| 766 | ||
| 767 | # Check if asis was called multiple times | |
| 768 | 36x |   if (sum(idx.asis) > 1) { | 
| 769 | 1x |     stop("Please use 'asis' just once (as it does not have any further arguments).") | 
| 770 | } | |
| 771 | ||
| 772 | 35x | return( | 
| 773 | 35x | list( | 
| 774 | 35x | plotdata = m.est, errorbardata = m.error, | 
| 775 | 35x | idx.step = idx.step, idx.asis = idx.asis, | 
| 776 | 35x | idx.binned = idx.binned, idx.pava = fun.names == "pava" | 
| 777 | ) | |
| 778 | ) | |
| 779 | } | |
| 780 | ||
| 781 | # Define estimation functions | |
| 782 | est.funs <- function() { | |
| 783 | 104x | list( | 
| 784 | 104x | gam = getGAMest, | 
| 785 | 104x | cgam = getCGAMest, | 
| 786 | 104x | mspline = getMSPLINEest, | 
| 787 | 104x | binned = getBINNEDest, | 
| 788 | 104x | pava = getPAVAest, | 
| 789 | 104x | asis = getASISest | 
| 790 | ) | |
| 791 | } | |
| 792 | ||
| 793 | # Generic and S3 methods for estimations | |
| 794 | getEst <- function(x, outcome, score) { | |
| 795 | 54x |   UseMethod("getEst") | 
| 796 | } | |
| 797 | ||
| 798 | ||
| 799 | getEst.list <- function(x, outcome, score) { | |
| 800 | 14x | do.call( | 
| 801 | 14x | est.funs()[[x[["method"]]]], | 
| 802 | 14x | append(list(outcome = outcome, score = score), x[names(x) != "method"]) | 
| 803 | ) | |
| 804 | } | |
| 805 | ||
| 806 | ||
| 807 | getEst.character <- function(x, outcome, score) { | |
| 808 | 36x | do.call( | 
| 809 | 36x | est.funs()[[x]], | 
| 810 | 36x | list(outcome = outcome, score = score) | 
| 811 | ) | |
| 812 | } | |
| 813 | ||
| 814 | ||
| 815 | getEst.function <- function(x, outcome, score) { | |
| 816 | ||
| 817 | # Run estimation | |
| 818 | 4x | out <- x(outcome = outcome, score = score) | 
| 819 | ||
| 820 | # Check output | |
| 821 | 4x | check1 <- is.data.frame(out) && | 
| 822 | 4x |     identical(c("estimate", "outcome", "percentile", "score"), sort(colnames(out))) | 
| 823 | 4x |   if (!check1) { | 
| 824 | 1x | stop( | 
| 825 | 1x | paste( | 
| 826 | 1x | "User defined estimation functions must return a data.frame of 4 columns:", | 
| 827 | 1x | "score - the predictions,", | 
| 828 | 1x | "percentile - the percentile of score,", | 
| 829 | 1x | "outcome - the original outcome,", | 
| 830 | 1x | "and estimate - the estimated value" | 
| 831 | ) | |
| 832 | ) | |
| 833 | } | |
| 834 |  | |
| 835 | 3x | check2 <- vapply(out, is.numeric, FUN.VALUE = logical(1), USE.NAMES = TRUE) | 
| 836 | 3x |   if (!all(check2)) { | 
| 837 | 1x | stop( | 
| 838 | 1x | paste( | 
| 839 | 1x | "All columns of the returned data.frame in user defined estimation function", | 
| 840 | 1x | "must be numeric.", | 
| 841 | 1x |         paste0("`", names(which(!check2)), "`", collapse = ", "), "is/are not numeric." | 
| 842 | ) | |
| 843 | ) | |
| 844 | } | |
| 845 | ||
| 846 | 2x | return(out) | 
| 847 | } | |
| 848 | ||
| 849 | # Generic and S3 methods for estimation method as string (actual estimation function used) | |
| 850 | getEstMethod <- function(x) { | |
| 851 | 67x |   UseMethod("getEstMethod") | 
| 852 | } | |
| 853 | ||
| 854 | #' @export | |
| 855 | getEstMethod.character <- function(x) { | |
| 856 | 43x | x | 
| 857 | } | |
| 858 | ||
| 859 | #' @export | |
| 860 | getEstMethod.list <- function(x) { | |
| 861 | 19x | x[["method"]] | 
| 862 | } | |
| 863 | ||
| 864 | #' @export | |
| 865 | getEstMethod.function <- function(x) { | |
| 866 | 5x | "udf" | 
| 867 | } | |
| 868 | ||
| 869 | getEstMethods <- function(x, with.names) { | |
| 870 | 48x | vapply(x, getEstMethod, FUN.VALUE = character(1), USE.NAMES = with.names) | 
| 871 | } | 
| 1 | #' Calibration plot | |
| 2 | #' | |
| 3 | #' Calibration curve risk estimates | |
| 4 | #' | |
| 5 | #' @inheritParams riskProfile | |
| 6 | #' @param methods Character vector of method names (case-insensitive) for plotting curves or | |
| 7 | #' a named list where elements are method function and its arguments. | |
| 8 | #' Default is set to `list(gam = list(method = "gam", fitonPerc = FALSE))`. | |
| 9 | #' | |
| 10 | #' Full options are: `c("binned", "pava", "mspline", "gam", "cgam")`. | |
| 11 | #' | |
| 12 | #' To specify arguments per method, use lists. For example: | |
| 13 | #' ``` | |
| 14 | #' list( | |
| 15 | #' pava = list(method = "pava", ties = "primary"), | |
| 16 | #' mspline = list(method = "mspline", fitonPerc = TRUE), | |
| 17 | #' gam = list(method = "gam", bs = "tp", logscores = FALSE), | |
| 18 | #' bin = list(method = "binned", bins = 10), | |
| 19 | #' ) | |
| 20 | #' ``` | |
| 21 | #' See section "Estimation" for more details. | |
| 22 | #' @param include Character vector (case-insensitive, partial matching) or `NULL` specifying | |
| 23 | #' what quantities to include in the plot. | |
| 24 | #' | |
| 25 | #' Default is: `c("loess", "citl")`. | |
| 26 | #' | |
| 27 | #' Full options are: `c("loess", "citl", "rug", "datapoints")` or `NULL`. | |
| 28 | #' "loess" adds a Loess fit, "citl" stands for "Calibration in the large", | |
| 29 | #' "rug" adds rug ticks of `score` by `outcome` (top x-axis: `score` for `outcome == 1`, | |
| 30 | #' bottom x-axis: `score` for `outcome == 0`), | |
| 31 | #' "datapoints" adds jittered `score` by `outcome` (slightly shifted away from 0 / 1 y-values), | |
| 32 | #' "`NULL`" stands for no extra information. | |
| 33 | #' @param plot.raw Logical to show percentiles or raw values. | |
| 34 | #' Defaults to `TRUE` (i.e. raw `score`). | |
| 35 | #' @param rev.order Logical to reverse ordering of scores. Defaults to `FALSE`. | |
| 36 | #' @param margin.type Type of additional margin plot, can be one of | |
| 37 | #' `c("density", "histogram", "boxplot", "violin", "densigram")`. | |
| 38 | #' See [ggExtra::ggMarginal()] for more details. | |
| 39 | #' @param ... Additional arguments passed to [ggExtra::ggMarginal()]. | |
| 40 | #' | |
| 41 | #' @inheritSection riskProfile Estimation | |
| 42 | #' | |
| 43 | #' @return A list containing the plot and data, plus `citl` data if they were requested. | |
| 44 | #' | |
| 45 | #' @export | |
| 46 | #' | |
| 47 | #' @seealso [riskProfile()] [sensSpec()] | |
| 48 | #' | |
| 49 | #' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()] | |
| 50 | #' [getASISest()] | |
| 51 | #' | |
| 52 | #' @examples | |
| 53 | #' # Read in example data | |
| 54 | #' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) | |
| 55 | #' rscore <- auroc$predicted_calibrated | |
| 56 | #' truth <- as.numeric(auroc$actual) | |
| 57 | #' | |
| 58 | #' # Default calibration plot | |
| 59 | #' p1 <- calibrationProfile(outcome = truth, score = rscore) | |
| 60 | #' p1$plot | |
| 61 | #' | |
| 62 | #' # Specifying multiple estimation methods | |
| 63 | #' # By default, all the methods fit on percentiles | |
| 64 | #' calibrationProfile( | |
| 65 | #' outcome = truth, | |
| 66 | #' score = rscore, | |
| 67 | #'   methods = c("gam", "mspline", "binned") | |
| 68 | #' )$plot | |
| 69 | #' | |
| 70 | #' # Specifying multiple estimation methods with parameters | |
| 71 | #' calibrationProfile( | |
| 72 | #' outcome = truth, | |
| 73 | #' score = rscore, | |
| 74 | #' methods = list( | |
| 75 | #' gam = list(method = "gam", fitonPerc = FALSE, k = 3), | |
| 76 | #' mspline = list(method = "mspline"), | |
| 77 | #' bin = list(method = "binned", quantiles = 5) | |
| 78 | #' ) | |
| 79 | #' )$plot | |
| 80 | #' | |
| 81 | #' # Additional quantities and marginal histogram with specified number of bins | |
| 82 | #' calibrationProfile( | |
| 83 | #' outcome = truth, | |
| 84 | #' score = rscore, | |
| 85 | #'   include = c("rug", "datapoints", "citl"), | |
| 86 | #'   # or use partial matching: include = c("r", "d", "c"), | |
| 87 | #' margin.type = "histogram", | |
| 88 | #' bins = 100 # passed to ggExtra::ggMarginal | |
| 89 | #' )$plot | |
| 90 | #' | |
| 91 | calibrationProfile <- function(outcome, | |
| 92 | score, | |
| 93 | methods = list(gam = list(method = "gam", fitonPerc = FALSE)), | |
| 94 |                                include = c("loess", "citl"), | |
| 95 | plot.raw = TRUE, | |
| 96 | rev.order = FALSE, | |
| 97 | margin.type = NULL, | |
| 98 |                                ...) { | |
| 99 | ||
| 100 | # Argument checks (except outcome, score, methods - below) | |
| 101 | 10x | checkmate::assert( | 
| 102 | 10x | checkmate::check_character(include), | 
| 103 | 10x | checkmate::check_null(include) | 
| 104 | ) | |
| 105 | 10x |   if (is.character(include)) { | 
| 106 | 9x |     include <- matchArgSubset(tolower(include), choices = c("loess", "citl", "rug", "datapoints")) | 
| 107 | } | |
| 108 | 10x | checkmate::assert_flag(plot.raw) | 
| 109 | 10x | checkmate::assert_flag(rev.order) | 
| 110 | 10x | checkmate::assert( | 
| 111 | 10x | checkmate::check_character(margin.type, len = 1, any.missing = FALSE), | 
| 112 | 10x | checkmate::check_null(margin.type) | 
| 113 | ) | |
| 114 | ||
| 115 | 10x |   if (plot.raw) { | 
| 116 | 9x | xvar <- "score" | 
| 117 |   } else { | |
| 118 | 1x | xvar <- "percentile" | 
| 119 | } | |
| 120 | ||
| 121 | # Standardize/Check outcome, scores | |
| 122 | 10x | op <- inputCheck(outcome = outcome, score = score) | 
| 123 | ||
| 124 | # Order Data by scores | |
| 125 | 10x | tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order) | 
| 126 | 10x | score <- tempdf$score | 
| 127 | 10x | outcome <- tempdf$outcome | 
| 128 | ||
| 129 | # Check methods | |
| 130 | 10x | methods <- methodCheck(methods = methods) | 
| 131 | 10x | method.names <- names(methods) | 
| 132 | 10x |   if ("asis" %in% getEstMethods(methods, with.names = FALSE)) { | 
| 133 | 1x |     stop('"asis" method is not suitable for this plot. Please remove it.') | 
| 134 | } | |
| 135 | ||
| 136 | # Get estimates | |
| 137 | 9x | pc.ests <- getEsts(methods = methods, outcome = outcome, score = score) | 
| 138 | 9x | PC.data <- pc.ests$plotdata | 
| 139 | 9x | step.methods <- method.names[pc.ests$idx.step] | 
| 140 | ||
| 141 | # Calculate percentiles | |
| 142 | 9x | ecdf.score <- ecdf(score) | 
| 143 | 9x | percentile <- ecdf.score(score) | 
| 144 | ||
| 145 | # Calculate Calibration in the large | |
| 146 | 9x | citl.data <- data.frame( | 
| 147 | 9x | outcome = mean(outcome), | 
| 148 | 9x | score = mean(score), | 
| 149 | 9x | percentile = ecdf.score(mean(score)), | 
| 150 | 9x | method = "Calibration In The Large" | 
| 151 | ) | |
| 152 | ||
| 153 | # Subset PC data for plotting | |
| 154 | 9x | smoothPC <- PC.data[!PC.data$method %in% step.methods, , drop = FALSE] | 
| 155 | 9x | stepPC <- PC.data[PC.data$method %in% step.methods, , drop = FALSE] | 
| 156 | ||
| 157 | # data.frame with user inputs | |
| 158 | 9x | ddf <- data.frame(score, percentile, outcome) | 
| 159 | ||
| 160 | # Shape type storage | |
| 161 | 9x | shapes <- c() | 
| 162 | ||
| 163 | # Add empty scatterplot layer for ggMarginal | |
| 164 | 9x |   if (!is.null(margin.type)) { | 
| 165 | 1x | p <- ggplot() + | 
| 166 | 1x | geom_point( | 
| 167 | 1x | data = ddf, | 
| 168 | 1x | aes(x = .data[[xvar]], y = .data$outcome), shape = NA, na.rm = TRUE | 
| 169 | ) | |
| 170 |   } else { | |
| 171 | 8x | p <- ggplot() | 
| 172 | } | |
| 173 | ||
| 174 | # Build plot | |
| 175 | 9x | p <- p + | 
| 176 | 9x | geom_line( | 
| 177 | 9x | data = smoothPC, | 
| 178 | 9x | aes(x = .data[[xvar]], y = .data$estimate, linetype = .data$method, colour = .data$method), | 
| 179 | 9x | alpha = 0.8, | 
| 180 | 9x | linewidth = 0.5 | 
| 181 | ) + | |
| 182 | 9x | geom_step( | 
| 183 | 9x | data = stepPC, | 
| 184 | 9x | aes(x = .data[[xvar]], y = .data$estimate, linetype = .data$method, colour = .data$method), | 
| 185 | 9x | direction = "vh", | 
| 186 | 9x | alpha = 0.8, | 
| 187 | 9x | linewidth = 0.5 | 
| 188 | ) + | |
| 189 | 9x | geom_abline( | 
| 190 | 9x | aes(slope = 1, intercept = 0, linewidth = "Identity line"), | 
| 191 | 9x | colour = "gray50", | 
| 192 | 9x | linetype = "solid" | 
| 193 | ) | |
| 194 | ||
| 195 | # Add loess if requested | |
| 196 | 9x |   if ("loess" %in% include) { | 
| 197 | 8x | p <- p + geom_smooth( | 
| 198 | 8x | data = ddf, | 
| 199 | 8x | aes(x = .data[[xvar]], y = .data$outcome, linetype = "loess", colour = "loess"), | 
| 200 | 8x | method = "loess", formula = y ~ x, se = FALSE, | 
| 201 | 8x | linewidth = 0.5 | 
| 202 | ) | |
| 203 | } | |
| 204 | ||
| 205 | # Add calibration in the large if requested | |
| 206 | 9x |   if ("citl" %in% include) { | 
| 207 | 8x | p <- p + geom_point( | 
| 208 | 8x | data = citl.data, | 
| 209 | 8x | aes(x = .data[[xvar]], y = .data$outcome, shape = .data$method), | 
| 210 | 8x | colour = "red", | 
| 211 | 8x | size = 3, | 
| 212 | 8x | stroke = 1 | 
| 213 | ) | |
| 214 | 8x |     shapes <- c(shapes, c("Calibration In The Large" = 4)) | 
| 215 | } | |
| 216 | ||
| 217 | # Add datapoints if requested | |
| 218 | 9x |   if ("datapoints" %in% include) { | 
| 219 | 1x | p <- p + geom_jitter( | 
| 220 | 1x | data = data.frame( | 
| 221 | 1x | score, | 
| 222 | 1x | percentile, | 
| 223 | 1x | outcome = ifelse(outcome == 0, -0.1, 1.1), | 
| 224 | 1x | method = "Data points" | 
| 225 | ), | |
| 226 | 1x | aes(x = .data[[xvar]], y = .data$outcome, shape = .data$method), | 
| 227 | 1x | colour = "black", | 
| 228 | 1x | size = 1.5, | 
| 229 | 1x | alpha = 0.4, | 
| 230 | 1x | position = position_jitter(seed = 5, height = 0.03) | 
| 231 | ) | |
| 232 | 1x |     shapes <- c(shapes, c("Data points" = 16)) | 
| 233 | } | |
| 234 | ||
| 235 | # Add rug if requested | |
| 236 | 9x |   if ("rug" %in% include) { | 
| 237 | 1x | p <- p + geom_rug( | 
| 238 | 1x | data = ddf[ddf$outcome == 0, ], | 
| 239 | 1x | aes(x = .data[[xvar]]), | 
| 240 | 1x | sides = "b", | 
| 241 | 1x | show.legend = FALSE | 
| 242 | 1x | ) + geom_rug( | 
| 243 | 1x | data = ddf[ddf$outcome == 1, ], | 
| 244 | 1x | aes(x = .data[[xvar]]), | 
| 245 | 1x | sides = "t", | 
| 246 | 1x | show.legend = FALSE | 
| 247 | ) | |
| 248 | } | |
| 249 | ||
| 250 | # Fix shape legend ... | |
| 251 | 9x |   if (all(c("citl", "datapoints") %in% include)) { | 
| 252 | 1x | shape_guide <- guide_legend( | 
| 253 | 1x | override.aes = list( | 
| 254 | 1x |         colour = c("Calibration In The Large" = "red", "Data points" = "black"), | 
| 255 | 1x | alpha = 1, size = 2.5 | 
| 256 | ), | |
| 257 | 1x | order = 3 | 
| 258 | ) | |
| 259 | 8x |   } else if (any(c("citl", "datapoints") %in% include)) { | 
| 260 | 7x | shape_guide <- guide_legend( | 
| 261 | 7x | override.aes = list(alpha = 1, size = 2.5), | 
| 262 | 7x | order = 3 | 
| 263 | ) | |
| 264 |   } else { | |
| 265 | 1x | shape_guide <- NULL | 
| 266 | } | |
| 267 | ||
| 268 | # Finalize graph | |
| 269 | 9x | p <- p + | 
| 270 | 9x |     scale_linewidth_manual(values = c("Identity line" = 0.5)) + | 
| 271 | 9x | scale_shape_manual(values = shapes) + | 
| 272 | 9x | scale_colour_hue(l = 45) + | 
| 273 | 9x | scale_x_continuous(n.breaks = 6) + | 
| 274 | 9x | scale_y_continuous(n.breaks = 6) + | 
| 275 | 9x | labs( | 
| 276 | 9x | title = "Calibration Plot", | 
| 277 | 9x | x = "Predicted Probability", | 
| 278 | 9x | y = "Observed", | 
| 279 | 9x | linetype = "Estimation Method", | 
| 280 | 9x | linewidth = NULL, | 
| 281 | 9x | colour = "Estimation Method", | 
| 282 | 9x |       shape = `if`(all(c("citl", "datapoints") %in% include), "Points", NULL) | 
| 283 | ) + | |
| 284 | 9x | theme_bw() + | 
| 285 | 9x | theme(legend.key.width = unit(2, "line")) + | 
| 286 | 9x | guides( | 
| 287 | 9x | linetype = guide_legend(order = 1), | 
| 288 | 9x | colour = guide_legend(order = 1), | 
| 289 | 9x | linewidth = guide_legend(order = 2), | 
| 290 | 9x | shape = shape_guide | 
| 291 | ) | |
| 292 | ||
| 293 | # Add margin plot | |
| 294 | 9x |   if (!is.null(margin.type)) { | 
| 295 | 1x | p <- ggExtra::ggMarginal(p, type = margin.type, margins = "x", ...) | 
| 296 | } | |
| 297 | ||
| 298 | 9x | return(list(plot = p, data = dplyr::as_tibble(PC.data), citl = citl.data)) | 
| 299 | } | 
| 1 | #' Sensitivity and specificity plot | |
| 2 | #' | |
| 3 | #' Sensitivity and specificity risk estimates | |
| 4 | #' | |
| 5 | #' Given individual binary outcomes and scores, this function plots sensitivity and specificity | |
| 6 | #' (using each score as a cutoff) on their respective score percentiles. | |
| 7 | #' | |
| 8 | #' @inheritParams riskProfile | |
| 9 | #' @param show.best Logical; Include best possible sensitivity and specificity? Defaults to `TRUE`. | |
| 10 | #' | |
| 11 | #' @inheritSection riskProfile Estimation | |
| 12 | #' | |
| 13 | #' @return A list containing the plot and data. | |
| 14 | #' | |
| 15 | #' @export | |
| 16 | #' | |
| 17 | #' @seealso [riskProfile()] [calibrationProfile()] | |
| 18 | #' | |
| 19 | #' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()] | |
| 20 | #' [getASISest()] | |
| 21 | #' | |
| 22 | #' @examples | |
| 23 | #' # Read in example data | |
| 24 | #' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) | |
| 25 | #' rscore <- auroc$predicted_calibrated | |
| 26 | #' truth <- as.numeric(auroc$actual) | |
| 27 | #' | |
| 28 | #' # Plot sensitivity and specificity | |
| 29 | #' p1 <- sensSpec(outcome = truth, score = rscore) | |
| 30 | #' p1$plot | |
| 31 | #' | |
| 32 | #' # Same with smoothed estimates | |
| 33 | #' p2 <- sensSpec(outcome = truth, score = rscore, methods = c("asis", "gam")) | |
| 34 | #' p2$plot | |
| 35 | #' | |
| 36 | sensSpec <- function(outcome, | |
| 37 | score, | |
| 38 | methods = "asis", | |
| 39 | show.best = TRUE, | |
| 40 | plot.raw = FALSE, | |
| 41 |                      rev.order = FALSE) { | |
| 42 | ||
| 43 | # Argument checks | |
| 44 | 7x | checkmate::assert_flag(show.best) | 
| 45 | 7x | checkmate::assert_flag(plot.raw) | 
| 46 | 7x | checkmate::assert_flag(rev.order) | 
| 47 | ||
| 48 | # Check methods | |
| 49 | 7x | methods <- methodCheck(methods = methods) | 
| 50 | ||
| 51 | 7x |   if (plot.raw) { | 
| 52 | 2x | xvar <- "score" | 
| 53 |   } else { | |
| 54 | 5x | xvar <- "percentile" | 
| 55 | } | |
| 56 | ||
| 57 | # Standardize/Check outcome, scores | |
| 58 | 7x | op <- inputCheck(outcome = outcome, score = score) | 
| 59 | ||
| 60 | # Order Data by scores | |
| 61 | 7x | tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order) | 
| 62 | 7x | score <- tempdf$score | 
| 63 | 7x | outcome <- tempdf$outcome | 
| 64 | ||
| 65 | # Calculate spec and sens | |
| 66 | 7x | dat <- getEsts(outcome = outcome, score = score, methods = methods)$plotdata | 
| 67 | 7x | dat <- split(dat, dat$method) %>% | 
| 68 | 7x | lapply(\(d) nonParametricTR(outcome = d$outcome, score = d$estimate)) %>% | 
| 69 | 7x | bind_rows(.id = "method") | 
| 70 |  | |
| 71 | 7x |   if (!plot.raw) { | 
| 72 | 5x | dat <- add0thPercTR(dat) | 
| 73 | } | |
| 74 | ||
| 75 | # Plot | |
| 76 | 7x | p <- ggplot(dat) + | 
| 77 | 7x | geom_step( | 
| 78 | 7x | aes(x = .data[[xvar]], y = .data$value, colour = .data$method, linetype = .data$pf), | 
| 79 | 7x | direction = "hv" | 
| 80 | ) | |
| 81 | ||
| 82 | # Add best possible sensitivity and specificity | |
| 83 | 7x |   if (show.best) { | 
| 84 | 7x | prev <- mean(outcome) | 
| 85 | 7x |     if (plot.raw) { | 
| 86 | 2x | best <- data.frame(percentile = ecdf(score)(score), score = score) | 
| 87 |     } else { | |
| 88 | 5x | best <- data.frame(percentile = c(0, ecdf(score)(score)), score = c(NA, score)) | 
| 89 | } | |
| 90 | 7x | best <- best %>% | 
| 91 | 7x | distinct() %>% | 
| 92 | 7x | mutate( | 
| 93 | 7x | Sensitivity = bestSens(perc = .data$percentile, prev = prev), | 
| 94 | 7x | Specificity = bestSpec(perc = .data$percentile, prev = prev) | 
| 95 | ) %>% | |
| 96 | 7x | tidyr::pivot_longer( | 
| 97 | 7x |         cols = c("Sensitivity", "Specificity"), names_to = "pf", values_to = "value" | 
| 98 | ) %>% | |
| 99 | 7x | mutate(method = "best") | 
| 100 | 7x | p <- p + | 
| 101 | 7x | geom_line( | 
| 102 | 7x | aes(x = .data[[xvar]], y = .data$value, linetype = .data$pf, linewidth = "Best Possible"), | 
| 103 | 7x | data = best, | 
| 104 | 7x | colour = "gray70" | 
| 105 | ) + | |
| 106 | 7x |       scale_linewidth_manual(values = c("Best Possible" = 0.5), name = NULL) | 
| 107 | } | |
| 108 | ||
| 109 | # Finalize plot | |
| 110 | 7x | p <- p + | 
| 111 | 7x | labs( | 
| 112 | 7x | title = "Sensitivity and Specificity Plot", | 
| 113 | 7x | x = ifelse(plot.raw, "Prediction Score", "Risk Percentile"), | 
| 114 | 7x | y = "True Positive / Negative Rate", | 
| 115 | 7x | colour = "Estimation Method", | 
| 116 | 7x | linetype = "Predictive Quantity" | 
| 117 | ) + | |
| 118 | 7x | scale_x_continuous(n.breaks = 6) + | 
| 119 | 7x | scale_y_continuous(n.breaks = 6) + | 
| 120 | 7x |     scale_linetype_manual(values = c("Sensitivity" = "solid", "Specificity" = "dashed")) + | 
| 121 | 7x | scale_colour_hue(l = 45) + | 
| 122 | 7x | theme_bw() + | 
| 123 | 7x | theme(legend.key.width = unit(2, "line")) + | 
| 124 | 7x | guides( | 
| 125 | 7x | linetype = guide_legend(order = 1, override.aes = list(colour = "black")), | 
| 126 | 7x | colour = guide_legend(order = 2), | 
| 127 | 7x | linewidth = guide_legend(order = 3) | 
| 128 | ) | |
| 129 | ||
| 130 | 7x |   if (show.best) { | 
| 131 | 7x | dat <- bind_rows(dat, best) | 
| 132 | } | |
| 133 | ||
| 134 | 7x | return(list(plot = p, data = dplyr::as_tibble(dat))) | 
| 135 | } | 
| 1 | ||
| 2 | # Calculate Predictive Value (PV) Estimates | |
| 3 | getPVdata <- function(methods, outcome, score, pc.ests = NULL) { | |
| 4 | ||
| 5 | # Calculate estimates if not provided | |
| 6 | 16x |   if (is.null(pc.ests)) { | 
| 7 | 5x | pc.ests <- getEsts(outcome = outcome, score = score, methods = methods) | 
| 8 | 5x | pc.data <- pc.ests$plotdata | 
| 9 |   } else { | |
| 10 | 11x | pc.data <- pc.ests$plotdata | 
| 11 | } | |
| 12 | ||
| 13 | # locf and zip for pava and/or binned data | |
| 14 | 16x |   if (any(pc.ests$idx.step)) { | 
| 15 | 4x | name.binned <- names(methods)[pc.ests$idx.binned] | 
| 16 | 4x | name.pava <- names(methods)[pc.ests$idx.pava] | 
| 17 | 4x | pc.data <- locf(pc.data, method.binned = name.binned, method.pava = name.pava) | 
| 18 | } | |
| 19 | ||
| 20 | # Calculate PV | |
| 21 | 16x | PV.data <- parametricPV(pc.data = pc.data) | 
| 22 | ||
| 23 | 16x | return(as.data.frame(PV.data)) | 
| 24 | } | |
| 25 | ||
| 26 | ||
| 27 | # Calculates PV values from cutoff thresholds | |
| 28 | nonParametricPV <- function(outcome, score) { | |
| 29 | ||
| 30 | 11x | prev <- mean(outcome, na.rm = TRUE) | 
| 31 | ||
| 32 | 11x | thresh.predictions <- lapply(score, function(x) as.numeric(score > x)) | 
| 33 | ||
| 34 | 11x | ppv <- vapply( | 
| 35 | 11x | thresh.predictions, | 
| 36 | 11x |     function(x) { | 
| 37 | 440x | tp <- sum(outcome == 1 & x == 1) | 
| 38 | 440x | fp <- sum(outcome == 0 & x == 1) | 
| 39 | 440x | tp / (tp + fp) | 
| 40 | }, | |
| 41 | 11x | numeric(1) | 
| 42 | ) | |
| 43 | ||
| 44 | 11x | npv <- vapply( | 
| 45 | 11x | thresh.predictions, | 
| 46 | 11x |     function(x) { | 
| 47 | 440x | tn <- sum(outcome == 0 & x == 0) | 
| 48 | 440x | fn <- sum(outcome == 1 & x == 0) | 
| 49 | 440x | tn / (tn + fn) | 
| 50 | }, | |
| 51 | 11x | numeric(1) | 
| 52 | ) | |
| 53 | ||
| 54 | 11x | threshold.data <- data.frame( | 
| 55 | 11x | score = score, | 
| 56 | 11x | percentile = ecdf(score)(score), | 
| 57 | 11x | PPV = ppv, | 
| 58 | 11x | NPV = npv | 
| 59 | ) %>% | |
| 60 | 11x | mutate(MNPV = 1 - .data$NPV) %>% | 
| 61 | 11x | tidyr::fill( | 
| 62 | 11x |       all_of(c("PPV", "NPV", "MNPV")), | 
| 63 | 11x | .direction = "downup" | 
| 64 | ) | |
| 65 | ||
| 66 | 11x | return(threshold.data) | 
| 67 | } | |
| 68 | ||
| 69 | ||
| 70 | # Calculates PV values based on pracma::cumtrapz | |
| 71 | parametricPV <- function(pc.data) { | |
| 72 | ||
| 73 | # Calculate PVs | |
| 74 | 16x | PV.data <- pc.data %>% | 
| 75 | 16x | group_by(.data$method) %>% | 
| 76 | 16x | mutate( | 
| 77 | 16x | MNPV = pracma::cumtrapz(.data$percentile, .data$estimate)[, 1] / .data$percentile, | 
| 78 | 16x | NPV = 1 - .data$MNPV, | 
| 79 | 16x | PPV = | 
| 80 | ( | |
| 81 | 16x | max(pracma::cumtrapz(.data$percentile, .data$estimate), na.rm = TRUE) - | 
| 82 | 16x | pracma::cumtrapz(.data$percentile, .data$estimate)[, 1] | 
| 83 | 16x | ) / (max(.data$percentile) - .data$percentile) | 
| 84 | ) %>% | |
| 85 | 16x | ungroup() %>% | 
| 86 | 16x | arrange(.data$method, .data$percentile) | 
| 87 | ||
| 88 | # Fill NAs (starting points) | |
| 89 | 16x | PV.data <- PV.data %>% | 
| 90 | 16x | group_by(.data$method) %>% | 
| 91 | 16x | mutate( | 
| 92 | 16x | MNPV = ifelse(.data$percentile == min(.data$percentile) & is.na(.data$MNPV), 0, .data$MNPV), | 
| 93 | 16x | NPV = ifelse(.data$percentile == min(.data$percentile) & is.na(.data$NPV), 1, .data$NPV) | 
| 94 | ) %>% | |
| 95 | 16x | as.data.frame() | 
| 96 | ||
| 97 | # Fill NAs (ending points) | |
| 98 | 16x | PV.data <- tidyr::fill( | 
| 99 | 16x | PV.data, | 
| 100 | 16x |     all_of(c("estimate", "MNPV", "NPV", "PPV")), | 
| 101 | 16x | .direction = "down" | 
| 102 | ) | |
| 103 | ||
| 104 | 16x | return(as.data.frame(PV.data)) | 
| 105 | } | |
| 106 | ||
| 107 | ||
| 108 | # consistent colors for PVs | |
| 109 | predictionColours <- function(x, show.best) { | |
| 110 | 13x | clrs <- c( | 
| 111 | 13x | "Best NPV" = "gray80", | 
| 112 | 13x | "NPV" = "#53B400", | 
| 113 | 13x | "Best PPV" = "gray65", | 
| 114 | 13x | "PPV" = "red", | 
| 115 | 13x | "Best PC" = "black", | 
| 116 | 13x | "PC" = "plum", | 
| 117 | 13x | "1-NPV" = "royalblue2", | 
| 118 | 13x | "Best 1-NPV" = "gray50" | 
| 119 | ) | |
| 120 | 13x |   if (show.best) { | 
| 121 | 10x |     x <- c(x, paste("Best", x)) | 
| 122 | } | |
| 123 | 13x | return(clrs[names(clrs) %in% x]) # keep the ordering above | 
| 124 | } | |
| 125 | ||
| 126 | ||
| 127 | # Calculates sensitivity and specificity (true positive / negative rate) | |
| 128 | nonParametricTR <- function(outcome, score) { | |
| 129 | ||
| 130 | # Tresh_predictions are lists of 0,1's based on each finer_rscore as a cutpoint | |
| 131 | 9x | thresh.predictions <- lapply(score, function(x) as.numeric(score > x)) | 
| 132 | ||
| 133 | # Calc sensitivities and specificities at each risk percentile threshold | |
| 134 | 9x | senses <- vapply( | 
| 135 | 9x | thresh.predictions, | 
| 136 | 9x |     function(x) { | 
| 137 | 360x | sum(outcome == 1 & x == 1) / sum(outcome == 1) | 
| 138 | }, | |
| 139 | 9x | numeric(1) | 
| 140 | ) | |
| 141 | ||
| 142 | 9x | specs <- vapply( | 
| 143 | 9x | thresh.predictions, | 
| 144 | 9x |     function(x) { | 
| 145 | 360x | sum(outcome == 0 & x == 0) / sum(outcome == 0) | 
| 146 | }, | |
| 147 | 9x | numeric(1) | 
| 148 | ) | |
| 149 | ||
| 150 | # Create a data.frame | |
| 151 | 9x | dat <- data.frame( | 
| 152 | 9x | score = score, | 
| 153 | 9x | percentile = ecdf(score)(score), | 
| 154 | 9x | Sensitivity = senses, | 
| 155 | 9x | Specificity = specs | 
| 156 | ) %>% | |
| 157 | 9x | tidyr::pivot_longer( | 
| 158 | 9x |       cols = c("Sensitivity", "Specificity"), | 
| 159 | 9x | names_to = "pf", | 
| 160 | 9x | values_to = "value" | 
| 161 | ) | |
| 162 | ||
| 163 | 9x | return(as.data.frame(dat)) | 
| 164 | } | |
| 165 | ||
| 166 | ||
| 167 | ||
| 168 | adjPrevPerc <- function(perc, prev.new, cdf.case, cdf.control) { | |
| 169 | 4x | prev.new * cdf.case(perc) + (1 - prev.new) * cdf.control(perc) | 
| 170 | } | |
| 171 | ||
| 172 | adjPrevEst <- function(x, prev, prev.new) { | |
| 173 | 5x | f2 <- prev / (1 - prev) | 
| 174 | 5x | f3 <- (1 - prev.new) / prev.new | 
| 175 | 5x | 1 / (((1 / x) - 1) * f2 * f3 + 1) | 
| 176 | } | |
| 177 | ||
| 178 | adjPrevPC <- function(dat, prev, prev.new, cdf.case, cdf.control) { | |
| 179 | 1x | mutate( | 
| 180 | 1x | dat, | 
| 181 | 1x | percentile = adjPrevPerc( | 
| 182 | 1x | perc = .data$score, prev.new = prev.new, | 
| 183 | 1x | cdf.case = cdf.case, cdf.control = cdf.control | 
| 184 | ), | |
| 185 | 1x | estimate = adjPrevEst(x = .data$estimate, prev = prev, prev.new = prev.new) | 
| 186 | ) | |
| 187 | } | |
| 188 | ||
| 189 | adjPrevPV <- function(dat, prev, prev.new, cdf.case, cdf.control) { | |
| 190 | 1x | mutate( | 
| 191 | 1x | dat, | 
| 192 | 1x | percentile = adjPrevPerc( | 
| 193 | 1x | perc = .data$score, prev.new = prev.new, | 
| 194 | 1x | cdf.case = cdf.case, cdf.control = cdf.control | 
| 195 | ), | |
| 196 | 1x | MNPV = adjPrevEst(x = .data$MNPV, prev = prev, prev.new = prev.new), | 
| 197 | 1x | NPV = 1 - .data$MNPV, | 
| 198 | 1x | PPV = adjPrevEst(x = .data$PPV, prev = prev, prev.new = prev.new) | 
| 199 | ) | |
| 200 | } | |
| 201 | ||
| 202 | ||
| 203 | ||
| 204 | bestPPV <- function(perc, prev) { | |
| 205 | 12x | ifelse(perc > 1 - prev, 1, prev / (1 - perc)) | 
| 206 | } | |
| 207 | ||
| 208 | bestMNPV <- function(perc, prev) { | |
| 209 | 12x | ifelse(perc <= 1 - prev, 0, 1 - ((1 - prev) / perc)) | 
| 210 | } | |
| 211 | ||
| 212 | bestSens <- function(perc, prev) { | |
| 213 | 8x | ifelse(perc <= 1 - prev, 1, (1 - perc) / prev) | 
| 214 | } | |
| 215 | ||
| 216 | bestSpec <- function(perc, prev) { | |
| 217 | 8x | ifelse(perc > 1 - prev, 1, perc / (1 - prev)) | 
| 218 | } | 
| 1 | ||
| 2 | #' zipFastener for two dataframes of unequal length | |
| 3 | #' | |
| 4 | #' The following function acts like a “zip fastener” for combining two dataframes. | |
| 5 | #' It takes the first row of the first data frame and places it above of | |
| 6 | #' the first row of the second data frame and so on. | |
| 7 | #' | |
| 8 | #' @param df1 dataframe 1 | |
| 9 | #' @param df2 dataframe 2 | |
| 10 | #' | |
| 11 | #' @return Zipped data.frame | |
| 12 | #' | |
| 13 | #' @keywords internal | |
| 14 | #' @noRd | |
| 15 | #' | |
| 16 | #' @examples | |
| 17 | #' df1 <- data.frame(a = 1:3, b = 1:3, c = 1:3) | |
| 18 | #' df2 <- data.frame(a = letters[1:3], b = letters[1:3], c = letters[1:3]) | |
| 19 | #' | |
| 20 | #' zipFastener(df1, df2) | |
| 21 | #' | |
| 22 | zipFastener <- function(df1, df2) { | |
| 23 | ||
| 24 | 8x |   if (ncol(df1) != ncol(df2)) { | 
| 25 | 1x |     stop("the no. of columns has to be equal to merge them by zip feeding") | 
| 26 | } | |
| 27 | ||
| 28 | 7x | d1 <- nrow(df1) | 
| 29 | 7x | d2 <- nrow(df2) | 
| 30 | ||
| 31 | 7x |   if (d1 != d2) { | 
| 32 | 1x |     stop("the no. of rows has to be equal to merge them by zip feeding") | 
| 33 | } | |
| 34 | ||
| 35 | # zip fastener preperations | |
| 36 | 6x | i1 <- 1:d1 # index vector 1 | 
| 37 | 6x | i2 <- 1:d1 + d1 # index vector 2 | 
| 38 | ||
| 39 | # zip fastener operations | |
| 40 | 6x | index <- as.vector(matrix(c(i1, i2), ncol = d1, byrow = TRUE)) | 
| 41 | 6x | index <- index[!is.na(index)] # remove NAs | 
| 42 | ||
| 43 | 6x | colnames(df2) <- colnames(df1) # keep 1st colnames | 
| 44 | 6x | res <- rbind(df1, df2)[index, ] # reorder data frame | 
| 45 | ||
| 46 | 6x | return(res) | 
| 47 | } | |
| 48 | ||
| 49 | ||
| 50 | #' last observation carried forward: for binned | |
| 51 | #' | |
| 52 | #' Function assumes columns in data frame are: percentile, estimate, method | |
| 53 | #' | |
| 54 | #' @param df dataframe containing predcurve estimates | |
| 55 | #' | |
| 56 | #' @return dataframe with adjusted points | |
| 57 | #' | |
| 58 | #' @keywords internal | |
| 59 | #' @noRd | |
| 60 | #' | |
| 61 | locf.binned <- function(df) { | |
| 62 | ||
| 63 | 4x | df.copy <- df | 
| 64 | # keep same x, slide y back one spot | |
| 65 | 4x | df.copy$estimate <- df.copy$estimate[c(2:length(df.copy$estimate), NA)] | 
| 66 | 4x | combined.df <- zipFastener(df, df.copy) | 
| 67 | 4x | combined.df <- combined.df[-nrow(combined.df), ] # removes a redundant point | 
| 68 | ||
| 69 | 4x | return(combined.df) | 
| 70 | } | |
| 71 | ||
| 72 | #' last observation carried forward: for pava | |
| 73 | #' | |
| 74 | #' Function assumes columns in data frame are: percentile, estimate, method | |
| 75 | #' | |
| 76 | #' @param df dataframe containing predcurve estimates | |
| 77 | #' | |
| 78 | #' @return dataframe with adjusted points | |
| 79 | #' | |
| 80 | #' @keywords internal | |
| 81 | #' @noRd | |
| 82 | #' | |
| 83 | locf.pava <- function(df) { | |
| 84 | ||
| 85 | # save first and last points | |
| 86 | 2x | first <- df[1, ] | 
| 87 | 2x | last <- df[nrow(df) - 1, ] | 
| 88 | ||
| 89 | # adjust data to remove extra points | |
| 90 | 2x | diff_y <- diff(c(0, df$estimate)) | 
| 91 | 2x | pava.int <- df[diff_y != 0, ] | 
| 92 | ||
| 93 | # re-add first and last points | |
| 94 | 2x | df <- rbind(first, pava.int, last) | 
| 95 | 2x | df <- df[order(df$percentile), ] # order on percentile | 
| 96 | ||
| 97 | 2x | df.copy <- df | 
| 98 | # keep same x, slide y back one spot | |
| 99 | 2x | df.copy$percentile <- df.copy$percentile[ | 
| 100 | 2x | c(2:(length(df.copy$percentile)), NA) | 
| 101 | ] | |
| 102 | 2x | combined.df <- zipFastener(df, df.copy) | 
| 103 | ||
| 104 | # Combine smooth estimate to fixed df | |
| 105 | 2x | combined.df <- na.omit(combined.df) | 
| 106 | 2x | combined.df <- combined.df[-nrow(combined.df), ] # removes a redundant point | 
| 107 | ||
| 108 | 2x | return(combined.df) | 
| 109 | } | |
| 110 | ||
| 111 | #' Last observation carried forward for specific estimation methods | |
| 112 | #' | |
| 113 | #' Iterates locf for multiple methods: calls locf.binned and locf.pava | |
| 114 | #' | |
| 115 | #' @param df dataframe containing predcurve estimates | |
| 116 | #' @param method.binned vector of "binned" methods | |
| 117 | #' @param method.pava vector of "pava" methods | |
| 118 | #' | |
| 119 | #' @return dataframe with adjusted points | |
| 120 | #' | |
| 121 | #' @keywords internal | |
| 122 | #' @noRd | |
| 123 | #' | |
| 124 | locf <- function(df, method.binned, method.pava) { | |
| 125 | ||
| 126 | 4x | complete.data <- df[!df$method %in% c(method.binned, method.pava), ] | 
| 127 | 4x | binned.data <- df[df$method %in% method.binned, ] | 
| 128 | 4x | pava.data <- df[df$method %in% method.pava, ] | 
| 129 | ||
| 130 | # pava locf | |
| 131 | 4x |   if (nrow(pava.data) >= 1) { | 
| 132 | 2x | pava.plotdata <- locf.pava(pava.data) | 
| 133 | 2x | complete.data <- bind_rows(complete.data, pava.plotdata) | 
| 134 | } | |
| 135 | ||
| 136 | # binned locf | |
| 137 | 4x |   if (nrow(binned.data) >= 1) { | 
| 138 | 4x | binned.plotdata <- locf.binned(binned.data) | 
| 139 | 4x | complete.data <- bind_rows(complete.data, binned.plotdata) | 
| 140 | } | |
| 141 | 4x | rownames(complete.data) <- NULL | 
| 142 | ||
| 143 | 4x | return(complete.data) | 
| 144 | } |