1 |
#' Calibration plot |
|
2 |
#' |
|
3 |
#' Calibration curve risk estimates |
|
4 |
#' |
|
5 |
#' @inheritParams riskProfile |
|
6 |
#' @param methods Character vector of method names (case-insensitive) for plotting curves or |
|
7 |
#' a named list where elements are method function and its arguments. |
|
8 |
#' Default is set to `list(gam = list(method = "gam", fitonPerc = FALSE))`. |
|
9 |
#' |
|
10 |
#' Full options are: `c("binned", "pava", "mspline", "gam", "cgam")`. |
|
11 |
#' |
|
12 |
#' To specify arguments per method, use lists. For example: |
|
13 |
#' ``` |
|
14 |
#' list( |
|
15 |
#' pava = list(method = "pava", ties = "primary"), |
|
16 |
#' mspline = list(method = "mspline", fitonPerc = TRUE), |
|
17 |
#' gam = list(method = "gam", bs = "tp", logscores = FALSE), |
|
18 |
#' bin = list(method = "binned", bins = 10), |
|
19 |
#' ) |
|
20 |
#' ``` |
|
21 |
#' See section "Estimation" for more details. |
|
22 |
#' @param include Character vector (case-insensitive, partial matching) or `NULL` specifying |
|
23 |
#' what quantities to include in the plot. |
|
24 |
#' |
|
25 |
#' Default is: `c("loess", "citl")`. |
|
26 |
#' |
|
27 |
#' Full options are: `c("loess", "citl", "rug", "datapoints")` or `NULL`. |
|
28 |
#' "loess" adds a Loess fit, "citl" stands for "Calibration in the large", |
|
29 |
#' "rug" adds rug ticks of `score` by `outcome` (top x-axis: `score` for `outcome == 1`, |
|
30 |
#' bottom x-axis: `score` for `outcome == 0`), |
|
31 |
#' "datapoints" adds jittered `score` by `outcome` (slightly shifted away from 0 / 1 y-values), |
|
32 |
#' "`NULL`" stands for no extra information. |
|
33 |
#' @param plot.raw Logical to show percentiles or raw values. |
|
34 |
#' Defaults to `TRUE` (i.e. raw `score`). |
|
35 |
#' @param rev.order Logical to reverse ordering of scores. Defaults to `FALSE`. |
|
36 |
#' @param margin.type Type of additional margin plot, can be one of |
|
37 |
#' `c("density", "histogram", "boxplot", "violin", "densigram")`. |
|
38 |
#' See [ggExtra::ggMarginal()] for more details. |
|
39 |
#' @param ... Additional arguments passed to [ggExtra::ggMarginal()]. |
|
40 |
#' |
|
41 |
#' @inheritSection riskProfile Estimation |
|
42 |
#' |
|
43 |
#' @return A list containing the plot and data, plus `citl` data if they were requested. |
|
44 |
#' |
|
45 |
#' @export |
|
46 |
#' |
|
47 |
#' @seealso [riskProfile()] [sensSpec()] |
|
48 |
#' |
|
49 |
#' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()] |
|
50 |
#' [getASISest()] |
|
51 |
#' |
|
52 |
#' @examples |
|
53 |
#' # Read in example data |
|
54 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) |
|
55 |
#' rscore <- auroc$predicted_calibrated |
|
56 |
#' truth <- as.numeric(auroc$actual) |
|
57 |
#' |
|
58 |
#' # Default calibration plot |
|
59 |
#' p1 <- calibrationProfile(outcome = truth, score = rscore) |
|
60 |
#' p1$plot |
|
61 |
#' |
|
62 |
#' # Specifying multiple estimation methods |
|
63 |
#' # By default, all the methods fit on percentiles |
|
64 |
#' calibrationProfile( |
|
65 |
#' outcome = truth, |
|
66 |
#' score = rscore, |
|
67 |
#' methods = c("gam", "mspline", "binned") |
|
68 |
#' )$plot |
|
69 |
#' |
|
70 |
#' # Specifying multiple estimation methods with parameters |
|
71 |
#' calibrationProfile( |
|
72 |
#' outcome = truth, |
|
73 |
#' score = rscore, |
|
74 |
#' methods = list( |
|
75 |
#' gam = list(method = "gam", fitonPerc = FALSE, k = 3), |
|
76 |
#' mspline = list(method = "mspline"), |
|
77 |
#' bin = list(method = "binned", quantiles = 5) |
|
78 |
#' ) |
|
79 |
#' )$plot |
|
80 |
#' |
|
81 |
#' # Additional quantities and marginal histogram with specified number of bins |
|
82 |
#' calibrationProfile( |
|
83 |
#' outcome = truth, |
|
84 |
#' score = rscore, |
|
85 |
#' include = c("rug", "datapoints", "citl"), |
|
86 |
#' # or use partial matching: include = c("r", "d", "c"), |
|
87 |
#' margin.type = "histogram", |
|
88 |
#' bins = 100 # passed to ggExtra::ggMarginal |
|
89 |
#' )$plot |
|
90 |
#' |
|
91 |
calibrationProfile <- function(outcome, |
|
92 |
score, |
|
93 |
methods = list(gam = list(method = "gam", fitonPerc = FALSE)), |
|
94 |
include = c("loess", "citl"), |
|
95 |
plot.raw = TRUE, |
|
96 |
rev.order = FALSE, |
|
97 |
margin.type = NULL, |
|
98 |
...) { |
|
99 | ||
100 |
# Argument checks (except outcome, score, methods - below) |
|
101 | 10x |
checkmate::assert( |
102 | 10x |
checkmate::check_character(include), |
103 | 10x |
checkmate::check_null(include) |
104 |
) |
|
105 | 10x |
if (is.character(include)) { |
106 | 9x |
include <- matchArgSubset(tolower(include), choices = c("loess", "citl", "rug", "datapoints")) |
107 |
} |
|
108 | 10x |
checkmate::assert_flag(plot.raw) |
109 | 10x |
checkmate::assert_flag(rev.order) |
110 | 10x |
checkmate::assert( |
111 | 10x |
checkmate::check_character(margin.type, len = 1, any.missing = FALSE), |
112 | 10x |
checkmate::check_null(margin.type) |
113 |
) |
|
114 | ||
115 | 10x |
if (plot.raw) { |
116 | 9x |
xvar <- "score" |
117 |
} else { |
|
118 | 1x |
xvar <- "percentile" |
119 |
} |
|
120 | ||
121 |
# Standardize/Check outcome, scores |
|
122 | 10x |
op <- inputCheck(outcome = outcome, score = score) |
123 | ||
124 |
# Order Data by scores |
|
125 | 10x |
tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order) |
126 | 10x |
score <- tempdf$score |
127 | 10x |
outcome <- tempdf$outcome |
128 | ||
129 |
# Check methods |
|
130 | 10x |
methods <- methodCheck(methods = methods) |
131 | 10x |
method.names <- names(methods) |
132 | 10x |
if ("asis" %in% getEstMethods(methods, with.names = FALSE)) { |
133 | 1x |
stop('"asis" method is not suitable for this plot. Please remove it.') |
134 |
} |
|
135 | ||
136 |
# Get estimates |
|
137 | 9x |
pc.ests <- getEsts(methods = methods, outcome = outcome, score = score) |
138 | 9x |
PC.data <- pc.ests$plotdata |
139 | 9x |
step.methods <- method.names[pc.ests$idx.step] |
140 | ||
141 |
# Calculate percentiles |
|
142 | 9x |
ecdf.score <- ecdf(score) |
143 | 9x |
percentile <- ecdf.score(score) |
144 | ||
145 |
# Calculate Calibration in the large |
|
146 | 9x |
citl.data <- data.frame( |
147 | 9x |
outcome = mean(outcome), |
148 | 9x |
score = mean(score), |
149 | 9x |
percentile = ecdf.score(mean(score)), |
150 | 9x |
method = "Calibration In The Large" |
151 |
) |
|
152 | ||
153 |
# Subset PC data for plotting |
|
154 | 9x |
smoothPC <- PC.data[!PC.data$method %in% step.methods, , drop = FALSE] |
155 | 9x |
stepPC <- PC.data[PC.data$method %in% step.methods, , drop = FALSE] |
156 | ||
157 |
# data.frame with user inputs |
|
158 | 9x |
ddf <- data.frame(score, percentile, outcome) |
159 | ||
160 |
# Shape type storage |
|
161 | 9x |
shapes <- c() |
162 | ||
163 |
# Add empty scatterplot layer for ggMarginal |
|
164 | 9x |
if (!is.null(margin.type)) { |
165 | 1x |
p <- ggplot() + |
166 | 1x |
geom_point( |
167 | 1x |
data = ddf, |
168 | 1x |
aes(x = .data[[xvar]], y = .data$outcome), shape = NA, na.rm = TRUE |
169 |
) |
|
170 |
} else { |
|
171 | 8x |
p <- ggplot() |
172 |
} |
|
173 | ||
174 |
# Build plot |
|
175 | 9x |
p <- p + |
176 | 9x |
geom_line( |
177 | 9x |
data = smoothPC, |
178 | 9x |
aes(x = .data[[xvar]], y = .data$estimate, linetype = .data$method, colour = .data$method), |
179 | 9x |
alpha = 0.8, |
180 | 9x |
linewidth = 0.5 |
181 |
) + |
|
182 | 9x |
geom_step( |
183 | 9x |
data = stepPC, |
184 | 9x |
aes(x = .data[[xvar]], y = .data$estimate, linetype = .data$method, colour = .data$method), |
185 | 9x |
direction = "vh", |
186 | 9x |
alpha = 0.8, |
187 | 9x |
linewidth = 0.5 |
188 |
) + |
|
189 | 9x |
geom_abline( |
190 | 9x |
aes(slope = 1, intercept = 0, linewidth = "Identity line"), |
191 | 9x |
colour = "gray50", |
192 | 9x |
linetype = "solid" |
193 |
) |
|
194 | ||
195 |
# Add loess if requested |
|
196 | 9x |
if ("loess" %in% include) { |
197 | 8x |
p <- p + geom_smooth( |
198 | 8x |
data = ddf, |
199 | 8x |
aes(x = .data[[xvar]], y = .data$outcome, linetype = "loess", colour = "loess"), |
200 | 8x |
method = "loess", formula = y ~ x, se = FALSE, |
201 | 8x |
linewidth = 0.5 |
202 |
) |
|
203 |
} |
|
204 | ||
205 |
# Add calibration in the large if requested |
|
206 | 9x |
if ("citl" %in% include) { |
207 | 8x |
p <- p + geom_point( |
208 | 8x |
data = citl.data, |
209 | 8x |
aes(x = .data[[xvar]], y = .data$outcome, shape = .data$method), |
210 | 8x |
colour = "red", |
211 | 8x |
size = 3, |
212 | 8x |
stroke = 1 |
213 |
) |
|
214 | 8x |
shapes <- c(shapes, c("Calibration In The Large" = 4)) |
215 |
} |
|
216 | ||
217 |
# Add datapoints if requested |
|
218 | 9x |
if ("datapoints" %in% include) { |
219 | 1x |
p <- p + geom_jitter( |
220 | 1x |
data = data.frame( |
221 | 1x |
score, |
222 | 1x |
percentile, |
223 | 1x |
outcome = ifelse(outcome == 0, -0.1, 1.1), |
224 | 1x |
method = "Data points" |
225 |
), |
|
226 | 1x |
aes(x = .data[[xvar]], y = .data$outcome, shape = .data$method), |
227 | 1x |
colour = "black", |
228 | 1x |
size = 1.5, |
229 | 1x |
alpha = 0.4, |
230 | 1x |
position = position_jitter(seed = 5, height = 0.03) |
231 |
) |
|
232 | 1x |
shapes <- c(shapes, c("Data points" = 16)) |
233 |
} |
|
234 | ||
235 |
# Add rug if requested |
|
236 | 9x |
if ("rug" %in% include) { |
237 | 1x |
p <- p + geom_rug( |
238 | 1x |
data = ddf[ddf$outcome == 0, ], |
239 | 1x |
aes(x = .data[[xvar]]), |
240 | 1x |
sides = "b", |
241 | 1x |
show.legend = FALSE |
242 | 1x |
) + geom_rug( |
243 | 1x |
data = ddf[ddf$outcome == 1, ], |
244 | 1x |
aes(x = .data[[xvar]]), |
245 | 1x |
sides = "t", |
246 | 1x |
show.legend = FALSE |
247 |
) |
|
248 |
} |
|
249 | ||
250 |
# Fix shape legend ... |
|
251 | 9x |
if (all(c("citl", "datapoints") %in% include)) { |
252 | 1x |
shape_guide <- guide_legend( |
253 | 1x |
override.aes = list( |
254 | 1x |
colour = c("Calibration In The Large" = "red", "Data points" = "black"), |
255 | 1x |
alpha = 1, size = 2.5 |
256 |
), |
|
257 | 1x |
order = 3 |
258 |
) |
|
259 | 8x |
} else if (any(c("citl", "datapoints") %in% include)) { |
260 | 7x |
shape_guide <- guide_legend( |
261 | 7x |
override.aes = list(alpha = 1, size = 2.5), |
262 | 7x |
order = 3 |
263 |
) |
|
264 |
} else { |
|
265 | 1x |
shape_guide <- NULL |
266 |
} |
|
267 | ||
268 |
# Finalize graph |
|
269 | 9x |
p <- p + |
270 | 9x |
scale_linewidth_manual(values = c("Identity line" = 0.5)) + |
271 | 9x |
scale_shape_manual(values = shapes) + |
272 | 9x |
scale_colour_hue(l = 45) + |
273 | 9x |
scale_x_continuous(n.breaks = 6) + |
274 | 9x |
scale_y_continuous(n.breaks = 6) + |
275 | 9x |
labs( |
276 | 9x |
title = "Calibration Plot", |
277 | 9x |
x = "Predicted Probability", |
278 | 9x |
y = "Observed", |
279 | 9x |
linetype = "Estimation Method", |
280 | 9x |
linewidth = NULL, |
281 | 9x |
colour = "Estimation Method", |
282 | 9x |
shape = `if`(all(c("citl", "datapoints") %in% include), "Points", NULL) |
283 |
) + |
|
284 | 9x |
theme_bw() + |
285 | 9x |
theme(legend.key.width = unit(2, "line")) + |
286 | 9x |
guides( |
287 | 9x |
linetype = guide_legend(order = 1), |
288 | 9x |
colour = guide_legend(order = 1), |
289 | 9x |
linewidth = guide_legend(order = 2), |
290 | 9x |
shape = shape_guide |
291 |
) |
|
292 | ||
293 |
# Add margin plot |
|
294 | 9x |
if (!is.null(margin.type)) { |
295 | 1x |
p <- ggExtra::ggMarginal(p, type = margin.type, margins = "x", ...) |
296 |
} |
|
297 | ||
298 | 9x |
return(list(plot = p, data = dplyr::as_tibble(PC.data), citl = citl.data)) |
299 |
} |
1 | ||
2 |
# Helper function to check the user input - bins and quantiles arguments |
|
3 |
checkBinsQuantiles <- function(bins, quantiles, score) { |
|
4 | ||
5 | 15x |
checkmate::assert( |
6 | 15x |
checkmate::check_numeric(bins, any.missing = FALSE, sorted = TRUE), |
7 | 15x |
checkmate::check_null(bins) |
8 |
) |
|
9 | 15x |
checkmate::assert( |
10 | 15x |
checkmate::check_integerish(quantiles, lower = 1), |
11 | 15x |
checkmate::check_null(quantiles) |
12 |
) |
|
13 | ||
14 |
# Check combination of quantiles and bins |
|
15 | 15x |
if (!is.null(quantiles) && !is.null(bins)) { |
16 | 1x |
stop("bins and quantiles cannot be specified together, choose one and set the other to NULL") |
17 |
} |
|
18 | ||
19 | 14x |
if (is.null(quantiles) && is.null(bins)) { |
20 | 5x |
quantiles <- 10 |
21 |
} |
|
22 | ||
23 |
# Further check for bins |
|
24 | 14x |
if (length(bins) > 1) { |
25 | 4x |
if (bins[1] > min(score)) { |
26 | 1x |
stop(paste("The first element of bins must be <= min(score), not", bins[1])) |
27 |
} |
|
28 | 3x |
if (bins[length(bins)] < max(score)) { |
29 | 1x |
stop(paste("The last element of bins must be >= max(score), not", bins[length(bins)])) |
30 |
} |
|
31 |
} |
|
32 | ||
33 |
# Scalar bins -> number of intervals |
|
34 | 12x |
if (length(bins) == 1 && bins <= 1) { |
35 | 1x |
stop("bins must be > 1 when provided as a scalar (i.e. number of bins)") |
36 |
} |
|
37 | ||
38 |
# Check discrete score |
|
39 | 11x |
lu <- length(unique(score)) |
40 | 11x |
if (length(bins) == 1 && bins > lu) { |
41 | 1x |
warning( |
42 | 1x |
paste0( |
43 | 1x |
"The number of `bins` (", bins, ") ", |
44 | 1x |
"is > the number of unique score values (", lu, "). ", |
45 | 1x |
"The results may be unreliable." |
46 |
) |
|
47 |
) |
|
48 |
} |
|
49 | 11x |
if (length(bins) > 1 && length(bins) - 1 > lu) { |
50 | 1x |
warning( |
51 | 1x |
paste0( |
52 | 1x |
"The number of `bins` (", length(bins) - 1, ") ", |
53 | 1x |
"is > the number of unique score values (", lu, "). ", |
54 | 1x |
"The results may be unreliable." |
55 |
) |
|
56 |
) |
|
57 |
} |
|
58 | 11x |
if (!is.null(quantiles) && lu <= 10) { |
59 | 1x |
warning( |
60 | 1x |
"Using the quantile method for non-continuous score. ", |
61 | 1x |
"The results may be unreliable." |
62 |
) |
|
63 |
} |
|
64 | ||
65 | 11x |
return(list(bins = bins, quantiles = quantiles)) |
66 |
} |
|
67 | ||
68 | ||
69 |
# Helper function to check the user input - errorbar.sem argument |
|
70 |
checkErrorbarSem <- function(errorbar.sem) { |
|
71 | 7x |
checkmate::assert( |
72 | 7x |
checkmate::check_number(errorbar.sem, lower = 0), # this checks >= 0 |
73 | 7x |
checkmate::check_null(errorbar.sem) |
74 |
) |
|
75 | 7x |
if (!is.null(errorbar.sem)) { |
76 | 2x |
stopifnot("`errorbar.sem` must be > 0" = errorbar.sem > 0) |
77 |
} |
|
78 | 7x |
return(errorbar.sem) |
79 |
} |
|
80 | ||
81 | ||
82 |
#' Binned Risk Estimates |
|
83 |
#' |
|
84 |
#' Calculates bins based on number of evenly spaced bins or n-tiles. |
|
85 |
#' Determines average risk within bins, used for risk estimates. |
|
86 |
#' |
|
87 |
#' @inheritParams riskProfile |
|
88 |
#' @param quantiles Numeric; quantiles to split bins. |
|
89 |
#' @param bins Numeric; number of evenly spaced bins or bin locations. |
|
90 |
#' @param right Logical indicating right closed interval. Defaults to `TRUE`. |
|
91 |
#' @param errorbar.sem Scalar numeric representing the number of standard error from the means |
|
92 |
#' (SEM) used to calculate risk error bar. |
|
93 |
#' |
|
94 |
#' @return A data frame with 4 columns |
|
95 |
#' (score, score percentile, outcome, estimate). |
|
96 |
#' Additionally, there is an attribute "errorbar" holding the error-bar data if |
|
97 |
#' `errorbar.sem` was specified. |
|
98 |
#' |
|
99 |
#' @seealso [getASISest()] [getCGAMest()] [getGAMest()] [getMSPLINEest()] [getPAVAest()] |
|
100 |
#' |
|
101 |
#' @export |
|
102 |
#' |
|
103 |
#' @examples |
|
104 |
#' # Read in example data |
|
105 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) |
|
106 |
#' rscore <- auroc$predicted |
|
107 |
#' truth <- as.numeric(auroc$actual) |
|
108 |
#' |
|
109 |
#' getBINNEDest(outcome = truth, score = rscore) |
|
110 |
#' |
|
111 |
getBINNEDest <- function(outcome, |
|
112 |
score, |
|
113 |
quantiles = NULL, |
|
114 |
bins = NULL, |
|
115 |
right = TRUE, |
|
116 |
errorbar.sem = NULL) { |
|
117 | ||
118 |
# Argument checks |
|
119 | 7x |
checkmate::assert_numeric(outcome) |
120 | 7x |
checkmate::assert_numeric(score, len = length(outcome)) |
121 | 7x |
checkmate::assert_flag(right) |
122 | 7x |
errorbar.sem <- checkErrorbarSem(errorbar.sem) |
123 | ||
124 |
# Check bins and quantiles |
|
125 | 7x |
bqp <- checkBinsQuantiles(bins = bins, quantiles = quantiles, score = score) |
126 | ||
127 |
# Retrieve data summaries |
|
128 | 7x |
df <- getSummaries( |
129 | 7x |
outcome = outcome, score = score, |
130 | 7x |
quantiles = bqp$quantiles, bins = bqp$bins, |
131 | 7x |
right = right |
132 |
) |
|
133 | ||
134 | 7x |
if (!is.null(errorbar.sem)) { |
135 | 2x |
errorbar <- getERRORest(binlvl = df[["binlvl"]], z = errorbar.sem) %>% |
136 | 2x |
mutate( |
137 | 2x |
percentile = df[["binlvl"]][["riskpercentile"]], |
138 | 2x |
bin.mid = df[["binlvl"]][["avg.outcome"]] |
139 |
) |
|
140 |
} else { |
|
141 | 5x |
errorbar <- NULL |
142 |
} |
|
143 | ||
144 |
# Create binned.data |
|
145 | 7x |
binned.data <- data.frame( |
146 | 7x |
score = df[["binlvl"]][["avg.risk"]], |
147 | 7x |
percentile = df[["binlvl"]][["riskpercentile"]], |
148 | 7x |
outcome = NA, |
149 | 7x |
estimate = df[["binlvl"]][["avg.outcome"]] |
150 |
) |
|
151 | 7x |
attr(binned.data, "errorbar") <- errorbar |
152 | ||
153 | 7x |
return(binned.data) |
154 |
} |
|
155 | ||
156 | ||
157 | ||
158 |
#' Get Summaries: observation level & bin level dataframes |
|
159 |
#' |
|
160 |
#' Given vectors of outcomes and risk scores, the function will return a list of |
|
161 |
#' observation level and bin level summary dataframes for the data. |
|
162 |
#' |
|
163 |
#' Observation level dataframe has columns for outcome, riskscore, risk percentile, bin number, |
|
164 |
#' and corresponding minimum and maximum score for that bin. |
|
165 |
#' |
|
166 |
#' Bin level dataframe has columns indicating bin number and the observation count, |
|
167 |
#' number of events, average outcome, average risk, and standard deviation of risk, |
|
168 |
#' within each of the bins. Risk percentile and bin intervals are also provided. |
|
169 |
#' |
|
170 |
#' @return List of observation level and bin level dataframes. |
|
171 |
#' |
|
172 |
#' @keywords internal |
|
173 |
#' @noRd |
|
174 |
#' |
|
175 |
#' @examples |
|
176 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) |
|
177 |
#' truth <- as.numeric(auroc$actual) |
|
178 |
#' rscore <- auroc$predicted |
|
179 |
#' |
|
180 |
#' # Bin by quantiles |
|
181 |
#' getSummaries( |
|
182 |
#' outcome = truth, |
|
183 |
#' score = rscore, |
|
184 |
#' quantiles = 10 |
|
185 |
#' ) |
|
186 |
#' |
|
187 |
#' # Bin by specific percentiles |
|
188 |
#' getSummaries( |
|
189 |
#' outcome = truth, |
|
190 |
#' score = rscore, |
|
191 |
#' quantiles = 0, |
|
192 |
#' bin = c(0, 0.25, 0.5, 0.8, 1) |
|
193 |
#' ) |
|
194 |
#' |
|
195 |
getSummaries <- function(outcome, |
|
196 |
score, |
|
197 |
quantiles = NULL, |
|
198 |
bins = NULL, |
|
199 |
right = TRUE) { |
|
200 | ||
201 | 16x |
stopifnot( |
202 | 16x |
!is.null(quantiles) | !is.null(bins), |
203 | 16x |
quantiles != 0 | bins != 0, |
204 | 16x |
is.logical(right) |
205 |
) |
|
206 | ||
207 | 15x |
cdf.fit <- ecdf(score) |
208 | ||
209 | 15x |
if (!is.null(quantiles)) { |
210 | 12x |
bin.int <- Hmisc::cut2(score, g = quantiles) |
211 | ||
212 | 3x |
} else if (length(bins) > 1) { |
213 | ! |
bin.int <- cut(cdf.fit(score), breaks = bins, include.lowest = TRUE, right = right) |
214 | ||
215 | 3x |
} else if (length(bins) == 1) { |
216 | 2x |
bin.int <- cut(score, breaks = bins, include.lowest = TRUE, right = right) |
217 | ||
218 |
} else { |
|
219 | 1x |
stop("Unrecognized option") |
220 |
} |
|
221 |
|
|
222 |
# get numeric label of interval |
|
223 | 14x |
bin.num <- as.numeric(bin.int) |
224 | ||
225 |
# get interval borders (min, max) |
|
226 | 14x |
min_max <- strsplit( |
227 | 14x |
gsub("(?![-,.])[[:punct:]]", "", trimws(as.character(bin.int)), perl = TRUE), |
228 |
"," |
|
229 |
) |
|
230 | 14x |
min_max <- lapply( |
231 | 14x |
min_max, \(x) `if`(length(x) < 2, rep(x, 2), x) |
232 |
) |
|
233 | ||
234 |
# Observation level data: outcome, rscore, observation percentile, and interval |
|
235 | 14x |
obslvl <- data.frame( |
236 | 14x |
outcome = outcome, |
237 | 14x |
score = score, |
238 | 14x |
riskpercentile = cdf.fit(score), |
239 | 14x |
bin = bin.num, |
240 | 14x |
interval = as.character(bin.int), |
241 | 14x |
min = vapply(min_max, "[[", character(1), 1), |
242 | 14x |
max = vapply(min_max, "[[", character(1), 2) |
243 |
) %>% |
|
244 | 14x |
arrange(.data$score) |
245 | ||
246 |
# Bin level data: within each bin: n, avg risk, sd risk, quantile, error bar |
|
247 | 14x |
binlvl <- obslvl %>% |
248 | 14x |
group_by(.data$bin, .data$interval) %>% |
249 | 14x |
summarise( |
250 | 14x |
n = dplyr::n(), |
251 | 14x |
events = sum(.data$outcome), |
252 | 14x |
avg.outcome = mean(.data$outcome), |
253 | 14x |
sd.outcome = sd(.data$outcome, na.rm = TRUE), |
254 | 14x |
avg.risk = mean(.data$score, na.rm = TRUE), |
255 | 14x |
sd.risk = sd(.data$score), |
256 | 14x |
riskpercentile = max(.data$riskpercentile), |
257 | 14x |
.groups = "drop" |
258 |
) |
|
259 | ||
260 |
# Replace NAs with 0 |
|
261 | 14x |
binlvl[is.na(binlvl)] <- 0 |
262 | ||
263 | 14x |
return(list(obslvl = obslvl, binlvl = binlvl)) |
264 |
} |
|
265 | ||
266 | ||
267 |
# ERROR BAR Estimates |
|
268 |
getERRORest <- function(binlvl, z) { |
|
269 | ||
270 | 2x |
stopifnot( |
271 | 2x |
is.data.frame(binlvl), |
272 | 2x |
is.numeric(z) |
273 |
) |
|
274 | ||
275 | 2x |
binlvl %>% |
276 | 2x |
mutate( |
277 | 2x |
bin.low = .data$avg.outcome - (z * (.data$sd.outcome / sqrt(.data$n))), |
278 | 2x |
bin.low = ifelse(.data$bin.low < 0, 0, .data$bin.low), |
279 | 2x |
bin.high = .data$avg.outcome + (z * (.data$sd.outcome / sqrt(.data$n))), |
280 | 2x |
midquantile = .data$riskpercentile - (diff(c(0, .data$riskpercentile)) / 2) |
281 |
) %>% |
|
282 | 2x |
select(all_of(c("midquantile", "bin.high", "bin.low"))) %>% |
283 | 2x |
tidyr::replace_na(list(bin.high = 0, bin.low = 0)) |
284 |
} |
|
285 | ||
286 | ||
287 |
#' PAVA Risk Estimates |
|
288 |
#' |
|
289 |
#' Determines isotonic regression estimates via pava, given a vector of binary outcomes, |
|
290 |
#' and a vector of scores. |
|
291 |
#' |
|
292 |
#' @inheritParams riskProfile |
|
293 |
#' @param weights Vector of numerics to specify PAVA observation weighting. |
|
294 |
#' @param ties String to specify how ties should be handled for PAVA. |
|
295 |
#' @param low_events Numeric, specifying number of events in the lowest bin. |
|
296 |
#' @param low_nonevents Numeric, specifying number of nonevents in the lowest bin. |
|
297 |
#' @param high_events Numeric, specifying number of events in the highest bin. |
|
298 |
#' @param high_nonevents Numeric, specifying number of nonevents in the highest bin. |
|
299 |
#' @param hilo_obs Numeric, specifying number of observations in the highest and lowest bins. |
|
300 |
#' |
|
301 |
#' @return A data frame with 4 columns |
|
302 |
#' (score, score percentile, outcome, estimate). |
|
303 |
#' |
|
304 |
#' @seealso [getASISest()] [getBINNEDest()] [getCGAMest()] [getGAMest()] [getMSPLINEest()] |
|
305 |
#' |
|
306 |
#' @export |
|
307 |
#' |
|
308 |
#' @examples |
|
309 |
#' # Read in example data |
|
310 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) |
|
311 |
#' rscore <- auroc$predicted |
|
312 |
#' truth <- as.numeric(auroc$actual) |
|
313 |
#' |
|
314 |
#' tail(getPAVAest(outcome = truth, score = rscore), 10) |
|
315 |
#' |
|
316 |
getPAVAest <- function(outcome, |
|
317 |
score, |
|
318 |
weights = rep(1, length(outcome)), |
|
319 |
ties = "primary", |
|
320 |
low_events = NULL, |
|
321 |
low_nonevents = NULL, |
|
322 |
high_events = NULL, |
|
323 |
high_nonevents = NULL, |
|
324 |
hilo_obs = NULL) { |
|
325 | ||
326 | 5x |
checkmate::assert_numeric(outcome) |
327 | 5x |
checkmate::assert_numeric(score, len = length(outcome)) |
328 | 5x |
checkmate::assert_numeric(weights, any.missing = FALSE, len = length(outcome)) |
329 | 5x |
checkmate::assert_character(ties, any.missing = FALSE, len = 1) |
330 | 5x |
checkmate::assert( |
331 | 5x |
checkmate::check_integerish(low_events, lower = 1, any.missing = FALSE, len = 1), |
332 | 5x |
checkmate::check_null(low_events) |
333 |
) |
|
334 | 5x |
checkmate::assert( |
335 | 5x |
checkmate::check_integerish(low_nonevents, lower = 1, any.missing = FALSE, len = 1), |
336 | 5x |
checkmate::check_null(low_nonevents) |
337 |
) |
|
338 | 5x |
checkmate::assert( |
339 | 5x |
checkmate::check_integerish(high_events, lower = 1, any.missing = FALSE, len = 1), |
340 | 5x |
checkmate::check_null(high_events) |
341 |
) |
|
342 | 5x |
checkmate::assert( |
343 | 5x |
checkmate::check_integerish(high_nonevents, lower = 1, any.missing = FALSE, len = 1), |
344 | 5x |
checkmate::check_null(high_nonevents) |
345 |
) |
|
346 | 5x |
checkmate::assert( |
347 | 5x |
checkmate::check_integerish(hilo_obs, lower = 1, any.missing = FALSE, len = 1), |
348 | 5x |
checkmate::check_null(hilo_obs) |
349 |
) |
|
350 | ||
351 | 5x |
pava.est <- isotone::gpava(z = score, y = outcome, weights = weights, ties = ties)$x |
352 | ||
353 |
# if constrained, then replace percentiles.... |
|
354 | 5x |
check <- any( |
355 | 5x |
is.numeric(low_events), is.numeric(low_nonevents), is.numeric(high_events), |
356 | 5x |
is.numeric(high_nonevents), is.numeric(hilo_obs) |
357 |
) |
|
358 | 5x |
if (check) { |
359 | ! |
percentile <- getConstraints( |
360 | ! |
outcome = outcome, rscore = score, |
361 | ! |
low_events = low_events, low_nonevents = low_nonevents, |
362 | ! |
high_events = high_events, high_nonevents = high_nonevents, |
363 | ! |
hilo_obs = hilo_obs |
364 |
) |
|
365 |
} else { |
|
366 | 5x |
percentile <- ecdf(score)(score) |
367 |
} |
|
368 | ||
369 | 5x |
return(data.frame(score, percentile, outcome, estimate = pava.est)) |
370 |
} |
|
371 | ||
372 | ||
373 |
#' Constrained Risk Percentile Estimates |
|
374 |
#' |
|
375 |
#' Adjusts PAVA risk percentile estimates for the first and last bin, to meet criteria |
|
376 |
#' for events, non-events, or total observation count. |
|
377 |
#' |
|
378 |
#' @inheritParams riskProfile |
|
379 |
#' |
|
380 |
#' @return A vector of constrained risk percentiles. |
|
381 |
#' |
|
382 |
#' @keywords internal |
|
383 |
#' @noRd |
|
384 |
#' |
|
385 |
#' @examples |
|
386 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) |
|
387 |
#' truth <- as.numeric(auroc$actual) |
|
388 |
#' rscore <- auroc$predicted |
|
389 |
#' |
|
390 |
#' getConstraints(outcome = truth, rscore = rscore, low_events = 3, high_nonevents = 3) |
|
391 |
#' |
|
392 |
getConstraints <- function(outcome, |
|
393 |
rscore, |
|
394 |
low_events = NULL, # min events in lower bin (useful for a PC) |
|
395 |
low_nonevents = NULL, # min non-events in lower bin |
|
396 |
high_events = NULL, # min events in upper bin |
|
397 |
high_nonevents = NULL, # min non-events in upper bin (useful for a PC) |
|
398 |
hilo_obs = NULL) { # min total obs in upper AND lower bin |
|
399 | ||
400 | 1x |
n <- length(rscore) |
401 | 1x |
rscore <- seq_along(rscore) / length(rscore) |
402 |
# rscore_cons: scores with binning constraint (same as rscore if no constraint specified) |
|
403 | 1x |
rscore_cons <- rscore |
404 | ||
405 | ! |
if (!is.null(low_events) && low_events == 0) low_events <- NULL |
406 | ! |
if (!is.null(low_nonevents) && low_nonevents == 0) low_nonevents <- NULL |
407 | ! |
if (!is.null(high_events) && high_events == 0) high_events <- NULL |
408 | ! |
if (!is.null(high_nonevents) && high_nonevents == 0) high_nonevents <- NULL |
409 | ! |
if (!is.null(hilo_obs) && hilo_obs == 0) hilo_obs <- NULL |
410 | ||
411 | 1x |
if (is.numeric(low_events) && is.numeric(low_nonevents)) { |
412 | ! |
warning( |
413 | ! |
paste( |
414 | ! |
"Specified both a minimum number of events and non-events for the lower bin.", |
415 | ! |
"Combining for total observations instead." |
416 |
) |
|
417 |
) |
|
418 | ! |
hilo_obs <- round(low_events + low_nonevents) |
419 | ! |
rscore_cons[1:hilo_obs] <- min(rscore) |
420 | ||
421 | 1x |
} else if (is.numeric(high_events) && is.numeric(high_nonevents)) { |
422 | ! |
warning( |
423 | ! |
paste( |
424 | ! |
"Specified both a minimum number of events and non-events for the upper bin.", |
425 | ! |
"Combining for total observations instead." |
426 |
) |
|
427 |
) |
|
428 | ! |
hilo_obs <- round(high_events + high_nonevents) |
429 | ! |
rscore_cons[(n + 1 - hilo_obs):n] <- max(rscore) |
430 | ||
431 |
} else { |
|
432 |
# Apply upper / lower constraints |
|
433 | 1x |
if (is.numeric(low_events)) { |
434 | 1x |
low_events <- round(low_events) |
435 | 1x |
indlo <- match(TRUE, cumsum(outcome) == low_events) |
436 | 1x |
rscore_cons[1:indlo] <- min(rscore) |
437 |
} |
|
438 | 1x |
if (is.numeric(low_nonevents)) { |
439 | ! |
low_nonevents <- round(low_nonevents) |
440 | ! |
indlo <- match(TRUE, cumsum(1 - outcome) == low_nonevents) |
441 | ! |
rscore_cons[1:indlo] <- min(rscore) |
442 |
} |
|
443 | 1x |
if (is.numeric(high_nonevents)) { |
444 | 1x |
high_nonevents <- round(high_nonevents) |
445 | 1x |
indhi <- match(TRUE, cumsum(1 - outcome[n:1]) == high_nonevents) |
446 | 1x |
rscore_cons[(n + 1 - indhi):n] <- max(rscore) |
447 |
} |
|
448 | ||
449 | 1x |
if (is.numeric(high_events)) { |
450 | ! |
high_events <- round(high_events) |
451 | ! |
indhi <- match(TRUE, cumsum(outcome[n:1]) == high_events) |
452 | ! |
rscore_cons[(n + 1 - indhi):n] <- max(rscore) |
453 |
} |
|
454 | ||
455 | 1x |
if (is.numeric(hilo_obs)) { |
456 | ! |
hilo_obs <- round(hilo_obs) |
457 | ! |
rscore_cons[1:hilo_obs] <- min(rscore) |
458 | ! |
rscore_cons[(n + 1 - hilo_obs):n] <- max(rscore) |
459 |
} |
|
460 |
} |
|
461 | ||
462 | 1x |
return(as.vector(rscore_cons)) |
463 |
} |
|
464 | ||
465 | ||
466 |
#' GAM Risk Estimates |
|
467 |
#' |
|
468 |
#' Fits a Generalized Additive Model to estimate risk, given a vector of binary outcome, |
|
469 |
#' and a vector of scores. |
|
470 |
#' |
|
471 |
#' @inheritParams riskProfile |
|
472 |
#' @param k Numeric to specify the upper limit of basis functions to fit for GAM. |
|
473 |
#' See [mgcv::s()] for more details. Defaults to -1. |
|
474 |
#' @param bs Character string to specify spline type. |
|
475 |
#' See [mgcv::s()] for more details. Defaults to `"tp"`. |
|
476 |
#' @param method Character string to specify method type. |
|
477 |
#' See [mgcv::s()] for more details. Defaults to "REML". |
|
478 |
#' @param logscores Logical; if `TRUE`, fit gam on log scores. Defaults to `FALSE`. |
|
479 |
#' @param fitonPerc Logical; if `TRUE`, fit gam on risk percentiles. Defaults to `TRUE`. |
|
480 |
#' |
|
481 |
#' @return A data frame with 4 columns |
|
482 |
#' (score, score percentile, outcome, estimate). |
|
483 |
#' |
|
484 |
#' @seealso [getASISest()] [getBINNEDest()] [getCGAMest()] [getMSPLINEest()] [getPAVAest()] |
|
485 |
#' |
|
486 |
#' @export |
|
487 |
#' |
|
488 |
#' @examples |
|
489 |
#' # Read in example data |
|
490 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) |
|
491 |
#' rscore <- auroc$predicted |
|
492 |
#' truth <- as.numeric(auroc$actual) |
|
493 |
#' |
|
494 |
#' tail(getGAMest(outcome = truth, score = rscore), 10) |
|
495 |
#' |
|
496 |
getGAMest <- function(outcome, |
|
497 |
score, |
|
498 |
k = -1, |
|
499 |
bs = "tp", |
|
500 |
method = "REML", |
|
501 |
logscores = FALSE, |
|
502 |
fitonPerc = TRUE) { |
|
503 | ||
504 | 23x |
checkmate::assert_numeric(outcome) |
505 | 23x |
checkmate::assert_numeric(score, len = length(outcome)) |
506 | 23x |
checkmate::assert_number(k) |
507 | 23x |
checkmate::assert_character(bs, any.missing = FALSE, len = 1) |
508 | 23x |
checkmate::assert_character(method, any.missing = FALSE, len = 1) |
509 | 23x |
checkmate::assert_flag(logscores) |
510 | 23x |
checkmate::assert_flag(fitonPerc) |
511 | ||
512 | 23x |
mygrid <- ecdf(score)(score) |
513 | 23x |
df <- data.frame(outcome = outcome, score = score, perc = mygrid) |
514 | ||
515 |
# mgcv::s does not work in formula, need to define it here |
|
516 | 23x |
s <- mgcv::s |
517 | ||
518 | 23x |
if (fitonPerc && logscores) { |
519 | 1x |
formula <- outcome ~ s(log(perc), k = k, bs = bs) |
520 | 22x |
} else if (fitonPerc && !logscores) { |
521 | 15x |
formula <- outcome ~ s(perc, k = k, bs = bs) |
522 | 7x |
} else if (!fitonPerc && logscores) { |
523 | 1x |
formula <- outcome ~ s(log(score), k = k, bs = bs) |
524 |
} else { |
|
525 | 6x |
formula <- outcome ~ s(score, k = k, bs = bs) |
526 |
} |
|
527 | ||
528 | 23x |
gam.fit <- mgcv::gam(formula, data = df, family = "binomial", method = method) |
529 | ||
530 | 23x |
gam.est <- mgcv::predict.gam(gam.fit, type = "response") |
531 | ||
532 | 23x |
return(data.frame(score, percentile = mygrid, outcome, estimate = gam.est)) |
533 |
} |
|
534 | ||
535 | ||
536 | ||
537 |
#' Constrained GAM (cgam) Risk Estimates |
|
538 |
#' |
|
539 |
#' Fits a Constrained Generalized Additive Model to estimate risk, |
|
540 |
#' given a vector of binary outcomes and a vector of scores. |
|
541 |
#' |
|
542 |
#' @inheritParams riskProfile |
|
543 |
#' @param numknots Numeric to specify the number of knots. |
|
544 |
#' Passed to the `smoother` function. Defaults to 3. |
|
545 |
#' @param smoother Character string to specify the smoother (from cgam package). |
|
546 |
#' Defaults to "s.incr". |
|
547 |
#' @param logscores Logical; if `TRUE`, fit gam on log scores. Defaults to `FALSE`. |
|
548 |
#' @param fitonPerc Logical; if `TRUE`, fit gam on risk percentiles. Defaults to `TRUE`. |
|
549 |
#' |
|
550 |
#' @return A data frame with 4 columns |
|
551 |
#' (score, score percentile, outcome, estimate). |
|
552 |
#' |
|
553 |
#' @seealso [getASISest()] [getBINNEDest()] [getGAMest()] [getMSPLINEest()] [getPAVAest()] |
|
554 |
#' |
|
555 |
#' @export |
|
556 |
#' |
|
557 |
#' @examples |
|
558 |
#' # Read in example data |
|
559 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) |
|
560 |
#' rscore <- auroc$predicted |
|
561 |
#' truth <- as.numeric(auroc$actual) |
|
562 |
#' |
|
563 |
#' tail(getCGAMest(outcome = truth, score = rscore), 10) |
|
564 |
#' |
|
565 |
getCGAMest <- function(outcome, |
|
566 |
score, |
|
567 |
numknots = 0, |
|
568 |
smoother = "s.incr", |
|
569 |
logscores = FALSE, |
|
570 |
fitonPerc = TRUE) { |
|
571 | ||
572 | 3x |
checkmate::assert_numeric(outcome) |
573 | 3x |
checkmate::assert_numeric(score, len = length(outcome)) |
574 | 3x |
checkmate::assert_number(numknots) |
575 | 3x |
checkmate::assert_character(smoother, any.missing = FALSE, len = 1) |
576 | 3x |
checkmate::assert_flag(logscores) |
577 | 3x |
checkmate::assert_flag(fitonPerc) |
578 | ||
579 | 3x |
mygrid <- ecdf(score)(score) |
580 | 3x |
df <- data.frame(outcome = outcome, score = score, perc = mygrid) |
581 | ||
582 |
# cgam::s does not work in formula, need to define it here |
|
583 | 3x |
assign(smoother, do.call(`::`, list(pkg = "cgam", name = smoother))) |
584 | ||
585 | 3x |
formula <- as.formula( |
586 | 3x |
paste0( |
587 | 3x |
"outcome ~ ", |
588 | 3x |
smoother, "(", |
589 | 3x |
`if`(logscores, "log("), |
590 | 3x |
`if`(fitonPerc, "perc", "score"), |
591 | 3x |
`if`(logscores, ")"), |
592 | 3x |
", numknots = ", numknots, ")" |
593 |
) |
|
594 |
) |
|
595 | ||
596 | 3x |
cgam.fit <- cgam::cgam(formula, data = df, family = "binomial") |
597 | ||
598 | 3x |
cgam.est <- cgam::predict.cgam(cgam.fit, type = "response")$fit |
599 | ||
600 | 3x |
return(data.frame(score, percentile = mygrid, outcome, estimate = cgam.est)) |
601 |
} |
|
602 | ||
603 | ||
604 |
### MSPLINE ESTIMATES ### |
|
605 | ||
606 |
# Function written by willtownes, joseph paulson. |
|
607 |
mspline <- function(x, y, k = 10, lower = NA, upper = NA) { |
|
608 |
# fits a monotonic spline to data |
|
609 |
# small values of k= more smoothing (flatter curves) |
|
610 |
# large values of k= more flexible (wiggly curves) |
|
611 |
# k is related to effective degrees of freedom and number of knots |
|
612 |
# use unconstrained gam to get rough parameter estimates |
|
613 |
# lower, upper optional bounds on the function |
|
614 |
# basically a slight modifimessageion of an example in the mgcv::pcls documentation |
|
615 | 6x |
dat <- data.frame(x = x, y = y) |
616 | 6x |
s <- mgcv::s # mgcv::s does not work in formula, need to define it here |
617 | 6x |
init_gam <- mgcv::gam(y ~ s(x, k = k, bs = "cr")) |
618 |
# Create Design matrix, constraints etc. for monotonic spline.... |
|
619 | 6x |
sm <- mgcv::smoothCon(s(x, k = k, bs = "cr"), dat, knots = NULL)[[1]] |
620 | 6x |
mc <- mgcv::mono.con(sm$xp, lower = lower, upper = upper) # monotonicity constraints |
621 | 6x |
M <- list( |
622 | 6x |
X = sm$X, y = y, # design matrix, outcome |
623 | 6x |
C = matrix(0, 0, 0), # equality constraints (none) |
624 | 6x |
Ain = mc$A, bin = mc$b, # inequality constraints |
625 | 6x |
sp = init_gam$sp, p = sm$xp, # initial guesses for param estimates |
626 | 6x |
S = sm$S, # smoothness penalty matrix |
627 | 6x |
w = y * 0 + 1, off = 0 # weights, offset |
628 |
) |
|
629 |
# fit spine using penalized constrained least squares |
|
630 | 6x |
p <- mgcv::pcls(M) |
631 | 6x |
return(list(sm = sm, p = p)) |
632 |
} |
|
633 | ||
634 |
# Function written by joseph paulson |
|
635 |
predict.mspline <- function(msp, x) { |
|
636 |
# using the monotone spline msp, predict values for the vector x |
|
637 | 6x |
as.vector(mgcv::Predict.matrix(msp$sm, data.frame(x = x)) %*% msp$p) |
638 |
} |
|
639 | ||
640 | ||
641 | ||
642 |
#' Monotone Spline Risk Estimates |
|
643 |
#' |
|
644 |
#' Fits a Monotone constrained Generalized Additive Model (GAM) to estimate risk, |
|
645 |
#' given a vector of binary outcomes and a vector of scores. |
|
646 |
#' |
|
647 |
#' @inheritParams getGAMest |
|
648 |
#' |
|
649 |
#' @return A data frame with 4 columns |
|
650 |
#' (score, score percentile, outcome, estimate). |
|
651 |
#' |
|
652 |
#' @seealso [getASISest()] [getBINNEDest()] [getCGAMest()] [getGAMest()] [getPAVAest()] |
|
653 |
#' |
|
654 |
#' @export |
|
655 |
#' |
|
656 |
#' @examples |
|
657 |
#' # Read in example data |
|
658 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) |
|
659 |
#' rscore <- auroc$predicted |
|
660 |
#' truth <- as.numeric(auroc$actual) |
|
661 |
#' |
|
662 |
#' tail(getMSPLINEest(outcome = truth, score = rscore), 10) |
|
663 |
#' |
|
664 |
getMSPLINEest <- function(outcome, |
|
665 |
score, |
|
666 |
k = 10, |
|
667 |
fitonPerc = TRUE) { |
|
668 | ||
669 | 6x |
checkmate::assert_numeric(outcome) |
670 | 6x |
checkmate::assert_numeric(score, len = length(outcome)) |
671 | 6x |
checkmate::assert_integerish(k, len = 1, any.missing = FALSE) |
672 | 6x |
checkmate::assert_flag(fitonPerc) |
673 | ||
674 | 6x |
stopifnot(!is.null(k), is.logical(fitonPerc)) |
675 | ||
676 | 6x |
scorefit <- ecdf(score) |
677 | 6x |
mygrid <- scorefit(score) |
678 | ||
679 | 6x |
if (!fitonPerc) { |
680 | 1x |
fitspl <- mspline(x = score, y = outcome, k = k) |
681 | 1x |
mspline.est <- predict.mspline(fitspl, score) |
682 |
} else { |
|
683 | 5x |
fitspl <- mspline(x = mygrid, y = outcome, k = k) |
684 | 5x |
mspline.est <- predict.mspline(fitspl, mygrid) |
685 |
} |
|
686 | ||
687 | 6x |
mspline.est[mspline.est < 0] <- 0 |
688 | 6x |
mspline.est[mspline.est > 1] <- 1 |
689 | ||
690 | 6x |
return(data.frame(score, percentile = mygrid, outcome, estimate = mspline.est)) |
691 |
} |
|
692 | ||
693 | ||
694 |
#' "As is" estimates |
|
695 |
#' |
|
696 |
#' This function does no estimation, but uses the score as it is |
|
697 |
#' (it works like an identity function). |
|
698 |
#' |
|
699 |
#' @inheritParams getGAMest |
|
700 |
#' |
|
701 |
#' @return A data frame with 4 columns |
|
702 |
#' (score, score percentile, outcome, estimate). |
|
703 |
#' |
|
704 |
#' @seealso [getBINNEDest()] [getCGAMest()] [getGAMest()] [getMSPLINEest()] [getPAVAest()] |
|
705 |
#' |
|
706 |
#' @export |
|
707 |
#' |
|
708 |
#' @examples |
|
709 |
#' # Read in example data |
|
710 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) |
|
711 |
#' rscore <- auroc$predicted |
|
712 |
#' truth <- as.numeric(auroc$actual) |
|
713 |
#' |
|
714 |
#' tail(getASISest(outcome = truth, score = rscore), 10) |
|
715 |
#' |
|
716 |
getASISest <- function(outcome, score) { |
|
717 |
# Argument checks |
|
718 | 17x |
checkmate::assert_numeric(outcome) |
719 | 17x |
checkmate::assert_numeric(score, len = length(outcome)) |
720 | 17x |
return( |
721 | 17x |
data.frame( |
722 | 17x |
score = score, |
723 | 17x |
percentile = ecdf(score)(score), |
724 | 17x |
outcome, |
725 | 17x |
estimate = score |
726 |
) |
|
727 |
) |
|
728 |
} |
|
729 | ||
730 | ||
731 |
# Returns Risk Estimates. |
|
732 |
# This function calls all the other getXXXest functions |
|
733 |
# Used for Predictiveness Curve Data |
|
734 |
getEsts <- function(methods, outcome, score) { |
|
735 | ||
736 |
# check methods |
|
737 | 41x |
stopifnot(is.list(methods)) |
738 | ||
739 |
# Get function names (and perform checks) |
|
740 | 38x |
fun.names <- getEstMethods(methods, with.names = TRUE) |
741 | ||
742 |
# Run estimations |
|
743 | 38x |
m.est <- lapply(methods, \(x) getEst(x, outcome = outcome, score = score)) |
744 | ||
745 |
# Special case: get errorbar data if existing |
|
746 | 36x |
idx.binned <- fun.names == "binned" |
747 | 36x |
m.er <- lapply( |
748 | 36x |
which(idx.binned), |
749 | 36x |
\(i) attr(m.est[[i]], "errorbar") |
750 |
) |
|
751 | ||
752 |
# Bind together and convert to long data frame |
|
753 | 36x |
m.est <- bind_rows(m.est, .id = "method") |
754 | 36x |
rownames(m.est) <- NULL |
755 | ||
756 |
# Bind together and convert to long data frame; otherwise return NULL |
|
757 | 36x |
m.error <- bind_rows(m.er, .id = "method") |
758 | 36x |
if (nrow(m.error) == 0) { |
759 | 35x |
m.error <- NULL |
760 |
} |
|
761 | ||
762 |
# Return indexes of step methods and risk methods |
|
763 | 36x |
idx.step <- fun.names %in% c("binned", "pava") |
764 | 36x |
names(idx.step) <- names(fun.names) |
765 | 36x |
idx.asis <- fun.names == "asis" |
766 | ||
767 |
# Check if asis was called multiple times |
|
768 | 36x |
if (sum(idx.asis) > 1) { |
769 | 1x |
stop("Please use 'asis' just once (as it does not have any further arguments).") |
770 |
} |
|
771 | ||
772 | 35x |
return( |
773 | 35x |
list( |
774 | 35x |
plotdata = m.est, errorbardata = m.error, |
775 | 35x |
idx.step = idx.step, idx.asis = idx.asis, |
776 | 35x |
idx.binned = idx.binned, idx.pava = fun.names == "pava" |
777 |
) |
|
778 |
) |
|
779 |
} |
|
780 | ||
781 |
# Define estimation functions |
|
782 |
est.funs <- function() { |
|
783 | 104x |
list( |
784 | 104x |
gam = getGAMest, |
785 | 104x |
cgam = getCGAMest, |
786 | 104x |
mspline = getMSPLINEest, |
787 | 104x |
binned = getBINNEDest, |
788 | 104x |
pava = getPAVAest, |
789 | 104x |
asis = getASISest |
790 |
) |
|
791 |
} |
|
792 | ||
793 |
# Generic and S3 methods for estimations |
|
794 |
getEst <- function(x, outcome, score) { |
|
795 | 54x |
UseMethod("getEst") |
796 |
} |
|
797 | ||
798 | ||
799 |
getEst.list <- function(x, outcome, score) { |
|
800 | 14x |
do.call( |
801 | 14x |
est.funs()[[x[["method"]]]], |
802 | 14x |
append(list(outcome = outcome, score = score), x[names(x) != "method"]) |
803 |
) |
|
804 |
} |
|
805 | ||
806 | ||
807 |
getEst.character <- function(x, outcome, score) { |
|
808 | 36x |
do.call( |
809 | 36x |
est.funs()[[x]], |
810 | 36x |
list(outcome = outcome, score = score) |
811 |
) |
|
812 |
} |
|
813 | ||
814 | ||
815 |
getEst.function <- function(x, outcome, score) { |
|
816 | ||
817 |
# Run estimation |
|
818 | 4x |
out <- x(outcome = outcome, score = score) |
819 | ||
820 |
# Check output |
|
821 | 4x |
check1 <- is.data.frame(out) && |
822 | 4x |
identical(c("estimate", "outcome", "percentile", "score"), sort(colnames(out))) |
823 | 4x |
if (!check1) { |
824 | 1x |
stop( |
825 | 1x |
paste( |
826 | 1x |
"User defined estimation functions must return a data.frame of 4 columns:", |
827 | 1x |
"score - the predictions,", |
828 | 1x |
"percentile - the percentile of score,", |
829 | 1x |
"outcome - the original outcome,", |
830 | 1x |
"and estimate - the estimated value" |
831 |
) |
|
832 |
) |
|
833 |
} |
|
834 |
|
|
835 | 3x |
check2 <- vapply(out, is.numeric, FUN.VALUE = logical(1), USE.NAMES = TRUE) |
836 | 3x |
if (!all(check2)) { |
837 | 1x |
stop( |
838 | 1x |
paste( |
839 | 1x |
"All columns of the returned data.frame in user defined estimation function", |
840 | 1x |
"must be numeric.", |
841 | 1x |
paste0("`", names(which(!check2)), "`", collapse = ", "), "is/are not numeric." |
842 |
) |
|
843 |
) |
|
844 |
} |
|
845 | ||
846 | 2x |
return(out) |
847 |
} |
|
848 | ||
849 |
# Generic and S3 methods for estimation method as string (actual estimation function used) |
|
850 |
getEstMethod <- function(x) { |
|
851 | 67x |
UseMethod("getEstMethod") |
852 |
} |
|
853 | ||
854 |
#' @export |
|
855 |
getEstMethod.character <- function(x) { |
|
856 | 43x |
x |
857 |
} |
|
858 | ||
859 |
#' @export |
|
860 |
getEstMethod.list <- function(x) { |
|
861 | 19x |
x[["method"]] |
862 |
} |
|
863 | ||
864 |
#' @export |
|
865 |
getEstMethod.function <- function(x) { |
|
866 | 5x |
"udf" |
867 |
} |
|
868 | ||
869 |
getEstMethods <- function(x, with.names) { |
|
870 | 48x |
vapply(x, getEstMethod, FUN.VALUE = character(1), USE.NAMES = with.names) |
871 |
} |
1 |
#' Risk profile plot |
|
2 |
#' |
|
3 |
#' Predictiveness curve, PPV, NPV and 1-NPV risk estimates |
|
4 |
#' |
|
5 |
#' @param outcome Vector of binary outcome for each observation. |
|
6 |
#' @param score Numeric vector of continuous predicted risk score. |
|
7 |
#' @param methods Character vector of method names (case-insensitive) for plotting curves or |
|
8 |
#' a named list where elements are method function and its arguments. |
|
9 |
#' Default is set to `"asis"`. |
|
10 |
#' |
|
11 |
#' Full options are: `c("asis", "binned", "pava", "mspline", "gam", "cgam")`. |
|
12 |
#' |
|
13 |
#' To specify arguments per method, use lists. For example: |
|
14 |
#' ``` |
|
15 |
#' list( |
|
16 |
#' pava = list(method = "pava", ties = "primary"), |
|
17 |
#' mspline = list(method = "mspline", fitonPerc = TRUE), |
|
18 |
#' gam = list(method = "gam", bs = "tp", logscores = FALSE), |
|
19 |
#' bin = list(method = "binned", bins = 10), |
|
20 |
#' risk = list(method = "asis") |
|
21 |
#' ) |
|
22 |
#' ``` |
|
23 |
#' See section "Estimation" for more details. |
|
24 |
#' @param prev.adj `NULL` (default) or scalar numeric between 0 and 1 for prevalence adjustment. |
|
25 |
#' @param show.prev Logical, show prevalence value in the graph. Defaults to `TRUE`. |
|
26 |
#' @param show.nonparam.pv Logical, show non-parametric calculation of PVs. Defaults to `TRUE`. |
|
27 |
#' @param show.best.pv Logical, show best possible PVs. Defaults to `TRUE`. |
|
28 |
#' @param include Character vector (case-insensitive, partial matching) specifying what quantities |
|
29 |
#' to include in the plot. |
|
30 |
#' |
|
31 |
#' Default is: `c("PC", "PPV", "1-NPV")`. |
|
32 |
#' |
|
33 |
#' Full options are: `c("NPV", "PC", "PPV", "1-NPV")`. |
|
34 |
#' @param plot.raw Logical to show percentiles or raw values. |
|
35 |
#' Defaults to `FALSE` (i.e. percentiles). |
|
36 |
#' @param rev.order Logical, reverse ordering of scores. Defaults to `FALSE`. |
|
37 |
#' |
|
38 |
#' @section Estimation: |
|
39 |
#' The `methods` argument specifies the estimation method. |
|
40 |
#' You can provide either a vector of strings, any of |
|
41 |
#' ``` |
|
42 |
#' c("asis", "binned", "pava", "mspline", "gam", "cgam") |
|
43 |
#' ``` |
|
44 |
#' (`"asis"` is not available for `calibrationProfile`), |
|
45 |
#' or a named list of lists. |
|
46 |
#' In the latter case, the inner list must have an element "method", |
|
47 |
#' which specifies the estimation function (one of those above), |
|
48 |
#' and optionally other elements, which are passed to the estimation function. |
|
49 |
#' For example: |
|
50 |
#' ``` |
|
51 |
#' list( |
|
52 |
#' gam = list(method = "gam", k = 3), |
|
53 |
#' c_gam = list(method = "cgam", numknots = 3) |
|
54 |
#' ) |
|
55 |
#' ``` |
|
56 |
#' |
|
57 |
#' To see what arguments are available for each estimation method, |
|
58 |
#' see the documentation of that function. |
|
59 |
#' The naming convention is `getXest`, |
|
60 |
#' where `X` stands for the estimation method, for example [getGAMest()]. |
|
61 |
#' |
|
62 |
#' "gam", "cgam", and "mspline" always fit on percentiles by default. |
|
63 |
#' To change this, use `fitonPerc = FALSE`, for example |
|
64 |
#' ``` |
|
65 |
#' list(gam = list(method = "gam", fitonPerc = FALSE)) |
|
66 |
#' ``` |
|
67 |
#' |
|
68 |
#' "gam" and "cgam" methods are wrappers of [mgcv::gam()] and [cgam::cgam()], respectively. |
|
69 |
#' The default values of function arguments (like `k`, the number of knots in [mgcv::s()]) |
|
70 |
#' mirror the package defaults. |
|
71 |
#' |
|
72 |
#' @return A list containing the plot and data, plus `errorbar` data if they were requested |
|
73 |
#' (through `"binned"` estimation method with a parameter `errorbar.sem`). |
|
74 |
#' |
|
75 |
#' @export |
|
76 |
#' |
|
77 |
#' @seealso [calibrationProfile()] [sensSpec()] |
|
78 |
#' |
|
79 |
#' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()] |
|
80 |
#' [getASISest()] |
|
81 |
#' |
|
82 |
#' @examples |
|
83 |
#' # Read in example data |
|
84 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) |
|
85 |
#' rscore <- auroc$predicted_calibrated |
|
86 |
#' truth <- as.numeric(auroc$actual) |
|
87 |
#' |
|
88 |
#' # Default plot includes 1-NPV, PPV, and a predictiveness curve (PC) based on risk-cutoff |
|
89 |
#' p1 <- riskProfile(outcome = truth, score = rscore) |
|
90 |
#' p1$plot |
|
91 |
#' p1$data |
|
92 |
#' |
|
93 |
#' # Show also NPV |
|
94 |
#' p2 <- riskProfile( |
|
95 |
#' outcome = truth, |
|
96 |
#' score = rscore, |
|
97 |
#' include = c("PC", "NPV", "PPV", "1-NPV") |
|
98 |
#' # or use partial matching: include = c("PC", "N", "PPV", "1") |
|
99 |
#' ) |
|
100 |
#' p2$plot |
|
101 |
#' p2$data |
|
102 |
#' |
|
103 |
#' # All estimates of prediction curve |
|
104 |
#' p3 <- riskProfile( |
|
105 |
#' outcome = truth, |
|
106 |
#' score = rscore, |
|
107 |
#' methods = c("mspline", "gam", "cgam", "binned", "pava", "asis"), |
|
108 |
#' include = c("PC", "PPV", "1-NPV") |
|
109 |
#' ) |
|
110 |
#' p3$plot |
|
111 |
#' |
|
112 |
#' # Specifying method arguments (note each list has a "method" element) |
|
113 |
#' p4 <- riskProfile( |
|
114 |
#' outcome = truth, |
|
115 |
#' score = rscore, |
|
116 |
#' methods = list( |
|
117 |
#' "gam" = list(method = "gam", bs = "tp", logscores = FALSE, fitonPerc = TRUE), |
|
118 |
#' "risk" = list(method = "asis"), # no available arguments for this method |
|
119 |
#' "bin" = list(method = "binned", quantiles = 10, errorbar.sem = 1.2) |
|
120 |
#' ) |
|
121 |
#' ) |
|
122 |
#' p4$plot |
|
123 |
#' |
|
124 |
#' # Compare multiple GAMs in terms of Predictiveness Curves |
|
125 |
#' p5 <- riskProfile( |
|
126 |
#' outcome = truth, |
|
127 |
#' score = rscore, |
|
128 |
#' methods = list( |
|
129 |
#' "gam_3" = list(method = "gam", k = 3), |
|
130 |
#' "gam_4" = list(method = "gam", k = 4), |
|
131 |
#' "gam_7" = list(method = "gam", k = 7) |
|
132 |
#' ), |
|
133 |
#' include = "PC" |
|
134 |
#' ) |
|
135 |
#' p5$plot |
|
136 |
#' |
|
137 |
#' # Using logistic regression as user-defined estimation function, fitting on percentiles |
|
138 |
#' # Function needs to take exactly these two arguments |
|
139 |
#' my_est <- function(outcome, score) { |
|
140 |
#' # Calculate percentiles |
|
141 |
#' perc <- ecdf(score)(score) |
|
142 |
#' # Fit |
|
143 |
#' m <- glm(outcome ~ perc, family = "binomial") |
|
144 |
#' # Generate predictions |
|
145 |
#' preds <- predict(m, type = "response") |
|
146 |
#' # Return a data.frame with exactly these columns |
|
147 |
#' return( |
|
148 |
#' data.frame( |
|
149 |
#' score = score, |
|
150 |
#' percentile = perc, |
|
151 |
#' outcome = outcome, |
|
152 |
#' estimate = preds |
|
153 |
#' ) |
|
154 |
#' ) |
|
155 |
#' } |
|
156 |
#' p6 <- riskProfile( |
|
157 |
#' outcome = truth, |
|
158 |
#' score = rscore, |
|
159 |
#' methods = list(my_lr = my_est) |
|
160 |
#' ) |
|
161 |
#' p6$plot |
|
162 |
#' |
|
163 |
#' # Using cgam as user-defined estimation function |
|
164 |
#' # Note that you can also use the predefined cgam using methods = "cgam" |
|
165 |
#' # Attach needed library |
|
166 |
#' # Watch out for masking of mgcv::s and cgam::s if both are attached |
|
167 |
#' library(cgam, quietly = TRUE) |
|
168 |
#' # Function needs to take exactly these two arguments |
|
169 |
#' my_est <- function(outcome, score) { |
|
170 |
#' # Fit on raw predictions with space = "E" |
|
171 |
#' m <- cgam( |
|
172 |
#' outcome ~ s.incr(score, numknots = 5, space = "E"), |
|
173 |
#' family = "binomial" |
|
174 |
#' ) |
|
175 |
#' # Generate predictions and convert to vector |
|
176 |
#' preds <- predict(m, type = "response")$fit |
|
177 |
#' # Return a data.frame with exactly these columns |
|
178 |
#' out <- data.frame( |
|
179 |
#' score = score, |
|
180 |
#' percentile = ecdf(score)(score), |
|
181 |
#' outcome = outcome, |
|
182 |
#' estimate = preds |
|
183 |
#' ) |
|
184 |
#' return(out) |
|
185 |
#' } |
|
186 |
#' |
|
187 |
#' p7 <- riskProfile( |
|
188 |
#' outcome = truth, |
|
189 |
#' score = rscore, |
|
190 |
#' methods = list(my_cgam = my_est) |
|
191 |
#' ) |
|
192 |
#' p7$plot |
|
193 |
#' |
|
194 |
#' # Prevalence adjustment to 0.1 |
|
195 |
#' p8 <- riskProfile(outcome = truth, score = rscore, prev.adj = 0.1) |
|
196 |
#' p8$plot |
|
197 |
#' |
|
198 |
riskProfile <- function(outcome, |
|
199 |
score, |
|
200 |
methods = "asis", |
|
201 |
prev.adj = NULL, |
|
202 |
show.prev = TRUE, |
|
203 |
show.nonparam.pv = TRUE, |
|
204 |
show.best.pv = TRUE, |
|
205 |
include = c("PC", "PPV", "1-NPV"), |
|
206 |
plot.raw = FALSE, |
|
207 |
rev.order = FALSE) { |
|
208 | ||
209 |
# Argument checks (except outcome, score, methods - below) |
|
210 | 15x |
checkmate::assert_number(prev.adj, lower = 0, upper = 1, null.ok = TRUE) |
211 | 15x |
checkmate::assert_flag(show.nonparam.pv) |
212 | 15x |
checkmate::assert_flag(show.best.pv) |
213 | 15x |
checkmate::assert_flag(show.prev) |
214 | 15x |
include <- matchArgSubset(toupper(include), choices = c("PC", "PPV", "NPV", "1-NPV")) |
215 | 15x |
checkmate::assert_flag(plot.raw) |
216 | 15x |
checkmate::assert_flag(rev.order) |
217 | ||
218 |
# Standardize/Check outcome, scores |
|
219 | 15x |
op <- inputCheck(outcome = outcome, score = score) |
220 | ||
221 |
# Order Data by scores |
|
222 | 15x |
tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order) |
223 | 15x |
score <- tempdf$score |
224 | 15x |
outcome <- tempdf$outcome |
225 | ||
226 |
# Check methods |
|
227 | 15x |
methods <- methodCheck(methods = methods) |
228 | 15x |
method.names <- names(methods) |
229 | ||
230 |
# Calculate prevalence and percentiles |
|
231 | 15x |
prev <- mean(outcome, na.rm = TRUE) |
232 | ||
233 |
# Get the plot settings |
|
234 | 15x |
show.pc <- "PC" %in% include |
235 | 15x |
show.pv <- any(c("PPV", "NPV", "1-NPV") %in% include) |
236 | 15x |
show.one.only <- sum(c("PC", "PPV", "NPV", "1-NPV") %in% include) == 1 |
237 | ||
238 | 15x |
if (plot.raw) { |
239 | 2x |
xvar <- "score" |
240 |
} else { |
|
241 | 13x |
xvar <- "percentile" |
242 |
} |
|
243 | ||
244 |
# Prediction Curve Data |
|
245 | 15x |
if (show.pc) { |
246 | 12x |
pc.ests <- getEsts(methods = methods, outcome = outcome, score = score) |
247 | 12x |
PC.data <- mutate(pc.ests$plotdata, pv = "PC") |
248 | 12x |
errorbar.data <- pc.ests$errorbardata |
249 | 12x |
step.methods <- method.names[pc.ests$idx.step] |
250 |
} else { |
|
251 | 3x |
PC.data <- data.frame( |
252 | 3x |
method = character(0), pv = character(0), |
253 | 3x |
percentile = numeric(0), score = numeric(0), estimate = numeric(0) |
254 |
) |
|
255 | 3x |
pc.ests <- errorbar.data <- NULL |
256 | 3x |
step.methods <- character(0) |
257 |
} |
|
258 | ||
259 |
# Predictive Value Data |
|
260 | 15x |
if (show.pv) { |
261 | 14x |
PV.data <- getPVdata(outcome = outcome, score = score, methods = methods, pc.ests = pc.ests) |
262 |
} else { |
|
263 | 1x |
PV.data <- data.frame( |
264 | 1x |
method = character(0), score = numeric(0), percentile = numeric(0), |
265 | 1x |
outcome = numeric(0), estimate = numeric(0), |
266 | 1x |
MNPV = numeric(0), NPV = numeric(0), PPV = numeric(0) |
267 |
) |
|
268 |
} |
|
269 | ||
270 |
# show.nonparam.pv |
|
271 | 15x |
if (show.nonparam.pv) { |
272 | 10x |
tmp <- nonParametricPV(outcome = outcome, score = score) %>% |
273 | 10x |
mutate(method = "non-parametric", estimate = NA) |
274 | 10x |
PV.data <- bind_rows(PV.data, tmp) |
275 |
} |
|
276 | ||
277 |
# Dataset of inputs |
|
278 | 15x |
df.in <- data.frame(outcome, score, percentile = ecdf(score)(score)) |
279 | ||
280 |
# Adjust based on user defined prevalence |
|
281 | 15x |
if (!is.null(prev.adj)) { |
282 | ||
283 | 1x |
cdf.cases <- ecdf(score[outcome == 1]) |
284 | 1x |
cdf.controls <- ecdf(score[outcome == 0]) |
285 | ||
286 | 1x |
df.in$percentile <- adjPrevPerc( |
287 | 1x |
perc = df.in$score, prev.new = prev.adj, |
288 | 1x |
cdf.case = cdf.cases, cdf.control = cdf.controls |
289 |
) |
|
290 | ||
291 | 1x |
if (show.pc) { |
292 | 1x |
PC.data <- adjPrevPC( |
293 | 1x |
dat = PC.data, prev = prev, prev.new = prev.adj, |
294 | 1x |
cdf.case = cdf.cases, cdf.control = cdf.controls |
295 |
) |
|
296 |
} |
|
297 | ||
298 | 1x |
if (show.pv) { |
299 | 1x |
PV.data <- adjPrevPV( |
300 | 1x |
dat = PV.data, prev = prev, prev.new = prev.adj, |
301 | 1x |
cdf.case = cdf.cases, cdf.control = cdf.controls |
302 |
) |
|
303 |
} |
|
304 | ||
305 | 1x |
prev <- prev.adj |
306 |
} |
|
307 | ||
308 |
# pivot PV data |
|
309 | 15x |
if (show.pv) { |
310 | 14x |
PV.data <- PV.data %>% |
311 | 14x |
rename(`1-NPV` = "MNPV") %>% |
312 | 14x |
tidyr::pivot_longer( |
313 | 14x |
cols = all_of(c("1-NPV", "NPV", "PPV")), names_to = "pv", values_to = "pvValue" |
314 |
) %>% |
|
315 | 14x |
mutate(pv = factor(.data$pv, levels = c("NPV", "1-NPV", "PPV"))) %>% |
316 | 14x |
arrange(.data$method, .data$pv, .data$percentile) |
317 |
} else { |
|
318 | 1x |
PV.data <- data.frame( |
319 | 1x |
method = character(0), score = numeric(0), percentile = numeric(0), |
320 | 1x |
outcome = numeric(0), estimate = numeric(0), |
321 | 1x |
pv = character(0), pvValue = numeric(0) |
322 |
) |
|
323 |
} |
|
324 | ||
325 |
# If showing percentiles, add row with 0th percentile |
|
326 | 15x |
if (!plot.raw) { |
327 | 13x |
if (show.pc) { |
328 | 10x |
PC.data <- add0thPercPC(PC.data) |
329 |
} |
|
330 | 13x |
if (show.pv) { |
331 | 12x |
PV.data <- add0thPercPV(PV.data) |
332 |
} |
|
333 |
} |
|
334 | ||
335 |
# Subset PC data |
|
336 | 15x |
smoothPC <- PC.data[!PC.data$method %in% step.methods, , drop = FALSE] |
337 | 15x |
stepPC <- PC.data[PC.data$method %in% step.methods, , drop = FALSE] |
338 | ||
339 |
# Subset PV data |
|
340 | 15x |
smoothPV <- PV.data[ |
341 | 15x |
PV.data$pv %in% include & !PV.data$method %in% step.methods, , |
342 | 15x |
drop = FALSE |
343 |
] |
|
344 | 15x |
stepPV <- PV.data[ |
345 | 15x |
PV.data$pv %in% include & PV.data$method %in% step.methods, , |
346 | 15x |
drop = FALSE |
347 |
] |
|
348 | ||
349 |
# Different aes based on what is to be shown |
|
350 |
# (if one kind of PV value, use both coloring and linetype for distinguishing estimation methods) |
|
351 | 15x |
if (show.one.only) { |
352 | 2x |
aes.pc <- aes( |
353 | 2x |
x = .data[[xvar]], y = .data$estimate, |
354 | 2x |
colour = .data$method, linetype = .data$method |
355 |
) |
|
356 | 2x |
aes.pv <- aes( |
357 | 2x |
x = .data[[xvar]], y = .data$pvValue, |
358 | 2x |
colour = .data$method, linetype = .data$method |
359 |
) |
|
360 |
} else { |
|
361 | 13x |
aes.pc <- aes( |
362 | 13x |
x = .data[[xvar]], y = .data$estimate, |
363 | 13x |
colour = .data$pv, linetype = .data$method |
364 |
) |
|
365 | 13x |
aes.pv <- aes( |
366 | 13x |
x = .data[[xvar]], y = .data$pvValue, |
367 | 13x |
colour = .data$pv, linetype = .data$method |
368 |
) |
|
369 |
} |
|
370 | ||
371 |
# Build plot |
|
372 | 15x |
p <- ggplot() + |
373 | 15x |
geom_line(aes.pc, data = smoothPC, alpha = 0.8) + |
374 | 15x |
geom_step(aes.pc, data = stepPC, alpha = 0.8, direction = "vh") + |
375 | 15x |
geom_line(aes.pv, data = smoothPV, alpha = 0.8) + |
376 | 15x |
geom_step(aes.pv, data = stepPV, alpha = 0.8, direction = "vh") + |
377 | 15x |
geom_hline(yintercept = prev, alpha = 0.8, col = "black", linetype = "dashed") |
378 | ||
379 |
# Best PVs |
|
380 | 15x |
if (show.best.pv) { |
381 | 11x |
if (!plot.raw) { |
382 | 9x |
df.in <- bind_rows(dplyr::tibble(percentile = 0, score = NA), df.in) |
383 |
} |
|
384 | 11x |
best <- df.in %>% |
385 | 11x |
select(all_of(c("percentile", "score"))) %>% |
386 | 11x |
distinct() %>% |
387 | 11x |
arrange(.data$percentile, .data$score) %>% |
388 | 11x |
mutate( |
389 | 11x |
PC = ifelse(.data$percentile <= 1 - prev, 0, 1), |
390 | 11x |
PPV = bestPPV(perc = .data$percentile, prev = prev), |
391 | 11x |
`1-NPV` = bestMNPV(perc = .data$percentile, prev = prev), |
392 | 11x |
NPV = 1 - .data$`1-NPV` |
393 |
) %>% |
|
394 | 11x |
tidyr::pivot_longer( |
395 | 11x |
cols = c("PC", "PPV", "1-NPV", "NPV"), names_to = "pv", values_to = "pvValue" |
396 |
) %>% |
|
397 | 11x |
filter(.data$pv %in% include) %>% |
398 | 11x |
mutate( |
399 | 11x |
method = "Best PVs", |
400 | 11x |
pv = paste("Best", .data$pv) |
401 |
) |
|
402 | ||
403 | 11x |
if (show.one.only) { |
404 | 1x |
p <- p + |
405 | 1x |
geom_line( |
406 | 1x |
data = best, |
407 | 1x |
aes(x = .data[[xvar]], y = .data$pvValue, linewidth = .data$pv), |
408 | 1x |
colour = "gray60" |
409 |
) + |
|
410 | 1x |
scale_linewidth_manual( |
411 | 1x |
values = c("Best 1-NPV" = 0.3, "Best PPV" = 0.3, "Best PC" = 0.3, "Best NPV" = 0.3), |
412 | 1x |
name = "Best PVs" |
413 |
) |
|
414 |
} else { |
|
415 | 10x |
p <- p + |
416 | 10x |
geom_line(data = best, aes(x = .data[[xvar]], y = .data$pvValue, colour = .data$pv)) |
417 |
} |
|
418 |
} |
|
419 | ||
420 |
# Add errorbars |
|
421 | 15x |
if (show.pc && !is.null(errorbar.data)) { |
422 | 1x |
p <- p + |
423 | 1x |
geom_point( |
424 | 1x |
data = errorbar.data, |
425 | 1x |
aes(x = .data$midquantile, y = .data$bin.mid), |
426 | 1x |
alpha = 0.8, |
427 | 1x |
size = 0.2 |
428 |
) + |
|
429 | 1x |
geom_errorbar( |
430 | 1x |
data = errorbar.data, |
431 | 1x |
aes( |
432 | 1x |
x = .data$midquantile, |
433 | 1x |
ymin = .data$bin.low, |
434 | 1x |
ymax = .data$bin.high, |
435 | 1x |
width = .02 |
436 |
), |
|
437 | 1x |
alpha = 0.7, |
438 | 1x |
linewidth = 0.2, |
439 | 1x |
inherit.aes = FALSE |
440 |
) |
|
441 |
} |
|
442 | ||
443 |
# Set always the same colours for PVs |
|
444 | 15x |
if (!show.one.only) { |
445 | 13x |
clrs <- predictionColours(include, show.best = show.best.pv) |
446 | 13x |
p <- p + scale_colour_manual(values = clrs, breaks = names(clrs)) |
447 |
} else { |
|
448 | 2x |
p <- p + scale_colour_hue(l = 45) |
449 |
} |
|
450 | ||
451 |
# Finalize plot |
|
452 | 15x |
p <- p + |
453 | 15x |
labs( |
454 | 15x |
title = "Predictiveness Plot", |
455 | 15x |
x = ifelse(plot.raw, "Prediction Score", "Risk Percentile"), |
456 | 15x |
y = "Predicted Risk / Predictive Value", |
457 | 15x |
linetype = "Estimation Method", |
458 | 15x |
colour = ifelse(show.one.only, "Estimation Method", "Predictive Quantity") |
459 |
) + |
|
460 | 15x |
scale_x_continuous(n.breaks = 6) + |
461 | 15x |
scale_y_continuous(n.breaks = 6) + |
462 | 15x |
theme_bw() + |
463 | 15x |
theme(legend.key.width = unit(2, "line")) |
464 |
|
|
465 |
# Add prevalence annotation if requested |
|
466 | 15x |
if (show.prev) { |
467 |
# x-value for plotting prevalence label |
|
468 | 15x |
prev_x <- ifelse(plot.raw, min(score), 0) |
469 | 15x |
prev_nudge_x <- ifelse(plot.raw, (max(score) - min(score)) / 10, 0.1) |
470 | 15x |
prev_nudge_y <- ggplot2::layer_scales(p)$y$get_limits()[2] / 10 |
471 |
|
|
472 | 15x |
p <- p + annotate( |
473 | 15x |
geom = "text", |
474 | 15x |
x = prev_x + prev_nudge_x, |
475 | 15x |
y = ifelse(prev > 0.8, prev - prev_nudge_y, prev + prev_nudge_y), |
476 | 15x |
label = paste0("Prevalence: ", "\n", round(prev, 3)), |
477 | 15x |
colour = "black", |
478 | 15x |
alpha = 0.8, |
479 | 15x |
size = 3.5 |
480 |
) |
|
481 |
} |
|
482 | ||
483 | 15x |
if (show.one.only) { |
484 | 2x |
p <- p + guides(colour = guide_legend(order = 1), linetype = guide_legend(order = 1)) |
485 |
} else { |
|
486 | 13x |
p <- p + guides(colour = guide_legend(order = 1), linetype = guide_legend(order = 2)) |
487 |
} |
|
488 | ||
489 | 15x |
if (show.best.pv) { |
490 | 11x |
PV.data <- bind_rows(PV.data, mutate(best, pv = gsub("Best ", "", .data$pv))) |
491 |
} |
|
492 | ||
493 | 15x |
return( |
494 | 15x |
list( |
495 | 15x |
plot = p, |
496 | 15x |
data = bind_rows( |
497 | 15x |
dplyr::as_tibble(PC.data), |
498 | 15x |
dplyr::as_tibble(PV.data) |
499 |
), |
|
500 | 15x |
errorbar = errorbar.data |
501 |
) |
|
502 |
) |
|
503 |
} |
1 |
#' Sensitivity and specificity plot |
|
2 |
#' |
|
3 |
#' Sensitivity and specificity risk estimates |
|
4 |
#' |
|
5 |
#' Given individual binary outcomes and scores, this function plots sensitivity and specificity |
|
6 |
#' (using each score as a cutoff) on their respective score percentiles. |
|
7 |
#' |
|
8 |
#' @inheritParams riskProfile |
|
9 |
#' @param show.best Logical; Include best possible sensitivity and specificity? Defaults to `TRUE`. |
|
10 |
#' |
|
11 |
#' @inheritSection riskProfile Estimation |
|
12 |
#' |
|
13 |
#' @return A list containing the plot and data. |
|
14 |
#' |
|
15 |
#' @export |
|
16 |
#' |
|
17 |
#' @seealso [riskProfile()] [calibrationProfile()] |
|
18 |
#' |
|
19 |
#' [getPAVAest()] [getBINNEDest()] [getGAMest()] [getCGAMest()] [getMSPLINEest()] |
|
20 |
#' [getASISest()] |
|
21 |
#' |
|
22 |
#' @examples |
|
23 |
#' # Read in example data |
|
24 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) |
|
25 |
#' rscore <- auroc$predicted_calibrated |
|
26 |
#' truth <- as.numeric(auroc$actual) |
|
27 |
#' |
|
28 |
#' # Plot sensitivity and specificity |
|
29 |
#' p1 <- sensSpec(outcome = truth, score = rscore) |
|
30 |
#' p1$plot |
|
31 |
#' |
|
32 |
#' # Same with smoothed estimates |
|
33 |
#' p2 <- sensSpec(outcome = truth, score = rscore, methods = c("asis", "gam")) |
|
34 |
#' p2$plot |
|
35 |
#' |
|
36 |
sensSpec <- function(outcome, |
|
37 |
score, |
|
38 |
methods = "asis", |
|
39 |
show.best = TRUE, |
|
40 |
plot.raw = FALSE, |
|
41 |
rev.order = FALSE) { |
|
42 | ||
43 |
# Argument checks |
|
44 | 7x |
checkmate::assert_flag(show.best) |
45 | 7x |
checkmate::assert_flag(plot.raw) |
46 | 7x |
checkmate::assert_flag(rev.order) |
47 | ||
48 |
# Check methods |
|
49 | 7x |
methods <- methodCheck(methods = methods) |
50 | ||
51 | 7x |
if (plot.raw) { |
52 | 2x |
xvar <- "score" |
53 |
} else { |
|
54 | 5x |
xvar <- "percentile" |
55 |
} |
|
56 | ||
57 |
# Standardize/Check outcome, scores |
|
58 | 7x |
op <- inputCheck(outcome = outcome, score = score) |
59 | ||
60 |
# Order Data by scores |
|
61 | 7x |
tempdf <- orderInputs(outcome = op$outcome, score = op$score, rev.order = rev.order) |
62 | 7x |
score <- tempdf$score |
63 | 7x |
outcome <- tempdf$outcome |
64 | ||
65 |
# Calculate spec and sens |
|
66 | 7x |
dat <- getEsts(outcome = outcome, score = score, methods = methods)$plotdata |
67 | 7x |
dat <- split(dat, dat$method) %>% |
68 | 7x |
lapply(\(d) nonParametricTR(outcome = d$outcome, score = d$estimate)) %>% |
69 | 7x |
bind_rows(.id = "method") |
70 | ||
71 |
# Plot |
|
72 | 7x |
p <- ggplot(dat) + |
73 | 7x |
geom_step( |
74 | 7x |
aes(x = .data[[xvar]], y = .data$value, colour = .data$method, linetype = .data$pf), |
75 | 7x |
direction = "hv" |
76 |
) |
|
77 | ||
78 |
# Add best possible sensitivity and specificity |
|
79 | 7x |
if (show.best) { |
80 | 7x |
prev <- mean(outcome) |
81 | 7x |
if (plot.raw) { |
82 | 2x |
best <- data.frame(percentile = ecdf(score)(score), score = score) |
83 |
} else { |
|
84 | 5x |
best <- data.frame(percentile = c(0, ecdf(score)(score)), score = c(NA, score)) |
85 |
} |
|
86 | 7x |
best <- best %>% |
87 | 7x |
distinct() %>% |
88 | 7x |
mutate( |
89 | 7x |
Sensitivity = bestSens(perc = .data$percentile, prev = prev), |
90 | 7x |
Specificity = bestSpec(perc = .data$percentile, prev = prev) |
91 |
) %>% |
|
92 | 7x |
tidyr::pivot_longer( |
93 | 7x |
cols = c("Sensitivity", "Specificity"), names_to = "pf", values_to = "value" |
94 |
) %>% |
|
95 | 7x |
mutate(method = "best") |
96 | 7x |
p <- p + |
97 | 7x |
geom_line( |
98 | 7x |
aes(x = .data[[xvar]], y = .data$value, linetype = .data$pf, linewidth = "Best Possible"), |
99 | 7x |
data = best, |
100 | 7x |
colour = "gray70" |
101 |
) + |
|
102 | 7x |
scale_linewidth_manual(values = c("Best Possible" = 0.5), name = NULL) |
103 |
} |
|
104 | ||
105 |
# Finalize plot |
|
106 | 7x |
p <- p + |
107 | 7x |
labs( |
108 | 7x |
title = "Sensitivity and Specificity Plot", |
109 | 7x |
x = ifelse(plot.raw, "Prediction Score", "Risk Percentile"), |
110 | 7x |
y = "True Positive / Negative Rate", |
111 | 7x |
colour = "Estimation Method", |
112 | 7x |
linetype = "Predictive Quantity" |
113 |
) + |
|
114 | 7x |
scale_x_continuous(n.breaks = 6) + |
115 | 7x |
scale_y_continuous(n.breaks = 6) + |
116 | 7x |
scale_linetype_manual(values = c("Sensitivity" = "solid", "Specificity" = "dashed")) + |
117 | 7x |
scale_colour_hue(l = 45) + |
118 | 7x |
theme_bw() + |
119 | 7x |
theme(legend.key.width = unit(2, "line")) + |
120 | 7x |
guides( |
121 | 7x |
linetype = guide_legend(order = 1, override.aes = list(colour = "black")), |
122 | 7x |
colour = guide_legend(order = 2), |
123 | 7x |
linewidth = guide_legend(order = 3) |
124 |
) |
|
125 | ||
126 | 7x |
if (show.best) { |
127 | 7x |
dat <- bind_rows(dat, best) |
128 |
} |
|
129 | ||
130 | 7x |
return(list(plot = p, data = dplyr::as_tibble(dat))) |
131 |
} |
1 |
#' Check Input Arguments |
|
2 |
#' |
|
3 |
#' Given outcomes, prediction scores, methods, checks for obvious issues, |
|
4 |
#' and returns warnings if needed. |
|
5 |
#' These may include unequal lengths, missing values, ensuring binary outcomes, |
|
6 |
#' and numeric scores, etc. |
|
7 |
#' |
|
8 |
#' @inheritParams riskProfile |
|
9 |
#' |
|
10 |
#' @return A list of containing standardized outcomes and predicted scores. |
|
11 |
#' |
|
12 |
#' @keywords internal |
|
13 |
#' @noRd |
|
14 |
#' |
|
15 |
#' @examples |
|
16 |
#' auroc <- read.csv(system.file("extdata", "sample.csv", package = "stats4phc")) |
|
17 |
#' rscore <- auroc$predicted |
|
18 |
#' truth <- as.numeric(auroc$actual) |
|
19 |
#' inputCheck(truth, rscore) |
|
20 |
#' |
|
21 |
inputCheck <- function(outcome, score) { |
|
22 | ||
23 |
# Predscore - check numeric |
|
24 | 41x |
if (!is.numeric(score)) { |
25 | 1x |
stop("'score' vector must be represented as a numeric.") |
26 |
} |
|
27 | ||
28 |
# Outcome - logical to numeric |
|
29 | 40x |
if (is.logical(outcome)) { |
30 | 33x |
outcome <- as.numeric(outcome) |
31 |
} |
|
32 | ||
33 |
# Outcome - check numeric |
|
34 | 40x |
if (!is.numeric(outcome)) { |
35 | 1x |
stop("'outcome' vector needs to be a numeric vector.") |
36 |
} |
|
37 | ||
38 |
# Check matching lengths |
|
39 | 39x |
if (length(outcome) != length(score)) { |
40 | 1x |
stop("'outcome' and 'score' must have the same lengths.") |
41 |
} |
|
42 | ||
43 |
# Need at least 3 observations |
|
44 | 38x |
if (length(outcome) < 3) { |
45 | 1x |
stop("Need at least 3 observations.") |
46 |
} |
|
47 | ||
48 |
# DROP NA's for data pairs |
|
49 | 37x |
if (any(is.na(outcome)) || any(is.na(score))) { |
50 | 2x |
warning("Observations with NA's are dropped") |
51 | 2x |
ind <- intersect( |
52 | 2x |
which(complete.cases(outcome)), |
53 | 2x |
which(complete.cases(score)) |
54 |
) |
|
55 | 2x |
outcome <- outcome[ind] |
56 | 2x |
score <- score[ind] |
57 |
} |
|
58 | ||
59 |
# Outcome - check incorrect values |
|
60 | 37x |
if (!all(outcome %in% 0:1)) { |
61 | 1x |
stop("'outcome' vector must be represented as binary 1 or 0.") |
62 |
} |
|
63 | ||
64 |
# Predscore - check low frequency of score values |
|
65 | 36x |
if (length(unique(score)) <= 5) { |
66 | 2x |
tbl <- table(score) |
67 | 2x |
if (any(tbl <= 3)) { |
68 | 2x |
warning( |
69 | 2x |
paste( |
70 | 2x |
"There is a low-occurrence value in `score` (", names(tbl)[tbl <= 3][1], ").", |
71 | 2x |
"The results may be unreliable." |
72 |
) |
|
73 |
) |
|
74 |
} |
|
75 |
} |
|
76 | ||
77 |
# Outcome - check low frequency |
|
78 | 36x |
if (any(prop.table(table(outcome)) <= 0.03)) { |
79 | 1x |
warning( |
80 | 1x |
paste( |
81 | 1x |
"There is a low frequency of one of the outcome classes (`prop.table(table(outcome))`).", |
82 | 1x |
"The results may be unreliable." |
83 |
) |
|
84 |
) |
|
85 |
} |
|
86 | ||
87 | 36x |
return(list(outcome = outcome, score = score)) |
88 |
} |
|
89 | ||
90 | ||
91 |
#' Order Inputs |
|
92 |
#' |
|
93 |
#' Given a vector of prediction scores, and outcome, the function orders by score. |
|
94 |
#' Option exists to reverse ordering, if lower scores correspond to higher rate of outcomes. |
|
95 |
#' |
|
96 |
#' @inheritParams riskProfile |
|
97 |
#' |
|
98 |
#' @return Returns an ordered and complete case dataframe of outcomes and score. |
|
99 |
#' |
|
100 |
#' @keywords internal |
|
101 |
#' @noRd |
|
102 |
#' |
|
103 |
orderInputs <- function(outcome, score, rev.order = FALSE) { |
|
104 | 33x |
tempdf <- data.frame(score = score, outcome = outcome) |
105 | 33x |
if (rev.order) { |
106 | 6x |
tempdf$score <- -tempdf$score |
107 |
} |
|
108 | 33x |
tempdf <- tempdf[order(tempdf$score, tempdf$outcome), ] |
109 | 33x |
return(tempdf) |
110 |
} |
|
111 | ||
112 |
#' Method Check |
|
113 |
#' |
|
114 |
#' Usage: |
|
115 |
#' 1. methods <- methodCheck(methods) |
|
116 |
#' 2. getEstMethod(methods[[1]]) or getEstMethods(methods) |
|
117 |
#' 3. getEst(methods, ...) |
|
118 |
#' |
|
119 |
#' @inheritParams riskProfile |
|
120 |
#' |
|
121 |
#' @return list of named lists of method arguments |
|
122 |
#' |
|
123 |
#' @keywords internal |
|
124 |
#' @noRd |
|
125 |
#' |
|
126 |
methodCheck <- function(methods) { |
|
127 | ||
128 |
# Drop empty strings |
|
129 | 54x |
if (any(methods == "")) { |
130 | 1x |
methods <- methods[methods != ""] |
131 |
} |
|
132 | ||
133 |
# If character is supplied |
|
134 | 54x |
if (is.character(methods)) { |
135 | ||
136 |
# Convert to lower-case |
|
137 | 32x |
orig.names <- methods |
138 | 32x |
methods <- tolower(methods) |
139 | ||
140 |
# Check uniqueness |
|
141 | 32x |
if (length(unique(methods)) != length(methods)) { |
142 | 1x |
stop("`methods` should be unique when specified as character.") |
143 |
} |
|
144 | ||
145 |
# Check available method |
|
146 | 31x |
if (!all(methods %in% names(est.funs()))) { |
147 | 2x |
stop( |
148 | 2x |
paste( |
149 | 2x |
"Supplied method is not yet available. Try selecting from (case insensitive): ", |
150 | 2x |
paste(shQuote(names(est.funs())), collapse = ", ") |
151 |
) |
|
152 |
) |
|
153 |
} |
|
154 | ||
155 |
# Convert to list |
|
156 | 29x |
methods <- structure(as.list(methods), names = orig.names) |
157 | ||
158 |
# Otherwise, if a list is provided: |
|
159 |
# list(gam1 = list(method = "gam", k = 3), pv = list(method = "pava", ...), etc) |
|
160 |
# or list(my_estimate = my_fun(outcome, score) {...}) |
|
161 | 22x |
} else if (is.list(methods)) { |
162 | ||
163 |
# Check named list |
|
164 | 20x |
if (!checkmate::test_named(methods, type = "unique")) { |
165 | 1x |
stop("'methods' should be a uniquely named list.") |
166 |
} |
|
167 | ||
168 |
# Check and unify estimation method names |
|
169 | 19x |
methods <- lapply(methods, listMethodCheck) |
170 | ||
171 |
} else { |
|
172 | 2x |
stop( |
173 | 2x |
paste( |
174 | 2x |
"`methods` must be character", |
175 | 2x |
"or named list of estimation methods / user defined functions." |
176 |
) |
|
177 |
) |
|
178 |
} |
|
179 | ||
180 | 42x |
return(methods) |
181 |
} |
|
182 | ||
183 |
# x is a single method (element of an outer list) |
|
184 |
listMethodCheck <- function(x) { |
|
185 | 25x |
UseMethod("listMethodCheck") |
186 |
} |
|
187 | ||
188 |
#' @export |
|
189 |
listMethodCheck.list <- function(x) { |
|
190 | ||
191 |
# Check named list |
|
192 | 20x |
if (!checkmate::test_named(x, type = "unique")) { |
193 | 1x |
stop("Inner lists in the 'methods' argument must be named.") |
194 |
} |
|
195 | ||
196 |
# Check "method" element existing in the list |
|
197 | 19x |
if (is.null(x[["method"]]) || !is.character(x[["method"]])) { |
198 | 2x |
stop( |
199 | 2x |
paste( |
200 | 2x |
'All lists must have a "method" element specifying one of the predefined', |
201 | 2x |
"estimation functions as a string. Please select from:", |
202 | 2x |
paste(shQuote(names(est.funs())), collapse = ", ") |
203 |
) |
|
204 |
) |
|
205 |
} |
|
206 | ||
207 |
# Update method name |
|
208 | 17x |
x[["method"]] <- tolower(x[["method"]]) |
209 | ||
210 |
# Check available method |
|
211 | 17x |
chck_available <- checkmate::test_subset( |
212 | 17x |
x[["method"]], |
213 | 17x |
choices = names(est.funs()), empty.ok = FALSE |
214 |
) |
|
215 | 17x |
if (!chck_available) { |
216 | 2x |
stop( |
217 | 2x |
paste0( |
218 | 2x |
"Supplied method '", |
219 | 2x |
x[["method"]], |
220 | 2x |
"' is not yet available. Try selecting from: ", |
221 | 2x |
paste(shQuote(names(est.funs())), collapse = ", ") |
222 |
) |
|
223 |
) |
|
224 |
} |
|
225 | ||
226 | 15x |
return(x) |
227 |
} |
|
228 | ||
229 |
#' @export |
|
230 |
listMethodCheck.function <- function(x) { |
|
231 | ||
232 |
# Check input arguments |
|
233 | 4x |
if (!checkmate::test_function(x, args = c("outcome", "score"))) { |
234 | 2x |
stop( |
235 | 2x |
paste( |
236 | 2x |
"All user defined estimation functions need to take exactly two arguments:", |
237 | 2x |
"'outcome' and 'score'." |
238 |
) |
|
239 |
) |
|
240 |
} |
|
241 | ||
242 | 2x |
return(x) |
243 |
} |
|
244 | ||
245 |
#' @export |
|
246 |
listMethodCheck.default <- function(x) { |
|
247 | 1x |
stop( |
248 | 1x |
paste( |
249 | 1x |
"The estimation method must be specified as a list or a function." |
250 |
) |
|
251 |
) |
|
252 |
} |
|
253 | ||
254 |
# compared to match.arg(..., several.ok = T), |
|
255 |
# this does not allow NULL and returns a vector of the same length |
|
256 |
matchArgSubset <- function(x, choices) { |
|
257 | 24x |
checkmate::assert_character(x, any.missing = FALSE, min.len = 1) |
258 | 24x |
out <- c() |
259 | 24x |
for (ag in x) { |
260 | 60x |
matched <- tryCatch( |
261 | 60x |
match.arg(ag, choices = choices), |
262 | 60x |
error = function(e) stop(sub("'arg'", paste0("'", ag, "'"), e)) |
263 |
) |
|
264 | 60x |
out <- c(out, matched) |
265 |
} |
|
266 | 24x |
return(unique(out)) |
267 |
} |
|
268 | ||
269 | ||
270 |
add0thPercPC <- function(x) { |
|
271 | 11x |
bind_rows( |
272 | 11x |
x, |
273 | 11x |
x %>% |
274 | 11x |
group_by(.data$method) %>% |
275 | 11x |
summarise( |
276 | 11x |
score = NA, |
277 | 11x |
percentile = 0, |
278 | 11x |
outcome = NA, |
279 | 11x |
estimate = .data$estimate[.data$percentile == min(.data$percentile)][1], |
280 | 11x |
pv = "PC", |
281 | 11x |
.groups = "drop" |
282 |
) |
|
283 |
) %>% |
|
284 | 11x |
arrange(.data$method, .data$percentile) |
285 |
} |
|
286 | ||
287 | ||
288 |
add0thPercPV <- function(x) { |
|
289 | 13x |
bind_rows( |
290 | 13x |
x, |
291 | 13x |
x %>% |
292 | 13x |
group_by(.data$method, .data$pv) %>% |
293 | 13x |
summarise( |
294 | 13x |
score = NA, |
295 | 13x |
percentile = 0, |
296 | 13x |
estimate = NA, |
297 | 13x |
pvValue = dplyr::first(.data$pvValue), |
298 | 13x |
.groups = "drop" |
299 |
) %>% |
|
300 | 13x |
dplyr::relocate("pv", .before = "pvValue") |
301 |
) %>% |
|
302 | 13x |
arrange(.data$method, .data$pv, .data$percentile) |
303 |
} |
|
304 | ||
305 | ||
306 |
#' For snapshot testing of graphs |
|
307 |
#' |
|
308 |
#' @param code Code to create a graph |
|
309 |
#' @param width Width of the plot. |
|
310 |
#' @param height Height of the plot. |
|
311 |
#' |
|
312 |
#' @return Filepath |
|
313 |
#' |
|
314 |
#' @keywords internal |
|
315 |
#' @noRd |
|
316 |
#' |
|
317 |
#' @examples |
|
318 |
#' expect_snapshot_file(save_png(ggplot(mtcars) + |
|
319 |
#' geom_point(aes(hp, mpg))), "riskProfile.png") |
|
320 |
#' |
|
321 |
save_png <- function(code, width = 400, height = 400) { # nocov start |
|
322 |
path <- tempfile(fileext = ".png") |
|
323 |
png(path, width = width, height = height) |
|
324 |
on.exit(dev.off()) |
|
325 |
print(code) |
|
326 |
return(path) |
|
327 |
} # nocov end |
1 | ||
2 |
#' zipFastener for two dataframes of unequal length |
|
3 |
#' |
|
4 |
#' The following function acts like a “zip fastener” for combining two dataframes. |
|
5 |
#' It takes the first row of the first data frame and places it above of |
|
6 |
#' the first row of the second data frame and so on. |
|
7 |
#' |
|
8 |
#' @param df1 dataframe 1 |
|
9 |
#' @param df2 dataframe 2 |
|
10 |
#' |
|
11 |
#' @return Zipped data.frame |
|
12 |
#' |
|
13 |
#' @keywords internal |
|
14 |
#' @noRd |
|
15 |
#' |
|
16 |
#' @examples |
|
17 |
#' df1 <- data.frame(a = 1:3, b = 1:3, c = 1:3) |
|
18 |
#' df2 <- data.frame(a = letters[1:3], b = letters[1:3], c = letters[1:3]) |
|
19 |
#' |
|
20 |
#' zipFastener(df1, df2) |
|
21 |
#' |
|
22 |
zipFastener <- function(df1, df2) { |
|
23 | ||
24 | 8x |
if (ncol(df1) != ncol(df2)) { |
25 | 1x |
stop("the no. of columns has to be equal to merge them by zip feeding") |
26 |
} |
|
27 | ||
28 | 7x |
d1 <- nrow(df1) |
29 | 7x |
d2 <- nrow(df2) |
30 | ||
31 | 7x |
if (d1 != d2) { |
32 | 1x |
stop("the no. of rows has to be equal to merge them by zip feeding") |
33 |
} |
|
34 | ||
35 |
# zip fastener preperations |
|
36 | 6x |
i1 <- 1:d1 # index vector 1 |
37 | 6x |
i2 <- 1:d1 + d1 # index vector 2 |
38 | ||
39 |
# zip fastener operations |
|
40 | 6x |
index <- as.vector(matrix(c(i1, i2), ncol = d1, byrow = TRUE)) |
41 | 6x |
index <- index[!is.na(index)] # remove NAs |
42 | ||
43 | 6x |
colnames(df2) <- colnames(df1) # keep 1st colnames |
44 | 6x |
res <- rbind(df1, df2)[index, ] # reorder data frame |
45 | ||
46 | 6x |
return(res) |
47 |
} |
|
48 | ||
49 | ||
50 |
#' last observation carried forward: for binned |
|
51 |
#' |
|
52 |
#' Function assumes columns in data frame are: percentile, estimate, method |
|
53 |
#' |
|
54 |
#' @param df dataframe containing predcurve estimates |
|
55 |
#' |
|
56 |
#' @return dataframe with adjusted points |
|
57 |
#' |
|
58 |
#' @keywords internal |
|
59 |
#' @noRd |
|
60 |
#' |
|
61 |
locf.binned <- function(df) { |
|
62 | ||
63 | 4x |
df.copy <- df |
64 |
# keep same x, slide y back one spot |
|
65 | 4x |
df.copy$estimate <- df.copy$estimate[c(2:length(df.copy$estimate), NA)] |
66 | 4x |
combined.df <- zipFastener(df, df.copy) |
67 | 4x |
combined.df <- combined.df[-nrow(combined.df), ] # removes a redundant point |
68 | ||
69 | 4x |
return(combined.df) |
70 |
} |
|
71 | ||
72 |
#' last observation carried forward: for pava |
|
73 |
#' |
|
74 |
#' Function assumes columns in data frame are: percentile, estimate, method |
|
75 |
#' |
|
76 |
#' @param df dataframe containing predcurve estimates |
|
77 |
#' |
|
78 |
#' @return dataframe with adjusted points |
|
79 |
#' |
|
80 |
#' @keywords internal |
|
81 |
#' @noRd |
|
82 |
#' |
|
83 |
locf.pava <- function(df) { |
|
84 | ||
85 |
# save first and last points |
|
86 | 2x |
first <- df[1, ] |
87 | 2x |
last <- df[nrow(df) - 1, ] |
88 | ||
89 |
# adjust data to remove extra points |
|
90 | 2x |
diff_y <- diff(c(0, df$estimate)) |
91 | 2x |
pava.int <- df[diff_y != 0, ] |
92 | ||
93 |
# re-add first and last points |
|
94 | 2x |
df <- rbind(first, pava.int, last) |
95 | 2x |
df <- df[order(df$percentile), ] # order on percentile |
96 | ||
97 | 2x |
df.copy <- df |
98 |
# keep same x, slide y back one spot |
|
99 | 2x |
df.copy$percentile <- df.copy$percentile[ |
100 | 2x |
c(2:(length(df.copy$percentile)), NA) |
101 |
] |
|
102 | 2x |
combined.df <- zipFastener(df, df.copy) |
103 | ||
104 |
# Combine smooth estimate to fixed df |
|
105 | 2x |
combined.df <- na.omit(combined.df) |
106 | 2x |
combined.df <- combined.df[-nrow(combined.df), ] # removes a redundant point |
107 | ||
108 | 2x |
return(combined.df) |
109 |
} |
|
110 | ||
111 |
#' Last observation carried forward for specific estimation methods |
|
112 |
#' |
|
113 |
#' Iterates locf for multiple methods: calls locf.binned and locf.pava |
|
114 |
#' |
|
115 |
#' @param df dataframe containing predcurve estimates |
|
116 |
#' @param method.binned vector of "binned" methods |
|
117 |
#' @param method.pava vector of "pava" methods |
|
118 |
#' |
|
119 |
#' @return dataframe with adjusted points |
|
120 |
#' |
|
121 |
#' @keywords internal |
|
122 |
#' @noRd |
|
123 |
#' |
|
124 |
locf <- function(df, method.binned, method.pava) { |
|
125 | ||
126 | 4x |
complete.data <- df[!df$method %in% c(method.binned, method.pava), ] |
127 | 4x |
binned.data <- df[df$method %in% method.binned, ] |
128 | 4x |
pava.data <- df[df$method %in% method.pava, ] |
129 | ||
130 |
# pava locf |
|
131 | 4x |
if (nrow(pava.data) >= 1) { |
132 | 2x |
pava.plotdata <- locf.pava(pava.data) |
133 | 2x |
complete.data <- bind_rows(complete.data, pava.plotdata) |
134 |
} |
|
135 | ||
136 |
# binned locf |
|
137 | 4x |
if (nrow(binned.data) >= 1) { |
138 | 4x |
binned.plotdata <- locf.binned(binned.data) |
139 | 4x |
complete.data <- bind_rows(complete.data, binned.plotdata) |
140 |
} |
|
141 | 4x |
rownames(complete.data) <- NULL |
142 | ||
143 | 4x |
return(complete.data) |
144 |
} |
1 | ||
2 |
# Calculate Predictive Value (PV) Estimates |
|
3 |
getPVdata <- function(methods, outcome, score, pc.ests = NULL) { |
|
4 | ||
5 |
# Calculate estimates if not provided |
|
6 | 16x |
if (is.null(pc.ests)) { |
7 | 5x |
pc.ests <- getEsts(outcome = outcome, score = score, methods = methods) |
8 | 5x |
pc.data <- pc.ests$plotdata |
9 |
} else { |
|
10 | 11x |
pc.data <- pc.ests$plotdata |
11 |
} |
|
12 | ||
13 |
# locf and zip for pava and/or binned data |
|
14 | 16x |
if (any(pc.ests$idx.step)) { |
15 | 4x |
name.binned <- names(methods)[pc.ests$idx.binned] |
16 | 4x |
name.pava <- names(methods)[pc.ests$idx.pava] |
17 | 4x |
pc.data <- locf(pc.data, method.binned = name.binned, method.pava = name.pava) |
18 |
} |
|
19 | ||
20 |
# Calculate PV |
|
21 | 16x |
PV.data <- parametricPV(pc.data = pc.data) |
22 | ||
23 | 16x |
return(as.data.frame(PV.data)) |
24 |
} |
|
25 | ||
26 | ||
27 |
# Calculates PV values from cutoff thresholds |
|
28 |
nonParametricPV <- function(outcome, score) { |
|
29 | ||
30 | 11x |
prev <- mean(outcome, na.rm = TRUE) |
31 | ||
32 | 11x |
thresh.predictions <- lapply(score, function(x) as.numeric(score > x)) |
33 | ||
34 | 11x |
ppv <- vapply( |
35 | 11x |
thresh.predictions[1:(length(score) - 1)], |
36 | 11x |
function(x) { |
37 | 429x |
yardstick::ppv_vec( |
38 | 429x |
truth = factor(outcome, levels = c("1", "0")), |
39 | 429x |
estimate = factor(x, levels = c("1", "0")), |
40 | 429x |
event_level = "first" |
41 |
) |
|
42 |
}, |
|
43 | 11x |
numeric(1) |
44 |
) |
|
45 | ||
46 | 11x |
npv <- vapply( |
47 | 11x |
thresh.predictions[1:(length(score) - 1)], |
48 | 11x |
function(x) { |
49 | 429x |
yardstick::npv_vec( |
50 | 429x |
truth = factor(outcome, levels = c("1", "0")), |
51 | 429x |
estimate = factor(x, levels = c("1", "0")), |
52 | 429x |
event_level = "first" |
53 |
) |
|
54 |
}, |
|
55 | 11x |
numeric(1) |
56 |
) |
|
57 | ||
58 | 11x |
threshold.data <- data.frame( |
59 | 11x |
score = score, |
60 | 11x |
percentile = ecdf(score)(score), |
61 | 11x |
PPV = c(prev, ppv), |
62 | 11x |
NPV = c(npv, 1 - prev) |
63 |
) %>% |
|
64 | 11x |
mutate(MNPV = 1 - .data$NPV) %>% |
65 | 11x |
tidyr::fill( |
66 | 11x |
all_of(c("PPV", "NPV", "MNPV")), |
67 | 11x |
.direction = "downup" |
68 |
) |
|
69 | ||
70 | 11x |
return(threshold.data) |
71 |
} |
|
72 | ||
73 | ||
74 |
# Calculates PV values based on pracma::cumtrapz |
|
75 |
parametricPV <- function(pc.data) { |
|
76 | ||
77 |
# Calculate PVs |
|
78 | 16x |
PV.data <- pc.data %>% |
79 | 16x |
group_by(.data$method) %>% |
80 | 16x |
mutate( |
81 | 16x |
MNPV = pracma::cumtrapz(.data$percentile, .data$estimate)[, 1] / .data$percentile, |
82 | 16x |
NPV = 1 - .data$MNPV, |
83 | 16x |
PPV = |
84 |
( |
|
85 | 16x |
max(pracma::cumtrapz(.data$percentile, .data$estimate), na.rm = TRUE) - |
86 | 16x |
pracma::cumtrapz(.data$percentile, .data$estimate)[, 1] |
87 | 16x |
) / (max(.data$percentile) - .data$percentile) |
88 |
) %>% |
|
89 | 16x |
ungroup() %>% |
90 | 16x |
arrange(.data$method, .data$percentile) |
91 | ||
92 |
# Fill NAs (starting points) |
|
93 | 16x |
PV.data <- PV.data %>% |
94 | 16x |
group_by(.data$method) %>% |
95 | 16x |
mutate( |
96 | 16x |
MNPV = ifelse(.data$percentile == min(.data$percentile) & is.na(.data$MNPV), 0, .data$MNPV), |
97 | 16x |
NPV = ifelse(.data$percentile == min(.data$percentile) & is.na(.data$NPV), 1, .data$NPV) |
98 |
) %>% |
|
99 | 16x |
as.data.frame() |
100 | ||
101 |
# Fill NAs (ending points) |
|
102 | 16x |
PV.data <- tidyr::fill( |
103 | 16x |
PV.data, |
104 | 16x |
all_of(c("estimate", "MNPV", "NPV", "PPV")), |
105 | 16x |
.direction = "down" |
106 |
) |
|
107 | ||
108 | 16x |
return(as.data.frame(PV.data)) |
109 |
} |
|
110 | ||
111 | ||
112 |
# consistent colors for PVs |
|
113 |
predictionColours <- function(x, show.best) { |
|
114 | 13x |
clrs <- c( |
115 | 13x |
"Best NPV" = "gray80", |
116 | 13x |
"NPV" = "#53B400", |
117 | 13x |
"Best PPV" = "gray65", |
118 | 13x |
"PPV" = "red", |
119 | 13x |
"Best PC" = "black", |
120 | 13x |
"PC" = "plum", |
121 | 13x |
"1-NPV" = "royalblue2", |
122 | 13x |
"Best 1-NPV" = "gray50" |
123 |
) |
|
124 | 13x |
if (show.best) { |
125 | 10x |
x <- c(x, paste("Best", x)) |
126 |
} |
|
127 | 13x |
return(clrs[names(clrs) %in% x]) # keep the ordering above |
128 |
} |
|
129 | ||
130 | ||
131 |
# Calculates sensitivity and specificity (true positive / negative rate) |
|
132 |
nonParametricTR <- function(outcome, score) { |
|
133 | ||
134 |
# Tresh_predictions are lists of 0,1's based on each finer_rscore as a cutpoint |
|
135 | 9x |
thresh.predictions <- lapply(score, function(x) as.numeric(score > x)) |
136 | ||
137 |
# Calc sensitivities and specificities at each risk percentile threshold |
|
138 | 9x |
senses <- vapply( |
139 | 9x |
thresh.predictions[1:(length(score) - 1)], |
140 | 9x |
function(x) { |
141 | 351x |
yardstick::sens_vec( |
142 | 351x |
truth = factor(outcome, levels = c("1", "0")), |
143 | 351x |
estimate = factor(x, levels = c("1", "0")), |
144 | 351x |
event_level = "first" |
145 |
) |
|
146 |
}, |
|
147 | 9x |
numeric(1) |
148 |
) |
|
149 | ||
150 | 9x |
specs <- vapply( |
151 | 9x |
thresh.predictions[1:(length(score) - 1)], |
152 | 9x |
function(x) { |
153 | 351x |
yardstick::spec_vec( |
154 | 351x |
truth = factor(outcome, levels = c("1", "0")), |
155 | 351x |
estimate = factor(x, levels = c("1", "0")), |
156 | 351x |
event_level = "first" |
157 |
) |
|
158 |
}, |
|
159 | 9x |
numeric(1) |
160 |
) |
|
161 | ||
162 |
# Create a data.frame |
|
163 | 9x |
dat <- data.frame( |
164 | 9x |
score = c(min(score), score), |
165 | 9x |
percentile = c(0, ecdf(score)(score)), |
166 | 9x |
Sensitivity = c(1, senses, 0), |
167 | 9x |
Specificity = c(0, specs, 1) |
168 |
) %>% |
|
169 | 9x |
tidyr::pivot_longer( |
170 | 9x |
cols = c("Sensitivity", "Specificity"), |
171 | 9x |
names_to = "pf", |
172 | 9x |
values_to = "value" |
173 |
) |
|
174 | ||
175 | 9x |
return(as.data.frame(dat)) |
176 |
} |
|
177 | ||
178 | ||
179 | ||
180 |
adjPrevPerc <- function(perc, prev.new, cdf.case, cdf.control) { |
|
181 | 4x |
prev.new * cdf.case(perc) + (1 - prev.new) * cdf.control(perc) |
182 |
} |
|
183 | ||
184 |
adjPrevEst <- function(x, prev, prev.new) { |
|
185 | 5x |
f2 <- prev / (1 - prev) |
186 | 5x |
f3 <- (1 - prev.new) / prev.new |
187 | 5x |
1 / (((1 / x) - 1) * f2 * f3 + 1) |
188 |
} |
|
189 | ||
190 |
adjPrevPC <- function(dat, prev, prev.new, cdf.case, cdf.control) { |
|
191 | 1x |
mutate( |
192 | 1x |
dat, |
193 | 1x |
percentile = adjPrevPerc( |
194 | 1x |
perc = .data$score, prev.new = prev.new, |
195 | 1x |
cdf.case = cdf.case, cdf.control = cdf.control |
196 |
), |
|
197 | 1x |
estimate = adjPrevEst(x = .data$estimate, prev = prev, prev.new = prev.new) |
198 |
) |
|
199 |
} |
|
200 | ||
201 |
adjPrevPV <- function(dat, prev, prev.new, cdf.case, cdf.control) { |
|
202 | 1x |
mutate( |
203 | 1x |
dat, |
204 | 1x |
percentile = adjPrevPerc( |
205 | 1x |
perc = .data$score, prev.new = prev.new, |
206 | 1x |
cdf.case = cdf.case, cdf.control = cdf.control |
207 |
), |
|
208 | 1x |
MNPV = adjPrevEst(x = .data$MNPV, prev = prev, prev.new = prev.new), |
209 | 1x |
NPV = 1 - .data$MNPV, |
210 | 1x |
PPV = adjPrevEst(x = .data$PPV, prev = prev, prev.new = prev.new) |
211 |
) |
|
212 |
} |
|
213 | ||
214 | ||
215 | ||
216 |
bestPPV <- function(perc, prev) { |
|
217 | 12x |
ifelse(perc > 1 - prev, 1, prev / (1 - perc)) |
218 |
} |
|
219 | ||
220 |
bestMNPV <- function(perc, prev) { |
|
221 | 12x |
ifelse(perc <= 1 - prev, 0, 1 - ((1 - prev) / perc)) |
222 |
} |
|
223 | ||
224 |
bestSens <- function(perc, prev) { |
|
225 | 8x |
ifelse(perc <= 1 - prev, 1, (1 - perc) / prev) |
226 |
} |
|
227 | ||
228 |
bestSpec <- function(perc, prev) { |
|
229 | 8x |
ifelse(perc > 1 - prev, 1, perc / (1 - prev)) |
230 |
} |