diff --git a/NAMESPACE b/NAMESPACE index d7028b2..ae1f16b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,10 @@ S3method("[",harp_fcst) S3method(arrange,harp_fcst) +S3method(bin_fcst_obs,harp_det_point_df) +S3method(bin_fcst_obs,harp_ens_point_df) +S3method(bin_fcst_obs,harp_ens_point_df_long) +S3method(bin_fcst_obs,harp_list) S3method(bind_point_verif,harp_verif) S3method(bind_point_verif,list) S3method(c,harp_fcst) @@ -56,6 +60,7 @@ S3method(spread_members,default) S3method(spread_members,harp_fcst) S3method(transmute,harp_fcst) export("%>%") +export(bin_fcst_obs) export(bind_bootstrap_score) export(bind_point_verif) export(bind_rows.harp_fcst) diff --git a/R/det_verify.R b/R/det_verify.R index 2e4d227..702c002 100644 --- a/R/det_verify.R +++ b/R/det_verify.R @@ -1,23 +1,6 @@ #' Compute verification scores for deterministic forecasts. #' -#' @param .fcst A \code{harp_fcst} object with tables that have a column for -#' observations, or a single forecast table. -#' @param parameter The name of the column for the observed data. -#' @param thresholds A numeric vector of thresholds for which to compute the -#' threshold based scores. Set to NULL (the default) to only compute summary -#' scores. -#' @param groupings The groups for which to compute the scores. See -#' \link[dplyr]{group_by} for more information of how grouping works. -#' @param circle If set the parameter is assumed to be cyclic for bias -#' calculations. Should be this distance around a circle in the units of the -#' parameter, so would typically have a value of 360 for degrees or `2 * pi` -#' for radians. -#' @param show_progress Logical - whether to show a progress bar. The default is -#' `TRUE` -#' @param ... Reserved for methods -#' `TRUE`. -#' @return A list containting two data frames: \code{det_summary_scores} and -#' \code{det_threshold_scores}. +#' @inheritParams ens_verify #' @export #' #' @examples @@ -27,6 +10,7 @@ det_verify <- function( thresholds = NULL, groupings = "lead_time", circle = NULL, + num_bins = 30, show_progress = TRUE, ... ) { @@ -48,6 +32,7 @@ det_verify.harp_ens_point_df <- function( thresholds = NULL, groupings = "lead_time", circle = NULL, + num_bins = 30, show_progress = TRUE, fcst_model = NULL, ... @@ -62,15 +47,14 @@ det_verify.harp_ens_point_df <- function( groupings <- purrr::map(groupings, ~union(c("sub_model", "member"), .x)) det_verify( - .fcst, {{parameter}}, thresholds, groupings, circle, + .fcst, {{parameter}}, thresholds, groupings, circle, num_bins, show_progress, fcst_model ) } -#' @param fcst_model The name of the forecast model to use in the `fcst_model` -#' column of the output. If the function is dispatched on a `harp_list` -#' object, the names of the `harp_list` are automatically used. +#' @rdname det_verify +#' @inheritParams ens_verify.harp_ens_point_df #' @export det_verify.harp_det_point_df <- function( .fcst, @@ -78,6 +62,7 @@ det_verify.harp_det_point_df <- function( thresholds = NULL, groupings = "lead_time", circle = NULL, + num_bins = 30, show_progress = TRUE, fcst_model = NULL, ... @@ -184,15 +169,25 @@ det_verify.harp_det_point_df <- function( fcst_df } - res <- list() - - res[["det_summary_scores"]] <- purrr::map( + det_summary_scores <- list() + det_summary_scores[["basic"]] <- purrr::map( groupings, compute_summary_scores, .fcst ) %>% purrr::list_rbind() %>% fill_group_na(groupings) %>% dplyr::mutate(fcst_model = fcst_model, .before = dplyr::everything()) + det_summary_scores[["hexbin"]] <- bin_fcst_obs( + .fcst, !!parameter, groupings, num_bins, show_progress + )[["det_summary_scores"]] + + res <- list() + + res[["det_summary_scores"]] <- Reduce( + function(x, y) suppressMessages(dplyr::inner_join(x, y)), + det_summary_scores + ) + if (is.numeric(thresholds)) { meta_cols <- grep( @@ -307,6 +302,7 @@ det_verify.harp_list <- function( thresholds = NULL, groupings = "lead_time", circle = NULL, + num_bins = 30, show_progress = TRUE ) { @@ -322,7 +318,7 @@ det_verify.harp_list <- function( purrr::imap( .fcst, ~det_verify( - .x, {{parameter}}, thresholds, groupings, circle, + .x, {{parameter}}, thresholds, groupings, circle, num_bins, show_progress, fcst_model = .y ) ) diff --git a/R/ens_verify.R b/R/ens_verify.R index 11fa55c..5a1b407 100644 --- a/R/ens_verify.R +++ b/R/ens_verify.R @@ -1,7 +1,7 @@ #' Compute all verification scores for an ensemble. #' -#' @param .fcst A \code{harp_fcst} object with tables that have a column for -#' observations, or a single forecast table. +#' @param .fcst A `harp_df` or `harp_list` object with tables that have a column +#' for observations, or a single forecast table. #' @param parameter The name of the column for the observed data. #' @param verify_members Whether to verify the individual members of the #' ensemble. Even if thresholds are supplied, only summary scores are @@ -35,12 +35,16 @@ #' elements eps_model and member to use a member of an eps model in the #' harp_fcst object for the climatology, or a data frame with columns for #' threshold and climatology and also optionally lead_time. +#' @param hexbin Logical. Whether to compute hexbins for forecast, observation +#' pairs. Defaults to `TRUE`. See \link{bin_fcst_obs} for more details. +#' @param num_bins The number of bins into which to partition observations for +#' the hexbin computation. #' @param rank_hist Logical. Whether to compute the rank histogram. Defaults to #' `TRUE`. Note that the computation of the rank histogram can be slow if #' there is a large number (> 1000) of groups. #' @param crps Logical. Whether to compute the CRPS. Defaults to `TRUE`. -#' @param brier Logical. Whether to compute the Brier score. Defaults to -#' `TRUE`. Will be ignored if no thresholds are set. +#' @param brier Logical. Whether to compute the Brier score. Defaults to `TRUE`. +#' Will be ignored if no thresholds are set. #' @param roc Logical. Whether to compute the Relative Operating Characteristic #' (ROC). Defaults to `TRUE`. Will be ignored if no thresholds are set. #' @param econ_val Logical. Whether to compute the economic value. Defaults to @@ -66,6 +70,8 @@ ens_verify <- function( spread_drop_member = NULL, jitter_fcst = NULL, climatology = "sample", + hexbin = TRUE, + num_bins = 30, rank_hist = TRUE, crps = TRUE, brier = TRUE, @@ -101,6 +107,8 @@ ens_verify.harp_ens_point_df <- function( spread_drop_member = NULL, jitter_fcst = NULL, climatology = "sample", + hexbin = TRUE, + num_bins = 30, rank_hist = TRUE, crps = TRUE, show_progress = TRUE, @@ -170,9 +178,20 @@ ens_verify.harp_ens_point_df <- function( spread_drop_member = spread_drop_member )[["ens_summary_scores"]] + if (hexbin) { + ens_summary_scores[["hexbin"]] <- dplyr::select( + bin_fcst_obs( + .fcst, !!parameter, groupings = groupings, num_bins = num_bins, + show_progress = show_progress + )[["ens_summary_scores"]], + -dplyr::all_of("num_cases") + ) + } + if (rank_hist) { ens_summary_scores[["rh"]] <- ens_rank_histogram( - .fcst, !! parameter, groupings = groupings + .fcst, !! parameter, groupings = groupings, + show_progress = show_progress )[["ens_summary_scores"]] } @@ -284,6 +303,8 @@ ens_verify.harp_list <- function( spread_drop_member = NULL, jitter_fcst = NULL, climatology = "sample", + hexbin = TRUE, + num_bins = 30, rank_hist = TRUE, crps = TRUE, brier = TRUE, @@ -312,7 +333,7 @@ ens_verify.harp_list <- function( function(x, y, z) ens_verify( x, !!parameter, verify_members, thresholds, groupings, circle, rel_probs, num_ref_members, z, jitter_fcst, climatology, - rank_hist, crps, brier, roc, econ_val, + hexbin, num_bins, rank_hist, crps, brier, roc, econ_val, show_progress, fcst_model = y ) ) diff --git a/R/harp_hexbin.R b/R/harp_hexbin.R new file mode 100644 index 0000000..b20b337 --- /dev/null +++ b/R/harp_hexbin.R @@ -0,0 +1,248 @@ +#' Create a bi-variate histogram of forecast, observation pairs +#' +#' Values of forecasts and observations are binned into bands whereby the +#' density of forecast, observation pairs for each bin is calculated. Under the +#' hood, the data are binned into hexagons using \code{\link[hexbin]{hexbin}}. +#' Hexagons are used since they have symmetry with nearest neighbours unlike +#' square bins, and at plot time they are the polygon with the maximum number of +#' sides that tessellate. +#' +#' @param .fcst A `harp_df` data frame or a `harp_list`. +#' @param parameter The column containing the parameter. Can be unquoted, a +#' quoted string, or an embraced variable name (i.e {{var}}). +#' @param groupings The groupings for which to compute the binned densities. +#' Must be a vector of strings, or a list of vectors of strings. +#' @param num_bins The number of bins into which to partition the observations. +#' @param show_progress Logical. Whether to show a progress bar. +#' @param ... Arguments for methods. +#' +#' @return +#' @export +#' +#' @examples +bin_fcst_obs <- function( + .fcst, + parameter, + groupings = "lead_time", + num_bins = 30, + show_progress = TRUE, + ... +) { + UseMethod("bin_fcst_obs") +} + +#' @rdname bin_fcst_obs +#' @param fcst_model The name of the forecast model. If `.fcst` does not +#' contain a `fcst_model`, a new column is created and populated with this +#' value. If a `fcst_model` column exists, the value in the column is replaced +#' with this value. +#' +#' @export +bin_fcst_obs.harp_det_point_df <- function( + .fcst, + parameter, + groupings = "lead_time", + num_bins = 30, + show_progress = TRUE, + fcst_model = NULL, + ... +) { + + if (!is.list(groupings)) { + groupings <- list(groupings) + } + + col_names <- colnames(.fcst) + parameter <- rlang::ensym(parameter) + chr_param <- rlang::as_name(parameter) + + if (length(grep(chr_param, col_names)) < 1) { + cli::cli_abort("No column found for {chr_param}") + } + + fcst_model <- parse_fcst_model(.fcst, fcst_model) + .fcst[["fcst_model"]] <- fcst_model + + res <- list() + res[["det_summary_scores"]] <- purrr::map( + groupings, compute_hexbin, .fcst, chr_param, num_bins, show_progress + ) %>% + purrr::list_rbind() %>% + fill_group_na(groupings) %>% + dplyr::mutate(fcst_model = fcst_model, .before = dplyr::everything()) + + structure( + add_attributes( + res[which(vapply(res, nrow, numeric(1)) > 0)], + harpCore::unique_fcst_dttm(.fcst), + !!parameter, + harpCore::unique_stations(.fcst), + groupings + ), + class = "harp_verif" + ) + +} + +#' @export +bin_fcst_obs.harp_ens_point_df_long <- function( + .fcst, + parameter, + groupings = "lead_time", + num_bins = 30, + show_progress = TRUE, + fcst_model = NULL, + ... +) { + + if (!is.list(groupings)) { + groupings <- list(groupings) + } + + # groupings <- lapply(groupings, function(x) c(x, "member")) + + col_names <- colnames(.fcst) + parameter <- rlang::ensym(parameter) + chr_param <- rlang::as_name(parameter) + + if (length(grep(chr_param, col_names)) < 1) { + cli::cli_abort("No column found for {chr_param}") + } + + fcst_model <- parse_fcst_model(.fcst, fcst_model) + .fcst[["fcst_model"]] <- fcst_model + + res <- list() + res[["ens_summary_scores"]] <- purrr::map( + groupings, compute_hexbin, .fcst, chr_param, num_bins, show_progress + ) %>% + purrr::list_rbind() %>% + fill_group_na(groupings) %>% + dplyr::mutate(fcst_model = fcst_model, .before = dplyr::everything()) + + structure( + add_attributes( + res[which(vapply(res, nrow, numeric(1)) > 0)], + harpCore::unique_fcst_dttm(.fcst), + !!parameter, + harpCore::unique_stations(.fcst), + groupings + ), + class = "harp_verif" + ) + +} + +#' @rdname bin_fcst_obs +#' @export +bin_fcst_obs.harp_ens_point_df <- function( + .fcst, + parameter, + groupings = "lead_time", + num_bins = 30, + show_progress = TRUE, + fcst_model = NULL, + ... +) { + + parameter <- rlang::ensym(parameter) + + bin_fcst_obs( + harpCore::pivot_members(.fcst), + !!parameter, + groupings, + num_bins, + show_progress, + fcst_model, + ... + ) + +} + +#' @export +bin_fcst_obs.harp_list <- function( + .fcst, + parameter, + groupings = "lead_time", + num_bins = 30, + show_progress = TRUE, + ... +) { + + parameter <- rlang::ensym(parameter) + + list_to_harp_verif( + purrr::imap( + .fcst, + ~bin_fcst_obs( + .x, !!parameter, groupings, num_bins, show_progress, fcst_model = .y + ) + ) + ) +} + + +compute_hexbin <- function( + compute_group, fcst_df, parameter, num_bins, show_progress +) { + + fcst_df <- group_without_threshold(fcst_df, compute_group, nest = TRUE) + group_vars <- grep( + "grouped_data", colnames(fcst_df), invert = TRUE, value = TRUE + ) + group_names <- glue::glue_collapse(group_vars, sep = ", ", last = " & ") + + score_text <- cli::col_blue(glue::glue("Hexbin for {group_names}")) + if (show_progress) { + pb_name <- score_text + } else { + pb_name <- FALSE + cat(score_text) + score_text <- "" + } + + res <- dplyr::transmute( + fcst_df, + dplyr::across(group_vars, ~.x), + num_cases = purrr::map_int(.data[["grouped_data"]], nrow), + num_stations = { + if (is.element("SID", group_vars)) { + 1L + } else { + purrr::map_int(.data[["grouped_data"]], ~length(unique(.x[["SID"]]))) + } + }, + hexbin = purrr::map( + .data[["grouped_data"]], + ~hexbin_df(.x[[parameter]], .x[["fcst"]], num_bins), + .progress = pb_name + ) + ) + + cat(score_text, cli::col_green(cli::symbol[["tick"]]), "\n") + + res +} + + +hexbin_df <- function(x, y, nbins) { + xrange <- range(x) + yrange <- range(y) + if (diff(xrange) == 0) { + xrange[1] <- xrange[1] - xrange[1] * 0.01 + xrange[2] <- xrange[2] + xrange[2] * 0.01 + } + if (diff(yrange) == 0) { + yrange[1] <- yrange[1] - yrange[1] * 0.01 + yrange[2] <- yrange[2] + yrange[2] * 0.01 + } + hexes <- hexbin::hexbin(x, y, xbins = nbins, xbnds = xrange, ybnds = yrange) + dplyr::rename( + dplyr::mutate( + tibble::as_tibble(hexbin::hcell2xy(hexes)), + count = hexes@count + ), + obs = .data[["x"]], + fcst = .data[["y"]] + ) +} diff --git a/man/bin_fcst_obs.Rd b/man/bin_fcst_obs.Rd new file mode 100644 index 0000000..cb6ac82 --- /dev/null +++ b/man/bin_fcst_obs.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/harp_hexbin.R +\name{bin_fcst_obs} +\alias{bin_fcst_obs} +\alias{bin_fcst_obs.harp_det_point_df} +\alias{bin_fcst_obs.harp_ens_point_df} +\title{Create a bi-variate histogram of forecast, observation pairs} +\usage{ +bin_fcst_obs( + .fcst, + parameter, + groupings = "lead_time", + num_bins = 30, + show_progress = TRUE, + ... +) + +\method{bin_fcst_obs}{harp_det_point_df}( + .fcst, + parameter, + groupings = "lead_time", + num_bins = 30, + show_progress = TRUE, + fcst_model = NULL, + ... +) + +\method{bin_fcst_obs}{harp_ens_point_df}( + .fcst, + parameter, + groupings = "lead_time", + num_bins = 30, + show_progress = TRUE, + fcst_model = NULL, + ... +) +} +\arguments{ +\item{.fcst}{A `harp_df` data frame or a `harp_list`.} + +\item{parameter}{The column containing the parameter. Can be unquoted, a +quoted string, or an embraced variable name (i.e {{var}}).} + +\item{groupings}{The groupings for which to compute the binned densities. +Must be a vector of strings, or a list of vectors of strings.} + +\item{num_bins}{The number of bins into which to partition the observations.} + +\item{show_progress}{Logical. Whether to show a progress bar.} + +\item{...}{Arguments for methods.} + +\item{fcst_model}{The name of the forecast model. If `.fcst` does not +contain a `fcst_model`, a new column is created and populated with this +value. If a `fcst_model` column exists, the value in the column is replaced +with this value.} +} +\description{ +Values of forecasts and observations are binned into bands whereby the +density of forecast, observation pairs for each bin is calculated. Under the +hood, the data are binned into hexagons using \code{\link[hexbin]{hexbin}}. +Hexagons are used since they have symmetry with nearest neighbours unlike +square bins, and at plot time they are the polygon with the maximum number of +sides that tessellate. +} diff --git a/man/det_verify.Rd b/man/det_verify.Rd index d36339d..14f0715 100644 --- a/man/det_verify.Rd +++ b/man/det_verify.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/det_verify.R \name{det_verify} \alias{det_verify} +\alias{det_verify.harp_det_point_df} \title{Compute verification scores for deterministic forecasts.} \usage{ det_verify( @@ -10,13 +11,26 @@ det_verify( thresholds = NULL, groupings = "lead_time", circle = NULL, + num_bins = 30, show_progress = TRUE, ... ) + +\method{det_verify}{harp_det_point_df}( + .fcst, + parameter, + thresholds = NULL, + groupings = "lead_time", + circle = NULL, + num_bins = 30, + show_progress = TRUE, + fcst_model = NULL, + ... +) } \arguments{ -\item{.fcst}{A \code{harp_fcst} object with tables that have a column for -observations, or a single forecast table.} +\item{.fcst}{A `harp_df` or `harp_list` object with tables that have a column +for observations, or a single forecast table.} \item{parameter}{The name of the column for the observed data.} @@ -32,15 +46,17 @@ calculations. Should be this distance around a circle in the units of the parameter, so would typically have a value of 360 for degrees or `2 * pi` for radians.} -\item{show_progress}{Logical - whether to show a progress bar. The default is -`TRUE`} +\item{num_bins}{The number of bins into which to partition observations for +the hexbin computation.} -\item{...}{Reserved for methods +\item{show_progress}{Logical - whether to show progress bars. Defaults to `TRUE`.} -} -\value{ -A list containting two data frames: \code{det_summary_scores} and - \code{det_threshold_scores}. + +\item{...}{Reserved for methods.} + +\item{fcst_model}{The name of the forecast model to use in the `fcst_model` +column of the output. If the function is dispatched on a `harp_list` +object, the names of the `harp_list` are automatically used.} } \description{ Compute verification scores for deterministic forecasts. diff --git a/man/ens_spread_and_skill.Rd b/man/ens_spread_and_skill.Rd index f679514..be1a856 100644 --- a/man/ens_spread_and_skill.Rd +++ b/man/ens_spread_and_skill.Rd @@ -8,7 +8,7 @@ ens_spread_and_skill( .fcst, parameter, groupings = "lead_time", - circle, + circle = NULL, spread_drop_member = NULL, jitter_fcst = NULL, show_progress = TRUE, diff --git a/man/ens_verify.Rd b/man/ens_verify.Rd index 62d8b16..347d2c1 100644 --- a/man/ens_verify.Rd +++ b/man/ens_verify.Rd @@ -16,6 +16,8 @@ ens_verify( spread_drop_member = NULL, jitter_fcst = NULL, climatology = "sample", + hexbin = TRUE, + num_bins = 30, rank_hist = TRUE, crps = TRUE, brier = TRUE, @@ -26,8 +28,8 @@ ens_verify( ) } \arguments{ -\item{.fcst}{A \code{harp_fcst} object with tables that have a column for -observations, or a single forecast table.} +\item{.fcst}{A `harp_df` or `harp_list` object with tables that have a column +for observations, or a single forecast table.} \item{parameter}{The name of the column for the observed data.} @@ -72,14 +74,20 @@ elements eps_model and member to use a member of an eps model in the harp_fcst object for the climatology, or a data frame with columns for threshold and climatology and also optionally lead_time.} +\item{hexbin}{Logical. Whether to compute hexbins for forecast, observation +pairs. Defaults to `TRUE`. See \link{bin_fcst_obs} for more details.} + +\item{num_bins}{The number of bins into which to partition observations for +the hexbin computation.} + \item{rank_hist}{Logical. Whether to compute the rank histogram. Defaults to `TRUE`. Note that the computation of the rank histogram can be slow if there is a large number (> 1000) of groups.} \item{crps}{Logical. Whether to compute the CRPS. Defaults to `TRUE`.} -\item{brier}{Logical. Whether to compute the Brier score. Defaults to -`TRUE`. Will be ignored if no thresholds are set.} +\item{brier}{Logical. Whether to compute the Brier score. Defaults to `TRUE`. +Will be ignored if no thresholds are set.} \item{roc}{Logical. Whether to compute the Relative Operating Characteristic (ROC). Defaults to `TRUE`. Will be ignored if no thresholds are set.}