diff --git a/DESCRIPTION b/DESCRIPTION index abb0cf86c..a746b53eb 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -98,6 +98,8 @@ Collate: 'correlation.R' 'epi_df.R' 'epi_df_forbidden_methods.R' + 'epi_slide_opt_archive.R' + 'epi_slide_opt_edf.R' 'epiprocess-package.R' 'group_by_epi_df_methods.R' 'methods-epi_archive.R' @@ -106,6 +108,7 @@ Collate: 'key_colnames.R' 'methods-epi_df.R' 'outliers.R' + 'patch.R' 'reexports.R' 'revision_analysis.R' 'slide.R' diff --git a/NAMESPACE b/NAMESPACE index 6f3ef6a11..c596f67a6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,9 @@ S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) S3method(dplyr_reconstruct,epi_df) S3method(dplyr_row_slice,epi_df) +S3method(epi_slide_opt,epi_archive) +S3method(epi_slide_opt,epi_df) +S3method(epi_slide_opt,grouped_epi_archive) S3method(epix_slide,epi_archive) S3method(epix_slide,grouped_epi_archive) S3method(epix_truncate_versions_after,epi_archive) @@ -102,6 +105,7 @@ export(time_column_names) export(ungroup) export(unnest) export(validate_epi_archive) +export(vec_approx_equal) export(version_column_names) import(epidatasets) importFrom(checkmate,anyInfinite) @@ -118,13 +122,19 @@ importFrom(checkmate,assert_logical) importFrom(checkmate,assert_number) importFrom(checkmate,assert_numeric) importFrom(checkmate,assert_scalar) +importFrom(checkmate,assert_set_equal) importFrom(checkmate,assert_string) importFrom(checkmate,assert_subset) importFrom(checkmate,assert_tibble) +importFrom(checkmate,assert_true) importFrom(checkmate,checkInt) importFrom(checkmate,check_atomic) +importFrom(checkmate,check_character) importFrom(checkmate,check_data_frame) +importFrom(checkmate,check_logical) importFrom(checkmate,check_names) +importFrom(checkmate,check_null) +importFrom(checkmate,check_numeric) importFrom(checkmate,expect_class) importFrom(checkmate,test_int) importFrom(checkmate,test_set_equal) @@ -144,6 +154,7 @@ importFrom(data.table,address) importFrom(data.table,as.data.table) importFrom(data.table,between) importFrom(data.table,copy) +importFrom(data.table,fifelse) importFrom(data.table,frollapply) importFrom(data.table,frollmean) importFrom(data.table,frollsum) @@ -152,6 +163,8 @@ importFrom(data.table,key) importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(data.table,setDF) +importFrom(data.table,setDT) +importFrom(data.table,setcolorder) importFrom(data.table,setkeyv) importFrom(dplyr,"%>%") importFrom(dplyr,across) @@ -174,8 +187,8 @@ importFrom(dplyr,if_all) importFrom(dplyr,if_any) importFrom(dplyr,if_else) importFrom(dplyr,is_grouped_df) -importFrom(dplyr,lag) importFrom(dplyr,mutate) +importFrom(dplyr,n_groups) importFrom(dplyr,pick) importFrom(dplyr,pull) importFrom(dplyr,relocate) @@ -201,6 +214,7 @@ importFrom(rlang,"%||%") importFrom(rlang,.data) importFrom(rlang,.env) importFrom(rlang,arg_match) +importFrom(rlang,arg_match0) importFrom(rlang,caller_arg) importFrom(rlang,caller_env) importFrom(rlang,check_dots_empty) @@ -213,6 +227,7 @@ importFrom(rlang,expr_label) importFrom(rlang,f_env) importFrom(rlang,f_rhs) importFrom(rlang,is_bare_integerish) +importFrom(rlang,is_bare_list) importFrom(rlang,is_bare_numeric) importFrom(rlang,is_environment) importFrom(rlang,is_formula) @@ -236,10 +251,12 @@ importFrom(slider,slide_sum) importFrom(stats,cor) importFrom(stats,median) importFrom(tibble,as_tibble) +importFrom(tibble,is_tibble) importFrom(tibble,new_tibble) importFrom(tibble,validate_tibble) importFrom(tidyr,complete) importFrom(tidyr,full_seq) +importFrom(tidyr,nest) importFrom(tidyr,unnest) importFrom(tidyselect,any_of) importFrom(tidyselect,eval_select) @@ -249,15 +266,27 @@ importFrom(tsibble,as_tsibble) importFrom(utils,capture.output) importFrom(utils,tail) importFrom(vctrs,"vec_slice<-") +importFrom(vctrs,obj_is_vector) importFrom(vctrs,vec_cast) +importFrom(vctrs,vec_cast_common) importFrom(vctrs,vec_data) importFrom(vctrs,vec_duplicate_any) +importFrom(vctrs,vec_duplicate_detect) +importFrom(vctrs,vec_duplicate_id) importFrom(vctrs,vec_equal) importFrom(vctrs,vec_in) +importFrom(vctrs,vec_match) importFrom(vctrs,vec_order) +importFrom(vctrs,vec_ptype) importFrom(vctrs,vec_rbind) +importFrom(vctrs,vec_recycle) importFrom(vctrs,vec_recycle_common) importFrom(vctrs,vec_rep) +importFrom(vctrs,vec_rep_each) +importFrom(vctrs,vec_seq_along) +importFrom(vctrs,vec_set_intersect) +importFrom(vctrs,vec_set_names) importFrom(vctrs,vec_size) +importFrom(vctrs,vec_size_common) importFrom(vctrs,vec_slice) importFrom(vctrs,vec_sort) diff --git a/NEWS.md b/NEWS.md index 39274ff4a..89435c9df 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,6 +7,17 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat ## New features - `is_epi_archive` function has been reintroduced. +## Breaking changes + +- The low-level `new_epi_archive()` function's `x` argument has been replaced + with a `data_table` argument, which now has extra requirements; see + `?new_epi_archive`. Users should still be using `as_epi_archive()` unless they + have a need for something lower-level. + +## New features + +- `epi_slide_{mean,sum,opt}` now work on `epi_archive`s, preparing version + histories for 7-day-averages of signals, etc. # epiprocess 0.11 diff --git a/R/archive.R b/R/archive.R index 922371f1c..543cb4516 100644 --- a/R/archive.R +++ b/R/archive.R @@ -186,17 +186,21 @@ next_after.Date <- function(x) x + 1L #' archive. Unexpected behavior may result from modifying the metadata #' directly. #' -#' @param x A data.frame, data.table, or tibble, with columns `geo_value`, -#' `time_value`, `version`, and then any additional number of columns. +#' @param data_table a `data.table` with [`data.table::key()`] equal to +#' `c("geo_value", other_keys, "time_value", "version")`. For `data.table` +#' users: this sets up an alias of `data_table`; if you plan to keep on +#' working with `data_table` or working directly with the archive's `$DT` +#' using mutating operations, you should `copy()` if appropriate. We will not +#' mutate the `DT` with any exported `{epiprocess}` functions, though. #' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the -#' location column and set to "custom" if not recognized. -#' @param time_type DEPRECATED Has no effect. Time value type inferred from the time -#' column and set to "custom" if not recognized. Unpredictable behavior may result -#' if the time type is not recognized. +#' location column and set to "custom" if not recognized. +#' @param time_type DEPRECATED Has no effect. Time value type inferred from the +#' time column and set to "custom" if not recognized. Unpredictable behavior +#' may result if the time type is not recognized. #' @param other_keys Character vector specifying the names of variables in `x` #' that should be considered key variables (in the language of `data.table`) -#' apart from "geo_value", "time_value", and "version". Typical examples -#' are "age" or more granular geographies. +#' apart from "geo_value", "time_value", and "version". Typical examples are +#' "age" or more granular geographies. #' @param compactify Optional; `TRUE`, `FALSE`, or `"message"`. `TRUE` will #' remove some redundant rows, `FALSE` will not. `"message"` is like `TRUE` #' but will emit a message if anything was changed. Default is `TRUE`. See @@ -278,41 +282,22 @@ next_after.Date <- function(x) x + 1L #' @order 3 #' @export new_epi_archive <- function( - x, + data_table, geo_type, time_type, other_keys, clobberable_versions_start, versions_end) { - assert_data_frame(x) + assert_class(data_table, "data.table") assert_string(geo_type) assert_string(time_type) assert_character(other_keys, any.missing = FALSE) if (any(c("geo_value", "time_value", "version") %in% other_keys)) { cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") } - validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) - validate_version_bound(versions_end, x, na_ok = FALSE) - - key_vars <- c("geo_value", "time_value", other_keys, "version") - if (!all(key_vars %in% names(x))) { - # Give a more tailored error message than as.data.table would: - cli_abort(c( - "`x` is missing the following expected columns: - {format_varnames(setdiff(key_vars, names(x)))}.", - ">" = "You might need to `dplyr::rename()` beforehand - or use `as_epi_archive()`'s renaming feature.", - ">" = if (!all(other_keys %in% names(x))) { - "Check also for typos in `other_keys`." - } - )) - } - - # Create the data table; if x was an un-keyed data.table itself, - # then the call to as.data.table() will fail to set keys, so we - # need to check this, then do it manually if needed - data_table <- as.data.table(x, key = key_vars) - if (!identical(key_vars, key(data_table))) setkeyv(data_table, cols = key_vars) + assert_true(identical(key(data_table), c("geo_value", other_keys, "time_value", "version"))) + validate_version_bound(clobberable_versions_start, data_table, na_ok = TRUE) + validate_version_bound(versions_end, data_table, na_ok = FALSE) structure( list( @@ -338,7 +323,7 @@ new_epi_archive <- function( validate_epi_archive <- function(x) { assert_class(x, "epi_archive") - ukey_vars1 <- c("geo_value", "time_value", x$other_keys, "version") + ukey_vars1 <- c("geo_value", x$other_keys, "time_value", "version") ukey_vars2 <- key(x$DT) if (!identical(ukey_vars1, ukey_vars2)) { cli_abort(c("`data.table::key(x$DT)` not as expected", @@ -405,7 +390,7 @@ validate_epi_archive <- function(x) { #' would be `key(DT)`. #' @param abs_tol numeric, >=0; absolute tolerance to use on numeric measurement #' columns when determining whether something can be compactified away; see -#' [`is_locf`] +#' [`vec_approx_equal`] #' #' @importFrom data.table is.data.table key #' @importFrom dplyr arrange filter @@ -424,10 +409,23 @@ apply_compactify <- function(updates_df, ukey_names, abs_tol = 0) { } assert_numeric(abs_tol, len = 1, lower = 0) - if (!is.data.table(updates_df) || !identical(key(updates_df), ukey_names)) { + if (is.data.table(updates_df)) { + if (!identical(key(updates_df), ukey_names)) { + cli_abort(c("`ukey_names` should match `key(updates_df)`", + "i" = "`ukey_names` was {format_chr_deparse(ukey_names)}", + "i" = "`key(updates_df)` was {format_chr_deparse(key(updates_df))}" + )) + } + } else { updates_df <- updates_df %>% arrange(pick(all_of(ukey_names))) } - updates_df[!update_is_locf(updates_df, ukey_names, abs_tol), ] + + # In case updates_df is a data.table, store keep flags in a local: "When the + # first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table + # looks for var in calling scope". In case it's not a data.table, make sure to + # use df[i,] not just df[i]. + to_keep <- !update_is_locf(updates_df, ukey_names, abs_tol) + updates_df[to_keep, ] } #' get the entries that `compactify` would remove @@ -440,7 +438,8 @@ removed_by_compactify <- function(updates_df, ukey_names, abs_tol) { updates_df[update_is_locf(updates_df, ukey_names, abs_tol), ] } -#' Internal helper; lgl; which updates are LOCF +#' Internal helper; lgl; which updates are LOCF and should thus be dropped when +#' compactifying #' #' (Not validated:) Must be called inside certain dplyr data masking verbs (e.g., #' `filter` or `mutate`) being run on an `epi_archive`'s `DT` or a data frame @@ -464,56 +463,45 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) { ekt_names <- ukey_names[ukey_names != "version"] val_names <- all_names[!all_names %in% ukey_names] - Reduce(`&`, lapply(updates_col_refs[ekt_names], is_locf, abs_tol, TRUE)) & - Reduce(`&`, lapply(updates_col_refs[val_names], is_locf, abs_tol, FALSE)) -} - -#' Checks to see if a value in a vector is LOCF -#' @description LOCF meaning last observation carried forward (to later -#' versions). Lags the vector by 1, then compares with itself. If `is_key` is -#' `TRUE`, only values that are exactly the same between the lagged and -#' original are considered LOCF. If `is_key` is `FALSE` and `vec` is a vector -#' of numbers ([`base::is.numeric`]), then approximate equality will be used, -#' checking whether the absolute difference between each pair of entries is -#' `<= abs_tol`; if `vec` is something else, then exact equality is used -#' instead. -#' -#' @details -#' -#' We include epikey-time columns in LOCF comparisons as part of an optimization -#' to avoid slower grouped operations while still ensuring that the first -#' observation for each time series will not be marked as LOCF. We test these -#' key columns for exact equality to prevent chopping off consecutive -#' time_values during flat periods when `abs_tol` is high. -#' -#' We use exact equality for non-`is.numeric` double/integer columns such as -#' dates, datetimes, difftimes, `tsibble::yearmonth`s, etc., as these may be -#' used as part of re-indexing or grouping procedures, and we don't want to -#' change the number of groups for those operations when we remove LOCF data -#' during compactification. -#' -#' @importFrom dplyr lag if_else -#' @importFrom rlang is_bare_numeric -#' @importFrom vctrs vec_equal -#' @keywords internal -is_locf <- function(vec, abs_tol, is_key) { # nolint: object_usage_linter - lag_vec <- lag(vec) - if (is.vector(vec, mode = "numeric") && !is_key) { - # (integer or double vector, no class (& no dims); maybe names, which we'll - # ignore like `vec_equal`); not a key column - unname(if_else( - !is.na(vec) & !is.na(lag_vec), - abs(vec - lag_vec) <= abs_tol, - is.na(vec) & is.na(lag_vec) - )) + n_updates <- nrow(arranged_updates_df) + if (n_updates == 0L) { + logical(0L) + } else if (n_updates == 1L) { + FALSE # sole observation is not LOCF } else { - vec_equal(vec, lag_vec, na_equal = TRUE) + ekts_tbl <- new_tibble(updates_col_refs[ekt_names]) + vals_tbl <- new_tibble(updates_col_refs[val_names]) + # grab the data and a shifted version of the data, and compute the + # entry-wise difference to see if the value has changed + # n_updates >= 2L so we can use `:` naturally (this is the reason for + # separating out n_updates == 1L from this case): + inds1 <- 2L:n_updates + inds2 <- 1L:(n_updates - 1L) + c( + FALSE, # first observation is not LOCF + # for the rest, check that both the keys are exactly the same, and the + # values are within abs_tol + # the key comparison effectively implements a group_by, so that when the + # key changes we're guaranteed the value is correct + vec_approx_equal0(ekts_tbl, + inds1 = inds1, ekts_tbl, inds2 = inds2, + # check ekt (key) cols with 0 tolerance: + na_equal = TRUE, abs_tol = 0 + ) & + vec_approx_equal0(vals_tbl, + inds1 = inds1, vals_tbl, inds2 = inds2, + na_equal = TRUE, abs_tol = abs_tol + ) + ) } } #' `as_epi_archive` converts a data frame, data table, or tibble into an #' `epi_archive` object. #' +#' @param x A data.frame, data.table, or tibble, with columns `geo_value`, +#' `time_value`, `version`, and then any additional number of columns, either +#' keys or values. #' @param ... used for specifying column names, as in [`dplyr::rename`]. For #' example `version = release_date` #' @param .versions_end location based versions_end, used to avoid prefix @@ -536,11 +524,32 @@ as_epi_archive <- function( .versions_end = max_version_with_row_in(x), ..., versions_end = .versions_end) { assert_data_frame(x) + # Convert first to data.frame to guard against data.table#6859 and potentially + # other things epiprocess#618: + x_already_copied <- identical(class(x), c("data.table", "data.frame")) + x <- as.data.frame(x) x <- rename(x, ...) - x <- guess_column_name(x, "time_value", time_column_names()) x <- guess_column_name(x, "geo_value", geo_column_names()) + if (!all(other_keys %in% names(x))) { + # Give a more tailored error message than as.data.table would: + cli_abort(c( + "`x` is missing the following expected columns: + {format_varnames(setdiff(other_keys, names(x)))}.", + ">" = "You might need to `dplyr::rename()` beforehand + or using `as_epi_archive()`'s renaming feature." + )) + } + x <- guess_column_name(x, "time_value", time_column_names()) x <- guess_column_name(x, "version", version_column_names()) + # Convert to data.table: + key_vars <- c("geo_value", other_keys, "time_value", "version") + if (x_already_copied) { + setDT(x, key = key_vars) + } else { + x <- as.data.table(x, key = key_vars) + } + if (lifecycle::is_present(geo_type)) { cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.") } @@ -561,11 +570,10 @@ as_epi_archive <- function( cli_abort('`compactify` must be `TRUE`, `FALSE`, or `"message"`') } - data_table <- result$DT - key_vars <- key(data_table) + data_table <- result$DT # probably just `x`, but take no chances nrow_before_compactify <- nrow(data_table) - # Runs compactify on data frame + # Runs compactify on data_table if (identical(compactify, TRUE) || identical(compactify, "message")) { compactified <- apply_compactify(data_table, key_vars, compactify_abs_tol) } else { diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R new file mode 100644 index 000000000..ab930af61 --- /dev/null +++ b/R/epi_slide_opt_archive.R @@ -0,0 +1,173 @@ +#' Core operation of `epi_slide_opt.epi_archive` for a single epikey's history +#' +#' @param inp_updates tibble with a `version` column and measurement columns for +#' a single epikey, without the epikey labeling columns (e.g., from +#' `group_modify`). Interpretation is analogous to an `epi_archive` `DT`, but +#' a specific row order is not required. +#' @inheritParams epi_slide_opt_edf_one_epikey +#' @return tibble with a `version` column, pre-existing measurement columns, and +#' new measurement columns; (compactified) diff data to put into an +#' `epi_archive`. May not match column ordering; may not ensure any row +#' ordering. +#' +#' @examples +#' +#' library(dplyr) +#' inp_updates <- bind_rows( +#' tibble(version = 30, time_value = 1:20, value = 1:20), +#' tibble(version = 32, time_value = 4:5, value = 5:4), +#' tibble(version = 33, time_value = 8, value = 9), +#' tibble(version = 34, time_value = 11, value = NA), +#' tibble(version = 35, time_value = -10, value = -10), +#' tibble(version = 56, time_value = 50, value = 50) +#' ) %>% +#' mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) +#' +#' f <- purrr::partial(data.table::frollmean, algo = "exact") +#' +#' inp_updates %>% +#' epiprocess:::epi_slide_opt_archive_one_epikey( +#' "value", f, "data.table", 2L, 0L, "day", "slide_value" +#' ) +#' +#' @keywords internal +epi_slide_opt_archive_one_epikey <- function( + inp_updates, + in_colnames, + f_dots_baked, f_from_package, + before_n_steps, after_n_steps, time_type, + out_colnames) { + inp_updates_by_version <- inp_updates %>% + nest(.by = version, .key = "subtbl") %>% + arrange(version) + unit_step <- unit_time_delta(time_type, format = "fast") + prev_inp_snapshot <- NULL + prev_out_snapshot <- NULL + result <- lapply(seq_len(nrow(inp_updates_by_version)), function(version_i) { + version <- inp_updates_by_version$version[[version_i]] + inp_update <- inp_updates_by_version$subtbl[[version_i]] + inp_snapshot <- tbl_patch(prev_inp_snapshot, inp_update, "time_value") + inp_update_min_t <- min(inp_update$time_value) + inp_update_max_t <- max(inp_update$time_value) + # Time inp_update_max_t + before should have an output update, since it + # depends on inp_update_max_t + before - before = inp_update_max_t, which + # has an input update. Similarly, we could have updates beginning with + # inp_update_min_t - after, or anything in between these two bounds. If + # before == Inf, we need to update outputs all the way to the end of the + # input *snapshot*. + out_update_min_t <- inp_update_min_t - after_n_steps * unit_step + if (before_n_steps == Inf) { + out_update_max_t <- max(inp_snapshot$time_value) + } else { + out_update_max_t <- inp_update_max_t + before_n_steps * unit_step + } + out_update <- epi_slide_opt_edf_one_epikey(inp_snapshot, in_colnames, f_dots_baked, f_from_package, before_n_steps, after_n_steps, unit_step, time_type, out_colnames, list(out_update_min_t, out_update_max_t), NULL) + out_diff <- tbl_diff2(prev_out_snapshot, out_update, "time_value", "update") + prev_inp_snapshot <<- inp_snapshot + prev_out_snapshot <<- tbl_patch(prev_out_snapshot, out_diff, "time_value") + out_diff$version <- version + out_diff + }) + result <- list_rbind(result) + result +} + +#' @method epi_slide_opt grouped_epi_archive +#' @export +epi_slide_opt.grouped_epi_archive <- function(.x, ...) { + assert_set_equal( + group_vars(.x), + key_colnames(.x$private$ungrouped, exclude = c("time_value", "version")) + ) + orig_group_vars <- group_vars(.x) + orig_drop <- .x$private$drop + .x %>% + ungroup() %>% + epi_slide_opt(...) %>% + group_by(pick(all_of(orig_group_vars)), .drop = orig_drop) +} + +#' @method epi_slide_opt epi_archive +#' @export +epi_slide_opt.epi_archive <- + function(.x, .col_names, .f, ..., + .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, + .ref_time_values = NULL, .all_rows = FALSE, + .progress = FALSE) { + # Extract metadata: + time_type <- .x$time_type + epikey_names <- key_colnames(.x, exclude = c("time_value", "version")) + # Validation & pre-processing: + .align <- arg_match(.align) + .f_info <- upstream_slide_f_info(.f, ...) + .f_dots_baked <- + if (rlang::dots_n(...) == 0L) { + # Leaving `.f` unchanged slightly improves computation speed and trims + # debug stack traces: + .f + } else { + purrr::partial(.f, ... = , ...) # `... =` stands in for future args + } + col_names_quo <- enquo(.col_names) + if (is.null(.window_size)) { + cli_abort( + "epi_slide_opt: `.window_size` must be specified.", + class = "epiprocess__epi_slide_opt__window_size_missing" + ) + } + names_info <- across_ish_names_info( + .x$DT, time_type, col_names_quo, .f_info$namer, + .window_size, .align, .prefix, .suffix, .new_col_names + ) + window_args <- get_before_after_from_window(.window_size, .align, time_type) + before_n_steps <- time_delta_to_n_steps(window_args$before, time_type) + after_n_steps <- time_delta_to_n_steps(window_args$after, time_type) + if (!is.null(.ref_time_values)) { + cli_abort("epi_slide.epi_archive does not support the `.ref_time_values` argument", + class = "epiprocess__epi_slide_opt_archive__ref_time_values_unsupported" + ) + } + if (!identical(.all_rows, FALSE)) { + cli_abort("epi_slide.epi_archive does not support the `.all_rows` argument", + class = "epiprocess__epi_slide_opt_archive__all_rows_unsupported" + ) + } + assert( + checkmate::check_logical(.progress, any.missing = FALSE, len = 1L, names = "unnamed"), + checkmate::check_string(.progress) + ) + if (isTRUE(.progress)) { + .progress <- "Time series processed:" + } + use_progress <- !isFALSE(.progress) + # Perform the slide: + updates_grouped <- .x$DT %>% + as.data.frame() %>% + as_tibble(.name_repair = "minimal") %>% + # 0 rows input -> 0 rows output for any drop = FALSE groups with 0 rows, so + # we can just say drop = TRUE: + grouped_df(epikey_names, drop = TRUE) + if (use_progress) progress_bar_id <- cli::cli_progress_bar(.progress, total = n_groups(updates_grouped)) + result <- updates_grouped %>% + group_modify(function(group_values, group_key) { + res <- epi_slide_opt_archive_one_epikey( + group_values, + names_info$input_col_names, + .f_dots_baked, .f_info$from_package, before_n_steps, after_n_steps, time_type, + names_info$output_col_names + ) + if (use_progress) cli::cli_progress_update(id = progress_bar_id) + res + }) %>% + as.data.frame() %>% # data.table#6859 + as.data.table(key = key(.x$DT)) %>% + new_epi_archive( + .x$geo_type, .x$time_type, .x$other_keys, + .x$clobberable_versions_start, .x$versions_end + ) + if (use_progress) cli::cli_progress_done(id = progress_bar_id) + # Keep ordering of old columns, place new columns at end: + setcolorder(result$DT, names(.x$DT)) + result + } diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R new file mode 100644 index 000000000..a97fef7a7 --- /dev/null +++ b/R/epi_slide_opt_edf.R @@ -0,0 +1,797 @@ +#' Information about upstream (`{data.table}`/`{slider}`) slide functions +#' +#' Underlies [`upstream_slide_f_info`]. +#' +#' @keywords internal +upstream_slide_f_possibilities <- tibble::tribble( + ~f, ~package, ~namer, + frollmean, "data.table", ~ if (is.logical(.x)) "prop" else "av", + frollsum, "data.table", ~ if (is.logical(.x)) "count" else "sum", + frollapply, "data.table", ~"slide", + slide_sum, "slider", ~ if (is.logical(.x)) "count" else "sum", + slide_prod, "slider", ~"prod", + slide_mean, "slider", ~ if (is.logical(.x)) "prop" else "av", + slide_min, "slider", ~"min", + slide_max, "slider", ~"max", + slide_all, "slider", ~"all", + slide_any, "slider", ~"any", +) + +#' Validate & get information about an upstream slide function +#' +#' @param .f function such as `data.table::frollmean` or `slider::slide_mean`; +#' must appear in [`upstream_slide_f_possibilities`] +#' @param ... additional configuration args to `.f` (besides the data and window +#' size&alignment); used to validate `.f` is used in a supported way +#' @return named list with two elements: `from_package`, a string containing the +#' upstream package name ("data.table" or "slider"), and `namer`, a function +#' that takes a column to call `.f` on and outputs a basic name or +#' abbreviation for what operation `.f` represents on that kind of column +#' (e.g., "sum", "av", "count"). +#' +#' @keywords internal +upstream_slide_f_info <- function(.f, ...) { + assert_function(.f) + + # Check that slide function `.f` is one of those short-listed from + # `data.table` and `slider` (or a function that has the exact same definition, + # e.g. if the function has been reexported or defined locally). Extract some + # metadata. `namer` will be mapped over columns (.x will be a column, not the + # entire edf). + f_info_row <- upstream_slide_f_possibilities %>% + filter(map_lgl(.data$f, ~ identical(.f, .x))) + if (nrow(f_info_row) == 0L) { + # `f` is from somewhere else and not supported + cli_abort( + c( + "problem with {rlang::expr_label(rlang::caller_arg(.f))}", + "i" = "`.f` must be one of `data.table`'s rolling functions (`frollmean`, + `frollsum`, `frollapply`. See `?data.table::roll`) or one of + `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, + etc. See `?slider::\`summary-slide\`` for more options)." + ), + class = "epiprocess__epi_slide_opt__unsupported_slide_function", + epiprocess__f = .f + ) + } + if (nrow(f_info_row) > 1L) { + cli_abort('epiprocess internal error: looking up `.f` in table of possible + functions yielded multiple matches. Please report it using "New + issue" at https://github.com/cmu-delphi/epiprocess/issues, using + reprex::reprex to provide a minimal reproducible example.') + } + f_from_package <- f_info_row$package + if (f_from_package == "data.table" && "fill" %in% names(rlang::call_match(dots_expand = FALSE)[["..."]])) { + # XXX this doesn't detect with `fill` is passed positionally through dots... + cli_abort("`epi_slide_opt` does not support `data.table::froll*()` with a + custom `fill =` arg", + class = "epiprocess__epi_slide_opt__fill_unsupported" + ) + } + list( + from_package = f_from_package, + namer = unwrap(f_info_row$namer) + ) +} + +#' Calculate input and output column names for an `{epiprocess}` +#' [`dplyr::across`]-like operations +#' +#' @param .x data.frame to perform input column tidyselection on +#' @param time_type as in [`new_epi_df`] +#' @param col_names_quo enquosed input column tidyselect expression +#' @param .f_namer function taking an input column object and outputting a name +#' for a corresponding output column; see [`upstream_slide_f_info`] +#' @param .window_size as in [`epi_slide_opt`] +#' @param .align as in [`epi_slide_opt`] +#' @param .prefix as in [`epi_slide_opt`] +#' @param .suffix as in [`epi_slide_opt`] +#' @param .new_col_names as in [`epi_slide_opt`] +#' @return named list with two elements: `input_col_names`, chr, subset of +#' `names(.x)`; and `output_colnames`, chr, same length as `input_col_names` +#' +#' @keywords internal +across_ish_names_info <- function(.x, time_type, col_names_quo, .f_namer, + .window_size, .align, .prefix, .suffix, .new_col_names) { + # The position of a given column can differ between input `.x` and + # `.data_group` since the grouping step by default drops grouping columns. + # To avoid rerunning `eval_select` for every `.data_group`, convert + # positions of user-provided `col_names` into string column names. We avoid + # using `names(pos)` directly for robustness and in case we later want to + # allow users to rename fields via tidyselection. + pos <- eval_select(col_names_quo, data = .x, allow_rename = FALSE) + input_col_names <- names(.x)[pos] + + # Handle output naming + if ((!is.null(.prefix) || !is.null(.suffix)) && !is.null(.new_col_names)) { + cli_abort( + "Can't use both .prefix/.suffix and .new_col_names at the same time.", + class = "epiprocess__epi_slide_opt_incompatible_naming_args" + ) + } + assert_string(.prefix, null.ok = TRUE) + assert_string(.suffix, null.ok = TRUE) + assert_character(.new_col_names, len = length(input_col_names), null.ok = TRUE) + if (is.null(.prefix) && is.null(.suffix) && is.null(.new_col_names)) { + .suffix <- "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}" + # ^ does not account for any arguments specified to underlying functions via + # `...` such as `na.rm =`, nor does it distinguish between functions from + # different packages accomplishing the same type of computation. Those are + # probably only set one way per task, so this probably produces cleaner + # names without clashes (though maybe some confusion if switching between + # code with different settings). + } + if (!is.null(.prefix) || !is.null(.suffix)) { + .prefix <- .prefix %||% "" + .suffix <- .suffix %||% "" + if (identical(.window_size, Inf)) { + n <- "running_" + time_unit_abbr <- "" + align_abbr <- "" + } else { + n <- time_delta_to_n_steps(.window_size, time_type) + time_unit_abbr <- time_type_unit_abbr(time_type) + align_abbr <- c(right = "", center = "c", left = "l")[[.align]] + } + glue_env <- rlang::env( + .n = n, + .time_unit_abbr = time_unit_abbr, + .align_abbr = align_abbr, + .f_abbr = purrr::map_chr(.x[, c(input_col_names)], .f_namer), # compat between DT and tbl selection + quo_get_env(col_names_quo) + ) + .new_col_names <- unclass( + glue(.prefix, .envir = glue_env) + + input_col_names + + glue(.suffix, .envir = glue_env) + ) + } else { + # `.new_col_names` was provided by user; we don't need to do anything. + } + if (any(.new_col_names %in% names(.x))) { + cli_abort(c( + "Naming conflict between new columns and existing columns", + "x" = "Overlapping names: {format_varnames(intersect(.new_col_names, names(.x)))}" + ), class = "epiprocess__epi_slide_opt_old_new_name_conflict") + } + if (anyDuplicated(.new_col_names)) { + cli_abort(c( + "New column names contain duplicates", + "x" = "Duplicated names: {format_varnames(unique(.new_col_names[duplicated(.new_col_names)]))}" + ), class = "epiprocess__epi_slide_opt_new_name_duplicated") + } + output_col_names <- .new_col_names + + list( + input_col_names = input_col_names, + output_col_names = output_col_names + ) +} + +#' Run a specialized slide computation on a single `epi_df` epikey, with temporary completion +#' +#' @param inp_tbl tibble; should have a `time_value` column and columns named in +#' `in_colnames`; should not contain any columns named in `out_colnames` +#' @param in_colnames chr; names of columns to which to apply `f_dots_baked` +#' @param f_dots_baked supported sliding function from `{data.table}` or +#' `{slider}`, potentially with some arguments baked in with +#' [`purrr::partial`] +#' @param f_from_package string; name of package from which `f_dots_baked` +#' (pre-`partial`) originates +#' @param before_n_steps integerish `>=0` or `Inf`; number of time steps before +#' each `ref_time_value` to include in the sliding window computation; `Inf` +#' to include all times beginning with the min `time_value` +#' @param after_n_steps integerish `>=0`; number of time steps after each +#' `ref_time_value` to include in the sliding window computation +#' @param time_type as in `new_epi_archive` +#' @param out_colnames chr, same length as `in_colnames`; column names to use +#' for results +#' @param out_filter_time_range,out_filter_time_set `time_value` filter; +#' `time_values` in the output should match the result of applying this filter +#' to `inp_tbl$time_value`. Exactly one of the two must be provided +#' (non-`NULL`) and the other must be `NULL`. `out_filter_time_range`, if +#' provided, should be a length-2 vector/list containing the minimum and +#' maximum `time_value` to allow in the output. `out_filter_time_set`, if +#' provided, should be a vector of `time_values` to intersect with the input +#' `time_value`s. +#' @return tibble; like `inp_tbl` with addition of `out_colnames` holding the +#' slide computation results, with times filtered down as specified +#' +#' @examples +#' +#' library(dplyr) +#' tbl <- tibble( +#' time_value = c(11:12, 15:18) + 0, +#' value = c(c(1, 2), c(4, 8, 16, 32)) +#' ) +#' +#' tbl %>% +#' epi_slide_opt_edf_one_epikey( +#' "value", +#' frollmean, "data.table", +#' 1L, 0L, 1L, "integer", +#' "slide_value", +#' c(11L, 16L), NULL +#' ) +#' +#' tbl %>% +#' epi_slide_opt_edf_one_epikey( +#' "value", +#' frollmean, "data.table", +#' 0L, 1L, 1L, "integer", +#' "slide_value", +#' NULL, c(11, 15, 16, 17, 18) +#' ) +#' +#' tbl %>% +#' epi_slide_opt_edf_one_epikey( +#' "value", +#' frollmean, "data.table", +#' Inf, 0L, 1L, "integer", +#' "slide_value", +#' NULL, c(12, 17) +#' ) +#' +#' @keywords internal +epi_slide_opt_edf_one_epikey <- function(inp_tbl, + in_colnames, + f_dots_baked, f_from_package, + before_n_steps, after_n_steps, unit_step, time_type, + out_colnames, + out_filter_time_range, out_filter_time_set) { + # TODO try converting time values to reals, do work on reals, convert back at very end? + # + # TODO loosen restrictions here. each filter optional? + if (!is.null(out_filter_time_range) && is.null(out_filter_time_set)) { + out_filter_time_style <- "range" + out_t_min <- out_filter_time_range[[1L]] + out_t_max <- out_filter_time_range[[2L]] + } else if (is.null(out_filter_time_range) && !is.null(out_filter_time_set)) { + # FIXME min time_value for this epikey vs. entire edf; match existing behavior, or complete changeover + out_filter_time_style <- "set" + out_time_values <- vec_set_intersect(inp_tbl$time_value, out_filter_time_set) + if (vec_size(out_time_values) == 0L) { + out_t_min <- inp_tbl$time_value[[1L]] + out_t_max <- inp_tbl$time_value[[1L]] - 1L * unit_step + } else { + out_t_min <- min(out_time_values) + out_t_max <- max(out_time_values) + } + } else { + cli_abort("Exactly one of `out_filter_time_range` and `out_filter_time_set` must be non-`NULL`.") + } + if (before_n_steps == Inf) { + slide_t_min <- min(inp_tbl$time_value) + slide_start_padding_n <- time_minus_time_in_n_steps(out_t_min, slide_t_min, time_type) + } else { + slide_t_min <- out_t_min - before_n_steps * unit_step + slide_start_padding_n <- before_n_steps # perf: avoid time_minus_time_in_n_steps + } + slide_t_max <- out_t_max + after_n_steps * unit_step + slide_nrow <- time_delta_to_n_steps(slide_t_max - slide_t_min, time_type) + 1L + slide_time_values <- slide_t_min + 0L:(slide_nrow - 1L) * unit_step + slide_inp_backrefs <- vec_match(slide_time_values, inp_tbl$time_value) + # Get values needed from inp_tbl + perform any NA tail-padding needed to make + # slider results a fixed window size rather than adaptive at tails, and + # perform any NA gap-filling needed: + slide <- vec_slice(inp_tbl, slide_inp_backrefs) + # TODO refactor to use a join if not using backrefs later anymore? or perf: + # try removing time_value column before slice? + slide$time_value <- slide_time_values + if (f_from_package == "data.table") { + if (before_n_steps == Inf) { + slide[, out_colnames] <- + f_dots_baked(slide[, in_colnames], seq_len(slide_nrow), adaptive = TRUE) + } else { + out_cols <- f_dots_baked(slide[, in_colnames], before_n_steps + after_n_steps + 1L) + if (after_n_steps != 0L) { + # Shift an appropriate amount of NA padding from the start to the end. + # (This padding will later be cut off when we filter down to the + # original time_values.) + out_cols <- lapply(out_cols, function(out_col) { + c(out_col[(after_n_steps + 1L):length(out_col)], rep(NA, after_n_steps)) + }) + } + slide[, out_colnames] <- out_cols + } + } else if (f_from_package == "slider") { + for (col_i in seq_along(in_colnames)) { + slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before_n_steps, after = after_n_steps) + } + } else { + cli_abort( + "epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}, which is unsupported", + class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid" + ) + } + # We should filter down the slide time values to ones in the input time values + # when preparing the output: + rows_should_keep1 <- !is.na(slide_inp_backrefs) + # We also need to apply the out_filter. + # + # TODO comments + test vs. just using inequality + rows_should_keep2 <- switch(out_filter_time_style, + range = vec_rep_each( + c(FALSE, TRUE, FALSE), + c(slide_start_padding_n, slide_nrow - slide_start_padding_n - after_n_steps, after_n_steps), + ), + set = vec_in(slide_time_values, out_time_values) + ) + rows_should_keep <- rows_should_keep1 & rows_should_keep2 + out_tbl <- vec_slice(slide, rows_should_keep) + out_tbl +} + +#' Optimized slide functions for common cases +#' +#' @description +#' +#' `epi_slide_opt` calculates n-time-step rolling means&sums, +#' cumulative/"running" means&sums, and other operations supported by +#' [`data.table::froll`] or [`slider::summary-slide`] functions. +#' +#' * On `epi_df`s, it will take care of looping over `geo_value`s, temporarily +#' filling in time gaps with `NA`s and other work needed to ensure there are +#' exactly `n` consecutive time steps per computation, and has some other +#' convenience features. See `vignette("epi_df")` for more examples. +#' +#' * On `epi_archive`s, it will calculate the version history for these slide +#' computations and combine it with the version history for the rest of the +#' columns. +#' +#' This function tends to be much faster than using `epi_slide()` and +#' `epix_slide()` directly. +#' +#' @template basic-slide-params +#' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column +#' name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), +#' [other tidy-select expression][tidyselect::language], or a vector of +#' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if +#' they were positions in the data frame, so expressions like `x:y` can be +#' used to select a range of variables. +#' +#' The tidy-selection renaming interface is not supported, and cannot be used +#' to provide output column names; if you want to customize the output column +#' names, use [`dplyr::rename`] after the slide. +#' @param .f Function; together with `...` specifies the computation to slide. +#' `.f` must be one of `data.table`'s rolling functions +#' (`frollmean`, `frollsum`, `frollapply`. See [data.table::roll]) or one +#' of `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, +#' etc. See [slider::summary-slide]). +#' +#' The optimized `data.table` and `slider` functions can't be directly passed +#' as the computation function in `epi_slide` without careful handling to make +#' sure each computation group is made up of the `.window_size` dates rather +#' than `.window_size` points. `epi_slide_opt` (and wrapper functions +#' `epi_slide_mean` and `epi_slide_sum`) take care of window completion +#' automatically to prevent associated errors. +#' @param ... Additional arguments to pass to the slide computation `.f`, for +#' example, `algo` or `na.rm` in data.table functions. You don't need to +#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider +#' functions). +#' @param .prefix Optional [`glue::glue`] format string; name the slide result +#' column(s) by attaching this prefix to the corresponding input column(s). +#' Some shorthand is supported for basing the output names on `.window_size` +#' or other arguments; see "Prefix and suffix shorthand" below. +#' @param .suffix Optional [`glue::glue`] format string; like `.prefix`. The +#' default naming behavior is equivalent to `.suffix = +#' "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}"`. Can be used in combination +#' with `.prefix`. +#' @param .new_col_names Optional character vector with length matching the +#' number of input columns from `.col_names`; name the slide result column(s) +#' with these names. Cannot be used in combination with `.prefix` and/or +#' `.suffix`. +#' +#' @section Prefix and suffix shorthand: +#' +#' [`glue::glue`] format strings specially interpret content within curly +#' braces. E.g., `glue::glue("ABC{2 + 2}")` evaluates to `"ABC4"`. For `.prefix` +#' and `.suffix`, we provide `glue` with some additional variable bindings: +#' +#' - `{.n}` will be the number of time steps in the computation +#' corresponding to the `.window_size`. +#' - `{.time_unit_abbr}` will be a lower-case letter corresponding to the +#' `time_type` of `.x` +#' - `{.align_abbr}` will be `""` if `.align` is the default of `"right"`; +#' otherwise, it will be the first letter of `.align` +#' - `{.f_abbr}` will be a character vector containing a short abbreviation +#' for `.f` factoring in the input column type(s) for `.col_names` +#' +#' @importFrom dplyr mutate %>% arrange tibble select all_of +#' @importFrom rlang enquo expr_label caller_arg quo_get_env +#' @importFrom tidyselect eval_select +#' @importFrom glue glue +#' @importFrom purrr map map_lgl +#' @importFrom data.table frollmean frollsum frollapply +#' @importFrom lubridate as.period +#' @importFrom slider slide_sum slide_prod slide_mean slide_min slide_max slide_all slide_any +#' @export +#' @seealso [`epi_slide`] for the more general slide function +#' @examples +#' library(dplyr) +#' +#' # Add a column (`cases_7dsum`) containing a 7-day trailing sum on `cases`: +#' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% +#' epi_slide_sum(cases, .window_size = 7) +#' +#' # Add a column (`cases_rate_7dav`) containing a 7-day trailing average on `case_rate`: +#' covid_case_death_rates_extended %>% +#' epi_slide_mean(case_rate, .window_size = 7) +#' +#' # Use a less common specialized slide function: +#' cases_deaths_subset %>% +#' epi_slide_opt(cases, slider::slide_min, .window_size = 7) +#' +#' # Specify output column names and/or a naming scheme: +#' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% +#' group_by(geo_value) %>% +#' epi_slide_sum(cases, .window_size = 7, .new_col_names = "case_sum") %>% +#' ungroup() +#' cases_deaths_subset %>% +#' select(geo_value, time_value, cases) %>% +#' group_by(geo_value) %>% +#' epi_slide_sum(cases, .window_size = 7, .prefix = "sum_") %>% +#' ungroup() +#' +#' # Additional settings can be sent to the {data.table} and {slider} functions +#' # via `...`. This example passes some arguments to `frollmean` settings for +#' # speed, accuracy, and to allow partially-missing windows: +#' covid_case_death_rates_extended %>% +#' epi_slide_mean( +#' case_rate, +#' .window_size = 7, +#' na.rm = TRUE, algo = "exact", hasNA = TRUE +#' ) +#' +#' # If the more specialized possibilities for `.f` don't cover your needs, you +#' # can use `epi_slide_opt` with `.f = data.table::frollapply` to apply a +#' # custom function at the cost of more computation time. See also `epi_slide` +#' # if you need something even more general. +#' cases_deaths_subset %>% +#' select(geo_value, time_value, case_rate_7d_av, death_rate_7d_av) %>% +#' epi_slide_opt(c(case_rate_7d_av, death_rate_7d_av), +#' data.table::frollapply, +#' FUN = median, .window_size = 28, +#' .suffix = "_{.n}{.time_unit_abbr}_median" +#' ) %>% +#' print(n = 40) +#' +#' # You can calculate entire version histories for the derived signals by +#' # calling `epi_slide_opt()` on an `epi_archive`: +#' case_death_rate_archive %>% +#' epi_slide_mean(case_rate, .window_size = 14) +#' +#' @export +epi_slide_opt <- function( + .x, .col_names, .f, ..., + .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, + .ref_time_values = NULL, .all_rows = FALSE) { + UseMethod("epi_slide_opt") +} + +#' @export +epi_slide_opt.epi_df <- function(.x, .col_names, .f, ..., + .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, + .ref_time_values = NULL, .all_rows = FALSE) { + # Deprecated argument handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { + cli::cli_abort( + "epi_slide_opt: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, + or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, + `.ref_time_values`, `.all_rows`." + ) + } + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) + } + if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", + class = "epiprocess__epi_slide_opt__new_name_not_supported" + ) + } + if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `names_sep` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `dplyr::rename` after the slide.", + class = "epiprocess__epi_slide_opt__name_sep_not_supported" + ) + } + + assert_class(.x, "epi_df") + .x_orig_groups <- groups(.x) + if (inherits(.x, "grouped_df")) { + expected_group_keys <- .x %>% + key_colnames(exclude = "time_value") %>% + sort() + if (!identical(.x %>% group_vars() %>% sort(), expected_group_keys)) { + cli_abort( + "`.x` must be either grouped by {expected_group_keys} or ungrouped; if the latter, + we'll temporarily group by {expected_group_keys} for this operation. You may need + to aggregate your data first; see sum_groups_epi_df().", + class = "epiprocess__epi_slide_opt__invalid_grouping" + ) + } + } else { + .x <- group_epi_df(.x, exclude = "time_value") + } + if (nrow(.x) == 0L) { + cli_abort( + c( + "input data `.x` unexpectedly has 0 rows", + "i" = "If this computation is occuring within an `epix_slide` call, + check that `epix_slide` `.versions` argument was set appropriately + so that you don't get any completely-empty snapshots" + ), + class = "epiprocess__epi_slide_opt__0_row_input", + epiprocess__x = .x + ) + } + + # Check for duplicated time values within groups + assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value"))) + + # Validate/process .col_names, .f: + col_names_quo <- enquo(.col_names) + f_info <- upstream_slide_f_info(.f, ...) + f_from_package <- f_info$from_package + + # Validate/process .ref_time_values: + user_provided_rtvs <- !is.null(.ref_time_values) + if (!user_provided_rtvs) { + .ref_time_values <- unique(.x$time_value) + } else { + assert_numeric(.ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (!test_subset(.ref_time_values, unique(.x$time_value))) { + cli_abort( + "`ref_time_values` must be a unique subset of the time values in `x`.", + class = "epiprocess__epi_slide_opt_invalid_ref_time_values" + ) + } + if (anyDuplicated(.ref_time_values) != 0L) { + cli_abort( + "`ref_time_values` must not contain any duplicates; use `unique` if appropriate.", + class = "epiprocess__epi_slide_opt_invalid_ref_time_values" + ) + } + } + ref_time_values <- sort(.ref_time_values) + + # Handle window arguments + .align <- rlang::arg_match(.align) + time_type <- attr(.x, "metadata")$time_type + if (is.null(.window_size)) { + cli_abort( + "epi_slide_opt: `.window_size` must be specified.", + class = "epiprocess__epi_slide_opt__window_size_missing" + ) + } + validate_slide_window_arg(.window_size, time_type) + window_args <- get_before_after_from_window(.window_size, .align, time_type) + before_n_steps <- time_delta_to_n_steps(window_args$before, time_type) + after_n_steps <- time_delta_to_n_steps(window_args$after, time_type) + unit_step <- unit_time_delta(time_type, format = "fast") + + # Handle output naming: + names_info <- across_ish_names_info( + .x, time_type, col_names_quo, f_info$namer, + .window_size, .align, .prefix, .suffix, .new_col_names + ) + input_col_names <- names_info$input_col_names + output_col_names <- names_info$output_col_names + + f_dots_baked <- + if (rlang::dots_n(...) == 0L) { + # Leaving `.f` unchanged slightly improves computation speed and trims + # debug stack traces: + .f + } else { + purrr::partial(.f, ... = , ...) # `... =` stands in for future args + } + + result <- .x %>% + group_modify(function(grp_data, grp_key) { + epi_slide_opt_edf_one_epikey(grp_data, names_info$input_col_names, f_dots_baked, f_from_package, before_n_steps, after_n_steps, unit_step, time_type, names_info$output_col_names, NULL, ref_time_values) + }) %>% + arrange_col_canonical() + + if (.all_rows) { + ekt_names <- key_colnames(.x) + result <- left_join(ungroup(.x), result[c(ekt_names, output_col_names)], by = ekt_names) + } + + result <- group_by(result, !!!.x_orig_groups) + + return(result) +} + +#' @rdname epi_slide_opt +#' @description `epi_slide_mean` is a wrapper around `epi_slide_opt` with `.f = +#' data.table::frollmean`. +#' +#' @export +epi_slide_mean <- function( + .x, .col_names, ..., + .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, + .ref_time_values = NULL, .all_rows = FALSE) { + # Deprecated argument handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { + cli::cli_abort( + "epi_slide_mean: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, + or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, + `.ref_time_values`, `.all_rows`." + ) + } + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) + } + if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", + class = "epiprocess__epi_slide_opt__new_name_not_supported" + ) + } + if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: the argument `names_sep` is not supported. If you want to customize + the output column names, use `dplyr::rename` after the slide." + ) + } + + epi_slide_opt( + .x = .x, + .col_names = {{ .col_names }}, + .f = data.table::frollmean, + ..., + .window_size = .window_size, + .align = .align, + .prefix = .prefix, + .suffix = .suffix, + .new_col_names = .new_col_names, + .ref_time_values = .ref_time_values, + .all_rows = .all_rows + ) +} + +#' @rdname epi_slide_opt +#' @description `epi_slide_sum` is a wrapper around `epi_slide_opt` with `.f = +#' data.table::frollsum`. +#' +#' @export +epi_slide_sum <- function( + .x, .col_names, ..., + .window_size = NULL, .align = c("right", "center", "left"), + .prefix = NULL, .suffix = NULL, .new_col_names = NULL, + .ref_time_values = NULL, .all_rows = FALSE) { + # Deprecated argument handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { + cli::cli_abort( + "epi_slide_sum: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, + or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, + `.ref_time_values`, `.all_rows`." + ) + } + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) + } + if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", + class = "epiprocess__epi_slide_opt__new_name_not_supported" + ) + } + if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: the argument `names_sep` is not supported. If you want to customize + the output column names, use `dplyr::rename` after the slide." + ) + } + epi_slide_opt( + .x = .x, + .col_names = {{ .col_names }}, + .f = data.table::frollsum, + ..., + .window_size = .window_size, + .align = .align, + .prefix = .prefix, + .suffix = .suffix, + .new_col_names = .new_col_names, + .ref_time_values = .ref_time_values, + .all_rows = .all_rows + ) +} + +#' Make a complete date sequence between min(x$time_value) and max +#' (x$time_value). Produce lists of dates before min(x$time_value) and after +#' max(x$time_value) for padding initial and final windows to size `n`. +#' +#' `before` and `after` args are assumed to have been validated by the calling +#' function (using `validate_slide_window_arg`). +#' +#' @keywords internal +full_date_seq <- function(x, before, after, time_type) { + if (!time_type %in% c("day", "week", "yearmonth", "integer")) { + cli_abort( + "time_type must be one of 'day', 'week', or 'integer'." + ) + } + + pad_early_dates <- c() + pad_late_dates <- c() + + # `tsibble` time types have their own behavior, where adding 1 corresponds to + # incrementing by a quantum (smallest resolvable unit) of the date class. For + # example, one step = 1 quarter for `yearquarter`. + if (time_type %in% c("yearmonth", "integer")) { + all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) + + if (before != 0 && before != Inf) { + pad_early_dates <- all_dates[1L] - before:1 + } + if (after != 0) { + pad_late_dates <- all_dates[length(all_dates)] + 1:after + } + } else { + by <- switch(time_type, + day = "days", + week = "weeks", + ) + + all_dates <- seq(min(x$time_value), max(x$time_value), by = by) + if (before != 0 && before != Inf) { + # The behavior is analogous to the branch with tsibble types above. For + # more detail, note that the function `seq.Date(from, ..., length.out = + # n)` returns `from + 0:n`. Since we want `from + 1:n`, we drop the first + # element. Adding "-1" to the `by` arg makes `seq.Date` go backwards in + # time. + pad_early_dates <- sort(seq(all_dates[1L], by = paste("-1", by), length.out = before + 1)[-1]) + } + if (after != 0) { + pad_late_dates <- seq(all_dates[length(all_dates)], by = by, length.out = after + 1)[-1] + } + } + + list( + all_dates = all_dates, + pad_early_dates = pad_early_dates, + pad_late_dates = pad_late_dates + ) +} diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 61eccf993..c5a975a5f 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -6,36 +6,64 @@ #' @importFrom checkmate anyInfinite anyMissing assert assert_character #' @importFrom checkmate assert_class assert_data_frame assert_int assert_list #' @importFrom checkmate assert_false +#' @importFrom checkmate assert_function #' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt +#' @importFrom checkmate assert_set_equal #' @importFrom checkmate assert_string #' @importFrom checkmate assert_subset #' @importFrom checkmate assert_tibble +#' @importFrom checkmate assert_true #' @importFrom checkmate check_atomic check_data_frame expect_class test_int +#' @importFrom checkmate check_character +#' @importFrom checkmate check_logical #' @importFrom checkmate check_names +#' @importFrom checkmate check_null +#' @importFrom checkmate check_numeric #' @importFrom checkmate test_subset test_set_equal vname #' @importFrom cli cli_abort cli_warn #' @importFrom cli pluralize #' @importFrom cli qty #' @importFrom data.table as.data.table +#' @importFrom data.table fifelse #' @importFrom data.table key +#' @importFrom data.table setcolorder +#' @importFrom data.table setDT #' @importFrom data.table setkeyv #' @importFrom dplyr arrange #' @importFrom dplyr grouped_df #' @importFrom dplyr is_grouped_df +#' @importFrom dplyr n_groups #' @importFrom dplyr select #' @importFrom lifecycle deprecated #' @importFrom purrr list_rbind #' @importFrom rlang %||% +#' @importFrom rlang arg_match0 #' @importFrom rlang is_bare_integerish +#' @importFrom rlang is_bare_list +#' @importFrom rlang is_bare_numeric +#' @importFrom tibble is_tibble +#' @importFrom tidyr nest #' @importFrom tools toTitleCase +#' @importFrom vctrs obj_is_vector #' @importFrom vctrs vec_cast +#' @importFrom vctrs vec_cast_common #' @importFrom vctrs vec_data +#' @importFrom vctrs vec_duplicate_detect +#' @importFrom vctrs vec_duplicate_id #' @importFrom vctrs vec_equal #' @importFrom vctrs vec_in +#' @importFrom vctrs vec_match #' @importFrom vctrs vec_order +#' @importFrom vctrs vec_ptype #' @importFrom vctrs vec_rbind +#' @importFrom vctrs vec_recycle #' @importFrom vctrs vec_recycle_common #' @importFrom vctrs vec_rep +#' @importFrom vctrs vec_rep_each +#' @importFrom vctrs vec_seq_along +#' @importFrom vctrs vec_set_intersect +#' @importFrom vctrs vec_set_names +#' @importFrom vctrs vec_size_common #' @importFrom vctrs vec_slice #' @importFrom vctrs vec_slice<- #' @importFrom vctrs vec_sort diff --git a/R/patch.R b/R/patch.R new file mode 100644 index 000000000..44077d2bb --- /dev/null +++ b/R/patch.R @@ -0,0 +1,373 @@ +#' Test two vctrs vectors for equality with some tolerance in some cases +#' +#' Generalizes [`vctrs::vec_equal`]. +#' +#' @param vec1,vec2 vctrs vectors (includes data frames). Take care when using +#' on named vectors or "keyed" data frames; [`vec_names()`] are largely +#' ignored, and key columns are treated as normal value columns (when they +#' should probably generate an error if they are not lined up correctly, or be +#' tested for exact rather than approximate equality). +#' @param na_equal should `NA`s be considered equal to each other? (In +#' epiprocess, we usually want this to be `TRUE`, but that doesn't match the +#' [`vctrs::vec_equal()`] default, so this is mandatory.) +#' @param .ptype as in [`vctrs::vec_equal()`]. +#' @param ... should be empty (it's here to force later arguments to be passed +#' by name) +#' @param abs_tol absolute tolerance; will be used for bare numeric `vec1`, +#' `vec2`, or any such columns within `vec1`, `vec2` if they are data frames. +#' @param inds1,inds2 optional (row) indices into vec1 and vec2 compatible with +#' [`vctrs::vec_slice()`]; output should be consistent with `vec_slice`-ing to +#' these indices beforehand, but can give faster computation if `vec1` and +#' `vec2` are data frames. Currently, any speedup is only by making sure that +#' `vec_slice` is used rather than `[` for data frames. +#' +#' @return logical vector, with length matching the result of recycling `vec1` +#' (at `inds1` if provided) and `vec2` (at `inds2` if provided); entries +#' should all be `TRUE` or `FALSE` if `na_equal = TRUE`. +#' +#' @examples +#' +#' library(dplyr) +#' +#' # On numeric vectors: +#' vec_approx_equal( +#' c(1, 2, 3, NA), +#' c(1, 2 + 1e-10, NA, NA), +#' na_equal = TRUE, +#' abs_tol = 1e-8 +#' ) +#' +#' # On tibbles: +#' tbl1 <- tibble( +#' a = 1:5, +#' b = list(1:5, 1:4, 1:3, 1:2, 1:1) %>% lapply(as.numeric), +#' c = tibble( +#' c1 = 1:5 +#' ), +#' d = matrix(1:10, 5, 2) +#' ) +#' tbl2 <- tbl1 +#' tbl2$a[[2]] <- tbl1$a[[2]] + 1e-10 +#' tbl2$b[[3]][[1]] <- tbl1$b[[3]][[1]] + 1e-10 +#' tbl2$c$c1[[4]] <- tbl1$c$c1[[4]] + 1e-10 +#' tbl2$d[[5, 2]] <- tbl1$d[[5, 2]] + 1e-10 +#' vctrs::vec_equal(tbl1, tbl2, na_equal = TRUE) +#' vec_approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-12) +#' vec_approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-8) +#' +#' +#' +#' +#' +#' # Type comparison within lists is stricter, matching vctrs: +#' vctrs::vec_equal(list(1:2), list(as.numeric(1:2))) +#' vec_approx_equal(list(1:2), list(as.numeric(1:2)), FALSE, abs_tol = 0) +#' +#' @export +vec_approx_equal <- function(vec1, vec2, na_equal, .ptype = NULL, ..., abs_tol, inds1 = NULL, inds2 = NULL) { + if (!obj_is_vector(vec1)) cli_abort("`vec1` must be recognized by vctrs as a vector") + if (!obj_is_vector(vec2)) cli_abort("`vec2` must be recognized by vctrs as a vector") + # Leave vec size checking to vctrs recycling ops. + assert_logical(na_equal, any.missing = FALSE, len = 1L) + # Leave .ptype checks to cast operation. + check_dots_empty() + assert_numeric(abs_tol, lower = 0, len = 1L) + assert( + check_null(inds1), + check_numeric(inds1), + check_logical(inds1), + check_character(inds1) + ) + assert( + check_null(inds2), + check_numeric(inds2), + check_logical(inds2), + check_character(inds2) + ) + # Leave heavier index validation to the vctrs recycling & indexing ops. + + # Recycle inds if provided; vecs if not: + common_size <- vec_size_common( + if (is.null(inds1)) vec1 else inds1, + if (is.null(inds2)) vec2 else inds2 + ) + if (is.null(inds1)) { + vec1 <- vec_recycle(vec1, common_size) + } else { + inds1 <- vec_recycle(inds1, common_size) + } + if (is.null(inds2)) { + vec2 <- vec_recycle(vec2, common_size) + } else { + inds2 <- vec_recycle(inds2, common_size) + } + if (!identical(vec_ptype(vec1), vec_ptype(vec2)) || !is.null(.ptype)) { + # perf: this is slow, so try to avoid it if it's not needed + vecs <- vec_cast_common(vec1, vec2, .to = .ptype) + } else { + vecs <- list(vec1, vec2) + } + vec_approx_equal0(vecs[[1]], vecs[[2]], na_equal, abs_tol, inds1, inds2) +} + +#' Helper for [`vec_approx_equal`] for vecs guaranteed to have the same ptype and size +#' +#' @keywords internal +vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 = NULL) { + if (is_bare_numeric(vec1) && abs_tol != 0) { + # Matching vec_equal, we ignore names and other attributes. + if (!is.null(inds1)) vec1 <- vec_slice(vec1, inds1) + if (!is.null(inds2)) vec2 <- vec_slice(vec2, inds2) + if (na_equal) { + na_or_nan1 <- is.na(vec1) + na_or_nan2 <- is.na(vec2) + # Since above are bare logical vectors, we can use `fifelse` + res <- fifelse( + !na_or_nan1 & !na_or_nan2, + abs(vec1 - vec2) <= abs_tol, + na_or_nan1 & na_or_nan2 & (is.nan(vec1) == is.nan(vec2)) + ) + } else { + # Like `==` and `vec_equal`, we consider NaN == {NA, NaN, anything else} + # to be NA. That logic is actually baked into the basic formula: + res <- abs(vec1 - vec2) <= abs_tol + } + + if (!is.null(dim(vec1))) { + dim(res) <- dim(vec1) + res <- rowSums(res) == ncol(res) + } + # `fifelse` inherits any unrecognized attributes; drop them instead: + attributes(res) <- NULL + return(res) + } else if (is.data.frame(vec1) && abs_tol != 0) { + # (we only need to manually recurse if we potentially have columns that would + # be specially processed by the above) + if (ncol(vec1) == 0) { + rep(TRUE, nrow(vec1)) + } else { + Reduce(`&`, lapply(seq_len(ncol(vec1)), function(col_i) { + vec_approx_equal0(vec1[[col_i]], vec2[[col_i]], na_equal, abs_tol, inds1, inds2) + })) + } + } else if (is_bare_list(vec1) && abs_tol != 0) { + vapply(seq_along(vec1), function(i) { + entry1 <- vec1[[i]] + entry2 <- vec2[[i]] + vec_size(entry1) == vec_size(entry2) && + # Trying to follow vec_equal: strict on ptypes aside from vec_namedness: + identical( + vec_set_names(vec_ptype(entry1), NULL), + vec_set_names(vec_ptype(entry2), NULL) + ) && + all(vec_approx_equal0(entry1, entry2, na_equal, abs_tol)) + }, logical(1L)) + } else { + # No special handling for any other types/situations. We may want to allow + # S3 extension of this method or of a new appropriate vec_proxy_* variant. + # See Issue #640. + if (!is.null(inds1)) { + vec1 <- vec_slice(vec1, inds1) + } + if (!is.null(inds2)) { + vec2 <- vec_slice(vec2, inds2) + } + # perf: vec1 and vec2 have already been cast to a common ptype; we can't + # disable casts, but can say to cast (again...) to that ptype + res <- vec_equal(vec1, vec2, na_equal = na_equal, vec_ptype(vec1)) + return(res) + } +} + +#' Variation on [`dplyr::anti_join`] for speed + tolerance setting +#' +#' @param x tibble; `x[ukey_names]` must not have any duplicate rows +#' @param y tibble; `y[ukey_names]` must not have any duplicate rows +#' @param ukey_names chr; names of columns that form a unique key, for `x` and +#' for `y` +#' @param val_names chr; names of columns which should be treated as +#' value/measurement columns, and compared with a tolerance +#' @param abs_tol scalar non-negative numeric; absolute tolerance with which to +#' compare value columns; see [`vec_approx_equal`] +#' @return rows from `x` that either (a) don't have a (0-tolerance) matching +#' ukey in `y`, or (b) have a matching ukey in `y`, but don't have +#' approximately equal value column values +#' +#' @keywords internal +tbl_fast_anti_join <- function(x, y, ukey_names, val_names, abs_tol = 0) { + x_keyvals <- x[c(ukey_names, val_names)] + y_keyvals <- y[c(ukey_names, val_names)] + xy_keyvals <- vec_rbind(x, y) + if (abs_tol == 0) { + # perf: 0 tolerance is just like a normal `anti_join` by both ukey_names and + # val_names together. We can do that more quickly than `anti_join` with + # `vctrs` by checking for keyvals of `x` that are not duplicated in `y`. + # (`vec_duplicate_detect` will mark those, unlike `duplicated`.) + x_exclude <- vec_duplicate_detect(xy_keyvals) + x_exclude <- vec_slice(x_exclude, seq_len(nrow(x))) + } else { + xy_ukeys <- xy_keyvals[ukey_names] + # Locate ukeys in `y` that match ukeys in `x`, and where in `x` they map + # back to. It's faster to do this with `vec_duplicate_id` on `xy_ukeys` than + # to perform an `inner_join`. + xy_ukey_dup_ids <- vec_duplicate_id(xy_ukeys) + xy_ukey_dup_inds2 <- which(xy_ukey_dup_ids != seq_along(xy_ukey_dup_ids)) + # ^ these should point to rows from y that had a ukey match in x + xy_ukey_dup_inds1 <- vec_slice(xy_ukey_dup_ids, xy_ukey_dup_inds2) + # ^ these should point to the respectively corresponding rows from x + + # Anything in `x` without a ukey match in `y` should be kept; start off with + # `FALSE` for everything and just fill in `TRUE`/`FALSE` results for the + # ukeys with matches in `y`: + x_exclude <- rep(FALSE, nrow(x)) + xy_vals <- xy[val_names] + x_exclude[xy_ukey_dup_inds1] <- vec_approx_equal( + xy_vals, + inds1 = xy_ukey_dup_inds2, + xy_vals, + inds2 = xy_ukey_dup_inds1, + na_equal = TRUE, abs_tol = abs_tol + ) + } + vec_slice(x, !x_exclude) +} + +#' Calculate compact patch to move from one snapshot/update to another +#' +#' @param earlier_snapshot tibble or `NULL`; `NULL` represents that there was no +#' data before `later_tbl`. +#' @param later_tbl tibble; must have the same column names as +#' `earlier_snapshot` if it is a tibble. +#' @param ukey_names character; column names that together, form a unique key +#' for `earlier_snapshot` and for `later_tbl`. This is unchecked; see +#' [`check_ukey_unique`] if you don't already have this guaranteed. +#' @param later_format "snapshot" or "update"; default is "snapshot". If +#' "snapshot", `later_tbl` will be interpreted as a full snapshot of the data +#' set including all ukeys, and any ukeys that are in `earlier_snapshot` but +#' not in `later_tbl` are interpreted as deletions, which are currently +#' (imprecisely) represented in the output patch as revisions of all +#' non-`ukey_names` columns to NA values (using `{vctrs}`). If "update", then +#' it's assumed that any deletions have already been represented this way in +#' `later_tbl` and any ukeys not in `later_tbl` are simply unchanged; we are +#' just ensuring that the update is fully compact for the given +#' `compactify_abs_tol`. +#' @param compactify_abs_tol compactification tolerance; see `apply_compactify` +#' @return a tibble in compact "update" (diff) format +#' +#' @keywords internal +tbl_diff2 <- function(earlier_snapshot, later_tbl, + ukey_names, + later_format = c("snapshot", "update"), + compactify_abs_tol = 0) { + # Most input validation + handle NULL earlier_snapshot. This is a small function so + # use faster validation variants: + if (!is_tibble(later_tbl)) { + cli_abort( + "`later_tbl` must be a tibble", + class = "epiprocess__tbl_diff2__later_tbl_invalid" + ) + } + if (is.null(earlier_snapshot)) { + return(later_tbl) + } + if (!is_tibble(earlier_snapshot)) { + cli_abort( + "`earlier_snapshot` must be a tibble or `NULL`", + class = "epiprocess__tbl_diff2__earlier_tbl_class_invalid" + ) + } + if (!is.character(ukey_names) || !all(ukey_names %in% names(earlier_snapshot))) { + cli_abort( + "`ukey_names` must be a subset of column names", + class = "epiprocess__tbl_diff2__ukey_names_class_invalid" + ) + } + later_format <- arg_match0(later_format, c("snapshot", "update")) + if (!(is.vector(compactify_abs_tol, mode = "numeric") && + length(compactify_abs_tol) == 1L && # nolint: indentation_linter + compactify_abs_tol >= 0)) { + # Give a specific message: + assert_numeric(compactify_abs_tol, lower = 0, any.missing = FALSE, len = 1L) + # Fallback e.g. for invalid classes not caught by assert_numeric: + cli_abort( + "`compactify_abs_tol` must be a length-1 double/integer >= 0", + class = "epiprocess__tbl_diff2__compactify_abs_tol_invalid" + ) + } + + all_names <- names(later_tbl) + val_names <- all_names[!(all_names %in% ukey_names)] + updates <- tbl_fast_anti_join(later_tbl, earlier_snapshot, ukey_names, val_names, compactify_abs_tol) + if (later_format == "snapshot") { + # Interpret `later_tbl` as a full snapshot, rather than a diff / sparse + # update. That means that any ukeys in `earlier_snapshot` that don't appear + # in `later_tbl` were deleted in the later snapshot. + deletions <- tbl_fast_anti_join(earlier_snapshot[ukey_names], later_tbl[ukey_names], ukey_names, character(), 0) + updates <- vec_rbind(updates, deletions) # fills val cols with NAs + } + # If `later_format == "update"`, we don't need to do anything special about + # the above ukeys. The full snapshot for the later version would include the + # corresponding rows unchanged, and the diff for these unchanged rows would be + # empty. + updates +} + +#' Apply an update (e.g., from `tbl_diff2`) to a snapshot +#' +#' @param snapshot tibble or `NULL`; entire data set as of some version, or +#' `NULL` to treat `update` as the initial version of the data set. +#' @param update tibble; ukeys + initial values for added rows, ukeys + new +#' values for changed rows. Deletions must be imprecisely represented as +#' changing all values to NAs. +#' @param ukey_names character; names of columns that should form a unique key +#' for `snapshot` and for `update`. Uniqueness is unchecked; if you don't have +#' this guaranteed, see [`check_ukey_unique()`]. +#' @return tibble; snapshot of the data set with the update applied. +#' +#' @keywords internal +tbl_patch <- function(snapshot, update, ukey_names) { + # Most input validation. This is a small function so use faster validation + # variants: + if (!is_tibble(update)) { + cli_abort( + "`update` must be a tibble", + class = "epiprocess__tbl_patch__update_class_invalid" + ) + } + if (is.null(snapshot)) { + return(update) + } + if (!is_tibble(snapshot)) { + cli_abort( + "`snapshot` must be a tibble", + class = "epiprocess__tbl_patch__snapshot_class_invalid" + ) + } + if (!is.character(ukey_names) || !all(ukey_names %in% names(snapshot))) { + cli_abort( + "`ukey_names` must be a subset of column names", + class = "epiprocess__tbl_patch__ukey_names_invalid" + ) + } + if (!identical(names(snapshot), names(update))) { + cli_abort(c( + "`snapshot` and `update` should have identical column + names and ordering.", + "*" = "`snapshot` colnames: {format_chr_deparse(tbl_names)}", + "*" = "`update` colnames: {format_chr_deparse(names(update))}" + ), class = "epiprocess__tbl_patch__tbl_names_invalid") + } + + result_tbl <- vec_rbind(update, snapshot) + + dup_ids <- vec_duplicate_id(result_tbl[ukey_names]) + # Find the "first" appearance of each ukey; since `update` is ordered before `snapshot`, + # this means favoring the rows from `update` over those in `snapshot`. + # This is like `!duplicated()` but faster, and like `vec_unique_loc()` but guaranteeing + # that we get the first appearance since `vec_duplicate_id()` guarantees that + # it points to the first appearance. + is_only_or_favored_appearance <- dup_ids == vec_seq_along(result_tbl) + result_tbl <- vec_slice(result_tbl, is_only_or_favored_appearance) + + result_tbl +} diff --git a/R/slide.R b/R/slide.R index abc7c3b77..e4ea80792 100644 --- a/R/slide.R +++ b/R/slide.R @@ -410,7 +410,7 @@ epi_slide_one_group <- function( # Returned values must be data.frame or vector. if ("other" %in% return_types) { cli_abort( - "epi_slide: slide computations must always return either data frames without rownames + "epi_slide: slide computations must always return either data frames or unnamed vectors (as determined by the vctrs package).", class = "epiprocess__invalid_slide_comp_value" ) @@ -548,613 +548,3 @@ get_before_after_from_window <- function(window_size, align, time_type) { } list(before = before, after = after) } - -#' Optimized slide functions for common cases -#' -#' @description `epi_slide_opt` allows sliding an n-timestep [data.table::froll] -#' or [slider::summary-slide] function over variables in an `epi_df` object. -#' These functions tend to be much faster than `epi_slide()`. See -#' `vignette("epi_df")` for more examples. -#' -#' @template basic-slide-params -#' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column -#' name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), -#' [other tidy-select expression][tidyselect::language], or a vector of -#' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if -#' they were positions in the data frame, so expressions like `x:y` can be -#' used to select a range of variables. -#' -#' The tidy-selection renaming interface is not supported, and cannot be used -#' to provide output column names; if you want to customize the output column -#' names, use [`dplyr::rename`] after the slide. -#' @param .f Function; together with `...` specifies the computation to slide. -#' `.f` must be one of `data.table`'s rolling functions -#' (`frollmean`, `frollsum`, `frollapply`. See [data.table::roll]) or one -#' of `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, -#' etc. See [slider::summary-slide]). -#' -#' The optimized `data.table` and `slider` functions can't be directly passed -#' as the computation function in `epi_slide` without careful handling to make -#' sure each computation group is made up of the `.window_size` dates rather -#' than `.window_size` points. `epi_slide_opt` (and wrapper functions -#' `epi_slide_mean` and `epi_slide_sum`) take care of window completion -#' automatically to prevent associated errors. -#' @param ... Additional arguments to pass to the slide computation `.f`, for -#' example, `algo` or `na.rm` in data.table functions. You don't need to -#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider -#' functions). -#' @param .prefix Optional [`glue::glue`] format string; name the slide result -#' column(s) by attaching this prefix to the corresponding input column(s). -#' Some shorthand is supported for basing the output names on `.window_size` -#' or other arguments; see "Prefix and suffix shorthand" below. -#' @param .suffix Optional [`glue::glue`] format string; like `.prefix`. The -#' default naming behavior is equivalent to `.suffix = -#' "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}"`. Can be used in combination -#' with `.prefix`. -#' @param .new_col_names Optional character vector with length matching the -#' number of input columns from `.col_names`; name the slide result column(s) -#' with these names. Cannot be used in combination with `.prefix` and/or -#' `.suffix`. -#' -#' @section Prefix and suffix shorthand: -#' -#' [`glue::glue`] format strings specially interpret content within curly -#' braces. E.g., `glue::glue("ABC{2 + 2}")` evaluates to `"ABC4"`. For `.prefix` -#' and `.suffix`, we provide `glue` with some additional variable bindings: -#' -#' - `{.n}` will be the number of time steps in the computation -#' corresponding to the `.window_size`. -#' - `{.time_unit_abbr}` will be a lower-case letter corresponding to the -#' `time_type` of `.x` -#' - `{.align_abbr}` will be `""` if `.align` is the default of `"right"`; -#' otherwise, it will be the first letter of `.align` -#' - `{.f_abbr}` will be a character vector containing a short abbreviation -#' for `.f` factoring in the input column type(s) for `.col_names` -#' -#' @importFrom dplyr mutate %>% arrange tibble select all_of -#' @importFrom rlang enquo expr_label caller_arg quo_get_env -#' @importFrom tidyselect eval_select -#' @importFrom glue glue -#' @importFrom purrr map map_lgl -#' @importFrom data.table frollmean frollsum frollapply -#' @importFrom lubridate as.period -#' @importFrom checkmate assert_function -#' @importFrom slider slide_sum slide_prod slide_mean slide_min slide_max slide_all slide_any -#' @export -#' @seealso [`epi_slide`] for the more general slide function -#' @examples -#' library(dplyr) -#' -#' # Add a column (`cases_7dsum`) containing a 7-day trailing sum on `cases`: -#' cases_deaths_subset %>% -#' select(geo_value, time_value, cases) %>% -#' epi_slide_sum(cases, .window_size = 7) -#' -#' # Add a column (`cases_rate_7dav`) containing a 7-day trailing average on `case_rate`: -#' covid_case_death_rates_extended %>% -#' epi_slide_mean(case_rate, .window_size = 7) -#' -#' # Use a less common specialized slide function: -#' cases_deaths_subset %>% -#' epi_slide_opt(cases, slider::slide_min, .window_size = 7) -#' -#' # Specify output column names and/or a naming scheme: -#' cases_deaths_subset %>% -#' select(geo_value, time_value, cases) %>% -#' group_by(geo_value) %>% -#' epi_slide_sum(cases, .window_size = 7, .new_col_names = "case_sum") %>% -#' ungroup() -#' cases_deaths_subset %>% -#' select(geo_value, time_value, cases) %>% -#' group_by(geo_value) %>% -#' epi_slide_sum(cases, .window_size = 7, .prefix = "sum_") %>% -#' ungroup() -#' -#' # Additional settings can be sent to the {data.table} and {slider} functions -#' # via `...`. This example passes some arguments to `frollmean` settings for -#' # speed, accuracy, and to allow partially-missing windows: -#' covid_case_death_rates_extended %>% -#' epi_slide_mean( -#' case_rate, -#' .window_size = 7, -#' na.rm = TRUE, algo = "exact", hasNA = TRUE -#' ) -#' -#' # If the more specialized possibilities for `.f` don't cover your needs, you -#' # can use `epi_slide_opt` with `.f = data.table::frollapply` to apply a -#' # custom function at the cost of more computation time. See also `epi_slide` -#' # if you need something even more general. -#' cases_deaths_subset %>% -#' select(geo_value, time_value, case_rate_7d_av, death_rate_7d_av) %>% -#' epi_slide_opt(c(case_rate_7d_av, death_rate_7d_av), -#' data.table::frollapply, -#' FUN = median, .window_size = 28, -#' .suffix = "_{.n}{.time_unit_abbr}_median" -#' ) %>% -#' print(n = 40) -epi_slide_opt <- function( - .x, .col_names, .f, ..., - .window_size = NULL, .align = c("right", "center", "left"), - .prefix = NULL, .suffix = NULL, .new_col_names = NULL, - .ref_time_values = NULL, .all_rows = FALSE) { - assert_class(.x, "epi_df") - - # Deprecated argument handling - provided_args <- rlang::call_args_names(rlang::call_match()) - if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { - cli::cli_abort( - "epi_slide_opt: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, - or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, - `.ref_time_values`, `.all_rows`." - ) - } - if ("as_list_col" %in% provided_args) { - cli::cli_abort( - "epi_slide_opt: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. - If TRUE, have your given computation wrap its result using `list(result)` instead." - ) - } - if ("before" %in% provided_args || "after" %in% provided_args) { - cli::cli_abort( - "epi_slide_opt: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. - See the slide documentation for more details." - ) - } - if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { - cli::cli_abort( - "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize - the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", - class = "epiprocess__epi_slide_opt__new_name_not_supported" - ) - } - if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { - cli::cli_abort( - "epi_slide_opt: the argument `names_sep` is not supported for `epi_slide_opt`. If you want to customize - the output column names, use `dplyr::rename` after the slide.", - class = "epiprocess__epi_slide_opt__name_sep_not_supported" - ) - } - - assert_class(.x, "epi_df") - .x_orig_groups <- groups(.x) - if (inherits(.x, "grouped_df")) { - expected_group_keys <- .x %>% - key_colnames(exclude = "time_value") %>% - sort() - if (!identical(.x %>% group_vars() %>% sort(), expected_group_keys)) { - cli_abort( - "`.x` must be either grouped by {expected_group_keys} or ungrouped; if the latter, - we'll temporarily group by {expected_group_keys} for this operation. You may need - to aggregate your data first; see sum_groups_epi_df().", - class = "epiprocess__epi_slide_opt__invalid_grouping" - ) - } - } else { - .x <- group_epi_df(.x, exclude = "time_value") - } - if (nrow(.x) == 0L) { - cli_abort( - c( - "input data `.x` unexpectedly has 0 rows", - "i" = "If this computation is occuring within an `epix_slide` call, - check that `epix_slide` `.versions` argument was set appropriately - so that you don't get any completely-empty snapshots" - ), - class = "epiprocess__epi_slide_opt__0_row_input", - epiprocess__x = .x - ) - } - - # Check for duplicated time values within groups - assert(check_ukey_unique(ungroup(.x), c(group_vars(.x), "time_value"))) - - # The position of a given column can be differ between input `.x` and - # `.data_group` since the grouping step by default drops grouping columns. - # To avoid rerunning `eval_select` for every `.data_group`, convert - # positions of user-provided `col_names` into string column names. We avoid - # using `names(pos)` directly for robustness and in case we later want to - # allow users to rename fields via tidyselection. - col_names_quo <- enquo(.col_names) - pos <- eval_select(col_names_quo, data = .x, allow_rename = FALSE) - col_names_chr <- names(.x)[pos] - - # Check that slide function `.f` is one of those short-listed from - # `data.table` and `slider` (or a function that has the exact same definition, - # e.g. if the function has been reexported or defined locally). Extract some - # metadata. `namer` will be mapped over columns (.x will be a column, not the - # entire edf). - f_possibilities <- - tibble::tribble( - ~f, ~package, ~namer, - frollmean, "data.table", ~ if (is.logical(.x)) "prop" else "av", - frollsum, "data.table", ~ if (is.logical(.x)) "count" else "sum", - frollapply, "data.table", ~"slide", - slide_sum, "slider", ~ if (is.logical(.x)) "count" else "sum", - slide_prod, "slider", ~"prod", - slide_mean, "slider", ~ if (is.logical(.x)) "prop" else "av", - slide_min, "slider", ~"min", - slide_max, "slider", ~"max", - slide_all, "slider", ~"all", - slide_any, "slider", ~"any", - ) - f_info <- f_possibilities %>% - filter(map_lgl(.data$f, ~ identical(.f, .x))) - if (nrow(f_info) == 0L) { - # `f` is from somewhere else and not supported - cli_abort( - c( - "problem with {rlang::expr_label(rlang::caller_arg(f))}", - "i" = "`f` must be one of `data.table`'s rolling functions (`frollmean`, - `frollsum`, `frollapply`. See `?data.table::roll`) or one of - `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, - etc. See `?slider::\`summary-slide\`` for more options)." - ), - class = "epiprocess__epi_slide_opt__unsupported_slide_function", - epiprocess__f = .f - ) - } - if (nrow(f_info) > 1L) { - cli_abort('epiprocess internal error: looking up `.f` in table of possible - functions yielded multiple matches. Please report it using "New - issue" at https://github.com/cmu-delphi/epiprocess/issues, using - reprex::reprex to provide a minimal reproducible example.') - } - f_from_package <- f_info$package - - user_provided_rtvs <- !is.null(.ref_time_values) - if (!user_provided_rtvs) { - .ref_time_values <- unique(.x$time_value) - } else { - assert_numeric(.ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) - if (!test_subset(.ref_time_values, unique(.x$time_value))) { - cli_abort( - "`ref_time_values` must be a unique subset of the time values in `x`.", - class = "epiprocess__epi_slide_opt_invalid_ref_time_values" - ) - } - if (anyDuplicated(.ref_time_values) != 0L) { - cli_abort( - "`ref_time_values` must not contain any duplicates; use `unique` if appropriate.", - class = "epiprocess__epi_slide_opt_invalid_ref_time_values" - ) - } - } - ref_time_values <- sort(.ref_time_values) - - # Handle window arguments - .align <- rlang::arg_match(.align) - time_type <- attr(.x, "metadata")$time_type - if (is.null(.window_size)) { - cli_abort("epi_slide_opt: `.window_size` must be specified.") - } - validate_slide_window_arg(.window_size, time_type) - window_args <- get_before_after_from_window(.window_size, .align, time_type) - - # Handle output naming - if ((!is.null(.prefix) || !is.null(.suffix)) && !is.null(.new_col_names)) { - cli_abort( - "Can't use both .prefix/.suffix and .new_col_names at the same time.", - class = "epiprocess__epi_slide_opt_incompatible_naming_args" - ) - } - assert_string(.prefix, null.ok = TRUE) - assert_string(.suffix, null.ok = TRUE) - assert_character(.new_col_names, len = length(col_names_chr), null.ok = TRUE) - if (is.null(.prefix) && is.null(.suffix) && is.null(.new_col_names)) { - .suffix <- "_{.n}{.time_unit_abbr}{.align_abbr}{.f_abbr}" - # ^ does not account for any arguments specified to underlying functions via - # `...` such as `na.rm =`, nor does it distinguish between functions from - # different packages accomplishing the same type of computation. Those are - # probably only set one way per task, so this probably produces cleaner - # names without clashes (though maybe some confusion if switching between - # code with different settings). - } - if (!is.null(.prefix) || !is.null(.suffix)) { - .prefix <- .prefix %||% "" - .suffix <- .suffix %||% "" - if (identical(.window_size, Inf)) { - n <- "running_" - time_unit_abbr <- "" - align_abbr <- "" - } else { - n <- time_delta_to_n_steps(.window_size, time_type) - time_unit_abbr <- time_type_unit_abbr(time_type) - align_abbr <- c(right = "", center = "c", left = "l")[[.align]] - } - glue_env <- rlang::env( - .n = n, - .time_unit_abbr = time_unit_abbr, - .align_abbr = align_abbr, - .f_abbr = purrr::map_chr(.x[col_names_chr], unwrap(f_info$namer)), - quo_get_env(col_names_quo) - ) - .new_col_names <- unclass( - glue(.prefix, .envir = glue_env) + - col_names_chr + - glue(.suffix, .envir = glue_env) - ) - } else { - # `.new_col_names` was provided by user; we don't need to do anything. - } - if (any(.new_col_names %in% names(.x))) { - cli_abort(c( - "Naming conflict between new columns and existing columns", - "x" = "Overlapping names: {format_varnames(intersect(.new_col_names, names(.x)))}" - ), class = "epiprocess__epi_slide_opt_old_new_name_conflict") - } - if (anyDuplicated(.new_col_names)) { - cli_abort(c( - "New column names contain duplicates", - "x" = "Duplicated names: {format_varnames(unique(.new_col_names[duplicated(.new_col_names)]))}" - ), class = "epiprocess__epi_slide_opt_new_name_duplicated") - } - result_col_names <- .new_col_names - - # Make a complete date sequence between min(.x$time_value) and max(.x$time_value). - date_seq_list <- full_date_seq(.x, window_args$before, window_args$after, time_type) - all_dates <- date_seq_list$all_dates - pad_early_dates <- date_seq_list$pad_early_dates - pad_late_dates <- date_seq_list$pad_late_dates - - slide_one_grp <- function(.data_group, .group_key, ...) { - missing_times <- all_dates[!vec_in(all_dates, .data_group$time_value)] - # `frollmean` requires a full window to compute a result. Add NA values - # to beginning and end of the group so that we get results for the - # first `before` and last `after` elements. - .data_group <- vec_rbind( - .data_group, # (tibble; epi_slide_opt uses .keep = FALSE) - new_tibble(vec_recycle_common( - time_value = c(missing_times, pad_early_dates, pad_late_dates), - .real = FALSE - )) - ) %>% - `[`(vec_order(.$time_value), ) - - if (f_from_package == "data.table") { - # Grouping should ensure that we don't have duplicate time values. - # Completion above should ensure we have at least .window_size rows. Check - # that we don't have more than .window_size rows (or fewer somehow): - if (nrow(.data_group) != length(c(all_dates, pad_early_dates, pad_late_dates))) { - cli_abort( - c( - "group contains an unexpected number of rows", - "i" = c("Input data may contain `time_values` closer together than the - expected `time_step` size") - ), - class = "epiprocess__epi_slide_opt__unexpected_row_number", - epiprocess__data_group = .data_group, - epiprocess__group_key = .group_key - ) - } - - # `frollmean` is 1-indexed, so create a new window width based on our - # `before` and `after` params. Right-aligned `frollmean` results' - # `ref_time_value`s will be `after` timesteps ahead of where they should - # be; shift results to the left by `after` timesteps. - if (window_args$before != Inf) { - window_size <- window_args$before + window_args$after + 1L - roll_output <- .f(x = .data_group[, col_names_chr], n = window_size, ...) - } else { - window_size <- list(seq_along(.data_group$time_value)) - roll_output <- .f(x = .data_group[, col_names_chr], n = window_size, adaptive = TRUE, ...) - } - if (window_args$after >= 1) { - .data_group[, result_col_names] <- purrr::map(roll_output, function(.x) { - c(.x[(window_args$after + 1L):length(.x)], rep(NA, window_args$after)) - }) - } else { - .data_group[, result_col_names] <- roll_output - } - } - if (f_from_package == "slider") { - for (i in seq_along(col_names_chr)) { - .data_group[, result_col_names[i]] <- .f( - x = .data_group[[col_names_chr[i]]], - before = as.numeric(window_args$before), - after = as.numeric(window_args$after), - ... - ) - } - } - - .data_group - } - - result <- .x %>% - `[[<-`(".real", value = TRUE) %>% - group_modify(slide_one_grp, ..., .keep = FALSE) %>% - `[`(.$.real, names(.) != ".real") %>% - arrange_col_canonical() %>% - group_by(!!!.x_orig_groups) - - if (.all_rows) { - result[!vec_in(result$time_value, ref_time_values), result_col_names] <- NA - } else if (user_provided_rtvs) { - result <- result[vec_in(result$time_value, ref_time_values), ] - } - - if (!is_epi_df(result)) { - # `.all_rows` handling strips epi_df format and metadata. - # Restore them. - result <- reclass(result, attributes(.x)$metadata) - } - - return(result) -} - -#' @rdname epi_slide_opt -#' @description `epi_slide_mean` is a wrapper around `epi_slide_opt` with `.f = -#' data.table::frollmean`. -#' -#' @export -epi_slide_mean <- function( - .x, .col_names, ..., - .window_size = NULL, .align = c("right", "center", "left"), - .prefix = NULL, .suffix = NULL, .new_col_names = NULL, - .ref_time_values = NULL, .all_rows = FALSE) { - # Deprecated argument handling - provided_args <- rlang::call_args_names(rlang::call_match()) - if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { - cli::cli_abort( - "epi_slide_mean: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, - or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, - `.ref_time_values`, `.all_rows`." - ) - } - if ("as_list_col" %in% provided_args) { - cli::cli_abort( - "epi_slide_mean: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. - If TRUE, have your given computation wrap its result using `list(result)` instead." - ) - } - if ("before" %in% provided_args || "after" %in% provided_args) { - cli::cli_abort( - "epi_slide_mean: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. - See the slide documentation for more details." - ) - } - if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { - cli::cli_abort( - "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize - the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", - class = "epiprocess__epi_slide_opt__new_name_not_supported" - ) - } - if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { - cli::cli_abort( - "epi_slide_mean: the argument `names_sep` is not supported. If you want to customize - the output column names, use `dplyr::rename` after the slide." - ) - } - - epi_slide_opt( - .x = .x, - .col_names = {{ .col_names }}, - .f = data.table::frollmean, - ..., - .window_size = .window_size, - .align = .align, - .prefix = .prefix, - .suffix = .suffix, - .new_col_names = .new_col_names, - .ref_time_values = .ref_time_values, - .all_rows = .all_rows - ) -} - -#' @rdname epi_slide_opt -#' @description `epi_slide_sum` is a wrapper around `epi_slide_opt` with `.f = -#' data.table::frollsum`. -#' -#' @export -epi_slide_sum <- function( - .x, .col_names, ..., - .window_size = NULL, .align = c("right", "center", "left"), - .prefix = NULL, .suffix = NULL, .new_col_names = NULL, - .ref_time_values = NULL, .all_rows = FALSE) { - # Deprecated argument handling - provided_args <- rlang::call_args_names(rlang::call_match()) - if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { - cli::cli_abort( - "epi_slide_sum: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, - or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, - `.ref_time_values`, `.all_rows`." - ) - } - if ("as_list_col" %in% provided_args) { - cli::cli_abort( - "epi_slide_sum: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. - If TRUE, have your given computation wrap its result using `list(result)` instead." - ) - } - if ("before" %in% provided_args || "after" %in% provided_args) { - cli::cli_abort( - "epi_slide_sum: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. - See the slide documentation for more details." - ) - } - if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { - cli::cli_abort( - "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize - the output column names, use `.prefix =`, `.suffix =`, or `.new_col_**names** =`.", - class = "epiprocess__epi_slide_opt__new_name_not_supported" - ) - } - if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { - cli::cli_abort( - "epi_slide_sum: the argument `names_sep` is not supported. If you want to customize - the output column names, use `dplyr::rename` after the slide." - ) - } - epi_slide_opt( - .x = .x, - .col_names = {{ .col_names }}, - .f = data.table::frollsum, - ..., - .window_size = .window_size, - .align = .align, - .prefix = .prefix, - .suffix = .suffix, - .new_col_names = .new_col_names, - .ref_time_values = .ref_time_values, - .all_rows = .all_rows - ) -} - -#' Make a complete date sequence between min(x$time_value) and max -#' (x$time_value). Produce lists of dates before min(x$time_value) and after -#' max(x$time_value) for padding initial and final windows to size `n`. -#' -#' `before` and `after` args are assumed to have been validated by the calling -#' function (using `validate_slide_window_arg`). -#' -#' @importFrom checkmate assert_function -#' @keywords internal -full_date_seq <- function(x, before, after, time_type) { - if (!time_type %in% c("day", "week", "yearmonth", "integer")) { - cli_abort( - "time_type must be one of 'day', 'week', or 'integer'." - ) - } - - pad_early_dates <- c() - pad_late_dates <- c() - - # `tsibble` time types have their own behavior, where adding 1 corresponds to - # incrementing by a quantum (smallest resolvable unit) of the date class. For - # example, one step = 1 quarter for `yearquarter`. - if (time_type %in% c("yearmonth", "integer")) { - all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) - - if (before != 0 && before != Inf) { - pad_early_dates <- all_dates[1L] - before:1 - } - if (after != 0) { - pad_late_dates <- all_dates[length(all_dates)] + 1:after - } - } else { - by <- switch(time_type, - day = "days", - week = "weeks", - ) - - all_dates <- seq(min(x$time_value), max(x$time_value), by = by) - if (before != 0 && before != Inf) { - # The behavior is analogous to the branch with tsibble types above. For - # more detail, note that the function `seq.Date(from, ..., length.out = - # n)` returns `from + 0:n`. Since we want `from + 1:n`, we drop the first - # element. Adding "-1" to the `by` arg makes `seq.Date` go backwards in - # time. - pad_early_dates <- sort(seq(all_dates[1L], by = paste("-1", by), length.out = before + 1)[-1]) - } - if (after != 0) { - pad_late_dates <- seq(all_dates[length(all_dates)], by = by, length.out = after + 1)[-1] - } - } - - list( - all_dates = all_dates, - pad_early_dates = pad_early_dates, - pad_late_dates = pad_late_dates - ) -} diff --git a/R/time-utils.R b/R/time-utils.R index 73fbc8a56..a5740f7f5 100644 --- a/R/time-utils.R +++ b/R/time-utils.R @@ -110,6 +110,8 @@ validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRU } } +time_delta_formats <- c("friendly", "fast") + #' Object that, added to time_values of time_type, advances by one time step/interval #' #' @param time_type string; `epi_df`'s or `epi_archive`'s `time_type` @@ -129,7 +131,7 @@ validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRU #' #' @keywords internal unit_time_delta <- function(time_type, format = c("friendly", "fast")) { - format <- rlang::arg_match(format) + format <- rlang::arg_match0(format, time_delta_formats) switch(format, friendly = switch(time_type, day = as.difftime(1, units = "days"), @@ -337,7 +339,7 @@ difftime_approx_ceiling_time_delta <- function(difftime, time_type) { ) } -#' Difference between two time value vectors in terms of number of time "steps" +#' Difference between two finite `time_value` vectors in terms of number of time "steps" #' #' @param x a time_value (vector) of time type `time_type` #' @param y a time_value (vector) of time type `time_type` @@ -350,15 +352,18 @@ time_minus_time_in_n_steps <- function(x, y, time_type) { time_delta_to_n_steps(x - y, time_type) } -#' Advance/retreat time_values by specified number of time "steps" +#' Advance/retreat time_value(s) by bare-integerish number(s) of time "steps" #' #' Here, a "step" is based on the `time_type`, not just the class of `x`. #' #' @param x a time_value (vector) of time type `time_type` -#' @param y integerish (vector) +#' @param y bare integerish (vector) #' @param time_type as in [`validate_slide_window_arg()`] #' @return a time_value (vector) of time type `time_type` #' +#' @seealso [`time_plus_slide_window_arg`] if you're working with a `y` that is +#' a slide window arg, which is scalar but otherwise more general (class-wise, +#' Inf-wise) than an integerish vector. #' @keywords internal time_plus_n_steps <- function(x, y, time_type) { x + y * unit_time_delta(time_type, "fast") diff --git a/_pkgdown.yml b/_pkgdown.yml index 3742bd416..8d3a09d5c 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -93,6 +93,10 @@ reference: - epidatasets::covid_incidence_outliers - epidatasets::covid_case_death_rates_extended + - title: Other utilities + - contents: + - vec_approx_equal + - title: internal - contents: - starts_with("internal") diff --git a/man/across_ish_names_info.Rd b/man/across_ish_names_info.Rd new file mode 100644 index 000000000..36b9ed040 --- /dev/null +++ b/man/across_ish_names_info.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_slide_opt_edf.R +\name{across_ish_names_info} +\alias{across_ish_names_info} +\title{Calculate input and output column names for an \code{{epiprocess}} +\code{\link[dplyr:across]{dplyr::across}}-like operations} +\usage{ +across_ish_names_info( + .x, + time_type, + col_names_quo, + .f_namer, + .window_size, + .align, + .prefix, + .suffix, + .new_col_names +) +} +\arguments{ +\item{.x}{data.frame to perform input column tidyselection on} + +\item{time_type}{as in \code{\link{new_epi_df}}} + +\item{col_names_quo}{enquosed input column tidyselect expression} + +\item{.f_namer}{function taking an input column object and outputting a name +for a corresponding output column; see \code{\link{upstream_slide_f_info}}} + +\item{.window_size}{as in \code{\link{epi_slide_opt}}} + +\item{.align}{as in \code{\link{epi_slide_opt}}} + +\item{.prefix}{as in \code{\link{epi_slide_opt}}} + +\item{.suffix}{as in \code{\link{epi_slide_opt}}} + +\item{.new_col_names}{as in \code{\link{epi_slide_opt}}} +} +\value{ +named list with two elements: \code{input_col_names}, chr, subset of +\code{names(.x)}; and \code{output_colnames}, chr, same length as \code{input_col_names} +} +\description{ +Calculate input and output column names for an \code{{epiprocess}} +\code{\link[dplyr:across]{dplyr::across}}-like operations +} +\keyword{internal} diff --git a/man/apply_compactify.Rd b/man/apply_compactify.Rd index e96108789..6bd6ce770 100644 --- a/man/apply_compactify.Rd +++ b/man/apply_compactify.Rd @@ -16,7 +16,7 @@ would be \code{key(DT)}.} \item{abs_tol}{numeric, >=0; absolute tolerance to use on numeric measurement columns when determining whether something can be compactified away; see -\code{\link{is_locf}}} +\code{\link{vec_approx_equal}}} } \description{ Works by shifting all rows except the version, then comparing values to see diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index b92cd5057..5aa4ce785 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -25,7 +25,7 @@ as_epi_archive( is_epi_archive(x) new_epi_archive( - x, + data_table, geo_type, time_type, other_keys, @@ -36,20 +36,19 @@ new_epi_archive( validate_epi_archive(x) } \arguments{ -\item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, -\code{time_value}, \code{version}, and then any additional number of columns.} +\item{x}{An object.} \item{geo_type}{DEPRECATED Has no effect. Geo value type is inferred from the location column and set to "custom" if not recognized.} -\item{time_type}{DEPRECATED Has no effect. Time value type inferred from the time -column and set to "custom" if not recognized. Unpredictable behavior may result -if the time type is not recognized.} +\item{time_type}{DEPRECATED Has no effect. Time value type inferred from the +time column and set to "custom" if not recognized. Unpredictable behavior +may result if the time type is not recognized.} \item{other_keys}{Character vector specifying the names of variables in \code{x} that should be considered key variables (in the language of \code{data.table}) -apart from "geo_value", "time_value", and "version". Typical examples -are "age" or more granular geographies.} +apart from "geo_value", "time_value", and "version". Typical examples are +"age" or more granular geographies.} \item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{"message"}. \code{TRUE} will remove some redundant rows, \code{FALSE} will not. \code{"message"} is like \code{TRUE} @@ -94,6 +93,13 @@ beyond \code{max(x$version)}, but they all contained empty updates. (The default value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} + +\item{data_table}{a \code{data.table} with \code{\link[data.table:setkey]{data.table::key()}} equal to +\code{c("geo_value", other_keys, "time_value", "version")}. For \code{data.table} +users: this sets up an alias of \code{data_table}; if you plan to keep on +working with \code{data_table} or working directly with the archive's \verb{$DT} +using mutating operations, you should \code{copy()} if appropriate. We will not +mutate the \code{DT} with any exported \code{{epiprocess}} functions, though.} } \value{ \itemize{ diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 4b75e9ffb..8c7d5ad49 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/slide.R +% Please edit documentation in R/epi_slide_opt_edf.R \name{epi_slide_opt} \alias{epi_slide_opt} \alias{epi_slide_mean} @@ -134,10 +134,21 @@ added. It will be ungrouped if \code{.x} was ungrouped, and have the same groups as \code{.x} if \code{.x} was grouped. } \description{ -\code{epi_slide_opt} allows sliding an n-timestep \link[data.table:froll]{data.table::froll} -or \link[slider:summary-slide]{slider::summary-slide} function over variables in an \code{epi_df} object. -These functions tend to be much faster than \code{epi_slide()}. See -\code{vignette("epi_df")} for more examples. +\code{epi_slide_opt} calculates n-time-step rolling means&sums, +cumulative/"running" means&sums, and other operations supported by +\code{\link[data.table:froll]{data.table::froll}} or \code{\link[slider:summary-slide]{slider::summary-slide}} functions. +\itemize{ +\item On \code{epi_df}s, it will take care of looping over \code{geo_value}s, temporarily +filling in time gaps with \code{NA}s and other work needed to ensure there are +exactly \code{n} consecutive time steps per computation, and has some other +convenience features. See \code{vignette("epi_df")} for more examples. +\item On \code{epi_archive}s, it will calculate the version history for these slide +computations and combine it with the version history for the rest of the +columns. +} + +This function tends to be much faster than using \code{epi_slide()} and +\code{epix_slide()} directly. \code{epi_slide_mean} is a wrapper around \code{epi_slide_opt} with \code{.f = data.table::frollmean}. @@ -211,6 +222,12 @@ cases_deaths_subset \%>\% .suffix = "_{.n}{.time_unit_abbr}_median" ) \%>\% print(n = 40) + +# You can calculate entire version histories for the derived signals by +# calling `epi_slide_opt()` on an `epi_archive`: +case_death_rate_archive \%>\% + epi_slide_mean(case_rate, .window_size = 14) + } \seealso{ \code{\link{epi_slide}} for the more general slide function diff --git a/man/epi_slide_opt_archive_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd new file mode 100644 index 000000000..847d0d162 --- /dev/null +++ b/man/epi_slide_opt_archive_one_epikey.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_slide_opt_archive.R +\name{epi_slide_opt_archive_one_epikey} +\alias{epi_slide_opt_archive_one_epikey} +\title{Core operation of \code{epi_slide_opt.epi_archive} for a single epikey's history} +\usage{ +epi_slide_opt_archive_one_epikey( + inp_updates, + in_colnames, + f_dots_baked, + f_from_package, + before_n_steps, + after_n_steps, + time_type, + out_colnames +) +} +\arguments{ +\item{inp_updates}{tibble with a \code{version} column and measurement columns for +a single epikey, without the epikey labeling columns (e.g., from +\code{group_modify}). Interpretation is analogous to an \code{epi_archive} \code{DT}, but +a specific row order is not required.} + +\item{in_colnames}{chr; names of columns to which to apply \code{f_dots_baked}} + +\item{f_dots_baked}{supported sliding function from \code{{data.table}} or +\code{{slider}}, potentially with some arguments baked in with +\code{\link[purrr:partial]{purrr::partial}}} + +\item{f_from_package}{string; name of package from which \code{f_dots_baked} +(pre-\code{partial}) originates} + +\item{before_n_steps}{integerish \verb{>=0} or \code{Inf}; number of time steps before +each \code{ref_time_value} to include in the sliding window computation; \code{Inf} +to include all times beginning with the min \code{time_value}} + +\item{after_n_steps}{integerish \verb{>=0}; number of time steps after each +\code{ref_time_value} to include in the sliding window computation} + +\item{time_type}{as in \code{new_epi_archive}} + +\item{out_colnames}{chr, same length as \code{in_colnames}; column names to use +for results} +} +\value{ +tibble with a \code{version} column, pre-existing measurement columns, and +new measurement columns; (compactified) diff data to put into an +\code{epi_archive}. May not match column ordering; may not ensure any row +ordering. +} +\description{ +Core operation of \code{epi_slide_opt.epi_archive} for a single epikey's history +} +\examples{ + +library(dplyr) +inp_updates <- bind_rows( + tibble(version = 30, time_value = 1:20, value = 1:20), + tibble(version = 32, time_value = 4:5, value = 5:4), + tibble(version = 33, time_value = 8, value = 9), + tibble(version = 34, time_value = 11, value = NA), + tibble(version = 35, time_value = -10, value = -10), + tibble(version = 56, time_value = 50, value = 50) +) \%>\% + mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) + +f <- purrr::partial(data.table::frollmean, algo = "exact") + +inp_updates \%>\% + epiprocess:::epi_slide_opt_archive_one_epikey( + "value", f, "data.table", 2L, 0L, "day", "slide_value" + ) + +} +\keyword{internal} diff --git a/man/epi_slide_opt_edf_one_epikey.Rd b/man/epi_slide_opt_edf_one_epikey.Rd new file mode 100644 index 000000000..b3cfad5b3 --- /dev/null +++ b/man/epi_slide_opt_edf_one_epikey.Rd @@ -0,0 +1,98 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_slide_opt_edf.R +\name{epi_slide_opt_edf_one_epikey} +\alias{epi_slide_opt_edf_one_epikey} +\title{Run a specialized slide computation on a single \code{epi_df} epikey, with temporary completion} +\usage{ +epi_slide_opt_edf_one_epikey( + inp_tbl, + in_colnames, + f_dots_baked, + f_from_package, + before_n_steps, + after_n_steps, + unit_step, + time_type, + out_colnames, + out_filter_time_range, + out_filter_time_set +) +} +\arguments{ +\item{inp_tbl}{tibble; should have a \code{time_value} column and columns named in +\code{in_colnames}; should not contain any columns named in \code{out_colnames}} + +\item{in_colnames}{chr; names of columns to which to apply \code{f_dots_baked}} + +\item{f_dots_baked}{supported sliding function from \code{{data.table}} or +\code{{slider}}, potentially with some arguments baked in with +\code{\link[purrr:partial]{purrr::partial}}} + +\item{f_from_package}{string; name of package from which \code{f_dots_baked} +(pre-\code{partial}) originates} + +\item{before_n_steps}{integerish \verb{>=0} or \code{Inf}; number of time steps before +each \code{ref_time_value} to include in the sliding window computation; \code{Inf} +to include all times beginning with the min \code{time_value}} + +\item{after_n_steps}{integerish \verb{>=0}; number of time steps after each +\code{ref_time_value} to include in the sliding window computation} + +\item{time_type}{as in \code{new_epi_archive}} + +\item{out_colnames}{chr, same length as \code{in_colnames}; column names to use +for results} + +\item{out_filter_time_range, out_filter_time_set}{\code{time_value} filter; +\code{time_values} in the output should match the result of applying this filter +to \code{inp_tbl$time_value}. Exactly one of the two must be provided +(non-\code{NULL}) and the other must be \code{NULL}. \code{out_filter_time_range}, if +provided, should be a length-2 vector/list containing the minimum and +maximum \code{time_value} to allow in the output. \code{out_filter_time_set}, if +provided, should be a vector of \code{time_values} to intersect with the input +\code{time_value}s.} +} +\value{ +tibble; like \code{inp_tbl} with addition of \code{out_colnames} holding the +slide computation results, with times filtered down as specified +} +\description{ +Run a specialized slide computation on a single \code{epi_df} epikey, with temporary completion +} +\examples{ + +library(dplyr) +tbl <- tibble( + time_value = c(11:12, 15:18) + 0, + value = c(c(1, 2), c(4, 8, 16, 32)) +) + +tbl \%>\% + epi_slide_opt_edf_one_epikey( + "value", + frollmean, "data.table", + 1L, 0L, 1L, "integer", + "slide_value", + c(11L, 16L), NULL + ) + +tbl \%>\% + epi_slide_opt_edf_one_epikey( + "value", + frollmean, "data.table", + 0L, 1L, 1L, "integer", + "slide_value", + NULL, c(11, 15, 16, 17, 18) + ) + +tbl \%>\% + epi_slide_opt_edf_one_epikey( + "value", + frollmean, "data.table", + Inf, 0L, 1L, "integer", + "slide_value", + NULL, c(12, 17) + ) + +} +\keyword{internal} diff --git a/man/full_date_seq.Rd b/man/full_date_seq.Rd index eb36b2c17..1a29c5e92 100644 --- a/man/full_date_seq.Rd +++ b/man/full_date_seq.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/slide.R +% Please edit documentation in R/epi_slide_opt_edf.R \name{full_date_seq} \alias{full_date_seq} \title{Make a complete date sequence between min(x$time_value) and max diff --git a/man/is_locf.Rd b/man/is_locf.Rd deleted file mode 100644 index f8f8eefcb..000000000 --- a/man/is_locf.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R -\name{is_locf} -\alias{is_locf} -\title{Checks to see if a value in a vector is LOCF} -\usage{ -is_locf(vec, abs_tol, is_key) -} -\description{ -LOCF meaning last observation carried forward (to later -versions). Lags the vector by 1, then compares with itself. If \code{is_key} is -\code{TRUE}, only values that are exactly the same between the lagged and -original are considered LOCF. If \code{is_key} is \code{FALSE} and \code{vec} is a vector -of numbers (\code{\link[base:numeric]{base::is.numeric}}), then approximate equality will be used, -checking whether the absolute difference between each pair of entries is -\verb{<= abs_tol}; if \code{vec} is something else, then exact equality is used -instead. -} -\details{ -We include epikey-time columns in LOCF comparisons as part of an optimization -to avoid slower grouped operations while still ensuring that the first -observation for each time series will not be marked as LOCF. We test these -key columns for exact equality to prevent chopping off consecutive -time_values during flat periods when \code{abs_tol} is high. - -We use exact equality for non-\code{is.numeric} double/integer columns such as -dates, datetimes, difftimes, \code{tsibble::yearmonth}s, etc., as these may be -used as part of re-indexing or grouping procedures, and we don't want to -change the number of groups for those operations when we remove LOCF data -during compactification. -} -\keyword{internal} diff --git a/man/tbl_diff2.Rd b/man/tbl_diff2.Rd new file mode 100644 index 000000000..f4e393b95 --- /dev/null +++ b/man/tbl_diff2.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/patch.R +\name{tbl_diff2} +\alias{tbl_diff2} +\title{Calculate compact patch to move from one snapshot/update to another} +\usage{ +tbl_diff2( + earlier_snapshot, + later_tbl, + ukey_names, + later_format = c("snapshot", "update"), + compactify_abs_tol = 0 +) +} +\arguments{ +\item{earlier_snapshot}{tibble or \code{NULL}; \code{NULL} represents that there was no +data before \code{later_tbl}.} + +\item{later_tbl}{tibble; must have the same column names as +\code{earlier_snapshot} if it is a tibble.} + +\item{ukey_names}{character; column names that together, form a unique key +for \code{earlier_snapshot} and for \code{later_tbl}. This is unchecked; see +\code{\link{check_ukey_unique}} if you don't already have this guaranteed.} + +\item{later_format}{"snapshot" or "update"; default is "snapshot". If +"snapshot", \code{later_tbl} will be interpreted as a full snapshot of the data +set including all ukeys, and any ukeys that are in \code{earlier_snapshot} but +not in \code{later_tbl} are interpreted as deletions, which are currently +(imprecisely) represented in the output patch as revisions of all +non-\code{ukey_names} columns to NA values (using \code{{vctrs}}). If "update", then +it's assumed that any deletions have already been represented this way in +\code{later_tbl} and any ukeys not in \code{later_tbl} are simply unchanged; we are +just ensuring that the update is fully compact for the given +\code{compactify_abs_tol}.} + +\item{compactify_abs_tol}{compactification tolerance; see \code{apply_compactify}} +} +\value{ +a tibble in compact "update" (diff) format +} +\description{ +Calculate compact patch to move from one snapshot/update to another +} +\keyword{internal} diff --git a/man/tbl_fast_anti_join.Rd b/man/tbl_fast_anti_join.Rd new file mode 100644 index 000000000..91104b2a1 --- /dev/null +++ b/man/tbl_fast_anti_join.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/patch.R +\name{tbl_fast_anti_join} +\alias{tbl_fast_anti_join} +\title{Variation on \code{\link[dplyr:filter-joins]{dplyr::anti_join}} for speed + tolerance setting} +\usage{ +tbl_fast_anti_join(x, y, ukey_names, val_names, abs_tol = 0) +} +\arguments{ +\item{x}{tibble; \code{x[ukey_names]} must not have any duplicate rows} + +\item{y}{tibble; \code{y[ukey_names]} must not have any duplicate rows} + +\item{ukey_names}{chr; names of columns that form a unique key, for \code{x} and +for \code{y}} + +\item{val_names}{chr; names of columns which should be treated as +value/measurement columns, and compared with a tolerance} + +\item{abs_tol}{scalar non-negative numeric; absolute tolerance with which to +compare value columns; see \code{\link{vec_approx_equal}}} +} +\value{ +rows from \code{x} that either (a) don't have a (0-tolerance) matching +ukey in \code{y}, or (b) have a matching ukey in \code{y}, but don't have +approximately equal value column values +} +\description{ +Variation on \code{\link[dplyr:filter-joins]{dplyr::anti_join}} for speed + tolerance setting +} +\keyword{internal} diff --git a/man/tbl_patch.Rd b/man/tbl_patch.Rd new file mode 100644 index 000000000..6d9eee270 --- /dev/null +++ b/man/tbl_patch.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/patch.R +\name{tbl_patch} +\alias{tbl_patch} +\title{Apply an update (e.g., from \code{tbl_diff2}) to a snapshot} +\usage{ +tbl_patch(snapshot, update, ukey_names) +} +\arguments{ +\item{snapshot}{tibble or \code{NULL}; entire data set as of some version, or +\code{NULL} to treat \code{update} as the initial version of the data set.} + +\item{update}{tibble; ukeys + initial values for added rows, ukeys + new +values for changed rows. Deletions must be imprecisely represented as +changing all values to NAs.} + +\item{ukey_names}{character; names of columns that should form a unique key +for \code{snapshot} and for \code{update}. Uniqueness is unchecked; if you don't have +this guaranteed, see \code{\link[=check_ukey_unique]{check_ukey_unique()}}.} +} +\value{ +tibble; snapshot of the data set with the update applied. +} +\description{ +Apply an update (e.g., from \code{tbl_diff2}) to a snapshot +} +\keyword{internal} diff --git a/man/time_minus_time_in_n_steps.Rd b/man/time_minus_time_in_n_steps.Rd index aab030dea..926b79034 100644 --- a/man/time_minus_time_in_n_steps.Rd +++ b/man/time_minus_time_in_n_steps.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/time-utils.R \name{time_minus_time_in_n_steps} \alias{time_minus_time_in_n_steps} -\title{Difference between two time value vectors in terms of number of time "steps"} +\title{Difference between two finite \code{time_value} vectors in terms of number of time "steps"} \usage{ time_minus_time_in_n_steps(x, y, time_type) } @@ -18,6 +18,6 @@ integerish vector such that \code{x + n_steps_to_time_delta_fast(result)} should equal \code{y}. } \description{ -Difference between two time value vectors in terms of number of time "steps" +Difference between two finite \code{time_value} vectors in terms of number of time "steps" } \keyword{internal} diff --git a/man/time_plus_n_steps.Rd b/man/time_plus_n_steps.Rd index f7071c132..26edf9053 100644 --- a/man/time_plus_n_steps.Rd +++ b/man/time_plus_n_steps.Rd @@ -3,7 +3,7 @@ \name{time_plus_n_steps} \alias{time_plus_n_steps} \alias{time_minus_n_steps} -\title{Advance/retreat time_values by specified number of time "steps"} +\title{Advance/retreat time_value(s) by bare-integerish number(s) of time "steps"} \usage{ time_plus_n_steps(x, y, time_type) @@ -12,7 +12,7 @@ time_minus_n_steps(x, y, time_type) \arguments{ \item{x}{a time_value (vector) of time type \code{time_type}} -\item{y}{integerish (vector)} +\item{y}{bare integerish (vector)} \item{time_type}{as in \code{\link[=validate_slide_window_arg]{validate_slide_window_arg()}}} } @@ -22,4 +22,9 @@ a time_value (vector) of time type \code{time_type} \description{ Here, a "step" is based on the \code{time_type}, not just the class of \code{x}. } +\seealso{ +\code{\link{time_plus_slide_window_arg}} if you're working with a \code{y} that is +a slide window arg, which is scalar but otherwise more general (class-wise, +Inf-wise) than an integerish vector. +} \keyword{internal} diff --git a/man/update_is_locf.Rd b/man/update_is_locf.Rd index 722f3d5c4..aaa70bad2 100644 --- a/man/update_is_locf.Rd +++ b/man/update_is_locf.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/archive.R \name{update_is_locf} \alias{update_is_locf} -\title{Internal helper; lgl; which updates are LOCF} +\title{Internal helper; lgl; which updates are LOCF and should thus be dropped when +compactifying} \usage{ update_is_locf(arranged_updates_df, ukey_names, abs_tol) } diff --git a/man/upstream_slide_f_info.Rd b/man/upstream_slide_f_info.Rd new file mode 100644 index 000000000..6b8ba7683 --- /dev/null +++ b/man/upstream_slide_f_info.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_slide_opt_edf.R +\name{upstream_slide_f_info} +\alias{upstream_slide_f_info} +\title{Validate & get information about an upstream slide function} +\usage{ +upstream_slide_f_info(.f, ...) +} +\arguments{ +\item{.f}{function such as \code{data.table::frollmean} or \code{slider::slide_mean}; +must appear in \code{\link{upstream_slide_f_possibilities}}} + +\item{...}{additional configuration args to \code{.f} (besides the data and window +size&alignment); used to validate \code{.f} is used in a supported way} +} +\value{ +named list with two elements: \code{from_package}, a string containing the +upstream package name ("data.table" or "slider"), and \code{namer}, a function +that takes a column to call \code{.f} on and outputs a basic name or +abbreviation for what operation \code{.f} represents on that kind of column +(e.g., "sum", "av", "count"). +} +\description{ +Validate & get information about an upstream slide function +} +\keyword{internal} diff --git a/man/upstream_slide_f_possibilities.Rd b/man/upstream_slide_f_possibilities.Rd new file mode 100644 index 000000000..0fae2cc4f --- /dev/null +++ b/man/upstream_slide_f_possibilities.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_slide_opt_edf.R +\docType{data} +\name{upstream_slide_f_possibilities} +\alias{upstream_slide_f_possibilities} +\title{Information about upstream (\code{{data.table}}/\code{{slider}}) slide functions} +\format{ +An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 10 rows and 3 columns. +} +\usage{ +upstream_slide_f_possibilities +} +\description{ +Underlies \code{\link{upstream_slide_f_info}}. +} +\keyword{internal} diff --git a/man/vec_approx_equal.Rd b/man/vec_approx_equal.Rd new file mode 100644 index 000000000..4bab9f77f --- /dev/null +++ b/man/vec_approx_equal.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/patch.R +\name{vec_approx_equal} +\alias{vec_approx_equal} +\title{Test two vctrs vectors for equality with some tolerance in some cases} +\usage{ +vec_approx_equal( + vec1, + vec2, + na_equal, + .ptype = NULL, + ..., + abs_tol, + inds1 = NULL, + inds2 = NULL +) +} +\arguments{ +\item{vec1, vec2}{vctrs vectors (includes data frames). Take care when using +on named vectors or "keyed" data frames; \code{\link[=vec_names]{vec_names()}} are largely +ignored, and key columns are treated as normal value columns (when they +should probably generate an error if they are not lined up correctly, or be +tested for exact rather than approximate equality).} + +\item{na_equal}{should \code{NA}s be considered equal to each other? (In +epiprocess, we usually want this to be \code{TRUE}, but that doesn't match the +\code{\link[vctrs:vec_equal]{vctrs::vec_equal()}} default, so this is mandatory.)} + +\item{.ptype}{as in \code{\link[vctrs:vec_equal]{vctrs::vec_equal()}}.} + +\item{...}{should be empty (it's here to force later arguments to be passed +by name)} + +\item{abs_tol}{absolute tolerance; will be used for bare numeric \code{vec1}, +\code{vec2}, or any such columns within \code{vec1}, \code{vec2} if they are data frames.} + +\item{inds1, inds2}{optional (row) indices into vec1 and vec2 compatible with +\code{\link[vctrs:vec_slice]{vctrs::vec_slice()}}; output should be consistent with \code{vec_slice}-ing to +these indices beforehand, but can give faster computation if \code{vec1} and +\code{vec2} are data frames. Currently, any speedup is only by making sure that +\code{vec_slice} is used rather than \code{[} for data frames.} +} +\value{ +logical vector, with length matching the result of recycling \code{vec1} +(at \code{inds1} if provided) and \code{vec2} (at \code{inds2} if provided); entries +should all be \code{TRUE} or \code{FALSE} if \code{na_equal = TRUE}. +} +\description{ +Generalizes \code{\link[vctrs:vec_equal]{vctrs::vec_equal}}. +} +\examples{ + +library(dplyr) + +# On numeric vectors: +vec_approx_equal( + c(1, 2, 3, NA), + c(1, 2 + 1e-10, NA, NA), + na_equal = TRUE, + abs_tol = 1e-8 +) + +# On tibbles: +tbl1 <- tibble( + a = 1:5, + b = list(1:5, 1:4, 1:3, 1:2, 1:1) \%>\% lapply(as.numeric), + c = tibble( + c1 = 1:5 + ), + d = matrix(1:10, 5, 2) +) +tbl2 <- tbl1 +tbl2$a[[2]] <- tbl1$a[[2]] + 1e-10 +tbl2$b[[3]][[1]] <- tbl1$b[[3]][[1]] + 1e-10 +tbl2$c$c1[[4]] <- tbl1$c$c1[[4]] + 1e-10 +tbl2$d[[5, 2]] <- tbl1$d[[5, 2]] + 1e-10 +vctrs::vec_equal(tbl1, tbl2, na_equal = TRUE) +vec_approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-12) +vec_approx_equal(tbl1, tbl2, na_equal = TRUE, abs_tol = 1e-8) + + + + + +# Type comparison within lists is stricter, matching vctrs: +vctrs::vec_equal(list(1:2), list(as.numeric(1:2))) +vec_approx_equal(list(1:2), list(as.numeric(1:2)), FALSE, abs_tol = 0) + +} diff --git a/man/vec_approx_equal0.Rd b/man/vec_approx_equal0.Rd new file mode 100644 index 000000000..01a1141ba --- /dev/null +++ b/man/vec_approx_equal0.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/patch.R +\name{vec_approx_equal0} +\alias{vec_approx_equal0} +\title{Helper for \code{\link{vec_approx_equal}} for vecs guaranteed to have the same ptype and size} +\usage{ +vec_approx_equal0(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 = NULL) +} +\description{ +Helper for \code{\link{vec_approx_equal}} for vecs guaranteed to have the same ptype and size +} +\keyword{internal} diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 0e84b03ba..4383272d2 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -92,7 +92,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" expect_equal(key(ea1$DT), c("geo_value", "time_value", "version")) ea2 <- as_epi_archive(df, other_keys = "value", compactify = FALSE) - expect_equal(key(ea2$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(key(ea2$DT), c("geo_value", "value", "time_value", "version")) # Tibble tib <- tibble::tibble(df, code = "x") @@ -101,7 +101,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" expect_equal(key(ea3$DT), c("geo_value", "time_value", "version")) ea4 <- as_epi_archive(tib, other_keys = "code", compactify = FALSE) - expect_equal(key(ea4$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(key(ea4$DT), c("geo_value", "code", "time_value", "version")) # Keyed data.table kdt <- data.table::data.table( @@ -119,7 +119,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" ea6 <- as_epi_archive(kdt, other_keys = "value", compactify = FALSE) # Mismatched keys, but the one from as_epi_archive overrides - expect_equal(key(ea6$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(key(ea6$DT), c("geo_value", "value", "time_value", "version")) # Unkeyed data.table udt <- data.table::data.table( @@ -134,7 +134,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" expect_equal(key(ea7$DT), c("geo_value", "time_value", "version")) ea8 <- as_epi_archive(udt, other_keys = "code", compactify = FALSE) - expect_equal(key(ea8$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(key(ea8$DT), c("geo_value", "code", "time_value", "version")) # epi_df edf1 <- cases_deaths_subset %>% @@ -145,7 +145,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" expect_equal(key(ea9$DT), c("geo_value", "time_value", "version")) ea10 <- as_epi_archive(edf1, other_keys = "code", compactify = FALSE) - expect_equal(key(ea10$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(key(ea10$DT), c("geo_value", "code", "time_value", "version")) # Keyed epi_df edf2 <- data.frame( @@ -164,7 +164,7 @@ test_that("epi_archives are correctly instantiated with a variety of data types" expect_equal(key(ea11$DT), c("geo_value", "time_value", "version")) ea12 <- as_epi_archive(edf2, other_keys = "misc", compactify = FALSE) - expect_equal(key(ea12$DT), c("geo_value", "time_value", "misc", "version")) + expect_equal(key(ea12$DT), c("geo_value", "misc", "time_value", "version")) }) test_that("`epi_archive` rejects nonunique keys", { @@ -216,9 +216,3 @@ test_that("`epi_archive` rejects dataframes where time_value and version columns ) expect_error(as_epi_archive(tbl3), class = "epiprocess__time_value_version_mismatch") }) - -test_that("is_locf works as expected", { - vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN) - is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1) - expect_equal(is_locf(vec, .Machine$double.eps^0.5, FALSE), as.logical(is_repeated)) -}) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 229af8453..e314b299f 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -124,7 +124,7 @@ quantile_pred_once <- function(estimates_vec, levels_vec) { hardhat::quantile_pred(t(as.matrix(estimates_vec)), levels_vec) } test_that("compactify works on distributions", { - skip("Until #611 is merged or hardhat/epipredict is patched") + skip("See #631.") forecasts <- tibble( ahead = 2L, geo_value = "ak", diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 0aa4aca7f..6fab55d44 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -937,3 +937,19 @@ test_that("epi_slide* output grouping matches input grouping", { character(0) ) }) + +test_that('`epi_slide_opt .align != "right"` errors on `fill` arg', { + test_date <- as.Date("2020-01-01") + toy_edf <- tibble( + geo_value = 1, + time_value = test_date - 1 + 1:5, + value = c(1:3, NA, 5) + ) %>% + as_epi_df(as_of = test_date + 10) + + expect_error( + toy_edf %>% + epi_slide_opt(value, frollmean, .window_size = 3, .align = "left", fill = -1000), + class = "epiprocess__epi_slide_opt__fill_unsupported" + ) +}) diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R new file mode 100644 index 000000000..999234217 --- /dev/null +++ b/tests/testthat/test-epi_slide_opt_archive.R @@ -0,0 +1,220 @@ +library(dplyr) + +test_that("epi_slide_opt_archive_one_epikey works as expected", { + start_date <- as.Date("2020-01-01") + + grp_updates <- bind_rows( + tibble(version = 10, time_value = 0:20, value = 0:20), + tibble(version = 12, time_value = 4:5, value = 5:4), + tibble(version = 13, time_value = 8, value = 9), + tibble(version = 14, time_value = 11, value = NA), + tibble(version = 15, time_value = -10, value = -10), + tibble(version = 16, time_value = 50, value = 50) + ) %>% + mutate(across(c(version, time_value), ~ start_date - 1 + .x)) + + expected <- list( + grp_updates %>% + slice_min(version) %>% + mutate(across(c(version, time_value), ~ as.numeric(.x - start_date) + 1)) %>% + mutate(slide_value = frollmean(value, 3, algo = "exact")), + tibble( + version = 12, + time_value = c(4, 5, 7), # time 6 unchanged, compactified away + # time 7 `value` unchanged, but here because `slide_value` changed: + value = c(5, 4, 7), + slide_value = c( + mean(c(2, 3, 5)), + # time 5 `slide_value` unchanged, but here because `value` changed: + mean(c(3, 5, 4)), + mean(c(4, 6, 7)) + ) + ), + tibble( + version = 13, time_value = 8:10, value = c(9, 9, 10), + slide_value = frollmean(c(6, 7, 9, 9, 10), 3, algo = "exact")[-(1:2)] + ), + tibble( + version = 14, time_value = 11:13, value = c(NA, 12, 13), slide_value = rep(NA_real_, 3L) + ), + tibble( + version = 15, time_value = -10, value = -10, slide_value = NA_real_ + ), + tibble( + version = 16, time_value = 50, value = 50, slide_value = NA_real_ + ) + ) %>% + lapply(function(x) { + x %>% + mutate(across(c(version, time_value), ~ start_date - 1 + .x)) + }) %>% + list_rbind() + + f <- purrr::partial(data.table::frollmean, algo = "exact") + + result <- grp_updates %>% + epiprocess:::epi_slide_opt_archive_one_epikey("value", f, "data.table", 2L, 0L, "day", "slide_value") %>% + arrange(version, time_value) %>% + select(version, time_value, everything()) + + expect_equal(result, expected) +}) + + +test_that("epi_slide_opt.epi_archive is not confused by unique(DT$version) unsorted", { + start_date <- as.Date("2020-01-01") + tibble( + geo_value = 1, + time_value = start_date - 1 + 1:4, + version = start_date - 1 + c(5, 5, 4, 4), + value = c(1, 2, 3, 4) + ) %>% + as_epi_archive() %>% + epi_slide_opt(value, frollmean, .window_size = 2L) %>% + expect_equal( + tibble( + geo_value = 1, + time_value = start_date - 1 + c(1, 2, 3, 3, 4), + version = start_date - 1 + c(5, 5, 4, 5, 4), + value = c(1, 2, 3, 3, 4), + value_2dav = c(NA, 1.5, NA, 2.5, 3.5) + ) %>% + as_epi_archive() + ) +}) + +test_that("epi_slide_opt.epi_archive is not confused by unique(DT$time_value) unsorted", { + start_date <- as.Date("2020-01-01") + tibble( + geo_value = c(1, 1, 2, 2), + time_value = start_date - 1 + c(2, 3, 1, 2), + version = start_date - 1 + c(1, 2, 2, 2), + value = c(1, 2, 3, 4) + ) %>% + as_epi_archive() %>% + epi_slide_opt(value, frollmean, .window_size = 2L) %>% + expect_equal( + tibble( + geo_value = c(1, 1, 2, 2), + time_value = start_date - 1 + c(2, 3, 1, 2), + version = start_date - 1 + c(1, 2, 2, 2), + value = c(1, 2, 3, 4), + value_2dav = c(NA, 1.5, NA, 3.5) + ) %>% + as_epi_archive() + ) +}) + +test_that("epi_slide_opt.epi_archive gives expected results on example data; also grouped behavior", { + # vs. built-in case_rate_7d_av column. + # + # If we were to compare the keyset vs. + # the original, it changes, as the original contains some tiny deviations in + # values that don't seem achievable with available sliding functions. E.g., in + # the recomputed result, geo "ak" version "2020-11-01" changes time 2020-03-13 + # from 0 to 0.138 and time 2020-03-14 from a slightly different value of 0.138 + # to 0, while nearby times remained stable; in the original, this resulted in + # a tiny update to the 7d_av for 2020-03-14 but not following times somehow, + # while in the recomputation there are also minute updates to 2020-03-15 and + # 2020-03-16; 2020-03-17 onward have other case_rate changes factoring in. + # Compactifying and comparing with tolerances would help account for some of + # these differences, but only through writing this was it realized that both + # archives would need the recompactification with tolerance; it's not just + # epi_slide_opt.epi_archive's very rigid compactification that's the cause. + # (Side note: allowing configurable compactification tolerance in + # epi_slide_opt.epi_archive wasn't included due to either feeling strange + # applying the compactification tolerance to all columns rather than just + # computed columns, and a slowdown when using one approach to compactify just + # the new columns + also awkward not matching what's possible with just + # construction functions.) + # + # --> just compare essentially an epix_merge of the original & the recomputation: + case_death_rate_archive_time <- system.time( + case_death_rate_archive_result <- case_death_rate_archive %>% + epi_slide_opt(case_rate, frollmean, algo = "exact", .window_size = 7) + ) + expect_equal( + case_death_rate_archive_result$DT$case_rate_7dav, + case_death_rate_archive_result$DT$case_rate_7d_av + ) + + # vs. computing via epix_slide: + + mini_case_death_rate_archive <- case_death_rate_archive %>% + { + as_tibble(as.data.frame(.$DT)) + } %>% + filter(geo_value %in% head(unique(geo_value), 4L)) %>% + as_epi_archive() + + mini_case_death_rate_archive_time_opt <- system.time( + mini_case_death_rate_archive_result <- mini_case_death_rate_archive %>% + epi_slide_opt(case_rate, frollmean, .window_size = 7) + ) + + mini_case_death_rate_archive_time_gen <- system.time( + mini_case_death_rate_archive_expected <- mini_case_death_rate_archive %>% + epix_slide( + ~ .x %>% epi_slide_opt(case_rate, frollmean, .window_size = 7) + ) %>% + select(names(mini_case_death_rate_archive$DT), everything()) %>% + as_epi_archive() + ) + + expect_equal(mini_case_death_rate_archive_result, mini_case_death_rate_archive_expected) + + mini_case_death_rate_archive_result2 <- mini_case_death_rate_archive %>% + group_by(geo_value) %>% + epi_slide_opt(case_rate, frollmean, .window_size = 7) + + expect_equal( + mini_case_death_rate_archive_result2, + mini_case_death_rate_archive_result %>% + group_by(geo_value) + ) + + mini_case_death_rate_archive_b <- + mini_case_death_rate_archive$DT %>% + as.data.frame() %>% + as_tibble() %>% + mutate(age_group = "overall") %>% + as_epi_archive(other_keys = "age_group") + + # grouping shouldn't change the outcome + expect_equal( + mini_case_death_rate_archive_b %>% + group_by(geo_value, age_group) %>% + epi_slide_opt(case_rate, frollmean, .window_size = 7), + mini_case_death_rate_archive_b %>% + epi_slide_opt(case_rate, frollmean, .window_size = 7) %>% + group_by(geo_value, age_group) + ) + + expect_error( + mini_case_death_rate_archive_b %>% + group_by(age_group) %>% + epi_slide_opt(case_rate, frollmean, .window_size = 7) + ) + + archive_cases_dv_subset_time_opt <- system.time( + archive_cases_dv_subset_result <- archive_cases_dv_subset %>% + epi_slide_opt(percent_cli, frollmean, .window_size = 7) + ) + + archive_cases_dv_subset_time_gen <- system.time( + archive_cases_dv_subset_expected <- archive_cases_dv_subset %>% + epix_slide( + ~ .x %>% epi_slide_opt(percent_cli, frollmean, .window_size = 7) + ) %>% + select(geo_value, time_value, version, everything()) %>% + as_epi_archive() + ) + + expect_equal(archive_cases_dv_subset_result, archive_cases_dv_subset_expected) + + expect_error( + archive_cases_dv_subset %>% + epi_slide_opt(percent_cli, frollmean, .window_size = 7, .align = "left", fill = -1000), + class = "epiprocess__epi_slide_opt__fill_unsupported" + ) +}) diff --git a/tests/testthat/test-vec_approx_equal.R b/tests/testthat/test-vec_approx_equal.R new file mode 100644 index 000000000..18235224c --- /dev/null +++ b/tests/testthat/test-vec_approx_equal.R @@ -0,0 +1,57 @@ +test_that("is_locf replacement works as expected", { + vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN) + is_repeated <- c(0, 1, 0, 1, 0, 1, 0, 1) + expect_equal( + c( + FALSE, + vec_approx_equal( + head(vec, -1L), tail(vec, -1L), + na_equal = TRUE, abs_tol = .Machine$double.eps^0.5 + ) + ), + as.logical(is_repeated) + ) +}) + +test_that("vec_approx_equal is compatible with vec_equal on some edge cases", { + # Match (`==` and) `vec_equal` on NaN behavior: + tbl <- tibble::tribble( + ~x, ~y, + NaN, 5, + NaN, NA, + NA, NaN, + NaN, NaN, + ) + expect_identical( + vec_approx_equal(tbl$x, tbl$y, na_equal = FALSE, abs_tol = 1e-8), + vctrs::vec_equal(tbl$x, tbl$y, na_equal = FALSE) + ) + expect_identical( + vec_approx_equal(tbl$x, tbl$y, na_equal = TRUE, abs_tol = 1e-8), + vctrs::vec_equal(tbl$x, tbl$y, na_equal = TRUE) + ) + + # Match `vec_equal` behavior on namedness, including within elements: + unnamed_list <- list(5) + named_list <- list(a = 5) + expect_identical( + vec_approx_equal(unnamed_list, named_list, na_equal = TRUE, abs_tol = 1e-8), + vec_equal(unnamed_list, named_list, na_equal = TRUE) + ) + expect_identical( + vec_approx_equal(list(unnamed_list), list(named_list), na_equal = TRUE, abs_tol = 1e-8), + vec_equal(list(unnamed_list), list(named_list), na_equal = TRUE) + ) + + # Match `vec_equal` behavior on (p)types, including within elements: + dbl <- 5.0 + int <- 5L + expect_identical( + vec_approx_equal(dbl, int, na_equal = TRUE, abs_tol = 1e-8), + vec_equal(dbl, int, na_equal = TRUE) + ) + expect_identical( + vec_approx_equal(list(dbl), list(int), na_equal = TRUE, abs_tol = 1e-8), + vec_equal(list(dbl), list(int), na_equal = TRUE) + ) +})