NorskRegnesentral · martinju · Jan 16, 2025 · Dec 29, 2024 · Dec 29, 2024 · Dec 29, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -27,11 +27,12 @@ inst/compare_lundberg\.xgb\.obj
 ^vignettes/vaeac\.Rmd\.orig$
 ^vignettes/regression\.Rmd\.orig$
 ^vignettes/asymmetric_causal\.Rmd\.orig$
-^vignettes/figure_general_usage/*$
 ^vignettes/cache_main/*$
-^vignettes/figure_vaeac/*$
 ^vignettes/cache_vaeac/*$
-^vignettes/figure_regression/*$
 ^vignettes/cache_regression/*$
-^vignettes/figure_asymmetric_causal/*$
 ^vignettes/cache_asymmetric_causal/*$
+#^tests/testthat/_snaps/*$ # Uncomment when submitting to CRAN to reduce filesize
+^inst/code_paper/*$
+^inst/joss_paper/*$
+
+
diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,21 +1,21 @@
 Package: shapr
-Version: 1.0.1
+Version: 1.0.1.9000
 Title: Prediction Explanation with Dependence-Aware Shapley Values
 Description: Complex machine learning models are often hard to interpret. However, in 
   many situations it is crucial to understand and explain why a model made a specific 
   prediction. Shapley values is the only method for such prediction explanation framework 
   with a solid theoretical foundation. Previously known methods for estimating the Shapley 
   values do, however, assume feature independence. This package implements methods which accounts for any feature 
   dependence, and thereby produces more accurate estimates of the true Shapley values.
-  An accompanying Python wrapper (shaprpy) is available on GitHub.
+  An accompanying 'Python' wrapper ('shaprpy') is available through the GitHub repository.
 Authors@R: c(
     person("Martin", "Jullum", email = "Martin.Jullum@nr.no", role = c("cre", "aut"), comment = c(ORCID = "0000-0003-3908-5155")),
-    person("Lars Henry Berge", "Olsen", email = "lholsen@math.uio.no", role = "aut", comment = c(ORCID = "0009-0006-9360-6993")),
-    person("Annabelle", "Redelmeier", email = "Annabelle.Redelmeier@nr.no", role = "aut"),
+    person("Lars Henry Berge", "Olsen", email = "lhbolsen@nr.no", role = "aut", comment = c(ORCID = "0009-0006-9360-6993")),
+    person("Annabelle", "Redelmeier", email = "ardelmeier@gmail.com", role = "aut"),
     person("Jon", "Lachmann", email = "Jon@lachmann.nu", role = "aut", comment = c(ORCID = "0000-0001-8396-5673")),
     person("Nikolai", "Sellereite", email = "nikolaisellereite@gmail.com", role = "aut", comment = c(ORCID = "0000-0002-4671-0337")),
     person("Anders", "Løland", email = "Anders.Loland@nr.no", role = "ctb"), 
-    person("Jens Christian", "Wahl", email = "Jens.Christian.Wahl@nr.no", role = "ctb"), 
+    person("Jens Christian", "Wahl", email = "jens.c.wahl@gmail.com", role = "ctb"), 
     person("Camilla", "Lingjærde", role = "ctb"),
     person("Norsk Regnesentral", role =  c("cph", "fnd"))
     )

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,8 @@
+# shapr (development version)
+
 # shapr 1.0.1 
 
+* Spelling checking and other minor clean up [#431](https://github.com/NorskRegnesentral/shapr/pull/431))
 * Add extra_computation_args and output_args to explain_forecast() [#428](https://github.com/NorskRegnesentral/shapr/pull/428))
 * Rename vaeac plotting functions [#428](https://github.com/NorskRegnesentral/shapr/pull/428))
 * Move explain() arguments `paired_shap_sampling` and `kernelSHAP_reweighting` into `extra_computation_args` [#428](https://github.com/NorskRegnesentral/shapr/pull/428))
@@ -8,15 +11,15 @@
 * Renamed various internal functions to be consistent with names in the rest of the package [#427](https://github.com/NorskRegnesentral/shapr/pull/427))
 * Remove MSEv from explain_forecast (as it was only supported for horizon=1). Should return in a more general manner in the future [#427](https://github.com/NorskRegnesentral/shapr/pull/427))
 * Improve efficiency of coalition sampling code and move to string sampling [#426](https://github.com/NorskRegnesentral/shapr/pull/426))
-* Bugfix `iterative = TRUE` for `explain_forecast()` which was not using coaltions from previous iterations [#426](https://github.com/NorskRegnesentral/shapr/pull/426))
+* Bugfix `iterative = TRUE` for `explain_forecast()` which was not using coalitions from previous iterations [#426](https://github.com/NorskRegnesentral/shapr/pull/426))
 * Bugfix the handling and output with the `verbose` argument for `explain_forecast()` [#425](https://github.com/NorskRegnesentral/shapr/pull/425))
 * Improved flexibility of the beeswarm plot functionality [#424](https://github.com/NorskRegnesentral/shapr/pull/424))
 * Bugfix edge case where the `party` package returns a `constparty` object [#423](https://github.com/NorskRegnesentral/shapr/pull/423))
 * Bugfix error due to extra comma in rarely used warning [#422](https://github.com/NorskRegnesentral/shapr/pull/422))
 * Shined up the vignettes a bit [#421](https://github.com/NorskRegnesentral/shapr/pull/421))
 * Bugfix `keep_samp_for_vS` with iterative approach [#417](https://github.com/NorskRegnesentral/shapr/pull/417))
 * [Python] Brought the python code base up to speed with essentially all functionality in `explain()` in R [#416](https://github.com/NorskRegnesentral/shapr/pull/416))
-* 
+* Please CRAN with dontrun on long running examples + skip_on_cran for the parallelized tests.
 
 # shapr 1.0.0 (GitHub only)
 
@@ -88,7 +91,7 @@ Previously, this was not possible with the prediction functions defined internal
 
 # shapr 0.2.2
 
-* Patch to fix failing CRAN-tests on R-devel due to changed behavior of `attach()`: Fixed by changing how we simluate adding a function to .GlobalEnv in the failing test. Actual package not affected.
+* Patch to fix failing CRAN-tests on R-devel due to changed behavior of `attach()`: Fixed by changing how we simulate adding a function to .GlobalEnv in the failing test. Actual package not affected.
 
 # shapr 0.2.1
 

diff --git a/R/approach_empirical.R b/R/approach_empirical.R
@@ -15,7 +15,7 @@
 #' accounts for 80\% of the total weight.
 #' `eta` is the \eqn{\eta} parameter in equation (15) of
 # nolint start
-#' \href{https://www.sciencedirect.com/science/article/pii/S0004370221000539/pdfft?md5=c3e4b95131ed944dc23b05a6170dbaee&pid=1-s2.0-S0004370221000539-main.pdf}{Aas et al. (2021)}.
+#' \href{https://martinjullum.com/publication/aas-2021-explaining/aas-2021-explaining.pdf}{Aas et al. (2021)}.
 # nolint end
 #'
 #' @param empirical.fixed_sigma Positive numeric scalar.
@@ -48,7 +48,7 @@
 #' @export
 # nolint start
 #' @references
-#'   - \href{https://www.sciencedirect.com/science/article/pii/S0004370221000539/pdfft?md5=c3e4b95131ed944dc23b05a6170dbaee&pid=1-s2.0-S0004370221000539-main.pdf}{
+#'   - \href{https://martinjullum.com/publication/aas-2021-explaining/aas-2021-explaining.pdf}{
 #'   Aas, K., Jullum, M., & Løland, A. (2021). Explaining individual predictions when features are dependent:
 #'   More accurate approximations to Shapley values. Artificial Intelligence, 298, 103502}
 # nolint end

diff --git a/R/approach_regression_separate.R b/R/approach_regression_separate.R
@@ -9,7 +9,7 @@
 #' is also a valid input. It is essential to include the package prefix if the package is not loaded.
 #' @param regression.tune_values Either `NULL` (default), a data.frame/data.table/tibble, or a function.
 #' The data.frame must contain the possible hyperparameter value combinations to try.
-#' The column names must match the names of the tuneable parameters specified in `regression.model`.
+#' The column names must match the names of the tunable parameters specified in `regression.model`.
 #' If `regression.tune_values` is a function, then it should take one argument `x` which is the training data
 #' for the current coalition and returns a data.frame/data.table/tibble with the properties described above.
 #' Using a function allows the hyperparameter values to change based on the size of the coalition See the regression

diff --git a/R/approach_vaeac.R b/R/approach_vaeac.R
@@ -2018,13 +2018,13 @@ vaeac_get_full_state_list <- function(environment) {
 #' @keywords internal
 #' @author Lars Henry Berge Olsen
 vaeac_get_x_explain_extended <- function(x_explain, S, index_features) {
-  n_coaltions <- length(index_features) # Get the number of active coalitions
+  n_coalitions <- length(index_features) # Get the number of active coalitions
   n_explain <- nrow(x_explain) # Get the number of explicands
   mask <- S[index_features, , drop = FALSE] # Get the masks/coalitions we are to generate MC samples for
   mask[mask == 0] <- NaN # Set zeros to `NaN` to indicate that they are missing and to be imputed by `vaeac`
   x_explain_extended <-
-    x_explain[rep(seq_len(nrow(x_explain)), each = n_coaltions), ] # Extend the explicands `n_coalitions` times
-  mask_extended <- mask[rep(seq(n_coaltions), times = n_explain), ] # Extend the masks `n_expliand` times
+    x_explain[rep(seq_len(nrow(x_explain)), each = n_coalitions), ] # Extend the explicands `n_coalitions` times
+  mask_extended <- mask[rep(seq(n_coalitions), times = n_explain), ] # Extend the masks `n_expliand` times
   x_explain_extended[is.na(mask_extended)] <- NaN # Apply the mask. The NaNs are features outside coalition S.
   return(x_explain_extended)
 }

diff --git a/R/approach_vaeac_torch_modules.R b/R/approach_vaeac_torch_modules.R
@@ -1042,7 +1042,7 @@ vaeac_postprocess_data <- function(data, vaeac_model_state_list) {
 #' This function creates a [torch::dataset()] object that represent a map from keys to data samples.
 #' It is used by the [torch::dataloader()] to load data which should be used to extract the
 #' batches for all epochs in the training phase of the neural network. Note that a dataset object
-#' is an R6 instanc, see \url{https://r6.r-lib.org/articles/Introduction.html}, which is classical
+#' is an R6 instance, see \url{https://r6.r-lib.org/articles/Introduction.html}, which is classical
 #' object-oriented programming, with self reference. I.e, [shapr::vaeac_dataset()] is a subclass
 #' of type [torch::dataset()].
 #'
@@ -1752,7 +1752,7 @@ gauss_cat_loss <- function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4)
 #' reconstructed or not.
 #'
 #' @details Note that the module works with mixed data represented as 2-dimensional inputs and it
-#' works correctly with missing values in `groundtruth` as long as they are repsented by NaNs.
+#' works correctly with missing values in `groundtruth` as long as they are represented by NaNs.
 #'
 #' @author Lars Henry Berge Olsen
 #' @keywords internal

diff --git a/R/compute_vS.R b/R/compute_vS.R
@@ -3,7 +3,7 @@
 #' @inheritParams default_doc_export
 #'
 #' @param method Character
-#' Indicates whether the lappy method (default) or loop method should be used.
+#' Indicates whether the lapply method (default) or loop method should be used.
 #' Options other than "future" is only used for testing/debugging.
 #'
 #' @export

diff --git a/R/explain.R b/R/explain.R
@@ -2,10 +2,7 @@
 #'
 #' @description Computes dependence-aware Shapley values for observations in `x_explain` from the specified
 #' `model` by using the method specified in `approach` to estimate the conditional expectation.
-#' See
-# nolint start
-#' \href{https://www.sciencedirect.com/science/article/pii/S0004370221000539/pdfft?md5=c3e4b95131ed944dc23b05a6170dbaee&pid=1-s2.0-S0004370221000539-main.pdf}{Aas, et. al (2021)}
-# nolint end
+#' See \href{https://martinjullum.com/publication/aas-2021-explaining/aas-2021-explaining.pdf}{Aas et al. (2021)}
 #' for a thorough introduction to dependence-aware prediction explanation with Shapley values.
 #'
 #' @param x_train Matrix or data.frame/data.table.
@@ -107,24 +104,24 @@
 #' This provides sufficiently accurate Shapley value estimates faster.
 #' First an initial number of coalitions is sampled, then bootsrapping is used to estimate the variance of the Shapley
 #' values.
-#' A convergence criterion is used to determine if the variances of the Shapley values are sufficently small.
+#' A convergence criterion is used to determine if the variances of the Shapley values are sufficiently small.
 #' If the variances are too high, we estimate the number of required samples to reach convergence, and thereby add more
 #' coalitions.
 #' The process is repeated until the variances are below the threshold.
 #' Specifics related to the iterative process and convergence criterion are set through `iterative_args`.
 #'
 #' @param iterative_args Named list.
-#' Specifices the arguments for the iterative procedure.
+#' Specifies the arguments for the iterative procedure.
 #' See [get_iterative_args_default()] for description of the arguments and their default values.
 #' @param output_args Named list.
-#' Specifices certain arguments related to the output of the function.
+#' Specifies certain arguments related to the output of the function.
 #' See [get_output_args_default()] for description of the arguments and their default values.
 #' @param extra_computation_args Named list.
-#' Specifices extra arguments related to the computation of the Shapley values.
+#' Specifies extra arguments related to the computation of the Shapley values.
 #' See [get_extra_comp_args_default()] for description of the arguments and their default values.
 #'
 #' @param prev_shapr_object `shapr` object or string.
-#' If an object of class `shapr` is provided, or string with a path to where intermediate results are strored,
+#' If an object of class `shapr` is provided, or string with a path to where intermediate results are stored,
 #' then the function will use the previous object to continue the computation.
 #' This is useful if the computation is interrupted or you want higher accuracy than already obtained, and therefore
 #' want to continue the iterative estimation. See the
@@ -189,9 +186,7 @@
 #' \href{https://norskregnesentral.github.io/shapr/articles/general_usage.html}{general usage}.
 #' (From R: `vignette("general_usage", package = "shapr")`).
 #' Moreover,
-# nolint start
-#'  \href{https://www.sciencedirect.com/science/article/pii/S0004370221000539/pdfft?md5=c3e4b95131ed944dc23b05a6170dbaee&pid=1-s2.0-S0004370221000539-main.pdf}{Aas et al. (2021)}
-# nolint end
+#'  \href{https://martinjullum.com/publication/aas-2021-explaining/aas-2021-explaining.pdf}{Aas et al. (2021)}
 #' gives a general introduction to dependence-aware Shapley values, and the three approaches `"empirical"`,
 #' `"gaussian"`, `"copula"`, and also discusses `"independence"`.
 #' \href{https://martinjullum.com/publication/redelmeier-2020-explaining/redelmeier-2020-explaining.pdf}{
@@ -253,6 +248,7 @@
 #' }
 #'
 #' @examples
+#' \dontrun{
 #'
 #' # Load example data
 #' data("airquality")
@@ -274,12 +270,11 @@
 #' # Explain predictions
 #' p <- mean(data_train[, y_var])
 #'
-#' \dontrun{
 #' # (Optionally) enable parallelization via the future package
 #' if (requireNamespace("future", quietly = TRUE)) {
 #'   future::plan("multisession", workers = 2)
 #' }
-#' }
+#'
 #'
 #' # (Optionally) enable progress updates within every iteration via the progressr package
 #' if (requireNamespace("progressr", quietly = TRUE)) {
@@ -393,14 +388,15 @@
 #'   iterative = TRUE,
 #'   iterative_args = list(initial_n_coalitions = 10)
 #' )
+#' }
 #'
 #' @export
 #'
 #' @author Martin Jullum, Lars Henry Berge Olsen
 #'
 # nolint start
 #' @references
-#'   - \href{https://www.sciencedirect.com/science/article/pii/S0004370221000539/pdfft?md5=c3e4b95131ed944dc23b05a6170dbaee&pid=1-s2.0-S0004370221000539-main.pdf}{
+#'   - \href{https://martinjullum.com/publication/aas-2021-explaining/aas-2021-explaining.pdf}{
 #'   Aas, K., Jullum, M., & Løland, A. (2021). Explaining individual predictions when features are dependent:
 #'   More accurate approximations to Shapley values. Artificial Intelligence, 298, 103502}
 #'   - \href{https://proceedings.neurips.cc/paper_files/paper/2020/file/0d770c496aa3da6d2c3f2bd19e7b9d6b-Paper.pdf}{
@@ -419,7 +415,7 @@
 #'   values and conditional inference trees. In Machine Learning and Knowledge Extraction:
 #'   International Cross-Domain Conference, CD-MAKE 2020, Dublin, Ireland, August 25–28, 2020, Proceedings 4
 #'   (pp. 117-137). Springer International Publishing.}
-#'   - \href{https://doi.org/10.21105/joss.02027}{
+#'   - \href{https://www.theoj.org/joss-papers/joss.02027/10.21105.joss.02027.pdf}{
 #'   Sellereite N., & Jullum, M. (2019). shapr: An R-package for explaining machine learning models with
 #'   dependence-aware Shapley values. Journal of Open Source Software, 5(46), 2027}
 #'   - \href{https://www.jmlr.org/papers/volume23/21-1413/21-1413.pdf}{
@@ -582,7 +578,7 @@ explain <- function(model,
   return(output)
 }
 
-#' Cleans out certain output arguments to allow perfect reproducability of the output
+#' Cleans out certain output arguments to allow perfect reproducibility of the output
 #'
 #' @inheritParams default_doc_export
 #'

diff --git a/R/explain_forecast.R b/R/explain_forecast.R
@@ -4,7 +4,7 @@
 #' `model` by using the method specified in `approach` to estimate the conditional expectation.
 #' See
 # nolint start
-#' \href{https://www.sciencedirect.com/science/article/pii/S0004370221000539/pdfft?md5=c3e4b95131ed944dc23b05a6170dbaee&pid=1-s2.0-S0004370221000539-main.pdf}{Aas, et. al (2021)}
+#' \href{https://martinjullum.com/publication/aas-2021-explaining/aas-2021-explaining.pdf}{Aas, et. al (2021)}
 # nolint end
 #' for a thorough introduction to dependence-aware prediction explanation with Shapley values.
 #'
@@ -68,6 +68,7 @@
 #' @author Jon Lachmann, Martin Jullum
 #' @examples
 #'
+#' \dontrun{
 #' # Load example data
 #' data("airquality")
 #' data <- data.table::as.data.table(airquality)
@@ -90,6 +91,7 @@
 #'   phi0 = p0_ar,
 #'   group_lags = FALSE
 #' )
+#' }
 #'
 #' @export
 explain_forecast <- function(model,

diff --git a/R/plot.R b/R/plot.R
@@ -11,7 +11,7 @@
 #'  contribution (their Shapley values).
 #'  `"scatter"` plots the feature values on the x-axis and Shapley values on the y-axis, as well as
 #'  (optionally) a background scatter_hist showing the distribution of the feature data.
-#'  `"beeswarm"` summarises the distribution of the Shapley values along the x-axis for all the features.
+#'  `"beeswarm"` summarizes the distribution of the Shapley values along the x-axis for all the features.
 #'  Each point gives the shapley value of a given instance, where the points are colored by the feature value
 #'  of that instance.
 #' @param digits Integer.
@@ -79,6 +79,7 @@
 #' @export
 #' @examples
 #'
+#' \dontrun{
 #' data("airquality")
 #' airquality <- airquality[complete.cases(airquality), ]
 #' x_var <- c("Solar.R", "Wind", "Temp", "Month")
@@ -168,6 +169,7 @@
 #'   plot(x, plot_type = "scatter")
 #'   plot(x, plot_type = "beeswarm")
 #' }
+#' }
 #'
 #' @author Martin Jullum, Vilde Ung, Lars Henry Berge Olsen
 plot.shapr <- function(x,
@@ -1443,7 +1445,7 @@ make_MSEv_coalition_plots <- function(MSEv_coalition_dt,
 #' y-axis or not. If `FALSE` (default), then no value is shown for the groups. If `TRUE`, then `shapr` includes
 #' the mean of the features in each group.
 #' @param index_explicands_sort Boolean. If `FALSE` (default), then `shapr` plots the explicands in the order
-#' specified in `index_explicands`. If `TRUE`, then `shapr` sort the indices in incressing oreder based on their id.
+#' specified in `index_explicands`. If `TRUE`, then `shapr` sort the indices in increasing order based on their id.
 #'
 #' @return A [ggplot2::ggplot()] object.
 #' @export

diff --git a/R/save_results.R b/R/save_results.R
@@ -1,4 +1,4 @@
-#' Saves the itermediate results to disk
+#' Saves the intermediate results to disk
 #'
 #' @inheritParams default_doc_export
 #'

diff --git a/R/shapley_setup.R b/R/shapley_setup.R
@@ -142,14 +142,14 @@ shapley_setup <- function(internal) {
 #' number of coalitions by a factor of `n_samps_scale` and determine when we have `n_coalitions` unique
 #' coalitions and only use the coalitions up to this point and throw away the remaining coalitions.
 #' @param approach0 Character vector.
-#' Contains the approach to be used for eastimation of each coalition size. Same as `approach` in [explain()].
+#' Contains the approach to be used for estimation of each coalition size. Same as `approach` in [explain()].
 #' @param coal_feature_list List.
 #' A list mapping each coalition to the features it contains.
 #' @param dt_valid_causal_coalitions data.table. Only applicable for asymmetric Shapley
 #' values explanations, and is `NULL` for symmetric Shapley values.
 #' The data.table contains information about the coalitions that respects the causal ordering.
 #' @inheritParams explain
-#' @return A data.table with info about the coaltions to use
+#' @return A data.table with info about the coalitions to use
 #'
 #' @keywords internal
 #'