From 9a44ab85d1a8e2ebb75fa6cdec5f3c59bbc76773 Mon Sep 17 00:00:00 2001 From: Roozbeh Valavi Date: Sun, 3 Aug 2025 23:34:08 +1000 Subject: [PATCH 1/9] update 3.2; new similarity --- DESCRIPTION | 4 +- NEWS.md | 4 + R/RcppExports.R | 4 + R/checks.R | 3 +- R/cv_block_size.R | 5 +- R/cv_cluster.R | 2 +- R/cv_plot.R | 12 +-- R/cv_similarity.R | 113 +++++++++++++++++++++++----- R/cv_spatial.R | 4 +- man/cv_similarity.Rd | 36 ++++++++- src/Lightweight_matrix.hpp | 80 ++++++++++++++++++++ src/RcppExports.cpp | 15 ++++ src/nndm.cpp | 43 +---------- src/similarity.cpp | 104 +++++++++++++++++++++++++ tests/testthat/test-cv_similarity.R | 71 +++++++++++------ 15 files changed, 398 insertions(+), 102 deletions(-) create mode 100644 src/Lightweight_matrix.hpp create mode 100644 src/similarity.cpp diff --git a/DESCRIPTION b/DESCRIPTION index 0aba4c3..da7deb5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: blockCV Type: Package Title: Spatial and Environmental Blocking for K-Fold and LOO Cross-Validation -Version: 3.1-7 -Date: 2025-08-01 +Version: 3.2-0 +Date: 2025-08-03 Authors@R: c(person("Roozbeh", "Valavi", role = c("aut", "cre"), email = "valavi.r@gmail.com", comment = c(ORCID = "0000-0003-2495-5277")), person("Jane", "Elith", role = "aut", diff --git a/NEWS.md b/NEWS.md index fb479aa..31dad23 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# version 3.2.0 +* Fix a warning in `cv_similarity` for colour aesthetics. +* Adding L1 and L2 distances to `cv_similarity` + # version 3.1.7 * Temporarily added `sp` package dependency to avoid CRAN check as required by `automap` package [#55]. diff --git a/R/RcppExports.R b/R/RcppExports.R index ec6a423..e42598a 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -5,3 +5,7 @@ nndm_cpp <- function(X, Gij, phi, min_train) { .Call(`_blockCV_nndm_cpp`, X, Gij, phi, min_train) } +similarity_cpp <- function(train_mat, test_mat, rand_mat, L1 = TRUE) { + .Call(`_blockCV_similarity_cpp`, train_mat, test_mat, rand_mat, L1) +} + diff --git a/R/checks.R b/R/checks.R index 16c141f..8112bc7 100644 --- a/R/checks.R +++ b/R/checks.R @@ -1,8 +1,7 @@ # Author: Roozbeh Valavi # contact: valavi.r@gmail.com # Date : May 2023 -# Version 0.2 -# Licence GPL v3 +# Version 0.3 # check points fall within the raster layer .check_within <- function(x, r) { diff --git a/R/cv_block_size.R b/R/cv_block_size.R index 927a753..65135f8 100644 --- a/R/cv_block_size.R +++ b/R/cv_block_size.R @@ -108,8 +108,7 @@ cv_block_size <- function( if(!is.null(x)){ geom_x <- ggplot2::geom_sf( data = x, - switch(!is.null(column), ggplot2::aes(colour = {{ column }}), NULL), - # ggplot2::aes(colour = {{ switch(!is.null(column), column, NULL) }}), + switch(!is.null(column), ggplot2::aes(colour = {{ column }})), inherit.aes = FALSE, alpha = 0.5 ) @@ -158,7 +157,7 @@ cv_block_size <- function( fill = "orangered4", alpha = 0.04, size = 0.2) + - switch(!is.null(x), geom_x, NULL) + + switch(!is.null(x), geom_x) + ggplot2::ggtitle("Spatial blocks", subtitle=paste("Using", input$num, diff --git a/R/cv_cluster.R b/R/cv_cluster.R index ac55278..9be898d 100644 --- a/R/cv_cluster.R +++ b/R/cv_cluster.R @@ -205,7 +205,7 @@ cv_cluster <- function( final_objs <- list( folds_list = fold_list, folds_ids = fold_ids, - biomod_table = switch(biomod2, as.matrix(biomod_table), NULL), + biomod_table = switch(biomod2, as.matrix(biomod_table)), k = k, column = column, type = ifelse(is.null(r), "Spatial Cluster", "Environmental Cluster"), diff --git a/R/cv_plot.R b/R/cv_plot.R index e2361b3..7438313 100644 --- a/R/cv_plot.R +++ b/R/cv_plot.R @@ -121,8 +121,8 @@ cv_plot <- function( if(methods::is(cv, "cv_spatial")){ p1 <- ggplot2::ggplot(data = blocks) + - switch(!is.null(r), geom_rast, NULL) + - switch(!is.null(r), geom_rast_col, NULL) + + switch(!is.null(r), geom_rast) + + switch(!is.null(r), geom_rast_col) + ggplot2::geom_sf(colour = "red", fill = "orangered4", alpha = 0.04, @@ -140,9 +140,9 @@ cv_plot <- function( } else{ p1 <- ggplot2::ggplot(data = x_long) + - switch(!is.null(r), geom_rast, NULL) + - switch(!is.null(r), geom_rast_col, NULL) + - switch(methods::is(cv, "cv_spatial"), geom_poly, NULL) + + switch(!is.null(r), geom_rast) + + switch(!is.null(r), geom_rast_col) + + switch(methods::is(cv, "cv_spatial"), geom_poly) + # ggplot2::geom_sf(ggplot2::aes(col = rlang::.data$value), ggplot2::geom_sf(ggplot2::aes(col = get("value")), alpha = points_alpha) + @@ -161,7 +161,7 @@ cv_plot <- function( # is it a LOO CV object? .is_loo <- function(x){ - methods::is(x, "cv_buffer") || methods::is(x, "cv_nndm") + inherits(x, c("cv_buffer", "cv_nndm")) } # transform x and fold numbers for plotting diff --git a/R/cv_similarity.R b/R/cv_similarity.R index 0c9a38c..9692d68 100644 --- a/R/cv_similarity.R +++ b/R/cv_similarity.R @@ -1,16 +1,42 @@ #' Compute similarity measures to evaluate possible extrapolation in testing folds #' -#' This function computes multivariate environmental similarity surface (MESS) as described -#' in Elith et al. (2010). MESS represents how similar a point in a testing fold is to a training -#' fold (as a reference set of points), with respect to a set of predictor variables in \code{r}. +#' This function supports three similarity measurement methods: +#' Multivariate Environmental Similarity Surface (MESS), Manhattan distance (L1), +#' and Euclidean distance (L2). It is primarily designed for evaluating the similarity +#' between training and test datasets in environmental or spatial modeling applications. +#' +#' +#' The function support three method of similarity measures, the multivariate environmental +#' similarity surface (MESS), Manhattan distance (L1), and Euclidean distance (L2). +#' +#' The function computes MESS as described in Elith et al. (2010). MESS represents +#' how similar a point in a testing fold is to a training fold (as a reference +#' set of points), with respect to a set of predictor variables in \code{r}. #' The negative values are the sites where at least one variable has a value that is outside #' the range of environments over the reference set, so these are novel environments. #' +#' When using the L1 (Manhattan) or L2 (Euclidean) distance options, the function performs +#' the following steps for each test sample: +#' +#' 1. Calculates the minimum distance between each test sample and all training samples +#' in the same fold using the selected metric (L1 or L2). +#' 2. Calculates a baseline distance: the average of the minimum distances between a set +#' of random background samples (defined by \code{num_sample}) and all training/test +#' samples combined. +#' 3. Computes a similarity score by subtracting the test sample’s minimum distance from +#' the baseline average. A higher score indicates the test sample is more similar to +#' the training data, while lower or negative scores indicate novelty. +#' +#' This provides a simple, distance-based novelty metric, useful for assessing +#' extrapolation or dissimilarity in prediction scenarios. +#' #' @inheritParams cv_plot #' @param x a simple features (sf) or SpatialPoints object of the spatial sample data used for creating #' the \code{cv} object. #' @param r a terra SpatRaster object of environmental predictor that are going to be used for modelling. This #' is used to calculate similarity between the training and testing points. +#' @param method the similarity method including: MESS, L1 and L2. +#' @param num_sample number of random samples from raster to calculate similarity distances (only for L1 and L2). #' @param num_plot a vector of indices of folds. #' @param jitter_width numeric; the width of jitter points. #' @param points_size numeric; the size of points. @@ -51,6 +77,8 @@ cv_similarity <- function( x, r, num_plot = seq_along(cv$folds_list), + method = "MESS", + num_sample = 10000L, jitter_width = 0.1, points_size = 2, points_alpha = 0.7, @@ -69,12 +97,13 @@ cv_similarity <- function( .check_within(x, r) # check cv obj - if(!class(cv) %in% c("cv_spatial", "cv_cluster", "cv_buffer", "cv_nndm")){ + if (!inherits(cv, c("cv_spatial", "cv_cluster", "cv_buffer", "cv_nndm"))) { stop("'cv' must be a blockCV cv_* object.") } - # if (!any(inherits(cv, c("cv_spatial", "cv_cluster", "cv_buffer", "cv_nndm")))) { - # stop("'cv' must be a blockCV cv_* object.") - # } + + method <- match.arg(tolower(method), choices = c("mess", "l1", "l2")) + MESS <- method == "mess" + L1 <- method == "l1" # The iteration must be a natural number tryCatch( @@ -106,16 +135,49 @@ cv_similarity <- function( df <- data.frame(id = seq_len(n)) # add progress bar if(progress) pb <- utils::txtProgressBar(min = 0, max = length(num_plot), style = 3) + + if (!MESS) { + # scale a dataset based on params of another scaled dataset + f <- function(x, s) { + mns <- attr(s,"scaled:center") + sds <- attr(s,"scaled:scale") + for (i in names(mns)) { + x[[i]] <- (x[[i]] - mns[i]) / sds[i] + } + attr(x, "scaled:center") <- NULL + attr(x, "scaled:scale") <- NULL + return(x) + } + rand_points <- terra::spatSample(r, size = num_sample, na.rm = TRUE) + rand_points <- rand_points[stats::complete.cases(rand_points), ] # just make sure.. + # scale and centre the random points + rand_points <- as.matrix(scale(rand_points)) + # scale and centre the samples points based on the same stats + points <- as.matrix(f(x = points, s = rand_points)) + # remove the attributes + attr(rand_points, "scaled:center") <- NULL + attr(rand_points, "scaled:scale") <- NULL + } + # calculate MESS for testing data for(i in num_plot){ df[, paste("Fold", i, sep = "")] <- NA train <- folds_list[[i]][[1]] test <- folds_list[[i]][[2]] - mes <- sapply(1:m, function(j) .messi3(points[test, j], points[train, j])) - if(.is_loo(cv)){ - mmes <- min(mes) - } else{ - mmes <- apply(mes, 1, min, na.rm = TRUE) + if (MESS) { + mes <- sapply(1:m, function(j) .messi3(points[test, j], points[train, j])) + if(.is_loo(cv)){ + mmes <- min(mes) + } else{ + mmes <- apply(mes, 1, min, na.rm = TRUE) + } + } else { + mmes <- similarity_cpp( + train_mat = points[train, ], + test_mat = points[test,, drop=FALSE], + rand_mat = rand_points, + L1 = L1 + ) } df[1:length(mmes), paste("Fold", i, sep = "")] <- mmes if(progress) utils::setTxtProgressBar(pb, i) @@ -144,21 +206,38 @@ cv_similarity <- function( # provide alternatives for class(cv) goem_buffer <- ggplot2::geom_point(size = points_size, alpha = points_alpha) geom_other <- ggplot2::geom_jitter(width = jitter_width, size = points_size, alpha = points_alpha) - geom_vio <- ggplot2::geom_violin(fill = NA) + geom_vio <- ggplot2::geom_violin( + ggplot2::aes(x = get("folds"), y = get("value"), group = get("folds")), + inherit.aes = FALSE, + fill = NA + ) # which geom to choose geom_exta <- if(.is_loo(cv)) goem_buffer else geom_other + mes_reshp$folds <- factor(mes_reshp$folds) + + col_name <- switch( + method, + mess = "MESS", + l1 = "L1 distance", + l2 = "L2 distance" + ) + y_name <- switch( + method, + mess = "MESS Values", + l1 = "Distance differnce with random samples", + l2 = "Distance differnce with random samples" + ) p1 <- ggplot2::ggplot( data = mes_reshp, - # ggplot2::aes(x = rlang::.data$folds, y = rlang::.data$value, col = rlang::.data$value)) + - ggplot2::aes(x = get("folds"), y = get("value"), col = get("value"))) + + ggplot2::aes(x = get("folds"), y = get("value"), colour = get("value"))) + ggplot2::geom_hline(yintercept = 0, color = "grey50", linetype = 2) + geom_exta + - switch(!.is_loo(cv), geom_vio, NULL) + + switch(!.is_loo(cv), geom_vio) + ggplot2::scale_color_gradientn(colours = points_colors, limits = c(-maxabs, maxabs), na.value = "#BEBEBE03") + - ggplot2::labs(x = "Folds", y = "MESS Values", col = "MESS") + + ggplot2::labs(x = "Folds", y = y_name, col = col_name) + ggplot2::theme_bw() return(p1) diff --git a/R/cv_spatial.R b/R/cv_spatial.R index 28c9168..5f6212c 100644 --- a/R/cv_spatial.R +++ b/R/cv_spatial.R @@ -432,7 +432,7 @@ cv_spatial <- function( final_objs <- list( folds_list = fold_list, folds_ids = fold_vect, - biomod_table = switch(biomod2, as.matrix(biomod_table), NULL), + biomod_table = switch(biomod2, as.matrix(biomod_table)), k = k, size = size, column = column, @@ -446,7 +446,7 @@ cv_spatial <- function( if(plot){ p1 <- cv_plot( cv = final_objs, - r = switch(!is.null(r), r, NULL), + r = switch(!is.null(r), r), ... ) plot(p1) diff --git a/man/cv_similarity.Rd b/man/cv_similarity.Rd index 00e7b19..56d42f2 100644 --- a/man/cv_similarity.Rd +++ b/man/cv_similarity.Rd @@ -9,6 +9,8 @@ cv_similarity( x, r, num_plot = seq_along(cv$folds_list), + method = "MESS", + num_sample = 10000L, jitter_width = 0.1, points_size = 2, points_alpha = 0.7, @@ -28,6 +30,10 @@ is used to calculate similarity between the training and testing points.} \item{num_plot}{a vector of indices of folds.} +\item{method}{the similarity method including: MESS, L1 and L2.} + +\item{num_sample}{number of random samples from raster to calculate similarity distances (only for L1 and L2).} + \item{jitter_width}{numeric; the width of jitter points.} \item{points_size}{numeric; the size of points.} @@ -42,11 +48,35 @@ is used to calculate similarity between the training and testing points.} a ggplot object } \description{ -This function computes multivariate environmental similarity surface (MESS) as described -in Elith et al. (2010). MESS represents how similar a point in a testing fold is to a training -fold (as a reference set of points), with respect to a set of predictor variables in \code{r}. +This function supports three similarity measurement methods: +Multivariate Environmental Similarity Surface (MESS), Manhattan distance (L1), +and Euclidean distance (L2). It is primarily designed for evaluating the similarity +between training and test datasets in environmental or spatial modeling applications. +} +\details{ +The function support three method of similarity measures, the multivariate environmental +similarity surface (MESS), Manhattan distance (L1), and Euclidean distance (L2). + +The function computes MESS as described in Elith et al. (2010). MESS represents +how similar a point in a testing fold is to a training fold (as a reference +set of points), with respect to a set of predictor variables in \code{r}. The negative values are the sites where at least one variable has a value that is outside the range of environments over the reference set, so these are novel environments. + +When using the L1 (Manhattan) or L2 (Euclidean) distance options, the function performs +the following steps for each test sample: + +1. Calculates the minimum distance between each test sample and all training samples + in the same fold using the selected metric (L1 or L2). +2. Calculates a baseline distance: the average of the minimum distances between a set + of random background samples (defined by \code{num_sample}) and all training/test + samples combined. +3. Computes a similarity score by subtracting the test sample’s minimum distance from + the baseline average. A higher score indicates the test sample is more similar to + the training data, while lower or negative scores indicate novelty. + +This provides a simple, distance-based novelty metric, useful for assessing +extrapolation or dissimilarity in prediction scenarios. } \examples{ \donttest{ diff --git a/src/Lightweight_matrix.hpp b/src/Lightweight_matrix.hpp new file mode 100644 index 0000000..4e62bf8 --- /dev/null +++ b/src/Lightweight_matrix.hpp @@ -0,0 +1,80 @@ +#pragma once + +#ifndef IGHTWEIGHT_MATRIX_H +#define IGHTWEIGHT_MATRIX_H + +#include + +// a light weight matrix class for faster calculation with row-major opertaions +template +class Lightweight_matrix { +private: + using MatrixType = std::vector; +public: + + explicit Lightweight_matrix(int rows, int columns): + rows_(rows), columns_(columns), matrix_(rows * columns) {} + + template + explicit Lightweight_matrix(Matrix matrix): + rows_(matrix.nrow()), columns_(matrix.ncol()), matrix_(matrix.nrow() * matrix.ncol()) { + for(int i = 0; i < rows_; ++i) { + for(int j = 0; j < columns_; ++j) { + operator()(i, j) = static_cast(matrix(i, j)); + } + } + } + + T operator()(int i, int j) const { + return matrix_[i * columns_ + j]; + } + + T& operator()(int i, int j) { + return matrix_[i * columns_ + j]; + } + + int ncol() const { + return columns_; + } + + int nrow() const { + return rows_; + } + + T* data() { + return matrix_.data(); + } + + const T* data() const { + return matrix_.data(); + } + +private: + int rows_; + int columns_; + MatrixType matrix_; +}; + + +template +Lightweight_matrix rbind(const Lightweight_matrix& A, const Lightweight_matrix& B) +{ + const int ncol = A.ncol(); + const int nrow_A = A.nrow(); + const int nrow_B = B.nrow(); + + if (ncol != B.ncol()) + { + throw std::invalid_argument("Matrices must have the same number of columns"); + } + + const int total_rows = nrow_A + nrow_B; + Lightweight_matrix result(total_rows, ncol); + + std::copy(A.data(), A.data() + nrow_A * ncol, result.data()); + std::copy(B.data(), B.data() + nrow_B * ncol, result.data() + nrow_A * ncol); + + return result; +} + +#endif /* IGHTWEIGHT_MATRIX_H */ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index bc58e07..1efdc4b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -24,9 +24,24 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// similarity_cpp +Rcpp::NumericVector similarity_cpp(const Rcpp::NumericMatrix& train_mat, const Rcpp::NumericMatrix& test_mat, const Rcpp::NumericMatrix& rand_mat, bool L1); +RcppExport SEXP _blockCV_similarity_cpp(SEXP train_matSEXP, SEXP test_matSEXP, SEXP rand_matSEXP, SEXP L1SEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type train_mat(train_matSEXP); + Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type test_mat(test_matSEXP); + Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type rand_mat(rand_matSEXP); + Rcpp::traits::input_parameter< bool >::type L1(L1SEXP); + rcpp_result_gen = Rcpp::wrap(similarity_cpp(train_mat, test_mat, rand_mat, L1)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_blockCV_nndm_cpp", (DL_FUNC) &_blockCV_nndm_cpp, 4}, + {"_blockCV_similarity_cpp", (DL_FUNC) &_blockCV_similarity_cpp, 4}, {NULL, NULL, 0} }; diff --git a/src/nndm.cpp b/src/nndm.cpp index 618d92e..94934c0 100644 --- a/src/nndm.cpp +++ b/src/nndm.cpp @@ -2,48 +2,7 @@ #include #include - -template -class Lightweight_matrix { -private: - using MatrixType = std::vector; -public: - - explicit Lightweight_matrix(int rows, int columns): - rows_(rows), columns_(columns), matrix_(rows * columns) {} - - template - explicit Lightweight_matrix(Matrix matrix): - rows_(matrix.nrow()), columns_(matrix.ncol()), matrix_(matrix.nrow() * matrix.ncol()) { - for(int i(0); i < rows_; ++i) { - for(int j(0); j < columns_; ++j) { - operator()(i, j) = static_cast(matrix(i, j)); - } - } - } - - T operator()(int i, int j) const { - return matrix_[i * columns_ + j]; - } - - T& operator()(int i, int j) { - return matrix_[i * columns_ + j]; - } - - int ncol() const { - return columns_; - } - - int nrow() const { - return rows_; - } - -private: - int rows_; - int columns_; - MatrixType matrix_; -}; - +#include "Lightweight_matrix.hpp" // [[Rcpp::export]] Rcpp::NumericMatrix nndm_cpp(Rcpp::NumericMatrix X, diff --git a/src/similarity.cpp b/src/similarity.cpp new file mode 100644 index 0000000..b1e5b20 --- /dev/null +++ b/src/similarity.cpp @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include "Lightweight_matrix.hpp" + +// a function to calculate L1 or L2 distances +inline double distance(Lightweight_matrix &x, Lightweight_matrix &y, int i, int ii, bool L1) +{ + int nc = x.ncol(); + double dist = 0.0; + + if (L1) + { + for (int j = 0; j < nc; j++) + { + dist += std::abs(x(i, j) - y(ii, j)); + } + } + else + { + for (int j = 0; j < nc; j++) + { + double diff = x(i, j) - y(ii, j); + dist += diff * diff; + } + dist = std::sqrt(dist); // Euclidean distance + } + + return dist; +} + + +// [[Rcpp::export]] +Rcpp::NumericVector similarity_cpp( + const Rcpp::NumericMatrix &train_mat, + const Rcpp::NumericMatrix &test_mat, + const Rcpp::NumericMatrix &rand_mat, + bool L1 = true +) { + + Lightweight_matrix train(train_mat); + Lightweight_matrix test(test_mat); + Lightweight_matrix rand(rand_mat); + // rbind train and test matrices for baseline calc + Lightweight_matrix full_mat = rbind(train, test); + + // check for possible errors. + if (train.ncol() != test.ncol()) Rcpp::stop("Number of columns in train and test are different!"); + if (train.ncol() != rand.ncol()) Rcpp::stop("Number of columns in train and raster layers are different!"); + + int nr_test = test.nrow(); + int nr_train = train.nrow(); + int nr_rand = rand.nrow(); + + // output vector + std::vector output(nr_test, 0.0); + + // loop over the random samples + double base_dist = 0.0; + for (int i = 0; i < nr_rand; i++) + { + double point_dist(std::numeric_limits::infinity()); + // loop over the training samples + for (int row = 0; row < full_mat.nrow(); row++) + { + // calcualte L1/L2 distance + double dist_rnd = distance(full_mat, rand, row, i, L1); + // get the min distance + if (dist_rnd < point_dist) + { + point_dist = dist_rnd; + } + } + base_dist += point_dist; + } + // average distance of a random sample to training samples + double ave_base = base_dist / static_cast(nr_rand); + + + // loop over the test samples + for (int i = 0; i < nr_test; i++) + { + double test_dist(std::numeric_limits::infinity()); + // loop over the training samples to get the min distance + for (int row = 0; row < nr_train; row++) + { + // calcualte L1/L2 distance + double dist = distance(train, test, row, i, L1); + // get the min distance + if (dist < test_dist) + { + test_dist = dist; + } + } + // expected distance - test distance + double exp_dist = ave_base - test_dist; + + output[i] = exp_dist; + } + + return Rcpp::wrap(output); +} + diff --git a/tests/testthat/test-cv_similarity.R b/tests/testthat/test-cv_similarity.R index a634fac..4fdf3d5 100644 --- a/tests/testthat/test-cv_similarity.R +++ b/tests/testthat/test-cv_similarity.R @@ -1,39 +1,62 @@ aus <- system.file("extdata/au/", package = "blockCV") |> - list.files(full.names = TRUE) |> - terra::rast() + list.files(full.names = TRUE) |> + terra::rast() pa_data <- read.csv(system.file("extdata/", "species.csv", package = "blockCV")) |> - sf::st_as_sf(coords = c("x", "y"), crs = 7845) + sf::st_as_sf(coords = c("x", "y"), crs = 7845) pa_data <- pa_data[1:200, ] -test_that("test that cv_similarity function works with cv_spatil", - { - scv <- cv_spatial(x = pa_data, - selection = "random", - iteration = 1, - biomod2 = FALSE, - plot = FALSE, - report = FALSE, - progress = FALSE) +scv <- cv_spatial( + x = pa_data, + selection = "random", + iteration = 1, + biomod2 = FALSE, + plot = FALSE, + report = FALSE, + progress = FALSE +) - plt <- cv_similarity(cv = scv, x = pa_data, r = aus) - expect_true(exists("plt")) - expect_true(ggplot2::is_ggplot(plt)) +test_that("test that cv_similarity function works with cv_spatil and MESS", { + + plt <- cv_similarity(cv = scv, x = pa_data, r = aus, method = "MESS") + + expect_true(exists("plt")) + expect_true(ggplot2::is_ggplot(plt)) }) +test_that("test that cv_similarity function works with cv_spatil and L1", { -test_that("test that cv_similarity function works with cv_buffer", - { - bloo <- cv_buffer(x = pa_data, - size = 250000, - progress = FALSE, - report = FALSE) + plt <- cv_similarity(cv = scv, x = pa_data, r = aus, method = "L1") - plt <- cv_similarity(cv = bloo, x = pa_data, r = aus) + expect_true(exists("plt")) + expect_true(ggplot2::is_ggplot(plt)) + +}) - expect_true(exists("plt")) - expect_true(ggplot2::is_ggplot(plt)) +test_that("test that cv_similarity function works with cv_spatil and L2", { + + plt <- cv_similarity(cv = scv, x = pa_data, r = aus, , method = "L2") + + expect_true(exists("plt")) + expect_true(ggplot2::is_ggplot(plt)) }) + + +test_that("test that cv_similarity function works with cv_buffer", { + bloo <- cv_buffer( + x = pa_data, + size = 250000, + progress = FALSE, + report = FALSE + ) + + plt <- cv_similarity(cv = bloo, x = pa_data, r = aus) + + expect_true(exists("plt")) + expect_true(ggplot2::is_ggplot(plt)) + +}) + From 6bd11cc2805633be51efdae6cfe02dcd8e7c93de Mon Sep 17 00:00:00 2001 From: Roozbeh Valavi Date: Tue, 5 Aug 2025 09:57:47 +1000 Subject: [PATCH 2/9] small changes --- R/cv_similarity.R | 4 ++-- src/Lightweight_matrix.hpp | 6 +++--- src/similarity.cpp | 29 ++++++++++++++--------------- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/R/cv_similarity.R b/R/cv_similarity.R index 9692d68..c34d584 100644 --- a/R/cv_similarity.R +++ b/R/cv_similarity.R @@ -15,8 +15,8 @@ #' The negative values are the sites where at least one variable has a value that is outside #' the range of environments over the reference set, so these are novel environments. #' -#' When using the L1 (Manhattan) or L2 (Euclidean) distance options, the function performs -#' the following steps for each test sample: +#' When using the L1 (Manhattan) or L2 (Euclidean) distance options (experimental), the +#' function performs the following steps for each test sample: #' #' 1. Calculates the minimum distance between each test sample and all training samples #' in the same fold using the selected metric (L1 or L2). diff --git a/src/Lightweight_matrix.hpp b/src/Lightweight_matrix.hpp index 4e62bf8..eda7103 100644 --- a/src/Lightweight_matrix.hpp +++ b/src/Lightweight_matrix.hpp @@ -1,11 +1,10 @@ -#pragma once - #ifndef IGHTWEIGHT_MATRIX_H #define IGHTWEIGHT_MATRIX_H #include +#include -// a light weight matrix class for faster calculation with row-major opertaions +// A light weight matrix class for faster calculation with row-major opertaions template class Lightweight_matrix { private: @@ -56,6 +55,7 @@ class Lightweight_matrix { }; +// rbind implementation for Lightweight_matrix template Lightweight_matrix rbind(const Lightweight_matrix& A, const Lightweight_matrix& B) { diff --git a/src/similarity.cpp b/src/similarity.cpp index b1e5b20..20228ca 100644 --- a/src/similarity.cpp +++ b/src/similarity.cpp @@ -14,7 +14,7 @@ inline double distance(Lightweight_matrix &x, Lightweight_matrix { for (int j = 0; j < nc; j++) { - dist += std::abs(x(i, j) - y(ii, j)); + dist += std::abs(x(i, j) - y(ii, j)); // Manhattan distance } } else @@ -57,46 +57,45 @@ Rcpp::NumericVector similarity_cpp( std::vector output(nr_test, 0.0); // loop over the random samples - double base_dist = 0.0; + double sum_dist = 0.0; for (int i = 0; i < nr_rand; i++) { - double point_dist(std::numeric_limits::infinity()); + double min_dist(std::numeric_limits::infinity()); // loop over the training samples for (int row = 0; row < full_mat.nrow(); row++) { // calcualte L1/L2 distance - double dist_rnd = distance(full_mat, rand, row, i, L1); + double dist_rand = distance(full_mat, rand, row, i, L1); // get the min distance - if (dist_rnd < point_dist) + if (dist_rand < min_dist) { - point_dist = dist_rnd; + min_dist = dist_rand; } } - base_dist += point_dist; + sum_dist += min_dist; } - // average distance of a random sample to training samples - double ave_base = base_dist / static_cast(nr_rand); - + // average min distance of a random sample to training samples + const double base_dist = sum_dist / static_cast(nr_rand); // loop over the test samples for (int i = 0; i < nr_test; i++) { - double test_dist(std::numeric_limits::infinity()); + double min_test(std::numeric_limits::infinity()); // loop over the training samples to get the min distance for (int row = 0; row < nr_train; row++) { // calcualte L1/L2 distance double dist = distance(train, test, row, i, L1); // get the min distance - if (dist < test_dist) + if (dist < min_test) { - test_dist = dist; + min_test = dist; } } // expected distance - test distance - double exp_dist = ave_base - test_dist; + double test_dist = base_dist - min_test; - output[i] = exp_dist; + output[i] = test_dist; } return Rcpp::wrap(output); From 5405e6d7fd3806ad6eb5f6ac2411635c70469b4b Mon Sep 17 00:00:00 2001 From: Roozbeh Valavi Date: Tue, 5 Aug 2025 10:25:58 +1000 Subject: [PATCH 3/9] small doc update --- R/cv_similarity.R | 28 +++++++++++++++++----------- man/cv_similarity.Rd | 32 ++++++++++++++++++++------------ 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/R/cv_similarity.R b/R/cv_similarity.R index c34d584..4abcab0 100644 --- a/R/cv_similarity.R +++ b/R/cv_similarity.R @@ -6,26 +6,28 @@ #' between training and test datasets in environmental or spatial modeling applications. #' #' -#' The function support three method of similarity measures, the multivariate environmental -#' similarity surface (MESS), Manhattan distance (L1), and Euclidean distance (L2). +#' The function support three method of similarity measures, the Multivariate Environmental +#' Similarity Surface (MESS), Manhattan distance (L1), and Euclidean distance (L2). #' -#' The function computes MESS as described in Elith et al. (2010). MESS represents +#' The MESS is calculated as described in Elith et al. (2010). MESS represents #' how similar a point in a testing fold is to a training fold (as a reference #' set of points), with respect to a set of predictor variables in \code{r}. #' The negative values are the sites where at least one variable has a value that is outside #' the range of environments over the reference set, so these are novel environments. #' -#' When using the L1 (Manhattan) or L2 (Euclidean) distance options (experimental), the +#' When using the L1 (Manhattan) or L2 (Euclidean) distance options (experimental), the #' function performs the following steps for each test sample: #' -#' 1. Calculates the minimum distance between each test sample and all training samples -#' in the same fold using the selected metric (L1 or L2). -#' 2. Calculates a baseline distance: the average of the minimum distances between a set +#' \itemize{ +#' \item{1. Calculates the minimum distance between each test sample and all training samples +#' in the same fold using the selected metric (L1 or L2).} +#' \item{2. Calculates a baseline distance: the average of the minimum distances between a set #' of random background samples (defined by \code{num_sample}) and all training/test -#' samples combined. -#' 3. Computes a similarity score by subtracting the test sample’s minimum distance from +#' samples combined.} +#' \item{3. Computes a similarity score by subtracting the test sample’s minimum distance from #' the baseline average. A higher score indicates the test sample is more similar to -#' the training data, while lower or negative scores indicate novelty. +#' the training data, while lower or negative scores indicate novelty.} +#' } #' #' This provides a simple, distance-based novelty metric, useful for assessing #' extrapolation or dissimilarity in prediction scenarios. @@ -35,7 +37,7 @@ #' the \code{cv} object. #' @param r a terra SpatRaster object of environmental predictor that are going to be used for modelling. This #' is used to calculate similarity between the training and testing points. -#' @param method the similarity method including: MESS, L1 and L2. +#' @param method the similarity method including: MESS, L1 and L2. Read the details section. #' @param num_sample number of random samples from raster to calculate similarity distances (only for L1 and L2). #' @param num_plot a vector of indices of folds. #' @param jitter_width numeric; the width of jitter points. @@ -44,6 +46,10 @@ #' @param points_colors character; a character vector of colours for points #' @inheritParams cv_spatial #' +#' @seealso \code{\link{cv_spatial}}, \code{\link{cv_cluster}}, \code{\link{cv_buffer}}, and \code{\link{cv_nndm}} +#' +#' @references Elith, J., Kearney, M., & Phillips, S. (2010). The art of modelling range-shifting species: The art of modelling range-shifting species. Methods in Ecology and Evolution, 1(4), 330–342. +#' #' @return a ggplot object #' @export #' diff --git a/man/cv_similarity.Rd b/man/cv_similarity.Rd index 56d42f2..e749d4b 100644 --- a/man/cv_similarity.Rd +++ b/man/cv_similarity.Rd @@ -30,7 +30,7 @@ is used to calculate similarity between the training and testing points.} \item{num_plot}{a vector of indices of folds.} -\item{method}{the similarity method including: MESS, L1 and L2.} +\item{method}{the similarity method including: MESS, L1 and L2. Read the details section.} \item{num_sample}{number of random samples from raster to calculate similarity distances (only for L1 and L2).} @@ -54,26 +54,28 @@ and Euclidean distance (L2). It is primarily designed for evaluating the similar between training and test datasets in environmental or spatial modeling applications. } \details{ -The function support three method of similarity measures, the multivariate environmental -similarity surface (MESS), Manhattan distance (L1), and Euclidean distance (L2). +The function support three method of similarity measures, the Multivariate Environmental +Similarity Surface (MESS), Manhattan distance (L1), and Euclidean distance (L2). -The function computes MESS as described in Elith et al. (2010). MESS represents +The MESS is calculated as described in Elith et al. (2010). MESS represents how similar a point in a testing fold is to a training fold (as a reference set of points), with respect to a set of predictor variables in \code{r}. The negative values are the sites where at least one variable has a value that is outside the range of environments over the reference set, so these are novel environments. -When using the L1 (Manhattan) or L2 (Euclidean) distance options, the function performs -the following steps for each test sample: +When using the L1 (Manhattan) or L2 (Euclidean) distance options (experimental), the +function performs the following steps for each test sample: -1. Calculates the minimum distance between each test sample and all training samples - in the same fold using the selected metric (L1 or L2). -2. Calculates a baseline distance: the average of the minimum distances between a set +\itemize{ +\item{1. Calculates the minimum distance between each test sample and all training samples + in the same fold using the selected metric (L1 or L2).} +\item{2. Calculates a baseline distance: the average of the minimum distances between a set of random background samples (defined by \code{num_sample}) and all training/test - samples combined. -3. Computes a similarity score by subtracting the test sample’s minimum distance from + samples combined.} +\item{3. Computes a similarity score by subtracting the test sample’s minimum distance from the baseline average. A higher score indicates the test sample is more similar to - the training data, while lower or negative scores indicate novelty. + the training data, while lower or negative scores indicate novelty.} +} This provides a simple, distance-based novelty metric, useful for assessing extrapolation or dissimilarity in prediction scenarios. @@ -104,3 +106,9 @@ cv_similarity(cv = sb, r = covars, x = pa_data) } } +\references{ +Elith, J., Kearney, M., & Phillips, S. (2010). The art of modelling range-shifting species: The art of modelling range-shifting species. Methods in Ecology and Evolution, 1(4), 330–342. +} +\seealso{ +\code{\link{cv_spatial}}, \code{\link{cv_cluster}}, \code{\link{cv_buffer}}, and \code{\link{cv_nndm}} +} From b25eb6f55a71133a69961912b7e5077cde935bda Mon Sep 17 00:00:00 2001 From: Roozbeh Valavi Date: Sat, 9 Aug 2025 21:37:30 +1000 Subject: [PATCH 4/9] fix autocor summary, some fixes --- .gitignore | 2 + NEWS.md | 5 +- R/checks.R | 7 ++ R/cv_buffer.R | 38 +------ R/cv_plot.R | 46 ++++---- R/cv_similarity.R | 4 +- R/cv_spatial.R | 52 +++++---- R/cv_spatial_autocor.R | 25 ++--- R/utils.R | 36 ++++++ inst/doc/tutorial_1.html | 233 +++++++++++++++++++-------------------- inst/doc/tutorial_2.html | 4 +- 11 files changed, 228 insertions(+), 224 deletions(-) create mode 100644 R/utils.R diff --git a/.gitignore b/.gitignore index 2540826..d26c686 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ vignettes/tutorial_1.R vignettes/tutorial_1.html vignettes/tutorial_2.R vignettes/tutorial_2.html +/doc/ +/Meta/ diff --git a/NEWS.md b/NEWS.md index 31dad23..3a91592 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # version 3.2.0 -* Fix a warning in `cv_similarity` for colour aesthetics. +* Fixed a warning in `cv_similarity` for colour aesthetics. * Adding L1 and L2 distances to `cv_similarity` +* Fixed the summary method for `cv_spatial_autocor` # version 3.1.7 * Temporarily added `sp` package dependency to avoid CRAN check as required by `automap` package [#55]. @@ -44,7 +45,7 @@ * Massive performance improvement in the C++ code of `cv_nndm` function for large datasets # version 3.0 -* Dependency to rgdal and rgeos are removed, and overall less dependency +* Dependency to `rgdal` and `rgeos` are removed, and overall less dependency * Function names have been changed, with all functions now starting with `cv_` * The old functions (v2.x) still work to allow appropriate time for adapting the new code * The CV blocking functions are now: `cv_spatial`, `cv_cluster`, `cv_buffer`, and `cv_nndm` diff --git a/R/checks.R b/R/checks.R index 8112bc7..c03cf9f 100644 --- a/R/checks.R +++ b/R/checks.R @@ -3,6 +3,13 @@ # Date : May 2023 # Version 0.3 +# check the object is a blockCV object +.check_cv <- function(x) { + is_cv <- inherits(x, c("cv_spatial", "cv_cluster", "cv_buffer", "cv_nndm")) + if (!is_cv) stop("'cv' must be a blockCV cv_* object.") +} + + # check points fall within the raster layer .check_within <- function(x, r) { bbox <- sf::st_bbox(x) diff --git a/R/cv_buffer.R b/R/cv_buffer.R index b3d0b1f..be5b798 100644 --- a/R/cv_buffer.R +++ b/R/cv_buffer.R @@ -117,7 +117,7 @@ cv_buffer <- function( add_bg <- (presence_bg && add_bg) if(progress) pb <- utils::txtProgressBar(min = 0, max = n, style = 3) - fold_list <- lapply(x_1s, function(i, pbag = add_bg){ + fold_list <- lapply(x_1s, function(i, pbag = add_bg) { if(pbag){ test_ids <- which(dmatrix[i, ] <= size) inside <- x[test_ids, column, drop = TRUE] @@ -127,10 +127,9 @@ cv_buffer <- function( test_set <- i } if(progress) utils::setTxtProgressBar(pb, i) - list(as.numeric(which(dmatrix[i, ] > size)), - as.numeric(test_set)) - } - ) + + list(as.numeric(which(dmatrix[i, ] > size)), as.numeric(test_set)) + }) # calculate train test table summary if(report){ @@ -172,32 +171,3 @@ summary.cv_buffer <- function(object, ...){ } } - -# count the train and test records -.ttt <- function(fold_list, x, column, n){ - if(is.null(column)){ - tt_count <- base::data.frame(train = rep(0, n), test = 0) - for(i in seq_len(n)){ - train_set <- fold_list[[i]][[1]] - test_set <- fold_list[[i]][[2]] - tt_count$train[i] <- length(train_set) - tt_count$test[i] <- length(test_set) - } - } else{ - cl <- sort(unique(x[, column, drop = TRUE])) - clen <- length(cl) - .check_classes(clen, column) # column should be binary or categorical - tt_count <- as.data.frame(matrix(0, nrow = n, ncol = clen * 2)) - names(tt_count) <- c(paste("train", cl, sep = "_"), paste("test", cl, sep = "_")) - for(i in seq_len(n)){ - train_set <- fold_list[[i]][[1]] - test_set <- fold_list[[i]][[2]] - countrain <- table(x[train_set, column, drop = TRUE]) - countest <- table(x[test_set, column, drop = TRUE]) - tt_count[i, which(cl %in% names(countrain))] <- countrain - tt_count[i, clen + which(cl %in% names(countest))] <- countest - } - } - - return(tt_count) -} diff --git a/R/cv_plot.R b/R/cv_plot.R index 7438313..c488d17 100644 --- a/R/cv_plot.R +++ b/R/cv_plot.R @@ -58,10 +58,8 @@ cv_plot <- function( ){ # check for availability of ggplot2 .check_pkgs(c("ggplot2")) - - if(!class(cv) %in% c("cv_spatial", "cv_cluster", "cv_buffer", "cv_nndm")){ - stop("'cv' must be a blockCV cv_* object.") - } + # check it's a cv obj + .check_cv(cv) # check x is an sf object if(!missing(x)){ @@ -73,6 +71,8 @@ cv_plot <- function( r <- .check_r(r) r <- r[[1]] } + # is it a cv_spatial object? + is_spatial <- inherits(cv, "cv_spatial") # make geom_tile for raster plots if(!is.null(r)){ @@ -91,10 +91,11 @@ cv_plot <- function( geom_rast <- ggplot2::geom_tile( data = map_df, ggplot2::aes(x = get("x"), y = get("y"), fill = get("value"))) + geom_rast_col <- ggplot2::scale_fill_gradientn(colours = raster_colors) } # make geom_sf for spatial blocks - if(methods::is(cv, "cv_spatial")){ + if(is_spatial){ blocks <- cv$blocks geom_poly <- ggplot2::geom_sf( data = sf::st_geometry(blocks), @@ -114,56 +115,49 @@ cv_plot <- function( } } else{ # stop if x is missing for buffer and cluster - if(!methods::is(cv, "cv_spatial")) stop("'x' is required for plotting cv_cluster and cv_buffer.") + if(!is_spatial) stop("'x' is required for plotting cv_cluster, cv_buffer and cv_nndm.") } - if(missing(x)){ - if(methods::is(cv, "cv_spatial")){ + geom_sftext <- if (label_size > 0) { + ggplot2::geom_sf_text( + ggplot2::aes(label = get("folds")), + size = label_size, fun.geometry = sf::st_centroid + ) + } else NULL + if(missing(x)){ + if(is_spatial){ p1 <- ggplot2::ggplot(data = blocks) + - switch(!is.null(r), geom_rast) + + switch(!is.null(r), geom_rast) + # only switch works with ggplot switch(!is.null(r), geom_rast_col) + ggplot2::geom_sf(colour = "red", fill = "orangered4", alpha = 0.04, linewidth = 0.2) + - ggplot2::geom_sf_text( - ggplot2::aes(label = get("folds")), - size = label_size, fun.geometry = sf::st_centroid) + + switch(!is.null(geom_sftext), geom_sftext) + ggplot2::labs(x = "", y = "") + # or set the axes labes to NULL ggplot2::scale_x_continuous(guide = ggplot2::guide_axis(check.overlap = TRUE)) + ggplot2::theme_minimal() + ggplot2::guides(fill = "none") - } } else{ - p1 <- ggplot2::ggplot(data = x_long) + - switch(!is.null(r), geom_rast) + + switch(!is.null(r), geom_rast) + # only switch works with ggplot switch(!is.null(r), geom_rast_col) + - switch(methods::is(cv, "cv_spatial"), geom_poly) + - # ggplot2::geom_sf(ggplot2::aes(col = rlang::.data$value), - ggplot2::geom_sf(ggplot2::aes(col = get("value")), - alpha = points_alpha) + + switch(is_spatial, geom_poly) + + ggplot2::geom_sf(ggplot2::aes(col = get("value")), alpha = points_alpha) + ggplot2::scale_color_manual(values = points_colors, na.value = "#BEBEBE03") + - # ggplot2::facet_wrap(~ rlang::.data$folds, nrow = nrow, ncol = ncol) + ggplot2::facet_wrap(~get("folds"), nrow = nrow, ncol = ncol) + ggplot2::labs(x = "", y = "", col = "") + # set the axes labes to NULL ggplot2::theme_bw() + ggplot2::guides(fill = "none") - } return(p1) } -# is it a LOO CV object? -.is_loo <- function(x){ - inherits(x, c("cv_buffer", "cv_nndm")) -} - # transform x and fold numbers for plotting .x_to_long <- function(x, cv, num_plot=1:10){ # get the folds list diff --git a/R/cv_similarity.R b/R/cv_similarity.R index 4abcab0..0cc6f5c 100644 --- a/R/cv_similarity.R +++ b/R/cv_similarity.R @@ -103,9 +103,7 @@ cv_similarity <- function( .check_within(x, r) # check cv obj - if (!inherits(cv, c("cv_spatial", "cv_cluster", "cv_buffer", "cv_nndm"))) { - stop("'cv' must be a blockCV cv_* object.") - } + .check_cv(cv) method <- match.arg(tolower(method), choices = c("mess", "l1", "l2")) MESS <- method == "mess" diff --git a/R/cv_spatial.R b/R/cv_spatial.R index 5f6212c..0076b13 100644 --- a/R/cv_spatial.R +++ b/R/cv_spatial.R @@ -315,7 +315,8 @@ cv_spatial <- function( } # iteration if random selection, otherwise only 1 round for(i in seq_len(iteration)){ - if(selection=='systematic'){ + + if(selection=="systematic"){ if(hexagon){ sub_blocks <- .fold_assign(sf::st_geometry(sub_blocks), n = k) } else{ @@ -323,17 +324,15 @@ cv_spatial <- function( sub_blocks$folds <- rep(1:k, length.out = blocks_len) } fold_df <- sf::st_drop_geometry(sub_blocks) - blocks_df <- merge(x = blocks_df, y = fold_df, by = "block_id", all.x = TRUE) } - if(selection=='checkerboard'){ + if(selection=="checkerboard"){ sub_blocks$folds <- sub_blocks$id sub_blocks$block_id <- seq_len(blocks_len) fold_df <- sf::st_drop_geometry(sub_blocks) - blocks_df <- merge(x = blocks_df, y = fold_df, by = "block_id", all.x = TRUE) } - if(selection=='random'){ + if(selection=="random"){ blocks_df <- blocks_df[, c("records", "block_id")] # to avoid repetition in iterations fold_df <- data.frame(block_id = seq_len(blocks_len), folds = 0) # create random folds with equal proportion @@ -344,14 +343,15 @@ cv_spatial <- function( unfold <- which(fold_df$folds == 0) fold_df$folds[unfold] <- sample(seq_len(k), rest, replace = FALSE) } - blocks_df <- merge(x = blocks_df, y = fold_df, by = "block_id", all.x = TRUE) } - if(selection=='predefined'){ + if(selection=="predefined"){ fold_df <- data.frame(block_id = seq_len(blocks_len), folds = sub_blocks[, folds_column, drop = TRUE]) - blocks_df <- merge(x = blocks_df, y = fold_df, by = "block_id", all.x = TRUE) } + # merge block-df once after the selection + blocks_df <- merge(x = blocks_df, y = fold_df, by = "block_id", all.x = TRUE) + # reset the table to 0s for each iteration train_test_table[] <- 0 # count the number of points in each fold @@ -432,7 +432,7 @@ cv_spatial <- function( final_objs <- list( folds_list = fold_list, folds_ids = fold_vect, - biomod_table = switch(biomod2, as.matrix(biomod_table)), + biomod_table = if (biomod2) as.matrix(biomod_table) else NULL, k = k, size = size, column = column, @@ -444,17 +444,19 @@ cv_spatial <- function( # plot with the cv_plot function if(plot){ - p1 <- cv_plot( - cv = final_objs, - r = switch(!is.null(r), r), - ... + plot( + cv_plot( + cv = final_objs, + r = r, + ... + ) ) - plot(p1) } return(final_objs) } + #' @export #' @method print cv_spatial print.cv_spatial <- function(x, ...){ @@ -483,16 +485,17 @@ summary.cv_spatial <- function(object, ...){ # create rectangular and hexagonal spatial blocks using terra and sf packages -.make_blocks <- function(x_obj, - blocksize = NULL, - blockcols = NULL, - blockrows = NULL, - hexagonal = FALSE, - flat_top = FALSE, - extend_perc = 0.005, - degree = 111325, - xy_offset = c(0, 0), - checkerboard = FALSE){ +.make_blocks <- function( + x_obj, + blocksize = NULL, + blockcols = NULL, + blockrows = NULL, + hexagonal = FALSE, + flat_top = FALSE, + extend_perc = 0.005, + degree = 111325, + xy_offset = c(0, 0), + checkerboard = FALSE){ # xpoints and rasters inputs are checked in the parent function (cv_spatial); # so no need to check them here; @@ -683,3 +686,4 @@ summary.cv_spatial <- function(object, ...){ return(blocks) } + diff --git a/R/cv_spatial_autocor.R b/R/cv_spatial_autocor.R index b227248..48677d9 100644 --- a/R/cv_spatial_autocor.R +++ b/R/cv_spatial_autocor.R @@ -168,8 +168,8 @@ cv_spatial_autocor <- function( vario_list <- lapply( seq_len(nlayer), .fit_variogram, - rr = switch(missing(x), r, NULL), - xx = switch(!missing(x), x, NULL), + rr = if (missing(x)) r else NULL, + xx = if (!missing(x)) x else NULL, column = column, num_sample = num_sample, progress = progress, @@ -214,8 +214,8 @@ cv_spatial_autocor <- function( # plot the spatial blocks p2 <- cv_plot( cv = plot_data, - r = switch(!missing(r), r, NULL), - label_size = 0, + r = if (!missing(r)) r else NULL, + label_size = -1, ... ) p2 <- p2 + ggplot2::ggtitle( @@ -263,8 +263,8 @@ plot.cv_spatial_autocor <- function(x, y, ...){ #' @method summary cv_spatial_autocor summary.cv_spatial_autocor <- function(object, ...){ cat("Summary statistics of spatial autocorrelation ranges of all input layers:\n") - print(summary(object$rangeTable$range)) - print(object$rangeTable[,1:2]) + print(summary(object$range_table$range)) + print(object$range_table[,1:2]) } @@ -292,15 +292,12 @@ summary.cv_spatial_autocor <- function(object, ...){ points <- xx[column[i]] # [i] in case there are more column names(points) <- c("target", "geometry") } - # NOTE: apparently the gstat package returns different units for range with sp and sf objects! - # So, need to keep this a SpatialPointDataFrame for now and have to fake using it!!! - if (FALSE) { - sp::SpatialPoints - } + fit_vario <- automap::autofitVariogram( formula = target ~ 1, input_data = .as_sp(points) ) + if(progress) utils::setTxtProgressBar(pb, i) return(fit_vario) @@ -309,7 +306,7 @@ summary.cv_spatial_autocor <- function(object, ...){ # Annoying step to import sp so there'll be no CRAN errors # Sp is only require because automap produces different output with latlong sf objects -# Don't use sf::as_Spatial because this somehow depends on sp and requires sp dependency anyway +# Don't use sf::as_Spatial because this depends on sp and requires sp dependency anyway .as_sp <- function(x) { out <- if (sf::st_is_longlat(x)) { coords <- sf::st_coordinates(x) @@ -344,10 +341,6 @@ summary.cv_spatial_autocor <- function(object, ...){ decreasing = FALSE), color = get("range")) ) + - # ggplot2::geom_bar( - # ggplot2::aes(x = stats::reorder(factor(get("layers")), get("range")), - # y = get("range"), fill = get("range")), - # stat = "identity", data = vario_data,) + ggplot2::geom_point(size = 4) + ggplot2::geom_segment( ggplot2::aes(x = get("layers"), xend = get("layers"), y = 0, yend = get("range")), diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000..be39168 --- /dev/null +++ b/R/utils.R @@ -0,0 +1,36 @@ +# A collection of function to facilitate block generaion and reporting + +# is it a LOO CV object? +.is_loo <- function(x){ + inherits(x, c("cv_buffer", "cv_nndm")) +} + +# count the train and test records +.ttt <- function(fold_list, x, column, n){ + if(is.null(column)){ + tt_count <- base::data.frame(train = rep(0, n), test = 0) + for(i in seq_len(n)){ + train_set <- fold_list[[i]][[1]] + test_set <- fold_list[[i]][[2]] + tt_count$train[i] <- length(train_set) + tt_count$test[i] <- length(test_set) + } + } else{ + cl <- sort(unique(x[, column, drop = TRUE])) + clen <- length(cl) + .check_classes(clen, column) # column should be binary or categorical + tt_count <- as.data.frame(matrix(0, nrow = n, ncol = clen * 2)) + names(tt_count) <- c(paste("train", cl, sep = "_"), paste("test", cl, sep = "_")) + for(i in seq_len(n)){ + train_set <- fold_list[[i]][[1]] + test_set <- fold_list[[i]][[2]] + countrain <- table(x[train_set, column, drop = TRUE]) + countest <- table(x[test_set, column, drop = TRUE]) + tt_count[i, which(cl %in% names(countrain))] <- countrain + tt_count[i, clen + which(cl %in% names(countest))] <- countest + } + } + + return(tt_count) +} + diff --git a/inst/doc/tutorial_1.html b/inst/doc/tutorial_1.html index bec4d2a..4571bda 100644 --- a/inst/doc/tutorial_1.html +++ b/inst/doc/tutorial_1.html @@ -12,7 +12,7 @@ - + 1. blockCV introduction: how to create block cross-validation folds @@ -342,7 +342,7 @@

1. blockCV introduction: how to create block cross-validation folds

Roozbeh Valavi, Jane Elith, José Lahoz-Monfort and Gurutzeta Guillera-Arroita

-

2025-06-23

+

2025-08-09

@@ -419,6 +419,7 @@

Installation

remotes::install_github("rvalavi/blockCV", dependencies = TRUE)
# loading the package
 library(blockCV)
+
## blockCV 3.2.0

Package data

@@ -432,20 +433,20 @@

Package data

data include presence-absence records (binary) of a simulated species.

To load the package raster data use:

-
library(sf) # working with spatial vector data
-library(terra) # working with spatial raster data
-library(tmap) # plotting maps
-
-# load raster data
-# the pipe operator |> is available for R version 4.1 or higher
-rasters <- system.file("extdata/au/", package = "blockCV") |>
-  list.files(full.names = TRUE) |>
-  terra::rast()
+
library(sf) # working with spatial vector data
+library(terra) # working with spatial raster data
+library(tmap) # plotting maps
+
+# load raster data
+# the pipe operator |> is available for R version 4.1 or higher
+rasters <- system.file("extdata/au/", package = "blockCV") |>
+  list.files(full.names = TRUE) |>
+  terra::rast()

The presence-absence species data include 243 presence points and 257 absence points.

-
# load species presence-absence data and convert to sf
-points <- read.csv(system.file("extdata/", "species.csv", package = "blockCV"))
-head(points)
+
# load species presence-absence data and convert to sf
+points <- read.csv(system.file("extdata/", "species.csv", package = "blockCV"))
+head(points)
##            x        y occ
 ## 1  1313728.4 -2275453   0
 ## 2  1176795.0 -1916003   0
@@ -459,21 +460,21 @@ 

Package data

coordinate reference system with "EPSG:7845" as defined by crs = 7845. We convert the data.frame to sf as follows:

-
pa_data <- sf::st_as_sf(points, coords = c("x", "y"), crs = 7845)
+
pa_data <- sf::st_as_sf(points, coords = c("x", "y"), crs = 7845)

Let’s plot the species data using tmap package:

-
tm_shape(rasters[[1]]) +
-  tm_raster(
-    col.scale = tm_scale_continuous(values = gray.colors(10)),
-    col.legend = tm_legend_hide()
-  ) +
-  tm_shape(pa_data) +
-  tm_dots(
-    fill = "occ",
-    fill.scale = tm_scale_categorical(),
-    size = 0.5,
-    fill_alpha = 0.5
-  )
+
tm_shape(rasters[[1]]) +
+  tm_raster(
+    col.scale = tm_scale_continuous(values = gray.colors(10)),
+    col.legend = tm_legend_hide()
+  ) +
+  tm_shape(pa_data) +
+  tm_dots(
+    fill = "occ",
+    fill.scale = tm_scale_categorical(),
+    size = 0.5,
+    fill_alpha = 0.5
+  )

@@ -516,14 +517,14 @@

Spatial blocks

allows users to define an external spatial polygon as blocking layer.

Here are some spatial block settings:

-
sb1 <- cv_spatial(x = pa_data,
-                  column = "occ", # the response column (binary or multi-class)
-                  k = 5, # number of folds
-                  size = 350000, # size of the blocks in metres
-                  selection = "random", # random blocks-to-fold
-                  iteration = 50, # find evenly dispersed folds
-                  biomod2 = TRUE) # also create folds for biomod2
-

+
sb1 <- cv_spatial(x = pa_data,
+                  column = "occ", # the response column (binary or multi-class)
+                  k = 5, # number of folds
+                  size = 350000, # size of the blocks in metres
+                  selection = "random", # random blocks-to-fold
+                  iteration = 50, # find evenly dispersed folds
+                  biomod2 = TRUE) # also create folds for biomod2
+

The output object is an R S3 object and you can get its elements by a $. Explore sb1$folds_ids, sb1$folds_list, and sb1$biomod_table for the @@ -536,37 +537,37 @@

Spatial blocks

raster layer (using r argument) for to be used for creating blocks and be used in the background of the plot (raster can also be added later only for visualising blocks using cv_plot).

-
sb2 <- cv_spatial(x = pa_data,
-                  column = "occ",
-                  r = rasters, # optionally add a raster layer
-                  k = 5, 
-                  size = 350000, 
-                  hexagon = FALSE, # use square blocks
-                  selection = "random",
-                  progress = FALSE, # turn off progress bar for vignette
-                  iteration = 50, 
-                  biomod2 = TRUE)
+
sb2 <- cv_spatial(x = pa_data,
+                  column = "occ",
+                  r = rasters, # optionally add a raster layer
+                  k = 5, 
+                  size = 350000, 
+                  hexagon = FALSE, # use square blocks
+                  selection = "random",
+                  progress = FALSE, # turn off progress bar for vignette
+                  iteration = 50, 
+                  biomod2 = TRUE)
## 
 ##   train_0 train_1 test_0 test_1
-## 1     217     190     40     53
-## 2     197     210     60     33
-## 3     211     166     46     77
-## 4     200     209     57     34
-## 5     203     197     54     46
-

+## 1 207 201 50 42 +## 2 197 208 60 35 +## 3 198 213 59 30 +## 4 209 198 48 45 +## 5 217 152 40 91
+

The assignment of folds to each block can also be done in a systematic manner using selection = "systematic", or a checkerboard pattern using selection = "checkerboard". The blocks can also be created by number of rows and columns when no size is supplied by e.g. rows_cols = c(12, 10).

-
# systematic fold assignment 
-# and also use row/column for creating blocks instead of size
-sb3 <- cv_spatial(x = pa_data,
-                  column = "occ",
-                  rows_cols = c(12, 10),
-                  hexagon = FALSE,
-                  selection = "systematic")
+
# systematic fold assignment 
+# and also use row/column for creating blocks instead of size
+sb3 <- cv_spatial(x = pa_data,
+                  column = "occ",
+                  rows_cols = c(12, 10),
+                  hexagon = FALSE,
+                  selection = "systematic")
## 
 ##   train_0 train_1 test_0 test_1
 ## 1     227     203     30     40
@@ -581,23 +582,23 @@ 

Spatial blocks

This is because the random assignment process is repeated multiple times, controlled by the iteration parameter, to ensure that the folds are evenly distributed.

-
# checkerboard block to CV fold assignment
-sb4 <- cv_spatial(x = pa_data,
-                  column = "occ",
-                  size = 350000,
-                  hexagon = FALSE,
-                  selection = "checkerboard")
+
# checkerboard block to CV fold assignment
+sb4 <- cv_spatial(x = pa_data,
+                  column = "occ",
+                  size = 350000,
+                  hexagon = FALSE,
+                  selection = "checkerboard")
## 
 ##   train_0 train_1 test_0 test_1
 ## 1     125     143    132    100
 ## 2     132     100    125    143

Let’s visualise the checkerboard blocks with tmap:

-
tm_shape(sb4$blocks) +
-  tm_fill(
-    fill = "folds",
-    fill.scale = tm_scale_categorical()
-  )
+
tm_shape(sb4$blocks) +
+  tm_fill(
+    fill = "folds",
+    fill.scale = tm_scale_categorical()
+  )

@@ -609,11 +610,11 @@

Spatial and environemntal clustering

based on spatial coordinates of sample points (the x argument).

Using spatial coordinate values for clustering:

-
# spatial clustering
-set.seed(6)
-scv <- cv_cluster(x = pa_data,
-                  column = "occ", # optional: counting number of train/test records
-                  k = 5)
+
# spatial clustering
+set.seed(6)
+scv <- cv_cluster(x = pa_data,
+                  column = "occ", # optional: counting number of train/test records
+                  k = 5)
##   train_0 train_1 test_0 test_1
 ## 1     230     240     27      3
 ## 2     171     142     86    101
@@ -628,13 +629,13 @@ 

Spatial and environemntal clustering

layer should cover all the species points, otherwise an error will rise. The records with no raster value should be deleted prior to the analysis or a different raster be used.

-
# environmental clustering
-set.seed(6)
-ecv <- cv_cluster(x = pa_data,
-                  column = "occ",
-                  r = rasters,
-                  k = 5, 
-                  scale = TRUE)
+
# environmental clustering
+set.seed(6)
+ecv <- cv_cluster(x = pa_data,
+                  column = "occ",
+                  r = rasters,
+                  k = 5, 
+                  scale = TRUE)
##   train_0 train_1 test_0 test_1
 ## 1     249     172      8     71
 ## 2     215     234     42      9
@@ -664,9 +665,9 @@ 

Buffering LOO (also known as Spatial LOO)

(ideally the range of spatial autocorrelation). In this method the test set never directly abuts a training set.

Using buffering to create CV folds:

-
bloo <- cv_buffer(x = pa_data,
-                  column = "occ",
-                  size = 350000)
+
bloo <- cv_buffer(x = pa_data,
+                  column = "occ",
+                  size = 350000)

When using species presence-background data (or presence and pseudo-absence), you need to supply the column and set presence_bg = TRUE. In this case, only presence @@ -690,14 +691,14 @@

Nearest Neighbour Distance Matching (NNDM) LOO

distribution function between the test and training data to the nearest neighbour distance distribution function between the target prediction and training points (Milà et al., 2022).

-
nncv <- cv_nndm(x = pa_data,
-                column = "occ",
-                r = rasters,
-                size = 350000,
-                num_sample = 5000, 
-                sampling = "regular",
-                min_train = 0.1,
-                plot = TRUE)
+
nncv <- cv_nndm(x = pa_data,
+                column = "occ",
+                r = rasters,
+                size = 350000,
+                num_sample = 5000, 
+                sampling = "regular",
+                min_train = 0.1,
+                plot = TRUE)
##     train_0         train_1          test_0          test_1     
 ##  Min.   :212.0   Min.   :145.0   Min.   :0.000   Min.   :0.000  
 ##  Mean   :244.4   Mean   :221.7   Mean   :0.514   Mean   :0.486  
@@ -712,24 +713,24 @@ 

Visualising the folds

might be slightly slower.

Let’s plot spatial clustering folds created in previous section (using cv_cluster):

-
cv_plot(cv = scv, 
-        x = pa_data)
+
cv_plot(cv = scv, 
+        x = pa_data)

When cv_buffer is used for plotting, only first 10 folds are shown. You can choose any set of CV folds for plotting. If remove_na = FALSE (default is TRUE), the NA in the legend shows the excluded points.

-
cv_plot(cv = bloo,
-        x = pa_data,
-        num_plots = c(1, 50, 100)) # only show folds 1, 50 and 100
+
cv_plot(cv = bloo,
+        x = pa_data,
+        num_plots = c(1, 50, 100)) # only show folds 1, 50 and 100

If you do not supply x when plotting a cv_spatial object, only the spatial blocks are plotted.

-
cv_plot(cv = sb1,
-        r = rasters,
-        raster_colors = terrain.colors(10, alpha = 0.5),
-        label_size = 4) 
-

+
cv_plot(cv = sb1,
+        r = rasters,
+        raster_colors = terrain.colors(10, alpha = 0.5),
+        label_size = 4) 
+

Check similarity

@@ -743,16 +744,10 @@

Check similarity

where at least one variable has a value that is outside the range of environments over the reference set, so these are novel environments.

-
cv_similarity(cv = ecv, # the environmental clustering
-              x = pa_data, 
-              r = rasters, 
-              progress = FALSE)
-
## Warning: The following aesthetics were dropped during statistical transformation:
-## colour.
-## ℹ This can happen when ggplot fails to infer the correct grouping structure in
-##   the data.
-## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
-##   variable into a factor?
+
cv_similarity(cv = ecv, # the environmental clustering
+              x = pa_data, 
+              r = rasters, 
+              progress = FALSE)

@@ -772,7 +767,7 @@

Estimating size: the effective range of spatial autocorrelation

for measuring spatial autocorrelation (O’Sullivan and Unwin, 2010).

sac1 <- cv_spatial_autocor(r = rasters, 
                            num_sample = 5000)
-

+

The plotted block size is based on the median of the spatial autocorrelation ranges. This could be as the minimum block size for creating spatially separated folds. Variograms are @@ -791,16 +786,20 @@

Estimating size: the effective range of spatial autocorrelation

# summary statistics of the output
 summary(sac1)
## Summary statistics of spatial autocorrelation ranges of all input layers:
-## Length  Class   Mode 
-##      0   NULL   NULL 
-## NULL
+## Min. 1st Qu. Median Mean 3rd Qu. Max. +## 1015264 1352302 1637589 5337128 5622415 17058071 +## layers range +## 3 bio_4 1015264 +## 2 bio_15 1464648 +## 4 bio_5 1810530 +## 1 bio_12 17058071

Alternatively, only use the response data using x and column. This could be a binary or continuous variable provided in as a column in the sample points sf object. This could be the response or the residuals of a fitted model.

sac2 <- cv_spatial_autocor(x = pa_data, 
                            column =  "occ")
-

+

To visualise them (this needs the automap package to be loaded):

library(automap)
diff --git a/inst/doc/tutorial_2.html b/inst/doc/tutorial_2.html
index 10f4921..1b457ce 100644
--- a/inst/doc/tutorial_2.html
+++ b/inst/doc/tutorial_2.html
@@ -12,7 +12,7 @@
 
 
 
-
+
 
 2. Block cross-validation for species distribution modelling
 
@@ -342,7 +342,7 @@ 

2. Block cross-validation for species distribution modelling

Roozbeh Valavi, Jane Elith, José Lahoz-Monfort and Gurutzeta Guillera-Arroita

-

2025-06-23

+

2025-08-09

From c356dae855cf8b7e72b01dafa049425fd570f8cd Mon Sep 17 00:00:00 2001 From: Roozbeh Valavi Date: Mon, 11 Aug 2025 18:54:46 +1000 Subject: [PATCH 5/9] some small changes --- R/cv_spatial.R | 35 +++++++++++++++++------------------ R/cv_spatial_autocor.R | 2 +- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/R/cv_spatial.R b/R/cv_spatial.R index 0076b13..0b88192 100644 --- a/R/cv_spatial.R +++ b/R/cv_spatial.R @@ -316,7 +316,21 @@ cv_spatial <- function( # iteration if random selection, otherwise only 1 round for(i in seq_len(iteration)){ - if(selection=="systematic"){ + if(selection=="random"){ + blocks_df <- blocks_df[, c("records", "block_id")] # avoid repetition + fold_df <- data.frame(block_id = seq_len(blocks_len), folds = 0) + + # create random folds with equal proportion + num <- floor(blocks_len / k) + fold_df$folds[seq_len(num * k)] <- sample(rep(seq_len(k), num), num * k) + + if(blocks_len %% k != 0){ + rest <- blocks_len %% k + unfold <- which(fold_df$folds == 0) + fold_df$folds[unfold] <- sample(seq_len(k), rest, replace = FALSE) + } + + } else if(selection=="systematic"){ if(hexagon){ sub_blocks <- .fold_assign(sf::st_geometry(sub_blocks), n = k) } else{ @@ -324,28 +338,13 @@ cv_spatial <- function( sub_blocks$folds <- rep(1:k, length.out = blocks_len) } fold_df <- sf::st_drop_geometry(sub_blocks) - } - if(selection=="checkerboard"){ + } else if(selection=="checkerboard"){ sub_blocks$folds <- sub_blocks$id sub_blocks$block_id <- seq_len(blocks_len) fold_df <- sf::st_drop_geometry(sub_blocks) - } - - if(selection=="random"){ - blocks_df <- blocks_df[, c("records", "block_id")] # to avoid repetition in iterations - fold_df <- data.frame(block_id = seq_len(blocks_len), folds = 0) - # create random folds with equal proportion - num <- floor(blocks_len / k) - fold_df$folds[seq_len(num * k)] <- sample(rep(seq_len(k), num), num * k) - if(blocks_len %% k != 0){ - rest <- blocks_len %% k - unfold <- which(fold_df$folds == 0) - fold_df$folds[unfold] <- sample(seq_len(k), rest, replace = FALSE) - } - } - if(selection=="predefined"){ + } else if(selection=="predefined"){ fold_df <- data.frame(block_id = seq_len(blocks_len), folds = sub_blocks[, folds_column, drop = TRUE]) } diff --git a/R/cv_spatial_autocor.R b/R/cv_spatial_autocor.R index 48677d9..78a49eb 100644 --- a/R/cv_spatial_autocor.R +++ b/R/cv_spatial_autocor.R @@ -215,7 +215,7 @@ cv_spatial_autocor <- function( p2 <- cv_plot( cv = plot_data, r = if (!missing(r)) r else NULL, - label_size = -1, + label_size = -1, # no label ... ) p2 <- p2 + ggplot2::ggtitle( From 700f50415ef92efae7aabe18204909b0a09934fb Mon Sep 17 00:00:00 2001 From: Roozbeh Valavi Date: Thu, 14 Aug 2025 15:27:43 +1000 Subject: [PATCH 6/9] added covr to github acitons --- .github/workflows/R-CMD-check.yml | 57 +++++++++++++++++-------------- DESCRIPTION | 2 +- NEWS.md | 4 +-- R/cv_buffer.R | 2 +- R/cv_similarity.R | 5 +-- R/utils.R | 2 +- man/cv_similarity.Rd | 5 +-- 7 files changed, 43 insertions(+), 34 deletions(-) diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index 50458bc..06a0b9f 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -1,25 +1,32 @@ -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] - -name: R-CMD-check - -jobs: - R-CMD-check: - runs-on: macOS-latest - steps: - - uses: actions/checkout@v3 - - uses: r-lib/actions/setup-r@v2 - - name: Setting up pandoc for Rmd docs - uses: r-lib/actions/setup-pandoc@v2 - - uses: r-lib/actions/setup-tinytex@v2 - - name: Install dependencies - run: | - install.packages(c("remotes", "rcmdcheck")) - remotes::install_deps(dependencies = TRUE) - shell: Rscript {0} - - name: Check - run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error") - shell: Rscript {0} +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: macOS-latest + steps: + - uses: actions/checkout@v3 + - uses: r-lib/actions/setup-r@v2 + - name: Setting up pandoc for Rmd docs + uses: r-lib/actions/setup-pandoc@v2 + - uses: r-lib/actions/setup-tinytex@v2 + - name: Install dependencies + run: | + install.packages(c("remotes", "rcmdcheck", "covr")) + remotes::install_deps(dependencies = TRUE) + shell: Rscript {0} + + - name: Check + run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error") + shell: Rscript {0} + + - name: Test coverage + run: covr::codecov() + shell: Rscript {0} + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/DESCRIPTION b/DESCRIPTION index da7deb5..3a82d70 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: blockCV Type: Package Title: Spatial and Environmental Blocking for K-Fold and LOO Cross-Validation Version: 3.2-0 -Date: 2025-08-03 +Date: 2025-08-14 Authors@R: c(person("Roozbeh", "Valavi", role = c("aut", "cre"), email = "valavi.r@gmail.com", comment = c(ORCID = "0000-0003-2495-5277")), person("Jane", "Elith", role = "aut", diff --git a/NEWS.md b/NEWS.md index 3a91592..c4355ab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # version 3.2.0 -* Fixed a warning in `cv_similarity` for colour aesthetics. -* Adding L1 and L2 distances to `cv_similarity` +* Added two new methods, L1 and L2 distances, to the `cv_similarity` function +* Fixed a warning in `cv_similarity` for colour aesthetics with ggplot * Fixed the summary method for `cv_spatial_autocor` # version 3.1.7 diff --git a/R/cv_buffer.R b/R/cv_buffer.R index be5b798..c8cbf7a 100644 --- a/R/cv_buffer.R +++ b/R/cv_buffer.R @@ -133,7 +133,7 @@ cv_buffer <- function( # calculate train test table summary if(report){ - train_test_table <- .ttt(fold_list, x, column, n) + train_test_table <- .table_summary(fold_list, x, column, n) print(summary(train_test_table)[c(1,4,6), ]) } diff --git a/R/cv_similarity.R b/R/cv_similarity.R index 0cc6f5c..237748a 100644 --- a/R/cv_similarity.R +++ b/R/cv_similarity.R @@ -22,7 +22,7 @@ #' \item{1. Calculates the minimum distance between each test sample and all training samples #' in the same fold using the selected metric (L1 or L2).} #' \item{2. Calculates a baseline distance: the average of the minimum distances between a set -#' of random background samples (defined by \code{num_sample}) and all training/test +#' of random background samples (defined by \code{num_sample}) from the raster and all training/test #' samples combined.} #' \item{3. Computes a similarity score by subtracting the test sample’s minimum distance from #' the baseline average. A higher score indicates the test sample is more similar to @@ -30,7 +30,8 @@ #' } #' #' This provides a simple, distance-based novelty metric, useful for assessing -#' extrapolation or dissimilarity in prediction scenarios. +#' extrapolation or dissimilarity in prediction scenarios. Note that this approach is +#' experimental. #' #' @inheritParams cv_plot #' @param x a simple features (sf) or SpatialPoints object of the spatial sample data used for creating diff --git a/R/utils.R b/R/utils.R index be39168..0fed7ac 100644 --- a/R/utils.R +++ b/R/utils.R @@ -6,7 +6,7 @@ } # count the train and test records -.ttt <- function(fold_list, x, column, n){ +.table_summary <- function(fold_list, x, column, n){ if(is.null(column)){ tt_count <- base::data.frame(train = rep(0, n), test = 0) for(i in seq_len(n)){ diff --git a/man/cv_similarity.Rd b/man/cv_similarity.Rd index e749d4b..537455e 100644 --- a/man/cv_similarity.Rd +++ b/man/cv_similarity.Rd @@ -70,7 +70,7 @@ function performs the following steps for each test sample: \item{1. Calculates the minimum distance between each test sample and all training samples in the same fold using the selected metric (L1 or L2).} \item{2. Calculates a baseline distance: the average of the minimum distances between a set - of random background samples (defined by \code{num_sample}) and all training/test + of random background samples (defined by \code{num_sample}) from the raster and all training/test samples combined.} \item{3. Computes a similarity score by subtracting the test sample’s minimum distance from the baseline average. A higher score indicates the test sample is more similar to @@ -78,7 +78,8 @@ function performs the following steps for each test sample: } This provides a simple, distance-based novelty metric, useful for assessing -extrapolation or dissimilarity in prediction scenarios. +extrapolation or dissimilarity in prediction scenarios. Note that this approach is +experimental. } \examples{ \donttest{ From 675e547784f1f0dc2336de5bc49114ff26e8951d Mon Sep 17 00:00:00 2001 From: Roozbeh Valavi Date: Thu, 14 Aug 2025 15:39:21 +1000 Subject: [PATCH 7/9] some formatting --- R/cv_nndm.R | 2 +- tests/testthat/test-cv_buffer.R | 174 +++++++------- tests/testthat/test-cv_cluster.R | 172 +++++++------- tests/testthat/test-cv_nndm.R | 184 +++++++-------- tests/testthat/test-cv_plot.R | 29 +-- tests/testthat/test-cv_spatial.R | 363 +++++++++++++++-------------- tests/testthat/test-v2-functions.R | 66 +++--- 7 files changed, 505 insertions(+), 485 deletions(-) diff --git a/R/cv_nndm.R b/R/cv_nndm.R index b6c7c34..13bcc2b 100644 --- a/R/cv_nndm.R +++ b/R/cv_nndm.R @@ -186,7 +186,7 @@ cv_nndm <- function( ) # calculate train test table summary if(report){ - train_test_table <- .ttt(fold_list, x, column, n) + train_test_table <- .table_summary(fold_list, x, column, n) print(summary(train_test_table)[c(1,4,6), ]) } diff --git a/tests/testthat/test-cv_buffer.R b/tests/testthat/test-cv_buffer.R index bd9a53f..e412099 100644 --- a/tests/testthat/test-cv_buffer.R +++ b/tests/testthat/test-cv_buffer.R @@ -9,120 +9,120 @@ expect_names <- c( pa_data <- read.csv(system.file("extdata/", "species.csv", package = "blockCV")) |> - sf::st_as_sf(coords = c("x", "y"), crs = 7845) + sf::st_as_sf(coords = c("x", "y"), crs = 7845) pa_data <- pa_data[1:200, ] test_that("test that cv_buffer function works properly with presence-absence data", { - bloo <- cv_buffer( - x = pa_data, - column = "occ", - size = 250000, - presence_bg = FALSE, - progress = TRUE - ) - - expect_true(exists("bloo")) - expect_s3_class(bloo, "cv_buffer") - expect_equal(names(bloo), expect_names) - expect_equal(length(bloo$folds), nrow(pa_data)) - expect_type(bloo$folds, "list") - expect_type(bloo$k, "integer") - expect_type(bloo$column, "character") - expect_type(bloo$size, "double") - expect_equal(bloo$presence_bg, FALSE) - expect_equal(dim(bloo$records), c(nrow(pa_data), 4)) - expect_true(!all(bloo$records == 0)) - -}) + bloo <- cv_buffer( + x = pa_data, + column = "occ", + size = 250000, + presence_bg = FALSE, + progress = TRUE + ) + + expect_true(exists("bloo")) + expect_s3_class(bloo, "cv_buffer") + expect_equal(names(bloo), expect_names) + expect_equal(length(bloo$folds), nrow(pa_data)) + expect_type(bloo$folds, "list") + expect_type(bloo$k, "integer") + expect_type(bloo$column, "character") + expect_type(bloo$size, "double") + expect_equal(bloo$presence_bg, FALSE) + expect_equal(dim(bloo$records), c(nrow(pa_data), 4)) + expect_true(!all(bloo$records == 0)) + + }) test_that("test that cv_buffer function works properly with presence-background data", { - bloo <- cv_buffer( - x = pa_data, - column = "occ", - size = 250000, - presence_bg = TRUE, - progress = FALSE - ) - - expect_true(exists("bloo")) - expect_s3_class(bloo, "cv_buffer") - expect_equal(names(bloo), expect_names) - expect_equal(length(bloo$folds), sum(pa_data$occ)) - expect_type(bloo$folds, "list") - expect_type(bloo$k, "integer") - expect_type(bloo$column, "character") - expect_type(bloo$size, "double") - expect_equal(bloo$presence_bg, TRUE) - expect_equal(dim(bloo$records), c(sum(pa_data$occ), 4)) - expect_true(!all(bloo$records == 0)) - -}) + bloo <- cv_buffer( + x = pa_data, + column = "occ", + size = 250000, + presence_bg = TRUE, + progress = FALSE + ) + + expect_true(exists("bloo")) + expect_s3_class(bloo, "cv_buffer") + expect_equal(names(bloo), expect_names) + expect_equal(length(bloo$folds), sum(pa_data$occ)) + expect_type(bloo$folds, "list") + expect_type(bloo$k, "integer") + expect_type(bloo$column, "character") + expect_type(bloo$size, "double") + expect_equal(bloo$presence_bg, TRUE) + expect_equal(dim(bloo$records), c(sum(pa_data$occ), 4)) + expect_true(!all(bloo$records == 0)) + + }) test_that("test that cv_buffer function works properly with no species specified", { - bloo <- cv_buffer( - x = pa_data, - size = 250000 - ) - - expect_true(exists("bloo")) - expect_s3_class(bloo, "cv_buffer") - expect_equal(names(bloo), expect_names) - expect_equal(length(bloo$folds), nrow(pa_data)) - expect_type(bloo$folds, "list") - expect_type(bloo$k, "integer") - expect_null(bloo$species) - expect_type(bloo$size, "double") - expect_equal(bloo$presence_bg, FALSE) - expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) - expect_true(!all(bloo$records == 0)) - - expect_equal(print(bloo), "cv_buffer") - expect_output(summary(bloo)) + bloo <- cv_buffer( + x = pa_data, + size = 250000 + ) -}) + expect_true(exists("bloo")) + expect_s3_class(bloo, "cv_buffer") + expect_equal(names(bloo), expect_names) + expect_equal(length(bloo$folds), nrow(pa_data)) + expect_type(bloo$folds, "list") + expect_type(bloo$k, "integer") + expect_null(bloo$species) + expect_type(bloo$size, "double") + expect_equal(bloo$presence_bg, FALSE) + expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) + expect_true(!all(bloo$records == 0)) + + expect_equal(print(bloo), "cv_buffer") + expect_output(summary(bloo)) + + }) test_that("test cv_buffer function with no matching species column", { - expect_warning( - bloo <- cv_buffer( - x = pa_data, - column = "response", - size = 250000, - presence_bg = FALSE, - progress = TRUE + expect_warning( + bloo <- cv_buffer( + x = pa_data, + column = "response", + size = 250000, + presence_bg = FALSE, + progress = TRUE + ) ) - ) - expect_true(exists("bloo")) - expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) - expect_true(!all(bloo$records == 0)) + expect_true(exists("bloo")) + expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) + expect_true(!all(bloo$records == 0)) }) test_that("test cv_buffer function with no spatial column data", { - expect_error( - cv_buffer( - x = "pa_data", - size = 250000 + expect_error( + cv_buffer( + x = "pa_data", + size = 250000 + ) ) - ) }) test_that("test cv_buffer function to have sptial points with no CRS", { - sf::st_crs(pa_data) <- NA - - expect_error( - cv_buffer( - x = pa_data, - column = "occ", - size = 250000 + sf::st_crs(pa_data) <- NA + + expect_error( + cv_buffer( + x = pa_data, + column = "occ", + size = 250000 + ) ) - ) }) diff --git a/tests/testthat/test-cv_cluster.R b/tests/testthat/test-cv_cluster.R index 99091af..f969dce 100644 --- a/tests/testthat/test-cv_cluster.R +++ b/tests/testthat/test-cv_cluster.R @@ -1,119 +1,109 @@ -expect_names <- c( - "folds_list", - "folds_ids", - "biomod_table", - "k", - "column", - "type", - "records" -) +expect_names <- c("folds_list", + "folds_ids", + "biomod_table", + "k", + "column", + "type", + "records") aus <- system.file("extdata/au/", package = "blockCV") |> - list.files(full.names = TRUE) |> - terra::rast() + list.files(full.names = TRUE) |> + terra::rast() pa_data <- read.csv(system.file("extdata/", "species.csv", package = "blockCV")) |> - sf::st_as_sf(coords = c("x", "y"), crs = 7845) + sf::st_as_sf(coords = c("x", "y"), crs = 7845) test_that("test that environmental cluster function with raster_cluster", { - - # environmental clustering - set.seed(42) - eb <- cv_cluster(x = pa_data, - column = "occ", - r = aus, - k = 3, - scale = TRUE, - raster_cluster = TRUE) - - expect_true(exists("eb")) - expect_s3_class(eb, "cv_cluster") - expect_equal(names(eb), expect_names) - expect_equal(length(eb$folds_list), 3) - expect_type(eb$folds_list, "list") - expect_equal(eb$k, 3) - expect_type(eb$column, "character") - expect_equal(dim(eb$records), c(3, 4)) - expect_true( - !all(eb$records == 0) - ) + # environmental clustering + set.seed(42) + eb <- cv_cluster( + x = pa_data, + column = "occ", + r = aus, + k = 3, + scale = TRUE, + raster_cluster = TRUE + ) + + expect_true(exists("eb")) + expect_s3_class(eb, "cv_cluster") + expect_equal(names(eb), expect_names) + expect_equal(length(eb$folds_list), 3) + expect_type(eb$folds_list, "list") + expect_equal(eb$k, 3) + expect_type(eb$column, "character") + expect_equal(dim(eb$records), c(3, 4)) + expect_true(!all(eb$records == 0)) }) test_that("test that spacial cluster function with no column", { - - # spatial clustering - set.seed(42) - eb <- cv_cluster(x = pa_data, - k = 5, - biomod2 = FALSE) - - expect_true(exists("eb")) - expect_s3_class(eb, "cv_cluster") - expect_equal(names(eb), expect_names) - expect_equal(length(eb$folds_list), 5) - expect_type(eb$folds_list, "list") - expect_equal(eb$k, 5) - expect_null(eb$column) - expect_equal(dim(eb$records), c(5, 2)) - expect_true( - !all(eb$records == 0) - ) - - expect_equal(print(eb), "cv_cluster") - expect_output(summary(eb)) + # spatial clustering + set.seed(42) + eb <- cv_cluster(x = pa_data, k = 5, biomod2 = FALSE) + + expect_true(exists("eb")) + expect_s3_class(eb, "cv_cluster") + expect_equal(names(eb), expect_names) + expect_equal(length(eb$folds_list), 5) + expect_type(eb$folds_list, "list") + expect_equal(eb$k, 5) + expect_null(eb$column) + expect_equal(dim(eb$records), c(5, 2)) + expect_true(!all(eb$records == 0)) + + expect_equal(print(eb), "cv_cluster") + expect_output(summary(eb)) }) test_that("test that environmental cluster with no scale and wrong column", { - - set.seed(42) - expect_warning( - eb <- cv_cluster(x = pa_data, - column = "response", # wrong column name - r = aus, - k = 5, - scale = FALSE, - raster_cluster = TRUE, - algorithm = "MacQueen") - ) - - expect_true(exists("eb")) - expect_s3_class(eb, "cv_cluster") - expect_equal(names(eb), expect_names) - expect_equal(length(eb$folds_list), 5) - expect_type(eb$folds_list, "list") - expect_equal(eb$k, 5) - expect_equal(dim(eb$records), c(5, 2)) - expect_true( - !all(eb$records == 0) - ) + set.seed(42) + expect_warning( + eb <- cv_cluster( + x = pa_data, + column = "response", + # wrong column name + r = aus, + k = 5, + scale = FALSE, + raster_cluster = TRUE, + algorithm = "MacQueen" + ) + ) + + expect_true(exists("eb")) + expect_s3_class(eb, "cv_cluster") + expect_equal(names(eb), expect_names) + expect_equal(length(eb$folds_list), 5) + expect_type(eb$folds_list, "list") + expect_equal(eb$k, 5) + expect_equal(dim(eb$records), c(5, 2)) + expect_true(!all(eb$records == 0)) }) test_that("test environmental cluster with no spatial or sf object", { - - expect_error( - cv_cluster(x = aus, # no spatial or sf object - r = aus, - k = 5, - scale = TRUE, - raster_cluster = FALSE) - - ) + expect_error(cv_cluster( + x = aus, + # no spatial or sf object + r = aus, + k = 5, + scale = TRUE, + raster_cluster = FALSE + )) }) test_that("test environmental cluster with no raster data", { - expect_error( - cv_cluster(r = data.frame(x = 1), # no raster data - x = pa_data) - - ) + expect_error(cv_cluster( + r = data.frame(x = 1), + # no raster data + x = pa_data + )) }) - diff --git a/tests/testthat/test-cv_nndm.R b/tests/testthat/test-cv_nndm.R index 35d9058..dd33f57 100644 --- a/tests/testthat/test-cv_nndm.R +++ b/tests/testthat/test-cv_nndm.R @@ -10,131 +10,131 @@ expect_names <- c( aus <- system.file("extdata/au/", package = "blockCV") |> - list.files(full.names = TRUE) |> - terra::rast() + list.files(full.names = TRUE) |> + terra::rast() pa_data <- read.csv(system.file("extdata/", "species.csv", package = "blockCV")) |> - sf::st_as_sf(coords = c("x", "y"), crs = 7845) + sf::st_as_sf(coords = c("x", "y"), crs = 7845) pa_data <- pa_data[1:100, ] test_that("test that cv_nndm function works properly with presence-absence data", { - bloo <- cv_nndm( - x = pa_data, - column = "occ", - r = aus, - size = 250000, - num_sample = 3000, - presence_bg = FALSE - ) - - expect_true(exists("bloo")) - expect_s3_class(bloo, "cv_nndm") - expect_equal(names(bloo), expect_names) - expect_equal(length(bloo$folds), nrow(pa_data)) - expect_type(bloo$folds, "list") - expect_type(bloo$k, "integer") - expect_type(bloo$column, "character") - expect_type(bloo$size, "double") - expect_equal(bloo$presence_bg, FALSE) - expect_equal(dim(bloo$records), c(nrow(pa_data), 4)) - expect_true(!all(bloo$records == 0)) + bloo <- cv_nndm( + x = pa_data, + column = "occ", + r = aus, + size = 250000, + num_sample = 3000, + presence_bg = FALSE + ) + + expect_true(exists("bloo")) + expect_s3_class(bloo, "cv_nndm") + expect_equal(names(bloo), expect_names) + expect_equal(length(bloo$folds), nrow(pa_data)) + expect_type(bloo$folds, "list") + expect_type(bloo$k, "integer") + expect_type(bloo$column, "character") + expect_type(bloo$size, "double") + expect_equal(bloo$presence_bg, FALSE) + expect_equal(dim(bloo$records), c(nrow(pa_data), 4)) + expect_true(!all(bloo$records == 0)) }) test_that("test that cv_nndm function works properly with presence-background data", { - bloo <- cv_nndm( - x = pa_data, - column = "occ", - r = aus, - size = 250000, - sampling = "regular", - presence_bg = TRUE - ) - - expect_true(exists("bloo")) - expect_s3_class(bloo, "cv_nndm") - expect_equal(names(bloo), expect_names) - expect_equal(length(bloo$folds), sum(pa_data$occ)) - expect_type(bloo$folds, "list") - expect_type(bloo$k, "integer") - expect_type(bloo$column, "character") - expect_type(bloo$size, "double") - expect_equal(bloo$presence_bg, TRUE) - expect_equal(dim(bloo$records), c(sum(pa_data$occ), 4)) - expect_true(!all(bloo$records == 0)) + bloo <- cv_nndm( + x = pa_data, + column = "occ", + r = aus, + size = 250000, + sampling = "regular", + presence_bg = TRUE + ) + + expect_true(exists("bloo")) + expect_s3_class(bloo, "cv_nndm") + expect_equal(names(bloo), expect_names) + expect_equal(length(bloo$folds), sum(pa_data$occ)) + expect_type(bloo$folds, "list") + expect_type(bloo$k, "integer") + expect_type(bloo$column, "character") + expect_type(bloo$size, "double") + expect_equal(bloo$presence_bg, TRUE) + expect_equal(dim(bloo$records), c(sum(pa_data$occ), 4)) + expect_true(!all(bloo$records == 0)) }) test_that("test that cv_nndm function works properly with no species specified", { - bloo <- cv_nndm( - x = sf::as_Spatial(pa_data), - r = aus, - size = 250000, - num_sample = 3000, - plot = FALSE - ) - - expect_true(exists("bloo")) - expect_s3_class(bloo, "cv_nndm") - expect_equal(names(bloo), expect_names) - expect_equal(length(bloo$folds), nrow(pa_data)) - expect_type(bloo$folds, "list") - expect_type(bloo$k, "integer") - expect_null(bloo$species) - expect_type(bloo$size, "double") - expect_equal(bloo$presence_bg, FALSE) - expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) - expect_true(!all(bloo$records == 0)) - - expect_equal(print(bloo), "cv_nndm") - expect_output(summary(bloo)) + bloo <- cv_nndm( + x = sf::as_Spatial(pa_data), + r = aus, + size = 250000, + num_sample = 3000, + plot = FALSE + ) + + expect_true(exists("bloo")) + expect_s3_class(bloo, "cv_nndm") + expect_equal(names(bloo), expect_names) + expect_equal(length(bloo$folds), nrow(pa_data)) + expect_type(bloo$folds, "list") + expect_type(bloo$k, "integer") + expect_null(bloo$species) + expect_type(bloo$size, "double") + expect_equal(bloo$presence_bg, FALSE) + expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) + expect_true(!all(bloo$records == 0)) + + expect_equal(print(bloo), "cv_nndm") + expect_output(summary(bloo)) }) test_that("test cv_nndm function with no matching species column", { - expect_warning( - bloo <- cv_nndm( - x = pa_data, - column = "response", - r = aus, - size = 250000, - num_sample = 3000, - presence_bg = FALSE + expect_warning( + bloo <- cv_nndm( + x = pa_data, + column = "response", + r = aus, + size = 250000, + num_sample = 3000, + presence_bg = FALSE + ) ) - ) - expect_true(exists("bloo")) - expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) - expect_true(!all(bloo$records == 0)) + expect_true(exists("bloo")) + expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) + expect_true(!all(bloo$records == 0)) }) test_that("test cv_nndm function with no spatial column data", { - expect_error( - cv_nndm( - x = "pa_data", - r = aus, - size = 250000 + expect_error( + cv_nndm( + x = "pa_data", + r = aus, + size = 250000 + ) ) - ) }) test_that("test cv_nndm function to have sptial points with no CRS", { - sf::st_crs(pa_data) <- NA - - expect_error( - cv_nndm( - x = pa_data, - column = "occ", - r = aus, - size = 250000 + sf::st_crs(pa_data) <- NA + + expect_error( + cv_nndm( + x = pa_data, + column = "occ", + r = aus, + size = 250000 + ) ) - ) }) diff --git a/tests/testthat/test-cv_plot.R b/tests/testthat/test-cv_plot.R index addcf45..e2c300d 100644 --- a/tests/testthat/test-cv_plot.R +++ b/tests/testthat/test-cv_plot.R @@ -1,21 +1,22 @@ pa_data <- read.csv(system.file("extdata/", "species.csv", package = "blockCV")) |> - sf::st_as_sf(coords = c("x", "y"), crs = 7845) + sf::st_as_sf(coords = c("x", "y"), crs = 7845) -test_that("test that cv_plot function works", - { - scv <- cv_spatial(x = pa_data, - size = 450000, - k = 5, - selection = "random", - iteration = 1, - biomod2 = FALSE, - plot = FALSE, - progress = FALSE) +test_that("test that cv_plot function works", { + scv <- cv_spatial( + x = pa_data, + size = 450000, + k = 5, + selection = "random", + iteration = 1, + biomod2 = FALSE, + plot = FALSE, + progress = FALSE + ) - plt <- cv_plot(cv = scv, x = pa_data) + plt <- cv_plot(cv = scv, x = pa_data) - expect_true(exists("plt")) - expect_true(ggplot2::is_ggplot(plt)) + expect_true(exists("plt")) + expect_true(ggplot2::is_ggplot(plt)) }) diff --git a/tests/testthat/test-cv_spatial.R b/tests/testthat/test-cv_spatial.R index a0f314f..d55a27a 100644 --- a/tests/testthat/test-cv_spatial.R +++ b/tests/testthat/test-cv_spatial.R @@ -8,226 +8,245 @@ expect_names <- c("folds_list", "records") aus <- system.file("extdata/au/", package = "blockCV") |> - list.files(full.names = TRUE) |> - terra::rast() + list.files(full.names = TRUE) |> + terra::rast() pa_data <- read.csv(system.file("extdata/", "species.csv", package = "blockCV")) |> - sf::st_as_sf(coords = c("x", "y"), crs = 7845) + sf::st_as_sf(coords = c("x", "y"), crs = 7845) pa_data <- pa_data[1:200, ] test_that("test cv_spatial function with random assingment and raster file", { - set.seed(1000) - scv <- cv_spatial(x = pa_data, - column = "occ", - r = aus, - size = 450000, - k = 5, - selection = "random", - iteration = 5, - biomod2 = TRUE, - offset = c(0.3, 0.2), - plot = FALSE, - progress = TRUE) - - expect_true(exists("scv")) - expect_s3_class(scv, "cv_spatial") - expect_equal(names(scv), expect_names) - expect_equal(length(scv$folds_list), 5) - expect_type(scv$folds_list, "list") - expect_type(scv$biomod_table, "logical") - expect_equal(dim(scv$biomod_table), c(nrow(pa_data), 5)) - expect_equal(scv$k, 5) - expect_s3_class(scv$blocks, "sf") - expect_type(scv$column, "character") - expect_type(scv$size, "double") - expect_equal(dim(scv$records), c(5, 4)) - expect_true( - !all(scv$records == 0) - ) + set.seed(1000) + scv <- cv_spatial( + x = pa_data, + column = "occ", + r = aus, + size = 450000, + k = 5, + selection = "random", + iteration = 5, + biomod2 = TRUE, + offset = c(0.3, 0.2), + plot = FALSE, + progress = TRUE + ) + + expect_true(exists("scv")) + expect_s3_class(scv, "cv_spatial") + expect_equal(names(scv), expect_names) + expect_equal(length(scv$folds_list), 5) + expect_type(scv$folds_list, "list") + expect_type(scv$biomod_table, "logical") + expect_equal(dim(scv$biomod_table), c(nrow(pa_data), 5)) + expect_equal(scv$k, 5) + expect_s3_class(scv$blocks, "sf") + expect_type(scv$column, "character") + expect_type(scv$size, "double") + expect_equal(dim(scv$records), c(5, 4)) + expect_true( + !all(scv$records == 0) + ) }) test_that("test cv_spatial function with systematic assingment and no raster file", { - scv <- cv_spatial(x = sf::as_Spatial(pa_data), - rows_cols = c(10, 10), - k = 5, - selection = "systematic", - biomod2 = FALSE, - plot = TRUE) - - expect_true(exists("scv")) - expect_s3_class(scv, "cv_spatial") - expect_equal(names(scv), expect_names) - expect_equal(length(scv$folds_list), 5) - expect_type(scv$folds_list, "list") - expect_null(scv$biomod_table, "matrix") - expect_equal(scv$k, 5) - expect_s3_class(scv$blocks, "sf") - expect_null(scv$column) - expect_null(scv$size) - expect_equal(dim(scv$records), c(5, 2)) - expect_true( - !all(scv$records == 0) - ) - - expect_equal(print(scv), "cv_spatial") - expect_message(plot(scv)) - expect_output(summary(scv)) + scv <- cv_spatial( + x = sf::as_Spatial(pa_data), + rows_cols = c(10, 10), + k = 5, + selection = "systematic", + biomod2 = FALSE, + plot = TRUE + ) + + expect_true(exists("scv")) + expect_s3_class(scv, "cv_spatial") + expect_equal(names(scv), expect_names) + expect_equal(length(scv$folds_list), 5) + expect_type(scv$folds_list, "list") + expect_null(scv$biomod_table, "matrix") + expect_equal(scv$k, 5) + expect_s3_class(scv$blocks, "sf") + expect_null(scv$column) + expect_null(scv$size) + expect_equal(dim(scv$records), c(5, 2)) + expect_true( + !all(scv$records == 0) + ) + + expect_equal(print(scv), "cv_spatial") + expect_message(plot(scv)) + expect_output(summary(scv)) }) test_that("test cv_spatial function with non-numeric iteration", { - scv <- cv_spatial(x = pa_data, - hexagon = FALSE, - k = 5, - selection = "random", - iteration = "2", # not numeric - biomod2 = FALSE, - plot = FALSE) - - expect_true(exists("scv")) - expect_s3_class(scv, "cv_spatial") - expect_equal(names(scv), expect_names) - expect_equal(length(scv$folds_list), 5) - expect_type(scv$folds_list, "list") - expect_null(scv$biomod_table, "matrix") - expect_equal(scv$k, 5) - expect_s3_class(scv$blocks, "sf") - expect_null(scv$column) - expect_null(scv$size) - expect_equal(dim(scv$records), c(5, 2)) - expect_true( - !all(scv$records == 0) - ) - - expect_equal(print(scv), "cv_spatial") - expect_message(plot(scv)) - expect_output(summary(scv)) + scv <- cv_spatial( + x = pa_data, + hexagon = FALSE, + k = 5, + selection = "random", + iteration = "2", # not numeric + biomod2 = FALSE, + plot = FALSE + ) + + expect_true(exists("scv")) + expect_s3_class(scv, "cv_spatial") + expect_equal(names(scv), expect_names) + expect_equal(length(scv$folds_list), 5) + expect_type(scv$folds_list, "list") + expect_null(scv$biomod_table, "matrix") + expect_equal(scv$k, 5) + expect_s3_class(scv$blocks, "sf") + expect_null(scv$column) + expect_null(scv$size) + expect_equal(dim(scv$records), c(5, 2)) + expect_true( + !all(scv$records == 0) + ) + + expect_equal(print(scv), "cv_spatial") + expect_message(plot(scv)) + expect_output(summary(scv)) }) test_that("test cv_spatial with checkerboard assingment and only row blocks", { - scv <- cv_spatial(x = sf::as_Spatial(pa_data), - hexagon = FALSE, - rows_cols = c(5, 1), - selection = "checkerboard", - biomod2 = FALSE, - plot = TRUE) - - expect_true(exists("scv")) - expect_s3_class(scv, "cv_spatial") - expect_equal(names(scv), expect_names) - expect_equal(length(scv$folds_list), 2) - expect_type(scv$folds_list, "list") - expect_null(scv$biomod_table) - expect_equal(scv$k, 2) - expect_s3_class(scv$blocks, "sf") - expect_null(scv$column) - expect_null(scv$size) - expect_equal(dim(scv$records), c(2, 2)) - expect_true( - !all(scv$records == 0) - ) - - expect_equal(print(scv), "cv_spatial") - expect_message(plot(scv)) - expect_output(summary(scv)) + scv <- cv_spatial( + x = sf::as_Spatial(pa_data), + hexagon = FALSE, + rows_cols = c(5, 1), + selection = "checkerboard", + biomod2 = FALSE, + plot = TRUE + ) + + expect_true(exists("scv")) + expect_s3_class(scv, "cv_spatial") + expect_equal(names(scv), expect_names) + expect_equal(length(scv$folds_list), 2) + expect_type(scv$folds_list, "list") + expect_null(scv$biomod_table) + expect_equal(scv$k, 2) + expect_s3_class(scv$blocks, "sf") + expect_null(scv$column) + expect_null(scv$size) + expect_equal(dim(scv$records), c(2, 2)) + expect_true( + !all(scv$records == 0) + ) + + expect_equal(print(scv), "cv_spatial") + expect_message(plot(scv)) + expect_output(summary(scv)) }) test_that("test cv_spatial with user-defined blocks", { - # make a spatial block polygon - user_poly <- .make_blocks(x_obj = pa_data, blocksize = 450000) - - scv <- cv_spatial(x = pa_data, - user_blocks = user_poly, - selection = "random", - iteration = 5, - biomod2 = FALSE, - progress = FALSE, - plot = FALSE) - - expect_true(exists("scv")) - expect_s3_class(scv, "cv_spatial") - expect_equal(names(scv), expect_names) - expect_equal(length(scv$folds_list), 5) - expect_type(scv$folds_list, "list") - expect_null(scv$biomod_table) - expect_equal(scv$k, 5) - expect_s3_class(scv$blocks, "sf") - expect_null(scv$column) - expect_null(scv$size) - expect_equal(dim(scv$records), c(5, 2)) - expect_true( - !all(scv$records == 0) - ) - - expect_equal(print(scv), "cv_spatial") - expect_message(plot(scv)) - expect_output(summary(scv)) + # make a spatial block polygon + user_poly <- .make_blocks(x_obj = pa_data, blocksize = 450000) + + scv <- cv_spatial( + x = pa_data, + user_blocks = user_poly, + selection = "random", + iteration = 5, + biomod2 = FALSE, + progress = FALSE, + plot = FALSE + ) + + expect_true(exists("scv")) + expect_s3_class(scv, "cv_spatial") + expect_equal(names(scv), expect_names) + expect_equal(length(scv$folds_list), 5) + expect_type(scv$folds_list, "list") + expect_null(scv$biomod_table) + expect_equal(scv$k, 5) + expect_s3_class(scv$blocks, "sf") + expect_null(scv$column) + expect_null(scv$size) + expect_equal(dim(scv$records), c(5, 2)) + expect_true( + !all(scv$records == 0) + ) + + expect_equal(print(scv), "cv_spatial") + expect_message(plot(scv)) + expect_output(summary(scv)) }) test_that("test cv_spatial failur: number of blocks, wrong selection", { - expect_error(cv_spatial(x = pa_data, - rows_cols = c(2, 2), - hexagon = FALSE, - k = 15, # very high k - selection = "random")) + expect_error(cv_spatial( + x = pa_data, + rows_cols = c(2, 2), + hexagon = FALSE, + k = 15, + # very high k + selection = "random" + )) - expect_error(cv_spatial(x = pa_data, - k = 1, # very low k - selection = "random")) + expect_error(cv_spatial( + x = pa_data, + k = 1, + # very low k + selection = "random" + )) - expect_error(cv_spatial(x = pa_data, - k = 5, - selection = "chance")) # wrong selection + expect_error(cv_spatial(x = pa_data, k = 5, selection = "chance")) # wrong selection }) test_that("test cv_spatial failur: wrong user-defined blocks", { - expect_error( - cv_spatial(x = pa_data, # wrong user-defined blocks - column = "occ", - r = aus, - user_blocks = data.frame(a = 1), - size = 450000, - k = 5) - ) + expect_error( + cv_spatial( + x = pa_data, + # wrong user-defined blocks + column = "occ", + r = aus, + user_blocks = data.frame(a = 1), + size = 450000, + k = 5 + ) + ) }) test_that("test cv_spatial with no speceis column match", { - expect_warning( - scv <- cv_spatial(x = pa_data, - column = "response", # wrong response - r = aus, - size = 450000, - k = 5) - ) - - expect_true(exists("scv")) - expect_s3_class(scv, "cv_spatial") - expect_equal(names(scv), expect_names) - expect_equal(length(scv$folds_list), 5) - expect_equal(scv$k, 5) - expect_equal(dim(scv$records), c(5, 2)) - expect_true( - !all(scv$records == 0) - ) + + expect_warning(scv <- cv_spatial( + x = pa_data, + column = "response", + # wrong response + r = aus, + size = 450000, + k = 5 + )) + + expect_true(exists("scv")) + expect_s3_class(scv, "cv_spatial") + expect_equal(names(scv), expect_names) + expect_equal(length(scv$folds_list), 5) + expect_equal(scv$k, 5) + expect_equal(dim(scv$records), c(5, 2)) + expect_true( + !all(scv$records == 0) + ) }) diff --git a/tests/testthat/test-v2-functions.R b/tests/testthat/test-v2-functions.R index 1310efb..843f8c7 100644 --- a/tests/testthat/test-v2-functions.R +++ b/tests/testthat/test-v2-functions.R @@ -20,9 +20,11 @@ expect_names <- c( test_that("test spatialAutoRange function works", { # skip_on_cran() - range1 <- spatialAutoRange(rasterLayer = aus, - sampleNumber = 1000, - showPlots = TRUE) + range1 <- spatialAutoRange( + rasterLayer = aus, + sampleNumber = 1000, + showPlots = TRUE + ) expect_true(exists("range1")) expect_s3_class(range1, "SpatialAutoRange") @@ -52,10 +54,12 @@ test_that("test spatialAutoRange function works", { test_that("test spatialAutoRange function with x", { # skip_on_cran() - range3 <- spatialAutoRange(rasterLayer = aus, - speciesData = pa_data, - progress = FALSE, - showPlots = TRUE) + range3 <- spatialAutoRange( + rasterLayer = aus, + speciesData = pa_data, + progress = FALSE, + showPlots = TRUE + ) expect_true(exists("range3")) expect_s3_class(range3, "SpatialAutoRange") @@ -89,16 +93,18 @@ test_that("test spatiaBlock function works", { # skip_on_cran() set.seed(1000) - sb1 <- spatialBlock(speciesData = pa_data, - species = "occ", - rasterLayer = aus, - theRange = 450000, - k = 5, - selection = "random", - iteration = 5, - biomod2Format = TRUE, - showBlocks = TRUE, - progress = TRUE) + sb1 <- spatialBlock( + speciesData = pa_data, + species = "occ", + rasterLayer = aus, + theRange = 450000, + k = 5, + selection = "random", + iteration = 5, + biomod2Format = TRUE, + showBlocks = TRUE, + progress = TRUE + ) expect_true(exists("sb1")) expect_s3_class(sb1, "SpatialBlock") @@ -137,11 +143,13 @@ expect_names <- c( test_that("test buffering function works", { # skip_on_cran() - bf1 <- buffering(speciesData= pa_data, - species= "occ", - theRange= 450000, - spDataType = "PA", - progress = TRUE) + bf1 <- buffering( + speciesData= pa_data, + species= "occ", + theRange= 450000, + spDataType = "PA", + progress = TRUE + ) expect_true(exists("bf1")) expect_s3_class(bf1, "BufferedBlock") @@ -176,12 +184,14 @@ expect_names <- c( test_that("test that environmental blocking function works", { # skip_on_cran() - eb <- envBlock(rasterLayer = aus, - speciesData = pa_data, - species = "occ", - k = 5, - standardization = "standard", - rasterBlock = FALSE) + eb <- envBlock( + rasterLayer = aus, + speciesData = pa_data, + species = "occ", + k = 5, + standardization = "standard", + rasterBlock = FALSE + ) expect_true(exists("eb")) expect_s3_class(eb, "EnvironmentalBlock") From d653027fddb9e22071bc9268aec748fd4276c49a Mon Sep 17 00:00:00 2001 From: Roozbeh Valavi Date: Thu, 14 Aug 2025 17:11:57 +1000 Subject: [PATCH 8/9] some updates --- .github/workflows/R-CMD-check.yml | 4 ++-- src/similarity.cpp | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index 06a0b9f..0a8e044 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -8,9 +8,9 @@ name: R-CMD-check jobs: R-CMD-check: - runs-on: macOS-latest + runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 - name: Setting up pandoc for Rmd docs uses: r-lib/actions/setup-pandoc@v2 diff --git a/src/similarity.cpp b/src/similarity.cpp index 20228ca..a01d06b 100644 --- a/src/similarity.cpp +++ b/src/similarity.cpp @@ -4,6 +4,7 @@ #include #include "Lightweight_matrix.hpp" + // a function to calculate L1 or L2 distances inline double distance(Lightweight_matrix &x, Lightweight_matrix &y, int i, int ii, bool L1) { From 4afe59d743ef69f65b616a183f0b0c48639d87a5 Mon Sep 17 00:00:00 2001 From: Roozbeh Valavi Date: Fri, 15 Aug 2025 17:17:05 +1000 Subject: [PATCH 9/9] update vignettes --- DESCRIPTION | 2 +- NEWS.md | 6 +- R/cv_similarity.R | 12 +- R/utils.R | 3 +- inst/doc/tutorial_1.R | 4 +- inst/doc/tutorial_1.Rmd | 7 +- inst/doc/tutorial_1.html | 42 ++-- inst/doc/tutorial_2.html | 4 +- man/cv_similarity.Rd | 11 +- ...weight_matrix.hpp => Lightweight_matrix.h} | 0 src/nndm.cpp | 200 +++++++++--------- src/similarity.cpp | 4 +- vignettes/tutorial_1.Rmd | 7 +- 13 files changed, 153 insertions(+), 149 deletions(-) rename src/{Lightweight_matrix.hpp => Lightweight_matrix.h} (100%) diff --git a/DESCRIPTION b/DESCRIPTION index 3a82d70..b67b315 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: blockCV Type: Package Title: Spatial and Environmental Blocking for K-Fold and LOO Cross-Validation Version: 3.2-0 -Date: 2025-08-14 +Date: 2025-08-15 Authors@R: c(person("Roozbeh", "Valavi", role = c("aut", "cre"), email = "valavi.r@gmail.com", comment = c(ORCID = "0000-0003-2495-5277")), person("Jane", "Elith", role = "aut", diff --git a/NEWS.md b/NEWS.md index c4355ab..6b2373d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,17 +1,17 @@ # version 3.2.0 * Added two new methods, L1 and L2 distances, to the `cv_similarity` function * Fixed a warning in `cv_similarity` for colour aesthetics with ggplot -* Fixed the summary method for `cv_spatial_autocor` +* Fixed the summary method and plotting for `cv_spatial_autocor` # version 3.1.7 -* Temporarily added `sp` package dependency to avoid CRAN check as required by `automap` package [#55]. +* Temporarily added `sp` package dependency to avoid CRAN error as required by `automap` package [#55]. # version 3.1.6 * Resolved unclear error messages; issue [#52](https://github.com/rvalavi/blockCV/issues/52) by A. Márcia Barbosa * Resolved ggplot testing failure; issue [#54](https://github.com/rvalavi/blockCV/issues/54) by Teun van den Brand # version 3.1.5 -* Resolved background pattern artifacts in raster plotting; issue [#50](https://github.com/rvalavi/blockCV/issues/50) by Camila Neder. +* Resolved background pattern artefacts in raster plotting; issue [#50](https://github.com/rvalavi/blockCV/issues/50) by Camila Neder. # version 3.1.4 * Just the `biomod2` example is updated in vignettes; and the link in help file diff --git a/R/cv_similarity.R b/R/cv_similarity.R index 237748a..432e69d 100644 --- a/R/cv_similarity.R +++ b/R/cv_similarity.R @@ -1,13 +1,9 @@ #' Compute similarity measures to evaluate possible extrapolation in testing folds #' -#' This function supports three similarity measurement methods: -#' Multivariate Environmental Similarity Surface (MESS), Manhattan distance (L1), -#' and Euclidean distance (L2). It is primarily designed for evaluating the similarity -#' between training and test datasets in environmental or spatial modeling applications. -#' -#' -#' The function support three method of similarity measures, the Multivariate Environmental -#' Similarity Surface (MESS), Manhattan distance (L1), and Euclidean distance (L2). +#' This function evaluates environmental similarity between training and testing folds, +#' helping to detect potential extrapolation in the testing data. It supports three +#' similarity measures: Multivariate Environmental Similarity Surface (MESS), Manhattan +#' distance (L1), and Euclidean distance (L2). #' #' The MESS is calculated as described in Elith et al. (2010). MESS represents #' how similar a point in a testing fold is to a training fold (as a reference diff --git a/R/utils.R b/R/utils.R index 0fed7ac..7e2e14e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -5,10 +5,11 @@ inherits(x, c("cv_buffer", "cv_nndm")) } + # count the train and test records .table_summary <- function(fold_list, x, column, n){ if(is.null(column)){ - tt_count <- base::data.frame(train = rep(0, n), test = 0) + tt_count <- data.frame(train = rep(0, n), test = 0) for(i in seq_len(n)){ train_set <- fold_list[[i]][[1]] test_set <- fold_list[[i]][[2]] diff --git a/inst/doc/tutorial_1.R b/inst/doc/tutorial_1.R index b0b07c7..1aee5aa 100644 --- a/inst/doc/tutorial_1.R +++ b/inst/doc/tutorial_1.R @@ -153,12 +153,12 @@ cv_plot(cv = sb1, cv_similarity(cv = ecv, # the environmental clustering x = pa_data, r = rasters, + method = "MESS", progress = FALSE) ## ----results='hide', fig.keep='all', warning=FALSE, message=FALSE, fig.height=5, fig.width=7.2---- -sac1 <- cv_spatial_autocor(r = rasters, - num_sample = 5000) +sac1 <- cv_spatial_autocor(r = rasters, num_sample = 5000) ## ----------------------------------------------------------------------------- diff --git a/inst/doc/tutorial_1.Rmd b/inst/doc/tutorial_1.Rmd index 6037169..46e1ac1 100644 --- a/inst/doc/tutorial_1.Rmd +++ b/inst/doc/tutorial_1.Rmd @@ -296,12 +296,14 @@ cv_plot(cv = sb1, ## Check similarity -The `cv_similarity` function can check for environmental similarity between the training and testing folds and thus possible extrapolation in the testing folds. It computes multivariate environmental similarity surface (MESS) as described in Elith et al. (2010). MESS represents how similar a point in a testing fold is to a training fold (as a reference set of points), with respect to a set of predictor variables in `r`. The negative values are the sites where at least one variable has a value that is outside the range of environments over the reference set, so these are novel environments. +The `cv_similarity` function evaluates environmental similarity between training and testing folds, helping to detect potential extrapolation in the testing data. It supports three similarity measures: Multivariate Environmental Similarity Surface (MESS), Manhattan distance (L1), and Euclidean distance (L2). Negative values indicate potential environmental novelty. + ```{r fig.height=4, fig.width=6} cv_similarity(cv = ecv, # the environmental clustering x = pa_data, r = rasters, + method = "MESS", progress = FALSE) ``` @@ -313,8 +315,7 @@ To support a first choice of block size, prior to any model fitting, package `bl When only `r` is supplied, the `cv_spatial_autocor` function works by automatically fitting variograms to each continuous raster and finding the effective range of spatial autocorrelation. Variogram is a fundamental geostatistical tool for measuring spatial autocorrelation (O’Sullivan and Unwin, 2010). ```{r, results='hide', fig.keep='all', warning=FALSE, message=FALSE, fig.height=5, fig.width=7.2} -sac1 <- cv_spatial_autocor(r = rasters, - num_sample = 5000) +sac1 <- cv_spatial_autocor(r = rasters, num_sample = 5000) ``` diff --git a/inst/doc/tutorial_1.html b/inst/doc/tutorial_1.html index 4571bda..999cc3b 100644 --- a/inst/doc/tutorial_1.html +++ b/inst/doc/tutorial_1.html @@ -12,7 +12,7 @@ - + 1. blockCV introduction: how to create block cross-validation folds @@ -342,7 +342,7 @@

1. blockCV introduction: how to create block cross-validation folds

Roozbeh Valavi, Jane Elith, José Lahoz-Monfort and Gurutzeta Guillera-Arroita

-

2025-08-09

+

2025-08-15

@@ -524,7 +524,7 @@

Spatial blocks

selection = "random", # random blocks-to-fold iteration = 50, # find evenly dispersed folds biomod2 = TRUE) # also create folds for biomod2
-

+

The output object is an R S3 object and you can get its elements by a $. Explore sb1$folds_ids, sb1$folds_list, and sb1$biomod_table for the @@ -549,12 +549,12 @@

Spatial blocks

biomod2 = TRUE)
## 
 ##   train_0 train_1 test_0 test_1
-## 1     207     201     50     42
-## 2     197     208     60     35
-## 3     198     213     59     30
-## 4     209     198     48     45
-## 5     217     152     40     91
-

+## 1 189 187 68 56 +## 2 197 172 60 71 +## 3 230 210 27 33 +## 4 216 202 41 41 +## 5 196 201 61 42
+

The assignment of folds to each block can also be done in a systematic manner using selection = "systematic", or a checkerboard pattern using selection = "checkerboard". The @@ -730,24 +730,21 @@

Visualising the folds

r = rasters, raster_colors = terrain.colors(10, alpha = 0.5), label_size = 4) -

+

Check similarity

-

The cv_similarity function can check for environmental -similarity between the training and testing folds and thus possible -extrapolation in the testing folds. It computes multivariate -environmental similarity surface (MESS) as described in Elith et -al. (2010). MESS represents how similar a point in a testing fold is to -a training fold (as a reference set of points), with respect to a set of -predictor variables in r. The negative values are the sites -where at least one variable has a value that is outside the range of -environments over the reference set, so these are novel -environments.

+

The cv_similarity function evaluates environmental +similarity between training and testing folds, helping to detect +potential extrapolation in the testing data. It supports three +similarity measures: Multivariate Environmental Similarity Surface +(MESS), Manhattan distance (L1), and Euclidean distance (L2). Negative +values indicate potential environmental novelty.

cv_similarity(cv = ecv, # the environmental clustering
               x = pa_data, 
               r = rasters, 
-              progress = FALSE)
+ method = "MESS", + progress = FALSE)

@@ -765,8 +762,7 @@

Estimating size: the effective range of spatial autocorrelation

variograms to each continuous raster and finding the effective range of spatial autocorrelation. Variogram is a fundamental geostatistical tool for measuring spatial autocorrelation (O’Sullivan and Unwin, 2010).

-
sac1 <- cv_spatial_autocor(r = rasters, 
-                           num_sample = 5000)
+
sac1 <- cv_spatial_autocor(r = rasters, num_sample = 5000)

The plotted block size is based on the median of the spatial autocorrelation ranges. This could be as the minimum block diff --git a/inst/doc/tutorial_2.html b/inst/doc/tutorial_2.html index 1b457ce..d74e576 100644 --- a/inst/doc/tutorial_2.html +++ b/inst/doc/tutorial_2.html @@ -12,7 +12,7 @@ - + 2. Block cross-validation for species distribution modelling @@ -342,7 +342,7 @@

2. Block cross-validation for species distribution modelling

Roozbeh Valavi, Jane Elith, José Lahoz-Monfort and Gurutzeta Guillera-Arroita

-

2025-08-09

+

2025-08-15

diff --git a/man/cv_similarity.Rd b/man/cv_similarity.Rd index 537455e..2b00e5a 100644 --- a/man/cv_similarity.Rd +++ b/man/cv_similarity.Rd @@ -48,15 +48,12 @@ is used to calculate similarity between the training and testing points.} a ggplot object } \description{ -This function supports three similarity measurement methods: -Multivariate Environmental Similarity Surface (MESS), Manhattan distance (L1), -and Euclidean distance (L2). It is primarily designed for evaluating the similarity -between training and test datasets in environmental or spatial modeling applications. +This function evaluates environmental similarity between training and testing folds, +helping to detect potential extrapolation in the testing data. It supports three +similarity measures: Multivariate Environmental Similarity Surface (MESS), Manhattan +distance (L1), and Euclidean distance (L2). } \details{ -The function support three method of similarity measures, the Multivariate Environmental -Similarity Surface (MESS), Manhattan distance (L1), and Euclidean distance (L2). - The MESS is calculated as described in Elith et al. (2010). MESS represents how similar a point in a testing fold is to a training fold (as a reference set of points), with respect to a set of predictor variables in \code{r}. diff --git a/src/Lightweight_matrix.hpp b/src/Lightweight_matrix.h similarity index 100% rename from src/Lightweight_matrix.hpp rename to src/Lightweight_matrix.h diff --git a/src/nndm.cpp b/src/nndm.cpp index 94934c0..36f460b 100644 --- a/src/nndm.cpp +++ b/src/nndm.cpp @@ -1,8 +1,7 @@ #include - #include #include -#include "Lightweight_matrix.hpp" +#include "Lightweight_matrix.h" // [[Rcpp::export]] Rcpp::NumericMatrix nndm_cpp(Rcpp::NumericMatrix X, @@ -10,111 +9,122 @@ Rcpp::NumericMatrix nndm_cpp(Rcpp::NumericMatrix X, double phi, double min_train){ - // Value representing NAs (NA_REAL does not work well with custom structures) - const double na_value(-1.); - - // C++ representation of Gij - const std::vector Gij_vec(Rcpp::as >(Gij)); - - // C++ representation of X - Lightweight_matrix X_cpp(X); - for(int i=0; i Gjstar_indices(X.ncol()); - double rmin = std::numeric_limits::infinity(); - int imin = 0; - for(int i=0; i::infinity(); - for(int j=0; j Gij_vec(Rcpp::as >(Gij)); - // Number of entries of Gjstar that are less than rmin - int Gjstar_less_than_rmin(0); - for(std::size_t i = 0; i < Gjstar_indices.size(); i++){ - if(X_cpp(i, Gjstar_indices[i]) <= rmin){ - ++Gjstar_less_than_rmin; - } + // C++ representation of X + Lightweight_matrix X_cpp(X); + for (int i=0; i(Gij_less_than_rmin) / static_cast(Gij_vec.size()); - const double tcdf = static_cast(Gjstar_less_than_rmin - 1) / static_cast(Gjstar_indices.size()); - - // Number of non-NA values in X[imin, ] - int X_non_na(0); - for(int j=0; j Gjstar_indices(X.ncol()); + double rmin = std::numeric_limits::infinity(); + int imin = 0; + for (int i=0; i::infinity(); + for (int j=0; j(X_non_na) / static_cast(X_cpp.ncol()) > min_train; - - // Will end up containing the updated rmin value - double new_rmin = std::numeric_limits::infinity(); - - if( (tcdf >= pcdf) && in_prop ) { - // Signal that the current element is NA - X_cpp(imin, Gjstar_indices[imin]) = na_value; - - double running_minimum = std::numeric_limits::infinity(); - for(int j=0; j= rmin && X_cpp(i, Gjstar_indices[i]) < new_rmin) { - new_rmin = X_cpp(i, Gjstar_indices[i]); - imin = i; + // Number of entries of Gij that are less than rmin + int Gij_less_than_rmin(0); + for (std::size_t i = 0; i < Gij_vec.size(); i++) + { + if(Gij_vec[i] <= rmin){ + ++Gij_less_than_rmin; + } } - } - } else if( static_cast(Gjstar_indices.size()) == Gjstar_less_than_rmin ){ - break; - } else { - for(std::size_t i = 0; i rmin && X_cpp(i, Gjstar_indices[i]) < new_rmin) { - new_rmin = X_cpp(i, Gjstar_indices[i]); - imin = i; + const double pcdf = static_cast(Gij_less_than_rmin) / static_cast(Gij_vec.size()); + const double tcdf = static_cast(Gjstar_less_than_rmin - 1) / static_cast(Gjstar_indices.size()); + + // Number of non-NA values in X[imin, ] + int X_non_na(0); + for (int j=0; j(X_non_na) / static_cast(X_cpp.ncol()) > min_train; + + // Will end up containing the updated rmin value + double new_rmin = std::numeric_limits::infinity(); + + if ((tcdf >= pcdf) && in_prop ) { + // Signal that the current element is NA + X_cpp(imin, Gjstar_indices[imin]) = na_value; + + double running_minimum = std::numeric_limits::infinity(); + for (int j=0; j= rmin && X_cpp(i, Gjstar_indices[i]) < new_rmin) { + new_rmin = X_cpp(i, Gjstar_indices[i]); + imin = i; + } + } + } else if( static_cast(Gjstar_indices.size()) == Gjstar_less_than_rmin ){ + break; + } else { + for (std::size_t i=0; i rmin && X_cpp(i, Gjstar_indices[i]) < new_rmin) { + new_rmin = X_cpp(i, Gjstar_indices[i]); + imin = i; + } + } + } - // Update X before returning, since we were working on C++ object - for(int i=0; i #include #include -#include "Lightweight_matrix.hpp" +#include "Lightweight_matrix.h" // a function to calculate L1 or L2 distances @@ -13,6 +13,8 @@ inline double distance(Lightweight_matrix &x, Lightweight_matrix if (L1) { + // could be faster by avoiding () operator and accessing data directly, and + // vectorised operations but potentially won't be necessary here.. for (int j = 0; j < nc; j++) { dist += std::abs(x(i, j) - y(ii, j)); // Manhattan distance diff --git a/vignettes/tutorial_1.Rmd b/vignettes/tutorial_1.Rmd index 6037169..46e1ac1 100644 --- a/vignettes/tutorial_1.Rmd +++ b/vignettes/tutorial_1.Rmd @@ -296,12 +296,14 @@ cv_plot(cv = sb1, ## Check similarity -The `cv_similarity` function can check for environmental similarity between the training and testing folds and thus possible extrapolation in the testing folds. It computes multivariate environmental similarity surface (MESS) as described in Elith et al. (2010). MESS represents how similar a point in a testing fold is to a training fold (as a reference set of points), with respect to a set of predictor variables in `r`. The negative values are the sites where at least one variable has a value that is outside the range of environments over the reference set, so these are novel environments. +The `cv_similarity` function evaluates environmental similarity between training and testing folds, helping to detect potential extrapolation in the testing data. It supports three similarity measures: Multivariate Environmental Similarity Surface (MESS), Manhattan distance (L1), and Euclidean distance (L2). Negative values indicate potential environmental novelty. + ```{r fig.height=4, fig.width=6} cv_similarity(cv = ecv, # the environmental clustering x = pa_data, r = rasters, + method = "MESS", progress = FALSE) ``` @@ -313,8 +315,7 @@ To support a first choice of block size, prior to any model fitting, package `bl When only `r` is supplied, the `cv_spatial_autocor` function works by automatically fitting variograms to each continuous raster and finding the effective range of spatial autocorrelation. Variogram is a fundamental geostatistical tool for measuring spatial autocorrelation (O’Sullivan and Unwin, 2010). ```{r, results='hide', fig.keep='all', warning=FALSE, message=FALSE, fig.height=5, fig.width=7.2} -sac1 <- cv_spatial_autocor(r = rasters, - num_sample = 5000) +sac1 <- cv_spatial_autocor(r = rasters, num_sample = 5000) ```