diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index 50458bc..0a8e044 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -1,25 +1,32 @@ -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] - -name: R-CMD-check - -jobs: - R-CMD-check: - runs-on: macOS-latest - steps: - - uses: actions/checkout@v3 - - uses: r-lib/actions/setup-r@v2 - - name: Setting up pandoc for Rmd docs - uses: r-lib/actions/setup-pandoc@v2 - - uses: r-lib/actions/setup-tinytex@v2 - - name: Install dependencies - run: | - install.packages(c("remotes", "rcmdcheck")) - remotes::install_deps(dependencies = TRUE) - shell: Rscript {0} - - name: Check - run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error") - shell: Rscript {0} +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: r-lib/actions/setup-r@v2 + - name: Setting up pandoc for Rmd docs + uses: r-lib/actions/setup-pandoc@v2 + - uses: r-lib/actions/setup-tinytex@v2 + - name: Install dependencies + run: | + install.packages(c("remotes", "rcmdcheck", "covr")) + remotes::install_deps(dependencies = TRUE) + shell: Rscript {0} + + - name: Check + run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error") + shell: Rscript {0} + + - name: Test coverage + run: covr::codecov() + shell: Rscript {0} + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.gitignore b/.gitignore index 2540826..d26c686 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ vignettes/tutorial_1.R vignettes/tutorial_1.html vignettes/tutorial_2.R vignettes/tutorial_2.html +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index 0aba4c3..b67b315 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: blockCV Type: Package Title: Spatial and Environmental Blocking for K-Fold and LOO Cross-Validation -Version: 3.1-7 -Date: 2025-08-01 +Version: 3.2-0 +Date: 2025-08-15 Authors@R: c(person("Roozbeh", "Valavi", role = c("aut", "cre"), email = "valavi.r@gmail.com", comment = c(ORCID = "0000-0003-2495-5277")), person("Jane", "Elith", role = "aut", diff --git a/NEWS.md b/NEWS.md index fb479aa..6b2373d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,12 +1,17 @@ +# version 3.2.0 +* Added two new methods, L1 and L2 distances, to the `cv_similarity` function +* Fixed a warning in `cv_similarity` for colour aesthetics with ggplot +* Fixed the summary method and plotting for `cv_spatial_autocor` + # version 3.1.7 -* Temporarily added `sp` package dependency to avoid CRAN check as required by `automap` package [#55]. +* Temporarily added `sp` package dependency to avoid CRAN error as required by `automap` package [#55]. # version 3.1.6 * Resolved unclear error messages; issue [#52](https://github.com/rvalavi/blockCV/issues/52) by A. Márcia Barbosa * Resolved ggplot testing failure; issue [#54](https://github.com/rvalavi/blockCV/issues/54) by Teun van den Brand # version 3.1.5 -* Resolved background pattern artifacts in raster plotting; issue [#50](https://github.com/rvalavi/blockCV/issues/50) by Camila Neder. +* Resolved background pattern artefacts in raster plotting; issue [#50](https://github.com/rvalavi/blockCV/issues/50) by Camila Neder. # version 3.1.4 * Just the `biomod2` example is updated in vignettes; and the link in help file @@ -40,7 +45,7 @@ * Massive performance improvement in the C++ code of `cv_nndm` function for large datasets # version 3.0 -* Dependency to rgdal and rgeos are removed, and overall less dependency +* Dependency to `rgdal` and `rgeos` are removed, and overall less dependency * Function names have been changed, with all functions now starting with `cv_` * The old functions (v2.x) still work to allow appropriate time for adapting the new code * The CV blocking functions are now: `cv_spatial`, `cv_cluster`, `cv_buffer`, and `cv_nndm` diff --git a/R/RcppExports.R b/R/RcppExports.R index ec6a423..e42598a 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -5,3 +5,7 @@ nndm_cpp <- function(X, Gij, phi, min_train) { .Call(`_blockCV_nndm_cpp`, X, Gij, phi, min_train) } +similarity_cpp <- function(train_mat, test_mat, rand_mat, L1 = TRUE) { + .Call(`_blockCV_similarity_cpp`, train_mat, test_mat, rand_mat, L1) +} + diff --git a/R/checks.R b/R/checks.R index 16c141f..c03cf9f 100644 --- a/R/checks.R +++ b/R/checks.R @@ -1,8 +1,14 @@ # Author: Roozbeh Valavi # contact: valavi.r@gmail.com # Date : May 2023 -# Version 0.2 -# Licence GPL v3 +# Version 0.3 + +# check the object is a blockCV object +.check_cv <- function(x) { + is_cv <- inherits(x, c("cv_spatial", "cv_cluster", "cv_buffer", "cv_nndm")) + if (!is_cv) stop("'cv' must be a blockCV cv_* object.") +} + # check points fall within the raster layer .check_within <- function(x, r) { diff --git a/R/cv_block_size.R b/R/cv_block_size.R index 927a753..65135f8 100644 --- a/R/cv_block_size.R +++ b/R/cv_block_size.R @@ -108,8 +108,7 @@ cv_block_size <- function( if(!is.null(x)){ geom_x <- ggplot2::geom_sf( data = x, - switch(!is.null(column), ggplot2::aes(colour = {{ column }}), NULL), - # ggplot2::aes(colour = {{ switch(!is.null(column), column, NULL) }}), + switch(!is.null(column), ggplot2::aes(colour = {{ column }})), inherit.aes = FALSE, alpha = 0.5 ) @@ -158,7 +157,7 @@ cv_block_size <- function( fill = "orangered4", alpha = 0.04, size = 0.2) + - switch(!is.null(x), geom_x, NULL) + + switch(!is.null(x), geom_x) + ggplot2::ggtitle("Spatial blocks", subtitle=paste("Using", input$num, diff --git a/R/cv_buffer.R b/R/cv_buffer.R index b3d0b1f..c8cbf7a 100644 --- a/R/cv_buffer.R +++ b/R/cv_buffer.R @@ -117,7 +117,7 @@ cv_buffer <- function( add_bg <- (presence_bg && add_bg) if(progress) pb <- utils::txtProgressBar(min = 0, max = n, style = 3) - fold_list <- lapply(x_1s, function(i, pbag = add_bg){ + fold_list <- lapply(x_1s, function(i, pbag = add_bg) { if(pbag){ test_ids <- which(dmatrix[i, ] <= size) inside <- x[test_ids, column, drop = TRUE] @@ -127,14 +127,13 @@ cv_buffer <- function( test_set <- i } if(progress) utils::setTxtProgressBar(pb, i) - list(as.numeric(which(dmatrix[i, ] > size)), - as.numeric(test_set)) - } - ) + + list(as.numeric(which(dmatrix[i, ] > size)), as.numeric(test_set)) + }) # calculate train test table summary if(report){ - train_test_table <- .ttt(fold_list, x, column, n) + train_test_table <- .table_summary(fold_list, x, column, n) print(summary(train_test_table)[c(1,4,6), ]) } @@ -172,32 +171,3 @@ summary.cv_buffer <- function(object, ...){ } } - -# count the train and test records -.ttt <- function(fold_list, x, column, n){ - if(is.null(column)){ - tt_count <- base::data.frame(train = rep(0, n), test = 0) - for(i in seq_len(n)){ - train_set <- fold_list[[i]][[1]] - test_set <- fold_list[[i]][[2]] - tt_count$train[i] <- length(train_set) - tt_count$test[i] <- length(test_set) - } - } else{ - cl <- sort(unique(x[, column, drop = TRUE])) - clen <- length(cl) - .check_classes(clen, column) # column should be binary or categorical - tt_count <- as.data.frame(matrix(0, nrow = n, ncol = clen * 2)) - names(tt_count) <- c(paste("train", cl, sep = "_"), paste("test", cl, sep = "_")) - for(i in seq_len(n)){ - train_set <- fold_list[[i]][[1]] - test_set <- fold_list[[i]][[2]] - countrain <- table(x[train_set, column, drop = TRUE]) - countest <- table(x[test_set, column, drop = TRUE]) - tt_count[i, which(cl %in% names(countrain))] <- countrain - tt_count[i, clen + which(cl %in% names(countest))] <- countest - } - } - - return(tt_count) -} diff --git a/R/cv_cluster.R b/R/cv_cluster.R index ac55278..9be898d 100644 --- a/R/cv_cluster.R +++ b/R/cv_cluster.R @@ -205,7 +205,7 @@ cv_cluster <- function( final_objs <- list( folds_list = fold_list, folds_ids = fold_ids, - biomod_table = switch(biomod2, as.matrix(biomod_table), NULL), + biomod_table = switch(biomod2, as.matrix(biomod_table)), k = k, column = column, type = ifelse(is.null(r), "Spatial Cluster", "Environmental Cluster"), diff --git a/R/cv_nndm.R b/R/cv_nndm.R index b6c7c34..13bcc2b 100644 --- a/R/cv_nndm.R +++ b/R/cv_nndm.R @@ -186,7 +186,7 @@ cv_nndm <- function( ) # calculate train test table summary if(report){ - train_test_table <- .ttt(fold_list, x, column, n) + train_test_table <- .table_summary(fold_list, x, column, n) print(summary(train_test_table)[c(1,4,6), ]) } diff --git a/R/cv_plot.R b/R/cv_plot.R index e2361b3..c488d17 100644 --- a/R/cv_plot.R +++ b/R/cv_plot.R @@ -58,10 +58,8 @@ cv_plot <- function( ){ # check for availability of ggplot2 .check_pkgs(c("ggplot2")) - - if(!class(cv) %in% c("cv_spatial", "cv_cluster", "cv_buffer", "cv_nndm")){ - stop("'cv' must be a blockCV cv_* object.") - } + # check it's a cv obj + .check_cv(cv) # check x is an sf object if(!missing(x)){ @@ -73,6 +71,8 @@ cv_plot <- function( r <- .check_r(r) r <- r[[1]] } + # is it a cv_spatial object? + is_spatial <- inherits(cv, "cv_spatial") # make geom_tile for raster plots if(!is.null(r)){ @@ -91,10 +91,11 @@ cv_plot <- function( geom_rast <- ggplot2::geom_tile( data = map_df, ggplot2::aes(x = get("x"), y = get("y"), fill = get("value"))) + geom_rast_col <- ggplot2::scale_fill_gradientn(colours = raster_colors) } # make geom_sf for spatial blocks - if(methods::is(cv, "cv_spatial")){ + if(is_spatial){ blocks <- cv$blocks geom_poly <- ggplot2::geom_sf( data = sf::st_geometry(blocks), @@ -114,56 +115,49 @@ cv_plot <- function( } } else{ # stop if x is missing for buffer and cluster - if(!methods::is(cv, "cv_spatial")) stop("'x' is required for plotting cv_cluster and cv_buffer.") + if(!is_spatial) stop("'x' is required for plotting cv_cluster, cv_buffer and cv_nndm.") } - if(missing(x)){ - if(methods::is(cv, "cv_spatial")){ + geom_sftext <- if (label_size > 0) { + ggplot2::geom_sf_text( + ggplot2::aes(label = get("folds")), + size = label_size, fun.geometry = sf::st_centroid + ) + } else NULL + if(missing(x)){ + if(is_spatial){ p1 <- ggplot2::ggplot(data = blocks) + - switch(!is.null(r), geom_rast, NULL) + - switch(!is.null(r), geom_rast_col, NULL) + + switch(!is.null(r), geom_rast) + # only switch works with ggplot + switch(!is.null(r), geom_rast_col) + ggplot2::geom_sf(colour = "red", fill = "orangered4", alpha = 0.04, linewidth = 0.2) + - ggplot2::geom_sf_text( - ggplot2::aes(label = get("folds")), - size = label_size, fun.geometry = sf::st_centroid) + + switch(!is.null(geom_sftext), geom_sftext) + ggplot2::labs(x = "", y = "") + # or set the axes labes to NULL ggplot2::scale_x_continuous(guide = ggplot2::guide_axis(check.overlap = TRUE)) + ggplot2::theme_minimal() + ggplot2::guides(fill = "none") - } } else{ - p1 <- ggplot2::ggplot(data = x_long) + - switch(!is.null(r), geom_rast, NULL) + - switch(!is.null(r), geom_rast_col, NULL) + - switch(methods::is(cv, "cv_spatial"), geom_poly, NULL) + - # ggplot2::geom_sf(ggplot2::aes(col = rlang::.data$value), - ggplot2::geom_sf(ggplot2::aes(col = get("value")), - alpha = points_alpha) + + switch(!is.null(r), geom_rast) + # only switch works with ggplot + switch(!is.null(r), geom_rast_col) + + switch(is_spatial, geom_poly) + + ggplot2::geom_sf(ggplot2::aes(col = get("value")), alpha = points_alpha) + ggplot2::scale_color_manual(values = points_colors, na.value = "#BEBEBE03") + - # ggplot2::facet_wrap(~ rlang::.data$folds, nrow = nrow, ncol = ncol) + ggplot2::facet_wrap(~get("folds"), nrow = nrow, ncol = ncol) + ggplot2::labs(x = "", y = "", col = "") + # set the axes labes to NULL ggplot2::theme_bw() + ggplot2::guides(fill = "none") - } return(p1) } -# is it a LOO CV object? -.is_loo <- function(x){ - methods::is(x, "cv_buffer") || methods::is(x, "cv_nndm") -} - # transform x and fold numbers for plotting .x_to_long <- function(x, cv, num_plot=1:10){ # get the folds list diff --git a/R/cv_similarity.R b/R/cv_similarity.R index 0c9a38c..432e69d 100644 --- a/R/cv_similarity.R +++ b/R/cv_similarity.R @@ -1,16 +1,41 @@ #' Compute similarity measures to evaluate possible extrapolation in testing folds #' -#' This function computes multivariate environmental similarity surface (MESS) as described -#' in Elith et al. (2010). MESS represents how similar a point in a testing fold is to a training -#' fold (as a reference set of points), with respect to a set of predictor variables in \code{r}. +#' This function evaluates environmental similarity between training and testing folds, +#' helping to detect potential extrapolation in the testing data. It supports three +#' similarity measures: Multivariate Environmental Similarity Surface (MESS), Manhattan +#' distance (L1), and Euclidean distance (L2). +#' +#' The MESS is calculated as described in Elith et al. (2010). MESS represents +#' how similar a point in a testing fold is to a training fold (as a reference +#' set of points), with respect to a set of predictor variables in \code{r}. #' The negative values are the sites where at least one variable has a value that is outside #' the range of environments over the reference set, so these are novel environments. #' +#' When using the L1 (Manhattan) or L2 (Euclidean) distance options (experimental), the +#' function performs the following steps for each test sample: +#' +#' \itemize{ +#' \item{1. Calculates the minimum distance between each test sample and all training samples +#' in the same fold using the selected metric (L1 or L2).} +#' \item{2. Calculates a baseline distance: the average of the minimum distances between a set +#' of random background samples (defined by \code{num_sample}) from the raster and all training/test +#' samples combined.} +#' \item{3. Computes a similarity score by subtracting the test sample’s minimum distance from +#' the baseline average. A higher score indicates the test sample is more similar to +#' the training data, while lower or negative scores indicate novelty.} +#' } +#' +#' This provides a simple, distance-based novelty metric, useful for assessing +#' extrapolation or dissimilarity in prediction scenarios. Note that this approach is +#' experimental. +#' #' @inheritParams cv_plot #' @param x a simple features (sf) or SpatialPoints object of the spatial sample data used for creating #' the \code{cv} object. #' @param r a terra SpatRaster object of environmental predictor that are going to be used for modelling. This #' is used to calculate similarity between the training and testing points. +#' @param method the similarity method including: MESS, L1 and L2. Read the details section. +#' @param num_sample number of random samples from raster to calculate similarity distances (only for L1 and L2). #' @param num_plot a vector of indices of folds. #' @param jitter_width numeric; the width of jitter points. #' @param points_size numeric; the size of points. @@ -18,6 +43,10 @@ #' @param points_colors character; a character vector of colours for points #' @inheritParams cv_spatial #' +#' @seealso \code{\link{cv_spatial}}, \code{\link{cv_cluster}}, \code{\link{cv_buffer}}, and \code{\link{cv_nndm}} +#' +#' @references Elith, J., Kearney, M., & Phillips, S. (2010). The art of modelling range-shifting species: The art of modelling range-shifting species. Methods in Ecology and Evolution, 1(4), 330–342. +#' #' @return a ggplot object #' @export #' @@ -51,6 +80,8 @@ cv_similarity <- function( x, r, num_plot = seq_along(cv$folds_list), + method = "MESS", + num_sample = 10000L, jitter_width = 0.1, points_size = 2, points_alpha = 0.7, @@ -69,12 +100,11 @@ cv_similarity <- function( .check_within(x, r) # check cv obj - if(!class(cv) %in% c("cv_spatial", "cv_cluster", "cv_buffer", "cv_nndm")){ - stop("'cv' must be a blockCV cv_* object.") - } - # if (!any(inherits(cv, c("cv_spatial", "cv_cluster", "cv_buffer", "cv_nndm")))) { - # stop("'cv' must be a blockCV cv_* object.") - # } + .check_cv(cv) + + method <- match.arg(tolower(method), choices = c("mess", "l1", "l2")) + MESS <- method == "mess" + L1 <- method == "l1" # The iteration must be a natural number tryCatch( @@ -106,16 +136,49 @@ cv_similarity <- function( df <- data.frame(id = seq_len(n)) # add progress bar if(progress) pb <- utils::txtProgressBar(min = 0, max = length(num_plot), style = 3) + + if (!MESS) { + # scale a dataset based on params of another scaled dataset + f <- function(x, s) { + mns <- attr(s,"scaled:center") + sds <- attr(s,"scaled:scale") + for (i in names(mns)) { + x[[i]] <- (x[[i]] - mns[i]) / sds[i] + } + attr(x, "scaled:center") <- NULL + attr(x, "scaled:scale") <- NULL + return(x) + } + rand_points <- terra::spatSample(r, size = num_sample, na.rm = TRUE) + rand_points <- rand_points[stats::complete.cases(rand_points), ] # just make sure.. + # scale and centre the random points + rand_points <- as.matrix(scale(rand_points)) + # scale and centre the samples points based on the same stats + points <- as.matrix(f(x = points, s = rand_points)) + # remove the attributes + attr(rand_points, "scaled:center") <- NULL + attr(rand_points, "scaled:scale") <- NULL + } + # calculate MESS for testing data for(i in num_plot){ df[, paste("Fold", i, sep = "")] <- NA train <- folds_list[[i]][[1]] test <- folds_list[[i]][[2]] - mes <- sapply(1:m, function(j) .messi3(points[test, j], points[train, j])) - if(.is_loo(cv)){ - mmes <- min(mes) - } else{ - mmes <- apply(mes, 1, min, na.rm = TRUE) + if (MESS) { + mes <- sapply(1:m, function(j) .messi3(points[test, j], points[train, j])) + if(.is_loo(cv)){ + mmes <- min(mes) + } else{ + mmes <- apply(mes, 1, min, na.rm = TRUE) + } + } else { + mmes <- similarity_cpp( + train_mat = points[train, ], + test_mat = points[test,, drop=FALSE], + rand_mat = rand_points, + L1 = L1 + ) } df[1:length(mmes), paste("Fold", i, sep = "")] <- mmes if(progress) utils::setTxtProgressBar(pb, i) @@ -144,21 +207,38 @@ cv_similarity <- function( # provide alternatives for class(cv) goem_buffer <- ggplot2::geom_point(size = points_size, alpha = points_alpha) geom_other <- ggplot2::geom_jitter(width = jitter_width, size = points_size, alpha = points_alpha) - geom_vio <- ggplot2::geom_violin(fill = NA) + geom_vio <- ggplot2::geom_violin( + ggplot2::aes(x = get("folds"), y = get("value"), group = get("folds")), + inherit.aes = FALSE, + fill = NA + ) # which geom to choose geom_exta <- if(.is_loo(cv)) goem_buffer else geom_other + mes_reshp$folds <- factor(mes_reshp$folds) + + col_name <- switch( + method, + mess = "MESS", + l1 = "L1 distance", + l2 = "L2 distance" + ) + y_name <- switch( + method, + mess = "MESS Values", + l1 = "Distance differnce with random samples", + l2 = "Distance differnce with random samples" + ) p1 <- ggplot2::ggplot( data = mes_reshp, - # ggplot2::aes(x = rlang::.data$folds, y = rlang::.data$value, col = rlang::.data$value)) + - ggplot2::aes(x = get("folds"), y = get("value"), col = get("value"))) + + ggplot2::aes(x = get("folds"), y = get("value"), colour = get("value"))) + ggplot2::geom_hline(yintercept = 0, color = "grey50", linetype = 2) + geom_exta + - switch(!.is_loo(cv), geom_vio, NULL) + + switch(!.is_loo(cv), geom_vio) + ggplot2::scale_color_gradientn(colours = points_colors, limits = c(-maxabs, maxabs), na.value = "#BEBEBE03") + - ggplot2::labs(x = "Folds", y = "MESS Values", col = "MESS") + + ggplot2::labs(x = "Folds", y = y_name, col = col_name) + ggplot2::theme_bw() return(p1) diff --git a/R/cv_spatial.R b/R/cv_spatial.R index 28c9168..0b88192 100644 --- a/R/cv_spatial.R +++ b/R/cv_spatial.R @@ -315,7 +315,22 @@ cv_spatial <- function( } # iteration if random selection, otherwise only 1 round for(i in seq_len(iteration)){ - if(selection=='systematic'){ + + if(selection=="random"){ + blocks_df <- blocks_df[, c("records", "block_id")] # avoid repetition + fold_df <- data.frame(block_id = seq_len(blocks_len), folds = 0) + + # create random folds with equal proportion + num <- floor(blocks_len / k) + fold_df$folds[seq_len(num * k)] <- sample(rep(seq_len(k), num), num * k) + + if(blocks_len %% k != 0){ + rest <- blocks_len %% k + unfold <- which(fold_df$folds == 0) + fold_df$folds[unfold] <- sample(seq_len(k), rest, replace = FALSE) + } + + } else if(selection=="systematic"){ if(hexagon){ sub_blocks <- .fold_assign(sf::st_geometry(sub_blocks), n = k) } else{ @@ -323,35 +338,19 @@ cv_spatial <- function( sub_blocks$folds <- rep(1:k, length.out = blocks_len) } fold_df <- sf::st_drop_geometry(sub_blocks) - blocks_df <- merge(x = blocks_df, y = fold_df, by = "block_id", all.x = TRUE) - } - if(selection=='checkerboard'){ + } else if(selection=="checkerboard"){ sub_blocks$folds <- sub_blocks$id sub_blocks$block_id <- seq_len(blocks_len) fold_df <- sf::st_drop_geometry(sub_blocks) - blocks_df <- merge(x = blocks_df, y = fold_df, by = "block_id", all.x = TRUE) - } - if(selection=='random'){ - blocks_df <- blocks_df[, c("records", "block_id")] # to avoid repetition in iterations - fold_df <- data.frame(block_id = seq_len(blocks_len), folds = 0) - # create random folds with equal proportion - num <- floor(blocks_len / k) - fold_df$folds[seq_len(num * k)] <- sample(rep(seq_len(k), num), num * k) - if(blocks_len %% k != 0){ - rest <- blocks_len %% k - unfold <- which(fold_df$folds == 0) - fold_df$folds[unfold] <- sample(seq_len(k), rest, replace = FALSE) - } - blocks_df <- merge(x = blocks_df, y = fold_df, by = "block_id", all.x = TRUE) - } - - if(selection=='predefined'){ + } else if(selection=="predefined"){ fold_df <- data.frame(block_id = seq_len(blocks_len), folds = sub_blocks[, folds_column, drop = TRUE]) - blocks_df <- merge(x = blocks_df, y = fold_df, by = "block_id", all.x = TRUE) } + # merge block-df once after the selection + blocks_df <- merge(x = blocks_df, y = fold_df, by = "block_id", all.x = TRUE) + # reset the table to 0s for each iteration train_test_table[] <- 0 # count the number of points in each fold @@ -432,7 +431,7 @@ cv_spatial <- function( final_objs <- list( folds_list = fold_list, folds_ids = fold_vect, - biomod_table = switch(biomod2, as.matrix(biomod_table), NULL), + biomod_table = if (biomod2) as.matrix(biomod_table) else NULL, k = k, size = size, column = column, @@ -444,17 +443,19 @@ cv_spatial <- function( # plot with the cv_plot function if(plot){ - p1 <- cv_plot( - cv = final_objs, - r = switch(!is.null(r), r, NULL), - ... + plot( + cv_plot( + cv = final_objs, + r = r, + ... + ) ) - plot(p1) } return(final_objs) } + #' @export #' @method print cv_spatial print.cv_spatial <- function(x, ...){ @@ -483,16 +484,17 @@ summary.cv_spatial <- function(object, ...){ # create rectangular and hexagonal spatial blocks using terra and sf packages -.make_blocks <- function(x_obj, - blocksize = NULL, - blockcols = NULL, - blockrows = NULL, - hexagonal = FALSE, - flat_top = FALSE, - extend_perc = 0.005, - degree = 111325, - xy_offset = c(0, 0), - checkerboard = FALSE){ +.make_blocks <- function( + x_obj, + blocksize = NULL, + blockcols = NULL, + blockrows = NULL, + hexagonal = FALSE, + flat_top = FALSE, + extend_perc = 0.005, + degree = 111325, + xy_offset = c(0, 0), + checkerboard = FALSE){ # xpoints and rasters inputs are checked in the parent function (cv_spatial); # so no need to check them here; @@ -683,3 +685,4 @@ summary.cv_spatial <- function(object, ...){ return(blocks) } + diff --git a/R/cv_spatial_autocor.R b/R/cv_spatial_autocor.R index b227248..78a49eb 100644 --- a/R/cv_spatial_autocor.R +++ b/R/cv_spatial_autocor.R @@ -168,8 +168,8 @@ cv_spatial_autocor <- function( vario_list <- lapply( seq_len(nlayer), .fit_variogram, - rr = switch(missing(x), r, NULL), - xx = switch(!missing(x), x, NULL), + rr = if (missing(x)) r else NULL, + xx = if (!missing(x)) x else NULL, column = column, num_sample = num_sample, progress = progress, @@ -214,8 +214,8 @@ cv_spatial_autocor <- function( # plot the spatial blocks p2 <- cv_plot( cv = plot_data, - r = switch(!missing(r), r, NULL), - label_size = 0, + r = if (!missing(r)) r else NULL, + label_size = -1, # no label ... ) p2 <- p2 + ggplot2::ggtitle( @@ -263,8 +263,8 @@ plot.cv_spatial_autocor <- function(x, y, ...){ #' @method summary cv_spatial_autocor summary.cv_spatial_autocor <- function(object, ...){ cat("Summary statistics of spatial autocorrelation ranges of all input layers:\n") - print(summary(object$rangeTable$range)) - print(object$rangeTable[,1:2]) + print(summary(object$range_table$range)) + print(object$range_table[,1:2]) } @@ -292,15 +292,12 @@ summary.cv_spatial_autocor <- function(object, ...){ points <- xx[column[i]] # [i] in case there are more column names(points) <- c("target", "geometry") } - # NOTE: apparently the gstat package returns different units for range with sp and sf objects! - # So, need to keep this a SpatialPointDataFrame for now and have to fake using it!!! - if (FALSE) { - sp::SpatialPoints - } + fit_vario <- automap::autofitVariogram( formula = target ~ 1, input_data = .as_sp(points) ) + if(progress) utils::setTxtProgressBar(pb, i) return(fit_vario) @@ -309,7 +306,7 @@ summary.cv_spatial_autocor <- function(object, ...){ # Annoying step to import sp so there'll be no CRAN errors # Sp is only require because automap produces different output with latlong sf objects -# Don't use sf::as_Spatial because this somehow depends on sp and requires sp dependency anyway +# Don't use sf::as_Spatial because this depends on sp and requires sp dependency anyway .as_sp <- function(x) { out <- if (sf::st_is_longlat(x)) { coords <- sf::st_coordinates(x) @@ -344,10 +341,6 @@ summary.cv_spatial_autocor <- function(object, ...){ decreasing = FALSE), color = get("range")) ) + - # ggplot2::geom_bar( - # ggplot2::aes(x = stats::reorder(factor(get("layers")), get("range")), - # y = get("range"), fill = get("range")), - # stat = "identity", data = vario_data,) + ggplot2::geom_point(size = 4) + ggplot2::geom_segment( ggplot2::aes(x = get("layers"), xend = get("layers"), y = 0, yend = get("range")), diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000..7e2e14e --- /dev/null +++ b/R/utils.R @@ -0,0 +1,37 @@ +# A collection of function to facilitate block generaion and reporting + +# is it a LOO CV object? +.is_loo <- function(x){ + inherits(x, c("cv_buffer", "cv_nndm")) +} + + +# count the train and test records +.table_summary <- function(fold_list, x, column, n){ + if(is.null(column)){ + tt_count <- data.frame(train = rep(0, n), test = 0) + for(i in seq_len(n)){ + train_set <- fold_list[[i]][[1]] + test_set <- fold_list[[i]][[2]] + tt_count$train[i] <- length(train_set) + tt_count$test[i] <- length(test_set) + } + } else{ + cl <- sort(unique(x[, column, drop = TRUE])) + clen <- length(cl) + .check_classes(clen, column) # column should be binary or categorical + tt_count <- as.data.frame(matrix(0, nrow = n, ncol = clen * 2)) + names(tt_count) <- c(paste("train", cl, sep = "_"), paste("test", cl, sep = "_")) + for(i in seq_len(n)){ + train_set <- fold_list[[i]][[1]] + test_set <- fold_list[[i]][[2]] + countrain <- table(x[train_set, column, drop = TRUE]) + countest <- table(x[test_set, column, drop = TRUE]) + tt_count[i, which(cl %in% names(countrain))] <- countrain + tt_count[i, clen + which(cl %in% names(countest))] <- countest + } + } + + return(tt_count) +} + diff --git a/inst/doc/tutorial_1.R b/inst/doc/tutorial_1.R index b0b07c7..1aee5aa 100644 --- a/inst/doc/tutorial_1.R +++ b/inst/doc/tutorial_1.R @@ -153,12 +153,12 @@ cv_plot(cv = sb1, cv_similarity(cv = ecv, # the environmental clustering x = pa_data, r = rasters, + method = "MESS", progress = FALSE) ## ----results='hide', fig.keep='all', warning=FALSE, message=FALSE, fig.height=5, fig.width=7.2---- -sac1 <- cv_spatial_autocor(r = rasters, - num_sample = 5000) +sac1 <- cv_spatial_autocor(r = rasters, num_sample = 5000) ## ----------------------------------------------------------------------------- diff --git a/inst/doc/tutorial_1.Rmd b/inst/doc/tutorial_1.Rmd index 6037169..46e1ac1 100644 --- a/inst/doc/tutorial_1.Rmd +++ b/inst/doc/tutorial_1.Rmd @@ -296,12 +296,14 @@ cv_plot(cv = sb1, ## Check similarity -The `cv_similarity` function can check for environmental similarity between the training and testing folds and thus possible extrapolation in the testing folds. It computes multivariate environmental similarity surface (MESS) as described in Elith et al. (2010). MESS represents how similar a point in a testing fold is to a training fold (as a reference set of points), with respect to a set of predictor variables in `r`. The negative values are the sites where at least one variable has a value that is outside the range of environments over the reference set, so these are novel environments. +The `cv_similarity` function evaluates environmental similarity between training and testing folds, helping to detect potential extrapolation in the testing data. It supports three similarity measures: Multivariate Environmental Similarity Surface (MESS), Manhattan distance (L1), and Euclidean distance (L2). Negative values indicate potential environmental novelty. + ```{r fig.height=4, fig.width=6} cv_similarity(cv = ecv, # the environmental clustering x = pa_data, r = rasters, + method = "MESS", progress = FALSE) ``` @@ -313,8 +315,7 @@ To support a first choice of block size, prior to any model fitting, package `bl When only `r` is supplied, the `cv_spatial_autocor` function works by automatically fitting variograms to each continuous raster and finding the effective range of spatial autocorrelation. Variogram is a fundamental geostatistical tool for measuring spatial autocorrelation (O’Sullivan and Unwin, 2010). ```{r, results='hide', fig.keep='all', warning=FALSE, message=FALSE, fig.height=5, fig.width=7.2} -sac1 <- cv_spatial_autocor(r = rasters, - num_sample = 5000) +sac1 <- cv_spatial_autocor(r = rasters, num_sample = 5000) ``` diff --git a/inst/doc/tutorial_1.html b/inst/doc/tutorial_1.html index bec4d2a..999cc3b 100644 --- a/inst/doc/tutorial_1.html +++ b/inst/doc/tutorial_1.html @@ -12,7 +12,7 @@ - + 1. blockCV introduction: how to create block cross-validation folds @@ -342,7 +342,7 @@

1. blockCV introduction: how to create block cross-validation folds

Roozbeh Valavi, Jane Elith, José Lahoz-Monfort and Gurutzeta Guillera-Arroita

-

2025-06-23

+

2025-08-15

@@ -419,6 +419,7 @@

Installation

remotes::install_github("rvalavi/blockCV", dependencies = TRUE)
# loading the package
 library(blockCV)
+
## blockCV 3.2.0

Package data

@@ -432,20 +433,20 @@

Package data

data include presence-absence records (binary) of a simulated species.

To load the package raster data use:

-
library(sf) # working with spatial vector data
-library(terra) # working with spatial raster data
-library(tmap) # plotting maps
-
-# load raster data
-# the pipe operator |> is available for R version 4.1 or higher
-rasters <- system.file("extdata/au/", package = "blockCV") |>
-  list.files(full.names = TRUE) |>
-  terra::rast()
+
library(sf) # working with spatial vector data
+library(terra) # working with spatial raster data
+library(tmap) # plotting maps
+
+# load raster data
+# the pipe operator |> is available for R version 4.1 or higher
+rasters <- system.file("extdata/au/", package = "blockCV") |>
+  list.files(full.names = TRUE) |>
+  terra::rast()

The presence-absence species data include 243 presence points and 257 absence points.

-
# load species presence-absence data and convert to sf
-points <- read.csv(system.file("extdata/", "species.csv", package = "blockCV"))
-head(points)
+
# load species presence-absence data and convert to sf
+points <- read.csv(system.file("extdata/", "species.csv", package = "blockCV"))
+head(points)
##            x        y occ
 ## 1  1313728.4 -2275453   0
 ## 2  1176795.0 -1916003   0
@@ -459,21 +460,21 @@ 

Package data

coordinate reference system with "EPSG:7845" as defined by crs = 7845. We convert the data.frame to sf as follows:

-
pa_data <- sf::st_as_sf(points, coords = c("x", "y"), crs = 7845)
+
pa_data <- sf::st_as_sf(points, coords = c("x", "y"), crs = 7845)

Let’s plot the species data using tmap package:

-
tm_shape(rasters[[1]]) +
-  tm_raster(
-    col.scale = tm_scale_continuous(values = gray.colors(10)),
-    col.legend = tm_legend_hide()
-  ) +
-  tm_shape(pa_data) +
-  tm_dots(
-    fill = "occ",
-    fill.scale = tm_scale_categorical(),
-    size = 0.5,
-    fill_alpha = 0.5
-  )
+
tm_shape(rasters[[1]]) +
+  tm_raster(
+    col.scale = tm_scale_continuous(values = gray.colors(10)),
+    col.legend = tm_legend_hide()
+  ) +
+  tm_shape(pa_data) +
+  tm_dots(
+    fill = "occ",
+    fill.scale = tm_scale_categorical(),
+    size = 0.5,
+    fill_alpha = 0.5
+  )

@@ -516,14 +517,14 @@

Spatial blocks

allows users to define an external spatial polygon as blocking layer.

Here are some spatial block settings:

-
sb1 <- cv_spatial(x = pa_data,
-                  column = "occ", # the response column (binary or multi-class)
-                  k = 5, # number of folds
-                  size = 350000, # size of the blocks in metres
-                  selection = "random", # random blocks-to-fold
-                  iteration = 50, # find evenly dispersed folds
-                  biomod2 = TRUE) # also create folds for biomod2
-

+
sb1 <- cv_spatial(x = pa_data,
+                  column = "occ", # the response column (binary or multi-class)
+                  k = 5, # number of folds
+                  size = 350000, # size of the blocks in metres
+                  selection = "random", # random blocks-to-fold
+                  iteration = 50, # find evenly dispersed folds
+                  biomod2 = TRUE) # also create folds for biomod2
+

The output object is an R S3 object and you can get its elements by a $. Explore sb1$folds_ids, sb1$folds_list, and sb1$biomod_table for the @@ -536,37 +537,37 @@

Spatial blocks

raster layer (using r argument) for to be used for creating blocks and be used in the background of the plot (raster can also be added later only for visualising blocks using cv_plot).

-
sb2 <- cv_spatial(x = pa_data,
-                  column = "occ",
-                  r = rasters, # optionally add a raster layer
-                  k = 5, 
-                  size = 350000, 
-                  hexagon = FALSE, # use square blocks
-                  selection = "random",
-                  progress = FALSE, # turn off progress bar for vignette
-                  iteration = 50, 
-                  biomod2 = TRUE)
+
sb2 <- cv_spatial(x = pa_data,
+                  column = "occ",
+                  r = rasters, # optionally add a raster layer
+                  k = 5, 
+                  size = 350000, 
+                  hexagon = FALSE, # use square blocks
+                  selection = "random",
+                  progress = FALSE, # turn off progress bar for vignette
+                  iteration = 50, 
+                  biomod2 = TRUE)
## 
 ##   train_0 train_1 test_0 test_1
-## 1     217     190     40     53
-## 2     197     210     60     33
-## 3     211     166     46     77
-## 4     200     209     57     34
-## 5     203     197     54     46
-

+## 1 189 187 68 56 +## 2 197 172 60 71 +## 3 230 210 27 33 +## 4 216 202 41 41 +## 5 196 201 61 42
+

The assignment of folds to each block can also be done in a systematic manner using selection = "systematic", or a checkerboard pattern using selection = "checkerboard". The blocks can also be created by number of rows and columns when no size is supplied by e.g. rows_cols = c(12, 10).

-
# systematic fold assignment 
-# and also use row/column for creating blocks instead of size
-sb3 <- cv_spatial(x = pa_data,
-                  column = "occ",
-                  rows_cols = c(12, 10),
-                  hexagon = FALSE,
-                  selection = "systematic")
+
# systematic fold assignment 
+# and also use row/column for creating blocks instead of size
+sb3 <- cv_spatial(x = pa_data,
+                  column = "occ",
+                  rows_cols = c(12, 10),
+                  hexagon = FALSE,
+                  selection = "systematic")
## 
 ##   train_0 train_1 test_0 test_1
 ## 1     227     203     30     40
@@ -581,23 +582,23 @@ 

Spatial blocks

This is because the random assignment process is repeated multiple times, controlled by the iteration parameter, to ensure that the folds are evenly distributed.

-
# checkerboard block to CV fold assignment
-sb4 <- cv_spatial(x = pa_data,
-                  column = "occ",
-                  size = 350000,
-                  hexagon = FALSE,
-                  selection = "checkerboard")
+
# checkerboard block to CV fold assignment
+sb4 <- cv_spatial(x = pa_data,
+                  column = "occ",
+                  size = 350000,
+                  hexagon = FALSE,
+                  selection = "checkerboard")
## 
 ##   train_0 train_1 test_0 test_1
 ## 1     125     143    132    100
 ## 2     132     100    125    143

Let’s visualise the checkerboard blocks with tmap:

-
tm_shape(sb4$blocks) +
-  tm_fill(
-    fill = "folds",
-    fill.scale = tm_scale_categorical()
-  )
+
tm_shape(sb4$blocks) +
+  tm_fill(
+    fill = "folds",
+    fill.scale = tm_scale_categorical()
+  )

@@ -609,11 +610,11 @@

Spatial and environemntal clustering

based on spatial coordinates of sample points (the x argument).

Using spatial coordinate values for clustering:

-
# spatial clustering
-set.seed(6)
-scv <- cv_cluster(x = pa_data,
-                  column = "occ", # optional: counting number of train/test records
-                  k = 5)
+
# spatial clustering
+set.seed(6)
+scv <- cv_cluster(x = pa_data,
+                  column = "occ", # optional: counting number of train/test records
+                  k = 5)
##   train_0 train_1 test_0 test_1
 ## 1     230     240     27      3
 ## 2     171     142     86    101
@@ -628,13 +629,13 @@ 

Spatial and environemntal clustering

layer should cover all the species points, otherwise an error will rise. The records with no raster value should be deleted prior to the analysis or a different raster be used.

-
# environmental clustering
-set.seed(6)
-ecv <- cv_cluster(x = pa_data,
-                  column = "occ",
-                  r = rasters,
-                  k = 5, 
-                  scale = TRUE)
+
# environmental clustering
+set.seed(6)
+ecv <- cv_cluster(x = pa_data,
+                  column = "occ",
+                  r = rasters,
+                  k = 5, 
+                  scale = TRUE)
##   train_0 train_1 test_0 test_1
 ## 1     249     172      8     71
 ## 2     215     234     42      9
@@ -664,9 +665,9 @@ 

Buffering LOO (also known as Spatial LOO)

(ideally the range of spatial autocorrelation). In this method the test set never directly abuts a training set.

Using buffering to create CV folds:

-
bloo <- cv_buffer(x = pa_data,
-                  column = "occ",
-                  size = 350000)
+
bloo <- cv_buffer(x = pa_data,
+                  column = "occ",
+                  size = 350000)

When using species presence-background data (or presence and pseudo-absence), you need to supply the column and set presence_bg = TRUE. In this case, only presence @@ -690,14 +691,14 @@

Nearest Neighbour Distance Matching (NNDM) LOO

distribution function between the test and training data to the nearest neighbour distance distribution function between the target prediction and training points (Milà et al., 2022).

-
nncv <- cv_nndm(x = pa_data,
-                column = "occ",
-                r = rasters,
-                size = 350000,
-                num_sample = 5000, 
-                sampling = "regular",
-                min_train = 0.1,
-                plot = TRUE)
+
nncv <- cv_nndm(x = pa_data,
+                column = "occ",
+                r = rasters,
+                size = 350000,
+                num_sample = 5000, 
+                sampling = "regular",
+                min_train = 0.1,
+                plot = TRUE)
##     train_0         train_1          test_0          test_1     
 ##  Min.   :212.0   Min.   :145.0   Min.   :0.000   Min.   :0.000  
 ##  Mean   :244.4   Mean   :221.7   Mean   :0.514   Mean   :0.486  
@@ -712,47 +713,38 @@ 

Visualising the folds

might be slightly slower.

Let’s plot spatial clustering folds created in previous section (using cv_cluster):

-
cv_plot(cv = scv, 
-        x = pa_data)
+
cv_plot(cv = scv, 
+        x = pa_data)

When cv_buffer is used for plotting, only first 10 folds are shown. You can choose any set of CV folds for plotting. If remove_na = FALSE (default is TRUE), the NA in the legend shows the excluded points.

-
cv_plot(cv = bloo,
-        x = pa_data,
-        num_plots = c(1, 50, 100)) # only show folds 1, 50 and 100
+
cv_plot(cv = bloo,
+        x = pa_data,
+        num_plots = c(1, 50, 100)) # only show folds 1, 50 and 100

If you do not supply x when plotting a cv_spatial object, only the spatial blocks are plotted.

-
cv_plot(cv = sb1,
-        r = rasters,
-        raster_colors = terrain.colors(10, alpha = 0.5),
-        label_size = 4) 
-

+
cv_plot(cv = sb1,
+        r = rasters,
+        raster_colors = terrain.colors(10, alpha = 0.5),
+        label_size = 4) 
+

Check similarity

-

The cv_similarity function can check for environmental -similarity between the training and testing folds and thus possible -extrapolation in the testing folds. It computes multivariate -environmental similarity surface (MESS) as described in Elith et -al. (2010). MESS represents how similar a point in a testing fold is to -a training fold (as a reference set of points), with respect to a set of -predictor variables in r. The negative values are the sites -where at least one variable has a value that is outside the range of -environments over the reference set, so these are novel -environments.

-
cv_similarity(cv = ecv, # the environmental clustering
-              x = pa_data, 
-              r = rasters, 
-              progress = FALSE)
-
## Warning: The following aesthetics were dropped during statistical transformation:
-## colour.
-## ℹ This can happen when ggplot fails to infer the correct grouping structure in
-##   the data.
-## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
-##   variable into a factor?
+

The cv_similarity function evaluates environmental +similarity between training and testing folds, helping to detect +potential extrapolation in the testing data. It supports three +similarity measures: Multivariate Environmental Similarity Surface +(MESS), Manhattan distance (L1), and Euclidean distance (L2). Negative +values indicate potential environmental novelty.

+
cv_similarity(cv = ecv, # the environmental clustering
+              x = pa_data, 
+              r = rasters, 
+              method = "MESS",
+              progress = FALSE)

@@ -770,9 +762,8 @@

Estimating size: the effective range of spatial autocorrelation

variograms to each continuous raster and finding the effective range of spatial autocorrelation. Variogram is a fundamental geostatistical tool for measuring spatial autocorrelation (O’Sullivan and Unwin, 2010).

-
sac1 <- cv_spatial_autocor(r = rasters, 
-                           num_sample = 5000)
-

+
sac1 <- cv_spatial_autocor(r = rasters, num_sample = 5000)
+

The plotted block size is based on the median of the spatial autocorrelation ranges. This could be as the minimum block size for creating spatially separated folds. Variograms are @@ -791,16 +782,20 @@

Estimating size: the effective range of spatial autocorrelation

# summary statistics of the output
 summary(sac1)
## Summary statistics of spatial autocorrelation ranges of all input layers:
-## Length  Class   Mode 
-##      0   NULL   NULL 
-## NULL
+## Min. 1st Qu. Median Mean 3rd Qu. Max. +## 1015264 1352302 1637589 5337128 5622415 17058071 +## layers range +## 3 bio_4 1015264 +## 2 bio_15 1464648 +## 4 bio_5 1810530 +## 1 bio_12 17058071

Alternatively, only use the response data using x and column. This could be a binary or continuous variable provided in as a column in the sample points sf object. This could be the response or the residuals of a fitted model.

sac2 <- cv_spatial_autocor(x = pa_data, 
                            column =  "occ")
-

+

To visualise them (this needs the automap package to be loaded):

library(automap)
diff --git a/inst/doc/tutorial_2.html b/inst/doc/tutorial_2.html
index 10f4921..d74e576 100644
--- a/inst/doc/tutorial_2.html
+++ b/inst/doc/tutorial_2.html
@@ -12,7 +12,7 @@
 
 
 
-
+
 
 2. Block cross-validation for species distribution modelling
 
@@ -342,7 +342,7 @@ 

2. Block cross-validation for species distribution modelling

Roozbeh Valavi, Jane Elith, José Lahoz-Monfort and Gurutzeta Guillera-Arroita

-

2025-06-23

+

2025-08-15

diff --git a/man/cv_similarity.Rd b/man/cv_similarity.Rd index 00e7b19..2b00e5a 100644 --- a/man/cv_similarity.Rd +++ b/man/cv_similarity.Rd @@ -9,6 +9,8 @@ cv_similarity( x, r, num_plot = seq_along(cv$folds_list), + method = "MESS", + num_sample = 10000L, jitter_width = 0.1, points_size = 2, points_alpha = 0.7, @@ -28,6 +30,10 @@ is used to calculate similarity between the training and testing points.} \item{num_plot}{a vector of indices of folds.} +\item{method}{the similarity method including: MESS, L1 and L2. Read the details section.} + +\item{num_sample}{number of random samples from raster to calculate similarity distances (only for L1 and L2).} + \item{jitter_width}{numeric; the width of jitter points.} \item{points_size}{numeric; the size of points.} @@ -42,11 +48,35 @@ is used to calculate similarity between the training and testing points.} a ggplot object } \description{ -This function computes multivariate environmental similarity surface (MESS) as described -in Elith et al. (2010). MESS represents how similar a point in a testing fold is to a training -fold (as a reference set of points), with respect to a set of predictor variables in \code{r}. +This function evaluates environmental similarity between training and testing folds, +helping to detect potential extrapolation in the testing data. It supports three +similarity measures: Multivariate Environmental Similarity Surface (MESS), Manhattan +distance (L1), and Euclidean distance (L2). +} +\details{ +The MESS is calculated as described in Elith et al. (2010). MESS represents +how similar a point in a testing fold is to a training fold (as a reference +set of points), with respect to a set of predictor variables in \code{r}. The negative values are the sites where at least one variable has a value that is outside the range of environments over the reference set, so these are novel environments. + +When using the L1 (Manhattan) or L2 (Euclidean) distance options (experimental), the +function performs the following steps for each test sample: + +\itemize{ +\item{1. Calculates the minimum distance between each test sample and all training samples + in the same fold using the selected metric (L1 or L2).} +\item{2. Calculates a baseline distance: the average of the minimum distances between a set + of random background samples (defined by \code{num_sample}) from the raster and all training/test + samples combined.} +\item{3. Computes a similarity score by subtracting the test sample’s minimum distance from + the baseline average. A higher score indicates the test sample is more similar to + the training data, while lower or negative scores indicate novelty.} +} + +This provides a simple, distance-based novelty metric, useful for assessing +extrapolation or dissimilarity in prediction scenarios. Note that this approach is +experimental. } \examples{ \donttest{ @@ -74,3 +104,9 @@ cv_similarity(cv = sb, r = covars, x = pa_data) } } +\references{ +Elith, J., Kearney, M., & Phillips, S. (2010). The art of modelling range-shifting species: The art of modelling range-shifting species. Methods in Ecology and Evolution, 1(4), 330–342. +} +\seealso{ +\code{\link{cv_spatial}}, \code{\link{cv_cluster}}, \code{\link{cv_buffer}}, and \code{\link{cv_nndm}} +} diff --git a/src/Lightweight_matrix.h b/src/Lightweight_matrix.h new file mode 100644 index 0000000..eda7103 --- /dev/null +++ b/src/Lightweight_matrix.h @@ -0,0 +1,80 @@ +#ifndef IGHTWEIGHT_MATRIX_H +#define IGHTWEIGHT_MATRIX_H + +#include +#include + +// A light weight matrix class for faster calculation with row-major opertaions +template +class Lightweight_matrix { +private: + using MatrixType = std::vector; +public: + + explicit Lightweight_matrix(int rows, int columns): + rows_(rows), columns_(columns), matrix_(rows * columns) {} + + template + explicit Lightweight_matrix(Matrix matrix): + rows_(matrix.nrow()), columns_(matrix.ncol()), matrix_(matrix.nrow() * matrix.ncol()) { + for(int i = 0; i < rows_; ++i) { + for(int j = 0; j < columns_; ++j) { + operator()(i, j) = static_cast(matrix(i, j)); + } + } + } + + T operator()(int i, int j) const { + return matrix_[i * columns_ + j]; + } + + T& operator()(int i, int j) { + return matrix_[i * columns_ + j]; + } + + int ncol() const { + return columns_; + } + + int nrow() const { + return rows_; + } + + T* data() { + return matrix_.data(); + } + + const T* data() const { + return matrix_.data(); + } + +private: + int rows_; + int columns_; + MatrixType matrix_; +}; + + +// rbind implementation for Lightweight_matrix +template +Lightweight_matrix rbind(const Lightweight_matrix& A, const Lightweight_matrix& B) +{ + const int ncol = A.ncol(); + const int nrow_A = A.nrow(); + const int nrow_B = B.nrow(); + + if (ncol != B.ncol()) + { + throw std::invalid_argument("Matrices must have the same number of columns"); + } + + const int total_rows = nrow_A + nrow_B; + Lightweight_matrix result(total_rows, ncol); + + std::copy(A.data(), A.data() + nrow_A * ncol, result.data()); + std::copy(B.data(), B.data() + nrow_B * ncol, result.data() + nrow_A * ncol); + + return result; +} + +#endif /* IGHTWEIGHT_MATRIX_H */ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index bc58e07..1efdc4b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -24,9 +24,24 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// similarity_cpp +Rcpp::NumericVector similarity_cpp(const Rcpp::NumericMatrix& train_mat, const Rcpp::NumericMatrix& test_mat, const Rcpp::NumericMatrix& rand_mat, bool L1); +RcppExport SEXP _blockCV_similarity_cpp(SEXP train_matSEXP, SEXP test_matSEXP, SEXP rand_matSEXP, SEXP L1SEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type train_mat(train_matSEXP); + Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type test_mat(test_matSEXP); + Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type rand_mat(rand_matSEXP); + Rcpp::traits::input_parameter< bool >::type L1(L1SEXP); + rcpp_result_gen = Rcpp::wrap(similarity_cpp(train_mat, test_mat, rand_mat, L1)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_blockCV_nndm_cpp", (DL_FUNC) &_blockCV_nndm_cpp, 4}, + {"_blockCV_similarity_cpp", (DL_FUNC) &_blockCV_similarity_cpp, 4}, {NULL, NULL, 0} }; diff --git a/src/nndm.cpp b/src/nndm.cpp index 618d92e..36f460b 100644 --- a/src/nndm.cpp +++ b/src/nndm.cpp @@ -1,49 +1,7 @@ #include - #include #include - -template -class Lightweight_matrix { -private: - using MatrixType = std::vector; -public: - - explicit Lightweight_matrix(int rows, int columns): - rows_(rows), columns_(columns), matrix_(rows * columns) {} - - template - explicit Lightweight_matrix(Matrix matrix): - rows_(matrix.nrow()), columns_(matrix.ncol()), matrix_(matrix.nrow() * matrix.ncol()) { - for(int i(0); i < rows_; ++i) { - for(int j(0); j < columns_; ++j) { - operator()(i, j) = static_cast(matrix(i, j)); - } - } - } - - T operator()(int i, int j) const { - return matrix_[i * columns_ + j]; - } - - T& operator()(int i, int j) { - return matrix_[i * columns_ + j]; - } - - int ncol() const { - return columns_; - } - - int nrow() const { - return rows_; - } - -private: - int rows_; - int columns_; - MatrixType matrix_; -}; - +#include "Lightweight_matrix.h" // [[Rcpp::export]] Rcpp::NumericMatrix nndm_cpp(Rcpp::NumericMatrix X, @@ -51,111 +9,122 @@ Rcpp::NumericMatrix nndm_cpp(Rcpp::NumericMatrix X, double phi, double min_train){ - // Value representing NAs (NA_REAL does not work well with custom structures) - const double na_value(-1.); - - // C++ representation of Gij - const std::vector Gij_vec(Rcpp::as >(Gij)); - - // C++ representation of X - Lightweight_matrix X_cpp(X); - for(int i=0; i Gjstar_indices(X.ncol()); - double rmin = std::numeric_limits::infinity(); - int imin = 0; - for(int i=0; i::infinity(); - for(int j=0; j Gij_vec(Rcpp::as >(Gij)); - // Number of entries of Gjstar that are less than rmin - int Gjstar_less_than_rmin(0); - for(std::size_t i = 0; i < Gjstar_indices.size(); i++){ - if(X_cpp(i, Gjstar_indices[i]) <= rmin){ - ++Gjstar_less_than_rmin; - } + // C++ representation of X + Lightweight_matrix X_cpp(X); + for (int i=0; i(Gij_less_than_rmin) / static_cast(Gij_vec.size()); - const double tcdf = static_cast(Gjstar_less_than_rmin - 1) / static_cast(Gjstar_indices.size()); - - // Number of non-NA values in X[imin, ] - int X_non_na(0); - for(int j=0; j Gjstar_indices(X.ncol()); + double rmin = std::numeric_limits::infinity(); + int imin = 0; + for (int i=0; i::infinity(); + for (int j=0; j(X_non_na) / static_cast(X_cpp.ncol()) > min_train; - - // Will end up containing the updated rmin value - double new_rmin = std::numeric_limits::infinity(); - - if( (tcdf >= pcdf) && in_prop ) { - // Signal that the current element is NA - X_cpp(imin, Gjstar_indices[imin]) = na_value; - - double running_minimum = std::numeric_limits::infinity(); - for(int j=0; j= rmin && X_cpp(i, Gjstar_indices[i]) < new_rmin) { - new_rmin = X_cpp(i, Gjstar_indices[i]); - imin = i; + // Number of entries of Gij that are less than rmin + int Gij_less_than_rmin(0); + for (std::size_t i = 0; i < Gij_vec.size(); i++) + { + if(Gij_vec[i] <= rmin){ + ++Gij_less_than_rmin; + } } - } - } else if( static_cast(Gjstar_indices.size()) == Gjstar_less_than_rmin ){ - break; - } else { - for(std::size_t i = 0; i rmin && X_cpp(i, Gjstar_indices[i]) < new_rmin) { - new_rmin = X_cpp(i, Gjstar_indices[i]); - imin = i; + const double pcdf = static_cast(Gij_less_than_rmin) / static_cast(Gij_vec.size()); + const double tcdf = static_cast(Gjstar_less_than_rmin - 1) / static_cast(Gjstar_indices.size()); + + // Number of non-NA values in X[imin, ] + int X_non_na(0); + for (int j=0; j(X_non_na) / static_cast(X_cpp.ncol()) > min_train; + + // Will end up containing the updated rmin value + double new_rmin = std::numeric_limits::infinity(); + + if ((tcdf >= pcdf) && in_prop ) { + // Signal that the current element is NA + X_cpp(imin, Gjstar_indices[imin]) = na_value; + + double running_minimum = std::numeric_limits::infinity(); + for (int j=0; j= rmin && X_cpp(i, Gjstar_indices[i]) < new_rmin) { + new_rmin = X_cpp(i, Gjstar_indices[i]); + imin = i; + } + } + } else if( static_cast(Gjstar_indices.size()) == Gjstar_less_than_rmin ){ + break; + } else { + for (std::size_t i=0; i rmin && X_cpp(i, Gjstar_indices[i]) < new_rmin) { + new_rmin = X_cpp(i, Gjstar_indices[i]); + imin = i; + } + } + } - // Update X before returning, since we were working on C++ object - for(int i=0; i +#include +#include +#include +#include "Lightweight_matrix.h" + + +// a function to calculate L1 or L2 distances +inline double distance(Lightweight_matrix &x, Lightweight_matrix &y, int i, int ii, bool L1) +{ + int nc = x.ncol(); + double dist = 0.0; + + if (L1) + { + // could be faster by avoiding () operator and accessing data directly, and + // vectorised operations but potentially won't be necessary here.. + for (int j = 0; j < nc; j++) + { + dist += std::abs(x(i, j) - y(ii, j)); // Manhattan distance + } + } + else + { + for (int j = 0; j < nc; j++) + { + double diff = x(i, j) - y(ii, j); + dist += diff * diff; + } + dist = std::sqrt(dist); // Euclidean distance + } + + return dist; +} + + +// [[Rcpp::export]] +Rcpp::NumericVector similarity_cpp( + const Rcpp::NumericMatrix &train_mat, + const Rcpp::NumericMatrix &test_mat, + const Rcpp::NumericMatrix &rand_mat, + bool L1 = true +) { + + Lightweight_matrix train(train_mat); + Lightweight_matrix test(test_mat); + Lightweight_matrix rand(rand_mat); + // rbind train and test matrices for baseline calc + Lightweight_matrix full_mat = rbind(train, test); + + // check for possible errors. + if (train.ncol() != test.ncol()) Rcpp::stop("Number of columns in train and test are different!"); + if (train.ncol() != rand.ncol()) Rcpp::stop("Number of columns in train and raster layers are different!"); + + int nr_test = test.nrow(); + int nr_train = train.nrow(); + int nr_rand = rand.nrow(); + + // output vector + std::vector output(nr_test, 0.0); + + // loop over the random samples + double sum_dist = 0.0; + for (int i = 0; i < nr_rand; i++) + { + double min_dist(std::numeric_limits::infinity()); + // loop over the training samples + for (int row = 0; row < full_mat.nrow(); row++) + { + // calcualte L1/L2 distance + double dist_rand = distance(full_mat, rand, row, i, L1); + // get the min distance + if (dist_rand < min_dist) + { + min_dist = dist_rand; + } + } + sum_dist += min_dist; + } + // average min distance of a random sample to training samples + const double base_dist = sum_dist / static_cast(nr_rand); + + // loop over the test samples + for (int i = 0; i < nr_test; i++) + { + double min_test(std::numeric_limits::infinity()); + // loop over the training samples to get the min distance + for (int row = 0; row < nr_train; row++) + { + // calcualte L1/L2 distance + double dist = distance(train, test, row, i, L1); + // get the min distance + if (dist < min_test) + { + min_test = dist; + } + } + // expected distance - test distance + double test_dist = base_dist - min_test; + + output[i] = test_dist; + } + + return Rcpp::wrap(output); +} + diff --git a/tests/testthat/test-cv_buffer.R b/tests/testthat/test-cv_buffer.R index bd9a53f..e412099 100644 --- a/tests/testthat/test-cv_buffer.R +++ b/tests/testthat/test-cv_buffer.R @@ -9,120 +9,120 @@ expect_names <- c( pa_data <- read.csv(system.file("extdata/", "species.csv", package = "blockCV")) |> - sf::st_as_sf(coords = c("x", "y"), crs = 7845) + sf::st_as_sf(coords = c("x", "y"), crs = 7845) pa_data <- pa_data[1:200, ] test_that("test that cv_buffer function works properly with presence-absence data", { - bloo <- cv_buffer( - x = pa_data, - column = "occ", - size = 250000, - presence_bg = FALSE, - progress = TRUE - ) - - expect_true(exists("bloo")) - expect_s3_class(bloo, "cv_buffer") - expect_equal(names(bloo), expect_names) - expect_equal(length(bloo$folds), nrow(pa_data)) - expect_type(bloo$folds, "list") - expect_type(bloo$k, "integer") - expect_type(bloo$column, "character") - expect_type(bloo$size, "double") - expect_equal(bloo$presence_bg, FALSE) - expect_equal(dim(bloo$records), c(nrow(pa_data), 4)) - expect_true(!all(bloo$records == 0)) - -}) + bloo <- cv_buffer( + x = pa_data, + column = "occ", + size = 250000, + presence_bg = FALSE, + progress = TRUE + ) + + expect_true(exists("bloo")) + expect_s3_class(bloo, "cv_buffer") + expect_equal(names(bloo), expect_names) + expect_equal(length(bloo$folds), nrow(pa_data)) + expect_type(bloo$folds, "list") + expect_type(bloo$k, "integer") + expect_type(bloo$column, "character") + expect_type(bloo$size, "double") + expect_equal(bloo$presence_bg, FALSE) + expect_equal(dim(bloo$records), c(nrow(pa_data), 4)) + expect_true(!all(bloo$records == 0)) + + }) test_that("test that cv_buffer function works properly with presence-background data", { - bloo <- cv_buffer( - x = pa_data, - column = "occ", - size = 250000, - presence_bg = TRUE, - progress = FALSE - ) - - expect_true(exists("bloo")) - expect_s3_class(bloo, "cv_buffer") - expect_equal(names(bloo), expect_names) - expect_equal(length(bloo$folds), sum(pa_data$occ)) - expect_type(bloo$folds, "list") - expect_type(bloo$k, "integer") - expect_type(bloo$column, "character") - expect_type(bloo$size, "double") - expect_equal(bloo$presence_bg, TRUE) - expect_equal(dim(bloo$records), c(sum(pa_data$occ), 4)) - expect_true(!all(bloo$records == 0)) - -}) + bloo <- cv_buffer( + x = pa_data, + column = "occ", + size = 250000, + presence_bg = TRUE, + progress = FALSE + ) + + expect_true(exists("bloo")) + expect_s3_class(bloo, "cv_buffer") + expect_equal(names(bloo), expect_names) + expect_equal(length(bloo$folds), sum(pa_data$occ)) + expect_type(bloo$folds, "list") + expect_type(bloo$k, "integer") + expect_type(bloo$column, "character") + expect_type(bloo$size, "double") + expect_equal(bloo$presence_bg, TRUE) + expect_equal(dim(bloo$records), c(sum(pa_data$occ), 4)) + expect_true(!all(bloo$records == 0)) + + }) test_that("test that cv_buffer function works properly with no species specified", { - bloo <- cv_buffer( - x = pa_data, - size = 250000 - ) - - expect_true(exists("bloo")) - expect_s3_class(bloo, "cv_buffer") - expect_equal(names(bloo), expect_names) - expect_equal(length(bloo$folds), nrow(pa_data)) - expect_type(bloo$folds, "list") - expect_type(bloo$k, "integer") - expect_null(bloo$species) - expect_type(bloo$size, "double") - expect_equal(bloo$presence_bg, FALSE) - expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) - expect_true(!all(bloo$records == 0)) - - expect_equal(print(bloo), "cv_buffer") - expect_output(summary(bloo)) + bloo <- cv_buffer( + x = pa_data, + size = 250000 + ) -}) + expect_true(exists("bloo")) + expect_s3_class(bloo, "cv_buffer") + expect_equal(names(bloo), expect_names) + expect_equal(length(bloo$folds), nrow(pa_data)) + expect_type(bloo$folds, "list") + expect_type(bloo$k, "integer") + expect_null(bloo$species) + expect_type(bloo$size, "double") + expect_equal(bloo$presence_bg, FALSE) + expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) + expect_true(!all(bloo$records == 0)) + + expect_equal(print(bloo), "cv_buffer") + expect_output(summary(bloo)) + + }) test_that("test cv_buffer function with no matching species column", { - expect_warning( - bloo <- cv_buffer( - x = pa_data, - column = "response", - size = 250000, - presence_bg = FALSE, - progress = TRUE + expect_warning( + bloo <- cv_buffer( + x = pa_data, + column = "response", + size = 250000, + presence_bg = FALSE, + progress = TRUE + ) ) - ) - expect_true(exists("bloo")) - expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) - expect_true(!all(bloo$records == 0)) + expect_true(exists("bloo")) + expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) + expect_true(!all(bloo$records == 0)) }) test_that("test cv_buffer function with no spatial column data", { - expect_error( - cv_buffer( - x = "pa_data", - size = 250000 + expect_error( + cv_buffer( + x = "pa_data", + size = 250000 + ) ) - ) }) test_that("test cv_buffer function to have sptial points with no CRS", { - sf::st_crs(pa_data) <- NA - - expect_error( - cv_buffer( - x = pa_data, - column = "occ", - size = 250000 + sf::st_crs(pa_data) <- NA + + expect_error( + cv_buffer( + x = pa_data, + column = "occ", + size = 250000 + ) ) - ) }) diff --git a/tests/testthat/test-cv_cluster.R b/tests/testthat/test-cv_cluster.R index 99091af..f969dce 100644 --- a/tests/testthat/test-cv_cluster.R +++ b/tests/testthat/test-cv_cluster.R @@ -1,119 +1,109 @@ -expect_names <- c( - "folds_list", - "folds_ids", - "biomod_table", - "k", - "column", - "type", - "records" -) +expect_names <- c("folds_list", + "folds_ids", + "biomod_table", + "k", + "column", + "type", + "records") aus <- system.file("extdata/au/", package = "blockCV") |> - list.files(full.names = TRUE) |> - terra::rast() + list.files(full.names = TRUE) |> + terra::rast() pa_data <- read.csv(system.file("extdata/", "species.csv", package = "blockCV")) |> - sf::st_as_sf(coords = c("x", "y"), crs = 7845) + sf::st_as_sf(coords = c("x", "y"), crs = 7845) test_that("test that environmental cluster function with raster_cluster", { - - # environmental clustering - set.seed(42) - eb <- cv_cluster(x = pa_data, - column = "occ", - r = aus, - k = 3, - scale = TRUE, - raster_cluster = TRUE) - - expect_true(exists("eb")) - expect_s3_class(eb, "cv_cluster") - expect_equal(names(eb), expect_names) - expect_equal(length(eb$folds_list), 3) - expect_type(eb$folds_list, "list") - expect_equal(eb$k, 3) - expect_type(eb$column, "character") - expect_equal(dim(eb$records), c(3, 4)) - expect_true( - !all(eb$records == 0) - ) + # environmental clustering + set.seed(42) + eb <- cv_cluster( + x = pa_data, + column = "occ", + r = aus, + k = 3, + scale = TRUE, + raster_cluster = TRUE + ) + + expect_true(exists("eb")) + expect_s3_class(eb, "cv_cluster") + expect_equal(names(eb), expect_names) + expect_equal(length(eb$folds_list), 3) + expect_type(eb$folds_list, "list") + expect_equal(eb$k, 3) + expect_type(eb$column, "character") + expect_equal(dim(eb$records), c(3, 4)) + expect_true(!all(eb$records == 0)) }) test_that("test that spacial cluster function with no column", { - - # spatial clustering - set.seed(42) - eb <- cv_cluster(x = pa_data, - k = 5, - biomod2 = FALSE) - - expect_true(exists("eb")) - expect_s3_class(eb, "cv_cluster") - expect_equal(names(eb), expect_names) - expect_equal(length(eb$folds_list), 5) - expect_type(eb$folds_list, "list") - expect_equal(eb$k, 5) - expect_null(eb$column) - expect_equal(dim(eb$records), c(5, 2)) - expect_true( - !all(eb$records == 0) - ) - - expect_equal(print(eb), "cv_cluster") - expect_output(summary(eb)) + # spatial clustering + set.seed(42) + eb <- cv_cluster(x = pa_data, k = 5, biomod2 = FALSE) + + expect_true(exists("eb")) + expect_s3_class(eb, "cv_cluster") + expect_equal(names(eb), expect_names) + expect_equal(length(eb$folds_list), 5) + expect_type(eb$folds_list, "list") + expect_equal(eb$k, 5) + expect_null(eb$column) + expect_equal(dim(eb$records), c(5, 2)) + expect_true(!all(eb$records == 0)) + + expect_equal(print(eb), "cv_cluster") + expect_output(summary(eb)) }) test_that("test that environmental cluster with no scale and wrong column", { - - set.seed(42) - expect_warning( - eb <- cv_cluster(x = pa_data, - column = "response", # wrong column name - r = aus, - k = 5, - scale = FALSE, - raster_cluster = TRUE, - algorithm = "MacQueen") - ) - - expect_true(exists("eb")) - expect_s3_class(eb, "cv_cluster") - expect_equal(names(eb), expect_names) - expect_equal(length(eb$folds_list), 5) - expect_type(eb$folds_list, "list") - expect_equal(eb$k, 5) - expect_equal(dim(eb$records), c(5, 2)) - expect_true( - !all(eb$records == 0) - ) + set.seed(42) + expect_warning( + eb <- cv_cluster( + x = pa_data, + column = "response", + # wrong column name + r = aus, + k = 5, + scale = FALSE, + raster_cluster = TRUE, + algorithm = "MacQueen" + ) + ) + + expect_true(exists("eb")) + expect_s3_class(eb, "cv_cluster") + expect_equal(names(eb), expect_names) + expect_equal(length(eb$folds_list), 5) + expect_type(eb$folds_list, "list") + expect_equal(eb$k, 5) + expect_equal(dim(eb$records), c(5, 2)) + expect_true(!all(eb$records == 0)) }) test_that("test environmental cluster with no spatial or sf object", { - - expect_error( - cv_cluster(x = aus, # no spatial or sf object - r = aus, - k = 5, - scale = TRUE, - raster_cluster = FALSE) - - ) + expect_error(cv_cluster( + x = aus, + # no spatial or sf object + r = aus, + k = 5, + scale = TRUE, + raster_cluster = FALSE + )) }) test_that("test environmental cluster with no raster data", { - expect_error( - cv_cluster(r = data.frame(x = 1), # no raster data - x = pa_data) - - ) + expect_error(cv_cluster( + r = data.frame(x = 1), + # no raster data + x = pa_data + )) }) - diff --git a/tests/testthat/test-cv_nndm.R b/tests/testthat/test-cv_nndm.R index 35d9058..dd33f57 100644 --- a/tests/testthat/test-cv_nndm.R +++ b/tests/testthat/test-cv_nndm.R @@ -10,131 +10,131 @@ expect_names <- c( aus <- system.file("extdata/au/", package = "blockCV") |> - list.files(full.names = TRUE) |> - terra::rast() + list.files(full.names = TRUE) |> + terra::rast() pa_data <- read.csv(system.file("extdata/", "species.csv", package = "blockCV")) |> - sf::st_as_sf(coords = c("x", "y"), crs = 7845) + sf::st_as_sf(coords = c("x", "y"), crs = 7845) pa_data <- pa_data[1:100, ] test_that("test that cv_nndm function works properly with presence-absence data", { - bloo <- cv_nndm( - x = pa_data, - column = "occ", - r = aus, - size = 250000, - num_sample = 3000, - presence_bg = FALSE - ) - - expect_true(exists("bloo")) - expect_s3_class(bloo, "cv_nndm") - expect_equal(names(bloo), expect_names) - expect_equal(length(bloo$folds), nrow(pa_data)) - expect_type(bloo$folds, "list") - expect_type(bloo$k, "integer") - expect_type(bloo$column, "character") - expect_type(bloo$size, "double") - expect_equal(bloo$presence_bg, FALSE) - expect_equal(dim(bloo$records), c(nrow(pa_data), 4)) - expect_true(!all(bloo$records == 0)) + bloo <- cv_nndm( + x = pa_data, + column = "occ", + r = aus, + size = 250000, + num_sample = 3000, + presence_bg = FALSE + ) + + expect_true(exists("bloo")) + expect_s3_class(bloo, "cv_nndm") + expect_equal(names(bloo), expect_names) + expect_equal(length(bloo$folds), nrow(pa_data)) + expect_type(bloo$folds, "list") + expect_type(bloo$k, "integer") + expect_type(bloo$column, "character") + expect_type(bloo$size, "double") + expect_equal(bloo$presence_bg, FALSE) + expect_equal(dim(bloo$records), c(nrow(pa_data), 4)) + expect_true(!all(bloo$records == 0)) }) test_that("test that cv_nndm function works properly with presence-background data", { - bloo <- cv_nndm( - x = pa_data, - column = "occ", - r = aus, - size = 250000, - sampling = "regular", - presence_bg = TRUE - ) - - expect_true(exists("bloo")) - expect_s3_class(bloo, "cv_nndm") - expect_equal(names(bloo), expect_names) - expect_equal(length(bloo$folds), sum(pa_data$occ)) - expect_type(bloo$folds, "list") - expect_type(bloo$k, "integer") - expect_type(bloo$column, "character") - expect_type(bloo$size, "double") - expect_equal(bloo$presence_bg, TRUE) - expect_equal(dim(bloo$records), c(sum(pa_data$occ), 4)) - expect_true(!all(bloo$records == 0)) + bloo <- cv_nndm( + x = pa_data, + column = "occ", + r = aus, + size = 250000, + sampling = "regular", + presence_bg = TRUE + ) + + expect_true(exists("bloo")) + expect_s3_class(bloo, "cv_nndm") + expect_equal(names(bloo), expect_names) + expect_equal(length(bloo$folds), sum(pa_data$occ)) + expect_type(bloo$folds, "list") + expect_type(bloo$k, "integer") + expect_type(bloo$column, "character") + expect_type(bloo$size, "double") + expect_equal(bloo$presence_bg, TRUE) + expect_equal(dim(bloo$records), c(sum(pa_data$occ), 4)) + expect_true(!all(bloo$records == 0)) }) test_that("test that cv_nndm function works properly with no species specified", { - bloo <- cv_nndm( - x = sf::as_Spatial(pa_data), - r = aus, - size = 250000, - num_sample = 3000, - plot = FALSE - ) - - expect_true(exists("bloo")) - expect_s3_class(bloo, "cv_nndm") - expect_equal(names(bloo), expect_names) - expect_equal(length(bloo$folds), nrow(pa_data)) - expect_type(bloo$folds, "list") - expect_type(bloo$k, "integer") - expect_null(bloo$species) - expect_type(bloo$size, "double") - expect_equal(bloo$presence_bg, FALSE) - expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) - expect_true(!all(bloo$records == 0)) - - expect_equal(print(bloo), "cv_nndm") - expect_output(summary(bloo)) + bloo <- cv_nndm( + x = sf::as_Spatial(pa_data), + r = aus, + size = 250000, + num_sample = 3000, + plot = FALSE + ) + + expect_true(exists("bloo")) + expect_s3_class(bloo, "cv_nndm") + expect_equal(names(bloo), expect_names) + expect_equal(length(bloo$folds), nrow(pa_data)) + expect_type(bloo$folds, "list") + expect_type(bloo$k, "integer") + expect_null(bloo$species) + expect_type(bloo$size, "double") + expect_equal(bloo$presence_bg, FALSE) + expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) + expect_true(!all(bloo$records == 0)) + + expect_equal(print(bloo), "cv_nndm") + expect_output(summary(bloo)) }) test_that("test cv_nndm function with no matching species column", { - expect_warning( - bloo <- cv_nndm( - x = pa_data, - column = "response", - r = aus, - size = 250000, - num_sample = 3000, - presence_bg = FALSE + expect_warning( + bloo <- cv_nndm( + x = pa_data, + column = "response", + r = aus, + size = 250000, + num_sample = 3000, + presence_bg = FALSE + ) ) - ) - expect_true(exists("bloo")) - expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) - expect_true(!all(bloo$records == 0)) + expect_true(exists("bloo")) + expect_equal(dim(bloo$records), c(nrow(pa_data), 2)) + expect_true(!all(bloo$records == 0)) }) test_that("test cv_nndm function with no spatial column data", { - expect_error( - cv_nndm( - x = "pa_data", - r = aus, - size = 250000 + expect_error( + cv_nndm( + x = "pa_data", + r = aus, + size = 250000 + ) ) - ) }) test_that("test cv_nndm function to have sptial points with no CRS", { - sf::st_crs(pa_data) <- NA - - expect_error( - cv_nndm( - x = pa_data, - column = "occ", - r = aus, - size = 250000 + sf::st_crs(pa_data) <- NA + + expect_error( + cv_nndm( + x = pa_data, + column = "occ", + r = aus, + size = 250000 + ) ) - ) }) diff --git a/tests/testthat/test-cv_plot.R b/tests/testthat/test-cv_plot.R index addcf45..e2c300d 100644 --- a/tests/testthat/test-cv_plot.R +++ b/tests/testthat/test-cv_plot.R @@ -1,21 +1,22 @@ pa_data <- read.csv(system.file("extdata/", "species.csv", package = "blockCV")) |> - sf::st_as_sf(coords = c("x", "y"), crs = 7845) + sf::st_as_sf(coords = c("x", "y"), crs = 7845) -test_that("test that cv_plot function works", - { - scv <- cv_spatial(x = pa_data, - size = 450000, - k = 5, - selection = "random", - iteration = 1, - biomod2 = FALSE, - plot = FALSE, - progress = FALSE) +test_that("test that cv_plot function works", { + scv <- cv_spatial( + x = pa_data, + size = 450000, + k = 5, + selection = "random", + iteration = 1, + biomod2 = FALSE, + plot = FALSE, + progress = FALSE + ) - plt <- cv_plot(cv = scv, x = pa_data) + plt <- cv_plot(cv = scv, x = pa_data) - expect_true(exists("plt")) - expect_true(ggplot2::is_ggplot(plt)) + expect_true(exists("plt")) + expect_true(ggplot2::is_ggplot(plt)) }) diff --git a/tests/testthat/test-cv_similarity.R b/tests/testthat/test-cv_similarity.R index a634fac..4fdf3d5 100644 --- a/tests/testthat/test-cv_similarity.R +++ b/tests/testthat/test-cv_similarity.R @@ -1,39 +1,62 @@ aus <- system.file("extdata/au/", package = "blockCV") |> - list.files(full.names = TRUE) |> - terra::rast() + list.files(full.names = TRUE) |> + terra::rast() pa_data <- read.csv(system.file("extdata/", "species.csv", package = "blockCV")) |> - sf::st_as_sf(coords = c("x", "y"), crs = 7845) + sf::st_as_sf(coords = c("x", "y"), crs = 7845) pa_data <- pa_data[1:200, ] -test_that("test that cv_similarity function works with cv_spatil", - { - scv <- cv_spatial(x = pa_data, - selection = "random", - iteration = 1, - biomod2 = FALSE, - plot = FALSE, - report = FALSE, - progress = FALSE) +scv <- cv_spatial( + x = pa_data, + selection = "random", + iteration = 1, + biomod2 = FALSE, + plot = FALSE, + report = FALSE, + progress = FALSE +) - plt <- cv_similarity(cv = scv, x = pa_data, r = aus) - expect_true(exists("plt")) - expect_true(ggplot2::is_ggplot(plt)) +test_that("test that cv_similarity function works with cv_spatil and MESS", { + + plt <- cv_similarity(cv = scv, x = pa_data, r = aus, method = "MESS") + + expect_true(exists("plt")) + expect_true(ggplot2::is_ggplot(plt)) }) +test_that("test that cv_similarity function works with cv_spatil and L1", { -test_that("test that cv_similarity function works with cv_buffer", - { - bloo <- cv_buffer(x = pa_data, - size = 250000, - progress = FALSE, - report = FALSE) + plt <- cv_similarity(cv = scv, x = pa_data, r = aus, method = "L1") - plt <- cv_similarity(cv = bloo, x = pa_data, r = aus) + expect_true(exists("plt")) + expect_true(ggplot2::is_ggplot(plt)) + +}) - expect_true(exists("plt")) - expect_true(ggplot2::is_ggplot(plt)) +test_that("test that cv_similarity function works with cv_spatil and L2", { + + plt <- cv_similarity(cv = scv, x = pa_data, r = aus, , method = "L2") + + expect_true(exists("plt")) + expect_true(ggplot2::is_ggplot(plt)) }) + + +test_that("test that cv_similarity function works with cv_buffer", { + bloo <- cv_buffer( + x = pa_data, + size = 250000, + progress = FALSE, + report = FALSE + ) + + plt <- cv_similarity(cv = bloo, x = pa_data, r = aus) + + expect_true(exists("plt")) + expect_true(ggplot2::is_ggplot(plt)) + +}) + diff --git a/tests/testthat/test-cv_spatial.R b/tests/testthat/test-cv_spatial.R index a0f314f..d55a27a 100644 --- a/tests/testthat/test-cv_spatial.R +++ b/tests/testthat/test-cv_spatial.R @@ -8,226 +8,245 @@ expect_names <- c("folds_list", "records") aus <- system.file("extdata/au/", package = "blockCV") |> - list.files(full.names = TRUE) |> - terra::rast() + list.files(full.names = TRUE) |> + terra::rast() pa_data <- read.csv(system.file("extdata/", "species.csv", package = "blockCV")) |> - sf::st_as_sf(coords = c("x", "y"), crs = 7845) + sf::st_as_sf(coords = c("x", "y"), crs = 7845) pa_data <- pa_data[1:200, ] test_that("test cv_spatial function with random assingment and raster file", { - set.seed(1000) - scv <- cv_spatial(x = pa_data, - column = "occ", - r = aus, - size = 450000, - k = 5, - selection = "random", - iteration = 5, - biomod2 = TRUE, - offset = c(0.3, 0.2), - plot = FALSE, - progress = TRUE) - - expect_true(exists("scv")) - expect_s3_class(scv, "cv_spatial") - expect_equal(names(scv), expect_names) - expect_equal(length(scv$folds_list), 5) - expect_type(scv$folds_list, "list") - expect_type(scv$biomod_table, "logical") - expect_equal(dim(scv$biomod_table), c(nrow(pa_data), 5)) - expect_equal(scv$k, 5) - expect_s3_class(scv$blocks, "sf") - expect_type(scv$column, "character") - expect_type(scv$size, "double") - expect_equal(dim(scv$records), c(5, 4)) - expect_true( - !all(scv$records == 0) - ) + set.seed(1000) + scv <- cv_spatial( + x = pa_data, + column = "occ", + r = aus, + size = 450000, + k = 5, + selection = "random", + iteration = 5, + biomod2 = TRUE, + offset = c(0.3, 0.2), + plot = FALSE, + progress = TRUE + ) + + expect_true(exists("scv")) + expect_s3_class(scv, "cv_spatial") + expect_equal(names(scv), expect_names) + expect_equal(length(scv$folds_list), 5) + expect_type(scv$folds_list, "list") + expect_type(scv$biomod_table, "logical") + expect_equal(dim(scv$biomod_table), c(nrow(pa_data), 5)) + expect_equal(scv$k, 5) + expect_s3_class(scv$blocks, "sf") + expect_type(scv$column, "character") + expect_type(scv$size, "double") + expect_equal(dim(scv$records), c(5, 4)) + expect_true( + !all(scv$records == 0) + ) }) test_that("test cv_spatial function with systematic assingment and no raster file", { - scv <- cv_spatial(x = sf::as_Spatial(pa_data), - rows_cols = c(10, 10), - k = 5, - selection = "systematic", - biomod2 = FALSE, - plot = TRUE) - - expect_true(exists("scv")) - expect_s3_class(scv, "cv_spatial") - expect_equal(names(scv), expect_names) - expect_equal(length(scv$folds_list), 5) - expect_type(scv$folds_list, "list") - expect_null(scv$biomod_table, "matrix") - expect_equal(scv$k, 5) - expect_s3_class(scv$blocks, "sf") - expect_null(scv$column) - expect_null(scv$size) - expect_equal(dim(scv$records), c(5, 2)) - expect_true( - !all(scv$records == 0) - ) - - expect_equal(print(scv), "cv_spatial") - expect_message(plot(scv)) - expect_output(summary(scv)) + scv <- cv_spatial( + x = sf::as_Spatial(pa_data), + rows_cols = c(10, 10), + k = 5, + selection = "systematic", + biomod2 = FALSE, + plot = TRUE + ) + + expect_true(exists("scv")) + expect_s3_class(scv, "cv_spatial") + expect_equal(names(scv), expect_names) + expect_equal(length(scv$folds_list), 5) + expect_type(scv$folds_list, "list") + expect_null(scv$biomod_table, "matrix") + expect_equal(scv$k, 5) + expect_s3_class(scv$blocks, "sf") + expect_null(scv$column) + expect_null(scv$size) + expect_equal(dim(scv$records), c(5, 2)) + expect_true( + !all(scv$records == 0) + ) + + expect_equal(print(scv), "cv_spatial") + expect_message(plot(scv)) + expect_output(summary(scv)) }) test_that("test cv_spatial function with non-numeric iteration", { - scv <- cv_spatial(x = pa_data, - hexagon = FALSE, - k = 5, - selection = "random", - iteration = "2", # not numeric - biomod2 = FALSE, - plot = FALSE) - - expect_true(exists("scv")) - expect_s3_class(scv, "cv_spatial") - expect_equal(names(scv), expect_names) - expect_equal(length(scv$folds_list), 5) - expect_type(scv$folds_list, "list") - expect_null(scv$biomod_table, "matrix") - expect_equal(scv$k, 5) - expect_s3_class(scv$blocks, "sf") - expect_null(scv$column) - expect_null(scv$size) - expect_equal(dim(scv$records), c(5, 2)) - expect_true( - !all(scv$records == 0) - ) - - expect_equal(print(scv), "cv_spatial") - expect_message(plot(scv)) - expect_output(summary(scv)) + scv <- cv_spatial( + x = pa_data, + hexagon = FALSE, + k = 5, + selection = "random", + iteration = "2", # not numeric + biomod2 = FALSE, + plot = FALSE + ) + + expect_true(exists("scv")) + expect_s3_class(scv, "cv_spatial") + expect_equal(names(scv), expect_names) + expect_equal(length(scv$folds_list), 5) + expect_type(scv$folds_list, "list") + expect_null(scv$biomod_table, "matrix") + expect_equal(scv$k, 5) + expect_s3_class(scv$blocks, "sf") + expect_null(scv$column) + expect_null(scv$size) + expect_equal(dim(scv$records), c(5, 2)) + expect_true( + !all(scv$records == 0) + ) + + expect_equal(print(scv), "cv_spatial") + expect_message(plot(scv)) + expect_output(summary(scv)) }) test_that("test cv_spatial with checkerboard assingment and only row blocks", { - scv <- cv_spatial(x = sf::as_Spatial(pa_data), - hexagon = FALSE, - rows_cols = c(5, 1), - selection = "checkerboard", - biomod2 = FALSE, - plot = TRUE) - - expect_true(exists("scv")) - expect_s3_class(scv, "cv_spatial") - expect_equal(names(scv), expect_names) - expect_equal(length(scv$folds_list), 2) - expect_type(scv$folds_list, "list") - expect_null(scv$biomod_table) - expect_equal(scv$k, 2) - expect_s3_class(scv$blocks, "sf") - expect_null(scv$column) - expect_null(scv$size) - expect_equal(dim(scv$records), c(2, 2)) - expect_true( - !all(scv$records == 0) - ) - - expect_equal(print(scv), "cv_spatial") - expect_message(plot(scv)) - expect_output(summary(scv)) + scv <- cv_spatial( + x = sf::as_Spatial(pa_data), + hexagon = FALSE, + rows_cols = c(5, 1), + selection = "checkerboard", + biomod2 = FALSE, + plot = TRUE + ) + + expect_true(exists("scv")) + expect_s3_class(scv, "cv_spatial") + expect_equal(names(scv), expect_names) + expect_equal(length(scv$folds_list), 2) + expect_type(scv$folds_list, "list") + expect_null(scv$biomod_table) + expect_equal(scv$k, 2) + expect_s3_class(scv$blocks, "sf") + expect_null(scv$column) + expect_null(scv$size) + expect_equal(dim(scv$records), c(2, 2)) + expect_true( + !all(scv$records == 0) + ) + + expect_equal(print(scv), "cv_spatial") + expect_message(plot(scv)) + expect_output(summary(scv)) }) test_that("test cv_spatial with user-defined blocks", { - # make a spatial block polygon - user_poly <- .make_blocks(x_obj = pa_data, blocksize = 450000) - - scv <- cv_spatial(x = pa_data, - user_blocks = user_poly, - selection = "random", - iteration = 5, - biomod2 = FALSE, - progress = FALSE, - plot = FALSE) - - expect_true(exists("scv")) - expect_s3_class(scv, "cv_spatial") - expect_equal(names(scv), expect_names) - expect_equal(length(scv$folds_list), 5) - expect_type(scv$folds_list, "list") - expect_null(scv$biomod_table) - expect_equal(scv$k, 5) - expect_s3_class(scv$blocks, "sf") - expect_null(scv$column) - expect_null(scv$size) - expect_equal(dim(scv$records), c(5, 2)) - expect_true( - !all(scv$records == 0) - ) - - expect_equal(print(scv), "cv_spatial") - expect_message(plot(scv)) - expect_output(summary(scv)) + # make a spatial block polygon + user_poly <- .make_blocks(x_obj = pa_data, blocksize = 450000) + + scv <- cv_spatial( + x = pa_data, + user_blocks = user_poly, + selection = "random", + iteration = 5, + biomod2 = FALSE, + progress = FALSE, + plot = FALSE + ) + + expect_true(exists("scv")) + expect_s3_class(scv, "cv_spatial") + expect_equal(names(scv), expect_names) + expect_equal(length(scv$folds_list), 5) + expect_type(scv$folds_list, "list") + expect_null(scv$biomod_table) + expect_equal(scv$k, 5) + expect_s3_class(scv$blocks, "sf") + expect_null(scv$column) + expect_null(scv$size) + expect_equal(dim(scv$records), c(5, 2)) + expect_true( + !all(scv$records == 0) + ) + + expect_equal(print(scv), "cv_spatial") + expect_message(plot(scv)) + expect_output(summary(scv)) }) test_that("test cv_spatial failur: number of blocks, wrong selection", { - expect_error(cv_spatial(x = pa_data, - rows_cols = c(2, 2), - hexagon = FALSE, - k = 15, # very high k - selection = "random")) + expect_error(cv_spatial( + x = pa_data, + rows_cols = c(2, 2), + hexagon = FALSE, + k = 15, + # very high k + selection = "random" + )) - expect_error(cv_spatial(x = pa_data, - k = 1, # very low k - selection = "random")) + expect_error(cv_spatial( + x = pa_data, + k = 1, + # very low k + selection = "random" + )) - expect_error(cv_spatial(x = pa_data, - k = 5, - selection = "chance")) # wrong selection + expect_error(cv_spatial(x = pa_data, k = 5, selection = "chance")) # wrong selection }) test_that("test cv_spatial failur: wrong user-defined blocks", { - expect_error( - cv_spatial(x = pa_data, # wrong user-defined blocks - column = "occ", - r = aus, - user_blocks = data.frame(a = 1), - size = 450000, - k = 5) - ) + expect_error( + cv_spatial( + x = pa_data, + # wrong user-defined blocks + column = "occ", + r = aus, + user_blocks = data.frame(a = 1), + size = 450000, + k = 5 + ) + ) }) test_that("test cv_spatial with no speceis column match", { - expect_warning( - scv <- cv_spatial(x = pa_data, - column = "response", # wrong response - r = aus, - size = 450000, - k = 5) - ) - - expect_true(exists("scv")) - expect_s3_class(scv, "cv_spatial") - expect_equal(names(scv), expect_names) - expect_equal(length(scv$folds_list), 5) - expect_equal(scv$k, 5) - expect_equal(dim(scv$records), c(5, 2)) - expect_true( - !all(scv$records == 0) - ) + + expect_warning(scv <- cv_spatial( + x = pa_data, + column = "response", + # wrong response + r = aus, + size = 450000, + k = 5 + )) + + expect_true(exists("scv")) + expect_s3_class(scv, "cv_spatial") + expect_equal(names(scv), expect_names) + expect_equal(length(scv$folds_list), 5) + expect_equal(scv$k, 5) + expect_equal(dim(scv$records), c(5, 2)) + expect_true( + !all(scv$records == 0) + ) }) diff --git a/tests/testthat/test-v2-functions.R b/tests/testthat/test-v2-functions.R index 1310efb..843f8c7 100644 --- a/tests/testthat/test-v2-functions.R +++ b/tests/testthat/test-v2-functions.R @@ -20,9 +20,11 @@ expect_names <- c( test_that("test spatialAutoRange function works", { # skip_on_cran() - range1 <- spatialAutoRange(rasterLayer = aus, - sampleNumber = 1000, - showPlots = TRUE) + range1 <- spatialAutoRange( + rasterLayer = aus, + sampleNumber = 1000, + showPlots = TRUE + ) expect_true(exists("range1")) expect_s3_class(range1, "SpatialAutoRange") @@ -52,10 +54,12 @@ test_that("test spatialAutoRange function works", { test_that("test spatialAutoRange function with x", { # skip_on_cran() - range3 <- spatialAutoRange(rasterLayer = aus, - speciesData = pa_data, - progress = FALSE, - showPlots = TRUE) + range3 <- spatialAutoRange( + rasterLayer = aus, + speciesData = pa_data, + progress = FALSE, + showPlots = TRUE + ) expect_true(exists("range3")) expect_s3_class(range3, "SpatialAutoRange") @@ -89,16 +93,18 @@ test_that("test spatiaBlock function works", { # skip_on_cran() set.seed(1000) - sb1 <- spatialBlock(speciesData = pa_data, - species = "occ", - rasterLayer = aus, - theRange = 450000, - k = 5, - selection = "random", - iteration = 5, - biomod2Format = TRUE, - showBlocks = TRUE, - progress = TRUE) + sb1 <- spatialBlock( + speciesData = pa_data, + species = "occ", + rasterLayer = aus, + theRange = 450000, + k = 5, + selection = "random", + iteration = 5, + biomod2Format = TRUE, + showBlocks = TRUE, + progress = TRUE + ) expect_true(exists("sb1")) expect_s3_class(sb1, "SpatialBlock") @@ -137,11 +143,13 @@ expect_names <- c( test_that("test buffering function works", { # skip_on_cran() - bf1 <- buffering(speciesData= pa_data, - species= "occ", - theRange= 450000, - spDataType = "PA", - progress = TRUE) + bf1 <- buffering( + speciesData= pa_data, + species= "occ", + theRange= 450000, + spDataType = "PA", + progress = TRUE + ) expect_true(exists("bf1")) expect_s3_class(bf1, "BufferedBlock") @@ -176,12 +184,14 @@ expect_names <- c( test_that("test that environmental blocking function works", { # skip_on_cran() - eb <- envBlock(rasterLayer = aus, - speciesData = pa_data, - species = "occ", - k = 5, - standardization = "standard", - rasterBlock = FALSE) + eb <- envBlock( + rasterLayer = aus, + speciesData = pa_data, + species = "occ", + k = 5, + standardization = "standard", + rasterBlock = FALSE + ) expect_true(exists("eb")) expect_s3_class(eb, "EnvironmentalBlock") diff --git a/vignettes/tutorial_1.Rmd b/vignettes/tutorial_1.Rmd index 6037169..46e1ac1 100644 --- a/vignettes/tutorial_1.Rmd +++ b/vignettes/tutorial_1.Rmd @@ -296,12 +296,14 @@ cv_plot(cv = sb1, ## Check similarity -The `cv_similarity` function can check for environmental similarity between the training and testing folds and thus possible extrapolation in the testing folds. It computes multivariate environmental similarity surface (MESS) as described in Elith et al. (2010). MESS represents how similar a point in a testing fold is to a training fold (as a reference set of points), with respect to a set of predictor variables in `r`. The negative values are the sites where at least one variable has a value that is outside the range of environments over the reference set, so these are novel environments. +The `cv_similarity` function evaluates environmental similarity between training and testing folds, helping to detect potential extrapolation in the testing data. It supports three similarity measures: Multivariate Environmental Similarity Surface (MESS), Manhattan distance (L1), and Euclidean distance (L2). Negative values indicate potential environmental novelty. + ```{r fig.height=4, fig.width=6} cv_similarity(cv = ecv, # the environmental clustering x = pa_data, r = rasters, + method = "MESS", progress = FALSE) ``` @@ -313,8 +315,7 @@ To support a first choice of block size, prior to any model fitting, package `bl When only `r` is supplied, the `cv_spatial_autocor` function works by automatically fitting variograms to each continuous raster and finding the effective range of spatial autocorrelation. Variogram is a fundamental geostatistical tool for measuring spatial autocorrelation (O’Sullivan and Unwin, 2010). ```{r, results='hide', fig.keep='all', warning=FALSE, message=FALSE, fig.height=5, fig.width=7.2} -sac1 <- cv_spatial_autocor(r = rasters, - num_sample = 5000) +sac1 <- cv_spatial_autocor(r = rasters, num_sample = 5000) ```