Skip to contents

Last class

  • We saw how to add ‘pseudo-absences’ to a presence-only data set to make modelling possible

This class

  • How to do spatial cross validation on presence / pseudo-absence data
  • Talk about the final project

An example SDM

  • Let’s run through a full SDM analysis
  • We will use an example dataset in the ENMTools package
library(ENMTools)
library(tidyverse)
library(tidymodels)
library(spatialsample)
library(tidysdm)
library(sf)
library(mapview)

data("iberolacerta.clade")
data("euro.worldclim")
monticola <- iberolacerta.clade$species$monticola

monticola <- st_as_sf(monticola$presence.points, 
                      coords = c("Longitude", "Latitude"),
                      crs = 4326)
mapview(monticola)
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)

Create Background Area

bg <- create_background(monticola, method = "ecoregion")
mapview(bg)

Sample Pseudo-Absences

monticola_dat <- sdm_data(monticola, bg = bg, n = 10000)
monticola_dat
## Simple feature collection with 10260 features and 2 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -9.252936 ymin: 36.61092 xmax: 2.587413 ymax: 46.23354
## Geodetic CRS:  WGS 84
## First 10 features:
##    present pnt_origin                       pnts
## 1  present       data POINT (-5.171215 43.06957)
## 2  present       data POINT (-6.036635 43.02531)
## 3  present       data POINT (-7.679727 40.38852)
## 4  present       data POINT (-7.790437 40.30959)
## 5  present       data  POINT (-7.47334 43.78935)
## 6  present       data  POINT (-6.575039 42.9107)
## 7  present       data POINT (-5.132756 43.49572)
## 8  present       data POINT (-7.787378 40.39362)
## 9  present       data  POINT (-4.941888 43.3531)
## 10 present       data  POINT (-7.621731 40.3417)
mapview(monticola_dat, zcol = "present")

Add environmental variables

monticola_dat <- add_env_vars(monticola_dat, euro.worldclim)
monticola_dat
## Simple feature collection with 10260 features and 21 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -9.252936 ymin: 36.61092 xmax: 2.587413 ymax: 46.23354
## Geodetic CRS:  WGS 84
## First 10 features:
##    present pnt_origin bio1 bio2 bio3 bio4 bio5 bio6 bio7 bio8 bio9 bio10 bio11
## 1  present       data   78  100   40 5056  223  -26  249   52  145   146    18
## 2  present       data   76  100   40 5052  220  -26  246   48  143   144    16
## 3  present       data  137   94   38 5318  281   34  247   72  208   208    72
## 4  present       data  129   88   36 5333  271   29  242   65  200   200    65
## 5  present       data  140   76   42 3636  237   58  179  101  185   189    95
## 6  present       data   84   99   40 5134  228  -19  247   28  152   152    23
## 7  present       data  133   78   41 3998  235   45  190  115  184   186    84
## 8  present       data  137   94   38 5318  281   34  247   72  208   208    72
## 9  present       data  128   78   40 4073  233   39  194  111  180   183    78
## 10 present       data  101   76   33 5359  236    7  229   39  173   173    39
##    bio12 bio13 bio14 bio15 bio16 bio17 bio18 bio19                       pnts
## 1    917   113    48    24   305   169   171   251 POINT (-5.171215 43.06957)
## 2   1012   128    46    28   345   166   171   302 POINT (-6.036635 43.02531)
## 3   1143   172    11    55   465    72    72   465 POINT (-7.679727 40.38852)
## 4   1231   187    12    56   504    76    76   504 POINT (-7.790437 40.30959)
## 5    931   122    32    34   336   122   142   298  POINT (-7.47334 43.78935)
## 6   1012   129    39    32   358   145   145   328  POINT (-6.575039 42.9107)
## 7    822   110    40    28   284   145   160   213 POINT (-5.132756 43.49572)
## 8   1143   172    11    55   465    72    72   465 POINT (-7.787378 40.39362)
## 9    843   111    41    27   291   149   164   222  POINT (-4.941888 43.3531)
## 10  1514   234    15    57   621    92    92   621  POINT (-7.621731 40.3417)

Remove NAs

monticola_dat <- monticola_dat %>%
  drop_na(bio1:bio19)

mapview(st_sf(monticola_dat), zcol = "bio1")

Spatial Cross Validation

## presence only (po) spatial CV
cv_folds_spat <- po_spatial_buffer_vfold_cv(monticola_dat, presence = "present", n = c(24, 16),
                                             v = 9)

## look at the spatial folds
autoplot(cv_folds_spat)

autoplot(cv_folds_spat$splits[[2]])

autoplot(cv_folds_spat$splits[[8]])

## regular CV for comparison
cv_folds <- vfold_cv(monticola_dat, 9)

Make a recipe

  • Let’s do some transformations
monticola_recipe <- recipe(monticola_dat,
                         vars = c("present",
                                  as_tibble(monticola_dat) %>% select(bio1:bio19) %>% colnames()),
                         roles = c("outcome", rep("predictor", 19))) %>%
  step_YeoJohnson(all_predictors()) %>%
  step_normalize(all_predictors())

monticola_recipe
## Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         19
## 
## Operations:
## 
## Yeo-Johnson transformation on all_predictors()
## Centering and scaling for all_predictors()

Make Model and Workflow

monticola_mod <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("classification")

monticola_wf <- workflow() %>%
  add_recipe(monticola_recipe) %>%
  add_model(monticola_mod)

monticola_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_YeoJohnson()
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
## 
## Engine-Specific Arguments:
##   importance = impurity
## 
## Computational engine: ranger

Fit Workflow

monticola_fit <- monticola_wf %>%
  fit_resamples(cv_folds,
                control = control_resamples(extract = extract_fit_engine))

monticola_fit %>%
  collect_metrics()
## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.963     9 0.00206 Preprocessor1_Model1
## 2 roc_auc  binary     0.938     9 0.00459 Preprocessor1_Model1

Fit Workflow with Spatial CV

monticola_fit_spat <- monticola_wf %>%
  fit_resamples(cv_folds_spat,
                control = control_resamples(extract = extract_fit_engine))

monticola_fit_spat %>%
  collect_metrics()
## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.968     9 0.00679 Preprocessor1_Model1
## 2 roc_auc  binary     0.755     9 0.0397  Preprocessor1_Model1

Look at folds separately

monticola_fit_spat$.metrics
## [[1]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.976 Preprocessor1_Model1
## 2 roc_auc  binary         0.787 Preprocessor1_Model1
## 
## [[2]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.983 Preprocessor1_Model1
## 2 roc_auc  binary         0.687 Preprocessor1_Model1
## 
## [[3]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.983 Preprocessor1_Model1
## 2 roc_auc  binary         0.681 Preprocessor1_Model1
## 
## [[4]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.974 Preprocessor1_Model1
## 2 roc_auc  binary         0.559 Preprocessor1_Model1
## 
## [[5]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.922 Preprocessor1_Model1
## 2 roc_auc  binary         0.633 Preprocessor1_Model1
## 
## [[6]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.952 Preprocessor1_Model1
## 2 roc_auc  binary         0.885 Preprocessor1_Model1
## 
## [[7]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.960 Preprocessor1_Model1
## 2 roc_auc  binary         0.888 Preprocessor1_Model1
## 
## [[8]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.985 Preprocessor1_Model1
## 2 roc_auc  binary         0.819 Preprocessor1_Model1
## 
## [[9]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.977 Preprocessor1_Model1
## 2 roc_auc  binary         0.857 Preprocessor1_Model1

Importance

Let’s have a look at the importance values determined by the random forest for our variables.

## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
## 
## Attaching package: 'patchwork'
## The following object is masked from 'package:raster':
## 
##     area
monticola_fit %>%
  unnest(.extracts) %>%
  pull(.extracts) %>%
  map(vip) %>%
  wrap_plots(ncol = 3, nrow = 3)

The ordering is reasonably consistent between different folds. Now, the spatial folds:


monticola_fit_spat %>%
  unnest(.extracts) %>%
  pull(.extracts) %>%
  map(vip) %>%
  wrap_plots(ncol = 3, nrow = 3)

Make a prediction dataset

  • We want to make predictions on a landscape
  • Use a grid:
monticola_grid <- sdm_data(monticola, bg,
                         5000, sample_options = list(type = "regular")) %>%
  filter(present == "absent")
## although coordinates are longitude/latitude, st_sample assumes that they are planar
mapview(monticola_grid)

Add environmental variable and predict

monticola_grid_dat <- add_env_vars(monticola_grid, euro.worldclim) %>%
  drop_na(bio1:bio19)

monticola_grid_dat
## Simple feature collection with 3887 features and 21 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -9.230222 ymin: 39.02463 xmax: 2.584783 ymax: 46.13454
## Geodetic CRS:  WGS 84
## First 10 features:
##    present pnt_origin bio1 bio2 bio3 bio4 bio5 bio6 bio7 bio8 bio9 bio10 bio11
## 1   absent     pseudo  167   99   39 5204  315   65  250  112  233   237   104
## 2   absent     pseudo  165  101   39 5367  318   62  256  109  234   238   101
## 3   absent     pseudo  165  101   39 5367  318   62  256  109  234   238   101
## 4   absent     pseudo  164  104   39 5538  321   57  264  105  236   239    97
## 5   absent     pseudo  164  104   39 5538  321   57  264  105  236   239    97
## 6   absent     pseudo  165  109   39 5712  328   53  275  104  239   241    96
## 7   absent     pseudo  166  114   40 5895  334   49  285  102  243   244    94
## 8   absent     pseudo  166  114   40 5895  334   49  285  102  243   244    94
## 9   absent     pseudo  167  115   39 6007  337   46  291  101  245   246    93
## 10  absent     pseudo  165  115   38 6117  339   44  295   98  246   246    90
##    bio12 bio13 bio14 bio15 bio16 bio17 bio18 bio19                       pnts
## 1    670    93     5    56   271    39    40   271 POINT (-7.766416 39.02463)
## 2    674    93     5    55   272    40    41   271 POINT (-7.661859 39.02463)
## 3    674    93     5    55   272    40    41   271 POINT (-7.557301 39.02463)
## 4    669    91     5    55   268    40    41   266 POINT (-7.452744 39.02463)
## 5    669    91     5    55   268    40    41   266 POINT (-7.348186 39.02463)
## 6    636    85     5    53   252    40    41   247 POINT (-7.243628 39.02463)
## 7    594    80     5    52   233    39    39   226 POINT (-7.139071 39.02463)
## 8    594    80     5    52   233    39    39   226 POINT (-7.034513 39.02463)
## 9    552    75     4    51   215    36    37   205 POINT (-6.929956 39.02463)
## 10   534    72     4    51   206    35    36   194 POINT (-6.825398 39.02463)
final_fit <- monticola_wf %>%
  fit(monticola_dat)

monticola_preds <- final_fit %>% 
  augment(monticola_grid_dat)

Plot predictions

coords <- st_coordinates(st_sf(monticola_preds))
ggplot(monticola_preds %>% bind_cols(coords), aes(X, Y)) +
  geom_sf(data = bg, inherit.aes = FALSE) +
  geom_raster(aes(fill = .pred_present + 0.0001)) +
  geom_sf(data = monticola, inherit.aes = FALSE, colour = "red",
          size = 1.2) +
  scale_fill_continuous(name = "Probability of Occurrence",
                        trans = "logit") +
  theme_minimal()
## Warning: Raster pixels are placed at uneven vertical intervals and will be
## shifted. Consider using geom_tile() instead.

Final Projects

  • Next week we will all try and do a full SDM on GBIF data
  • No lectures, both classes will be for working
  • Hand in at the end of next week
  • There will be no assignment this week
  • This assignment can be used as part of the final project

Final Projects

  • Can be done in groups (max 6) or individually, but everyone will hand in their own work
  • Groups can share code / work together on working out how to run the models
  • Three Sub-assignments:
    1. Project Plan
    2. Project Code and Results
    3. Project Presentation (Lightning Talk)
  • Details will be up on the course website by next week’s first class

1. Project Plan (Preliminary Research)

  • A form on Canvas to fill out with:
    • Name of chosen species
    • Group members if applicable
    • Written Sections:
      • Species: Background research on the species (including drawing from academic article if available), their natural history.
      • Variables: In an ideal world, what variables would be the most useful to model this species, based on what you discover about their biology?
      • Models: What kind of model will you run? Research and choose 1 method, briefly explain how it works to the best of your ability. What hyper-parameters does it have?
    • All sections will be short (less than 300 words)

Next Week

  • Do an SDM on your GBIF data