Forests of the Future: Week 10 Lecture Part 2 • fofpack

Last class

We saw how to add ‘pseudo-absences’ to a presence-only data set to make modelling possible

This class

How to do spatial cross validation on presence / pseudo-absence data
Talk about the final project

An example SDM

Let’s run through a full SDM analysis
We will use an example dataset in the ENMTools package

library(ENMTools)
library(tidyverse)
library(tidymodels)
library(spatialsample)
library(tidysdm)
library(sf)
library(mapview)

data("iberolacerta.clade")
data("euro.worldclim")
monticola <- iberolacerta.clade$species$monticola

monticola <- st_as_sf(monticola$presence.points, 
                      coords = c("Longitude", "Latitude"),
                      crs = 4326)
mapview(monticola)

## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)

Create Background Area

bg <- create_background(monticola, method = "ecoregion")
mapview(bg)

Sample Pseudo-Absences

monticola_dat <- sdm_data(monticola, bg = bg, n = 10000)
monticola_dat

## Simple feature collection with 10260 features and 2 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -9.252936 ymin: 36.61092 xmax: 2.587413 ymax: 46.23354
## Geodetic CRS:  WGS 84
## First 10 features:
##    present pnt_origin                       pnts
## 1  present       data POINT (-5.171215 43.06957)
## 2  present       data POINT (-6.036635 43.02531)
## 3  present       data POINT (-7.679727 40.38852)
## 4  present       data POINT (-7.790437 40.30959)
## 5  present       data  POINT (-7.47334 43.78935)
## 6  present       data  POINT (-6.575039 42.9107)
## 7  present       data POINT (-5.132756 43.49572)
## 8  present       data POINT (-7.787378 40.39362)
## 9  present       data  POINT (-4.941888 43.3531)
## 10 present       data  POINT (-7.621731 40.3417)

mapview(monticola_dat, zcol = "present")

Add environmental variables

monticola_dat <- add_env_vars(monticola_dat, euro.worldclim)
monticola_dat

## Simple feature collection with 10260 features and 21 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -9.252936 ymin: 36.61092 xmax: 2.587413 ymax: 46.23354
## Geodetic CRS:  WGS 84
## First 10 features:
##    present pnt_origin bio1 bio2 bio3 bio4 bio5 bio6 bio7 bio8 bio9 bio10 bio11
## 1  present       data   78  100   40 5056  223  -26  249   52  145   146    18
## 2  present       data   76  100   40 5052  220  -26  246   48  143   144    16
## 3  present       data  137   94   38 5318  281   34  247   72  208   208    72
## 4  present       data  129   88   36 5333  271   29  242   65  200   200    65
## 5  present       data  140   76   42 3636  237   58  179  101  185   189    95
## 6  present       data   84   99   40 5134  228  -19  247   28  152   152    23
## 7  present       data  133   78   41 3998  235   45  190  115  184   186    84
## 8  present       data  137   94   38 5318  281   34  247   72  208   208    72
## 9  present       data  128   78   40 4073  233   39  194  111  180   183    78
## 10 present       data  101   76   33 5359  236    7  229   39  173   173    39
##    bio12 bio13 bio14 bio15 bio16 bio17 bio18 bio19                       pnts
## 1    917   113    48    24   305   169   171   251 POINT (-5.171215 43.06957)
## 2   1012   128    46    28   345   166   171   302 POINT (-6.036635 43.02531)
## 3   1143   172    11    55   465    72    72   465 POINT (-7.679727 40.38852)
## 4   1231   187    12    56   504    76    76   504 POINT (-7.790437 40.30959)
## 5    931   122    32    34   336   122   142   298  POINT (-7.47334 43.78935)
## 6   1012   129    39    32   358   145   145   328  POINT (-6.575039 42.9107)
## 7    822   110    40    28   284   145   160   213 POINT (-5.132756 43.49572)
## 8   1143   172    11    55   465    72    72   465 POINT (-7.787378 40.39362)
## 9    843   111    41    27   291   149   164   222  POINT (-4.941888 43.3531)
## 10  1514   234    15    57   621    92    92   621  POINT (-7.621731 40.3417)

Remove NAs

monticola_dat <- monticola_dat %>%
  drop_na(bio1:bio19)

mapview(st_sf(monticola_dat), zcol = "bio1")

Spatial Cross Validation

## presence only (po) spatial CV
cv_folds_spat <- po_spatial_buffer_vfold_cv(monticola_dat, presence = "present", n = c(24, 16),
                                             v = 9)

## look at the spatial folds
autoplot(cv_folds_spat)

autoplot(cv_folds_spat$splits[[2]])

autoplot(cv_folds_spat$splits[[8]])

## regular CV for comparison
cv_folds <- vfold_cv(monticola_dat, 9)

Make a recipe

Let’s do some transformations

monticola_recipe <- recipe(monticola_dat,
                         vars = c("present",
                                  as_tibble(monticola_dat) %>% select(bio1:bio19) %>% colnames()),
                         roles = c("outcome", rep("predictor", 19))) %>%
  step_YeoJohnson(all_predictors()) %>%
  step_normalize(all_predictors())

monticola_recipe

## Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         19
## 
## Operations:
## 
## Yeo-Johnson transformation on all_predictors()
## Centering and scaling for all_predictors()

Make Model and Workflow

monticola_mod <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("classification")

monticola_wf <- workflow() %>%
  add_recipe(monticola_recipe) %>%
  add_model(monticola_mod)

monticola_wf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_YeoJohnson()
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
## 
## Engine-Specific Arguments:
##   importance = impurity
## 
## Computational engine: ranger

Fit Workflow

monticola_fit <- monticola_wf %>%
  fit_resamples(cv_folds,
                control = control_resamples(extract = extract_fit_engine))

monticola_fit %>%
  collect_metrics()

## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.963     9 0.00206 Preprocessor1_Model1
## 2 roc_auc  binary     0.938     9 0.00459 Preprocessor1_Model1

Fit Workflow with Spatial CV

monticola_fit_spat <- monticola_wf %>%
  fit_resamples(cv_folds_spat,
                control = control_resamples(extract = extract_fit_engine))

monticola_fit_spat %>%
  collect_metrics()

## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.968     9 0.00679 Preprocessor1_Model1
## 2 roc_auc  binary     0.755     9 0.0397  Preprocessor1_Model1

Look at folds separately

monticola_fit_spat$.metrics

## [[1]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.976 Preprocessor1_Model1
## 2 roc_auc  binary         0.787 Preprocessor1_Model1
## 
## [[2]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.983 Preprocessor1_Model1
## 2 roc_auc  binary         0.687 Preprocessor1_Model1
## 
## [[3]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.983 Preprocessor1_Model1
## 2 roc_auc  binary         0.681 Preprocessor1_Model1
## 
## [[4]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.974 Preprocessor1_Model1
## 2 roc_auc  binary         0.559 Preprocessor1_Model1
## 
## [[5]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.922 Preprocessor1_Model1
## 2 roc_auc  binary         0.633 Preprocessor1_Model1
## 
## [[6]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.952 Preprocessor1_Model1
## 2 roc_auc  binary         0.885 Preprocessor1_Model1
## 
## [[7]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.960 Preprocessor1_Model1
## 2 roc_auc  binary         0.888 Preprocessor1_Model1
## 
## [[8]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.985 Preprocessor1_Model1
## 2 roc_auc  binary         0.819 Preprocessor1_Model1
## 
## [[9]]
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.977 Preprocessor1_Model1
## 2 roc_auc  binary         0.857 Preprocessor1_Model1

Importance

Let’s have a look at the importance values determined by the random forest for our variables.

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

library(patchwork)

## 
## Attaching package: 'patchwork'

## The following object is masked from 'package:raster':
## 
##     area

monticola_fit %>%
  unnest(.extracts) %>%
  pull(.extracts) %>%
  map(vip) %>%
  wrap_plots(ncol = 3, nrow = 3)

The ordering is reasonably consistent between different folds. Now, the spatial folds:

monticola_fit_spat %>%
  unnest(.extracts) %>%
  pull(.extracts) %>%
  map(vip) %>%
  wrap_plots(ncol = 3, nrow = 3)

Make a prediction dataset

We want to make predictions on a landscape
Use a grid:

monticola_grid <- sdm_data(monticola, bg,
                         5000, sample_options = list(type = "regular")) %>%
  filter(present == "absent")

## although coordinates are longitude/latitude, st_sample assumes that they are planar

mapview(monticola_grid)

Add environmental variable and predict

monticola_grid_dat <- add_env_vars(monticola_grid, euro.worldclim) %>%
  drop_na(bio1:bio19)

monticola_grid_dat

## Simple feature collection with 3887 features and 21 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -9.230222 ymin: 39.02463 xmax: 2.584783 ymax: 46.13454
## Geodetic CRS:  WGS 84
## First 10 features:
##    present pnt_origin bio1 bio2 bio3 bio4 bio5 bio6 bio7 bio8 bio9 bio10 bio11
## 1   absent     pseudo  167   99   39 5204  315   65  250  112  233   237   104
## 2   absent     pseudo  165  101   39 5367  318   62  256  109  234   238   101
## 3   absent     pseudo  165  101   39 5367  318   62  256  109  234   238   101
## 4   absent     pseudo  164  104   39 5538  321   57  264  105  236   239    97
## 5   absent     pseudo  164  104   39 5538  321   57  264  105  236   239    97
## 6   absent     pseudo  165  109   39 5712  328   53  275  104  239   241    96
## 7   absent     pseudo  166  114   40 5895  334   49  285  102  243   244    94
## 8   absent     pseudo  166  114   40 5895  334   49  285  102  243   244    94
## 9   absent     pseudo  167  115   39 6007  337   46  291  101  245   246    93
## 10  absent     pseudo  165  115   38 6117  339   44  295   98  246   246    90
##    bio12 bio13 bio14 bio15 bio16 bio17 bio18 bio19                       pnts
## 1    670    93     5    56   271    39    40   271 POINT (-7.766416 39.02463)
## 2    674    93     5    55   272    40    41   271 POINT (-7.661859 39.02463)
## 3    674    93     5    55   272    40    41   271 POINT (-7.557301 39.02463)
## 4    669    91     5    55   268    40    41   266 POINT (-7.452744 39.02463)
## 5    669    91     5    55   268    40    41   266 POINT (-7.348186 39.02463)
## 6    636    85     5    53   252    40    41   247 POINT (-7.243628 39.02463)
## 7    594    80     5    52   233    39    39   226 POINT (-7.139071 39.02463)
## 8    594    80     5    52   233    39    39   226 POINT (-7.034513 39.02463)
## 9    552    75     4    51   215    36    37   205 POINT (-6.929956 39.02463)
## 10   534    72     4    51   206    35    36   194 POINT (-6.825398 39.02463)

final_fit <- monticola_wf %>%
  fit(monticola_dat)

monticola_preds <- final_fit %>% 
  augment(monticola_grid_dat)

Plot predictions

coords <- st_coordinates(st_sf(monticola_preds))
ggplot(monticola_preds %>% bind_cols(coords), aes(X, Y)) +
  geom_sf(data = bg, inherit.aes = FALSE) +
  geom_raster(aes(fill = .pred_present + 0.0001)) +
  geom_sf(data = monticola, inherit.aes = FALSE, colour = "red",
          size = 1.2) +
  scale_fill_continuous(name = "Probability of Occurrence",
                        trans = "logit") +
  theme_minimal()

## Warning: Raster pixels are placed at uneven vertical intervals and will be
## shifted. Consider using geom_tile() instead.

Final Projects

Next week we will all try and do a full SDM on GBIF data
No lectures, both classes will be for working
Hand in at the end of next week
There will be no assignment this week
This assignment can be used as part of the final project

Final Projects

Can be done in groups (max 6) or individually, but everyone will hand in their own work
Groups can share code / work together on working out how to run the models
Three Sub-assignments:
1. Project Plan
2. Project Code and Results
3. Project Presentation (Lightning Talk)
Details will be up on the course website by next week’s first class

1. Project Plan (Preliminary Research)

A form on Canvas to fill out with:
- Name of chosen species
- Group members if applicable
- Written Sections:
  - Species: Background research on the species (including drawing from academic article if available), their natural history.
  - Variables: In an ideal world, what variables would be the most useful to model this species, based on what you discover about their biology?
  - Models: What kind of model will you run? Research and choose 1 method, briefly explain how it works to the best of your ability. What hyper-parameters does it have?
- All sections will be short (less than 300 words)

Next Week

Do an SDM on your GBIF data