Use apply() with a simple features (SF) function

Question

I've written a function to calculate the maximum distance between a centroid and the edge of its polygon, but I can't figure out how to run it on each individual polygon of a simple features ("sf) data.frame.

library(sf)

distance.func <- function(polygon){
  max(st_distance(st_cast(polygon, "POINT"), st_centroid(polygon)))
}

If I test the function on a single polygon it works. (The warning messages are irrelevant to the current issue).

nc <- st_read(system.file("shape/nc.shp", package="sf")) # built in w/package
nc.1row <- nc[c(1),] # Just keep the first polygon

>distance.func(nc.1row)
24309.07 m
Warning messages:
1: In st_cast.sf(polygon, "POINT") :
   repeating attributes for all sub-geometries for which they may not be constant
2: In st_centroid.sfc(st_geometry(x), of_largest_polygon = of_largest_polygon) :
   st_centroid does not give correct centroids for longitude/latitude data

The problem is applying this function to the entire data.frame.

nc$distance <- apply(nc, 1, distance.func)
Error in UseMethod("st_cast") :
 no applicable method for 'st_cast' applied to an object of class "list"

What can I do to run this function (or one like it) for each individual polygon in an object of class "sf"?

lbusett · Accepted Answer

The problem here is that using apply-like functions directly on sf object is "problematic" because the geometry column is a list-column, which does not interact well with "apply" constructs.

The simplest workaround could be to just use a for loop:

library(sf)

nc <- st_read(system.file("shape/nc.shp", package="sf")) %>% 
  st_transform(3857)

distance.func <- function(polygon){
  max(st_distance(st_cast(polygon, "POINT"), st_centroid(polygon)))
}
dist <- list()
for (i in seq_along(nc[[1]])) dist[[i]] <- distance.func(nc[i,]) 

head(unlist(dist))
# [1] 30185.34 27001.39 34708.57 52751.61 57273.54 34598.17

, but it is quite slow.

To be able to use apply-like functions, you need to pass to the function only the geometry column of the object. Something like this would work:

library(purrr)

distance.func_lapply <- function(polygon){
  polygon <- st_sfc(polygon)
  max(st_distance(st_cast(polygon, "POINT"), st_centroid(polygon)))
}

dist_lapply <- lapply(st_geometry(nc),  distance.func_lapply)
dist_map    <- purrr::map(st_geometry(nc), distance.func_lapply)

all.equal(dist, dist_lapply)
# [1] TRUE
all.equal(dist, dist_map)
# [1] TRUE

Note however that I had to slighlty modify the distance function, adding an st_sfc call, because otherwise you get a lot of "In st_cast.MULTIPOLYGON(polygon, "POINT") : point from first coordinate only" warnings, and the results are not correct (I did not investigate the reason for this - apparently st_cast behaves differently on sfg objects than on sfc ones).

In terms of speed, both the lapply and the map solutions outperform the for loop by almost an order of magnitude:

microbenchmark::microbenchmark(
  forloop = {for (i in seq_along(nc[[1]])) dist[[i]] <- distance.func(nc[i,])}, 
  map     = {dist_map <- purrr::map(st_geometry(nc), distance.func_lapply)},  
  lapply  = {dist_lapply <- lapply(st_geometry(nc),  distance.func_lapply)}, times = 10)

Unit: milliseconds
    expr      min       lq     mean   median       uq       max neval
 forloop 904.8827 919.5636 936.2214 920.7451 929.7186 1076.9646    10
     map 122.7597 124.9074 126.1796 126.3326 127.6940  128.7551    10
  lapply 122.9131 125.3699 126.9642 126.8100 129.3791  131.2675    10

Jean-François Bourdon · Answer

There is an other way to apply over simple features albeit not really better than using a for loop. You can first create a list of simple features with lapply before applying your distance function.

distance.func <- function(polygon){
  max(st_distance(st_cast(polygon, "POINT"), st_centroid(polygon)))
}

distance.func.ls_sf <- function(sf){
  ls_sf <- lapply(1:nrow(sf), function(x, sf) {sf[x,]}, sf)
  dist <- lapply(ls_sf, distance.func)
}

dist_lapply_ls_sf <- distance.func.ls_sf(nc)

all.equal(dist, dist_lapply_ls_sf)
# [1] TRUE

The performance is almost as poor as a for loop... and it even seems that 4 years later (now R 4.1.1 with sf 1.0-3), it's almost two orders of magnitude worst (instead of one) than lapply or map using st_geometry(nc)...

microbenchmark::microbenchmark(
  forloop = {for (i in seq_along(nc[[1]])) dist[[i]] <- distance.func(nc[i,])}, 
  map     = {dist_map <- purrr::map(st_geometry(nc), distance.func_lapply)},  
  lapply  = {dist_lapply <- lapply(st_geometry(nc),  distance.func_lapply)},
  ls_sf   = {dist_lapply_ls_sf <- distance.func.ls_sf(nc)},
  times = 10)

Unit: milliseconds
    expr       min        lq      mean    median        uq       max neval
 forloop 7726.9337 7744.7534 7837.6937 7781.2301 7850.7447 8221.2092    10
     map  124.1067  126.2212  135.1502  128.4745  130.2372  182.1479    10
  lapply  122.0224  125.6585  130.6488  127.9388  134.1495  147.9301    10
   ls_sf 7722.1066 7733.8204 7785.8104 7775.5011 7814.3849 7911.3466    10

So it's a bad solution unless the function you are applying to the simple feature object is taking much more time to compute than st_distance().

What if you need the attributes ?

If your function needs both geometries and attributes part of the sf object, using mapply is a good way to go. Here is an example computing the Sudden Infant Death density (SID/km²) using three methods:

for
extracting each features before using lapply
mapply

microbenchmark::microbenchmark(
  forLoop =
    {
      sid.density.for <- vector("list", nrow(nc))
      for (i in seq(nrow(nc))) sid.density.for[[i]] <- nc[i,][["SID74"]] / st_area(nc[i,]) / 1000^2
    },
  list_nc =
    {
      list_nc <- lapply(seq(nrow(nc)), function(x, nc) { nc[x,] }, nc)
      sid.density.lapply <- lapply(list_nc, function(x) { x[["SID74"]] / as.numeric(st_area(x)) / 1000^2 })
    },
  mapply =
    {
      sid.density.func <- function(geometry, attribute) { attribute / st_area(geometry) / 1000^2 }
      sid.density.mapply <- mapply(sid.density.func, st_geometry(nc), nc[["SID74"]], SIMPLIFY = FALSE)
    },
  times = 10)

Unit: milliseconds
    expr       min        lq       mean     median        uq       max neval
 forLoop 4511.7203 4515.5997 4557.73503 4542.75200 4560.5508 4707.2877    10
 list_nc 4356.3801 4400.5640 4455.35743 4440.38775 4475.2213 4717.5218    10
  mapply   17.4783   17.6885   18.20704   17.99295   18.3078   20.1121    10

Use apply() with a simple features (SF) function

Tags:

r

gis

r-sf

John J.

2 Answers

lbusett

What if you need the attributes ?

Jean-François Bourdon

Recent Activity

Donate For Us

Use apply() with a simple features (SF) function

Tags:

r

gis

r-sf

John J.

2 Answers

lbusett

What if you need the attributes ?

Jean-François Bourdon

Related questions

Recent Activity

Donate For Us