I have a data frame were each row is a unique observation.
Observations overlap in time if they are located within a specified temporal distance (e.g. 30 days) of one another.
Observations overlap in space if they are located within a specified spatial distance (e.g. 20 kilometers) of one another.
I am working with the collections of observations that overlap in both time and space. I want to make a column (overlaps) that contains vectors with the ids of the observations that overlap with an observation. I have tried the solution below, but the run time is too poor for the solution to be applicable.
library(dplyr)
library(lubridate)
library(purrr)
library(geosphere)
spat_proximity <- function(x, y, z) {
return(which(map_dbl(y, ~ distGeo(., x)) <= z))}
temp_proximity <- function(x, y, z) {
return(which(map_dbl(y, ~ abs(x - .)) <= z))}
test %>%
mutate(overlaps = map2(map(place, ~ spat_proximity(., place, 20000)),
map(time, ~ temp_proximity(., time, 30)),
~ intersect(.x, .y)))
Ideas on how to speed things up would be much appreciated.
Desired output
structure(list(id = 1:42, time = structure(c(1478601762, 1475170279,
1469770219, 1462441336, 1474739469, 1488216507, 1475203721, 1468705558,
1481722718, 1485897197, 1488669576, 1501288618, 1510266595, 1516828588,
1497048175, 1516546144, 1507576242, 1517654363, 1496070298, 1519765220,
1507408104, 1532046710, 1542196446, 1534747170, 1533605231, 1521381844,
1545389880, 1537988628, 1544304998, 1524842149, 1551051077, 1540822870,
1579775599, 1580337175, 1551486497, 1554879837, 1568620434, 1568701543,
1556387550, 1561253396, 1582925482, 1562166384), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), place = list(c(7.59729413351368, 52.6052275122351
), c(9.99728447956781, 53.43773657253), c(10.1114473929533, 53.1295890148866
), c(7.74115218835801, 53.555354690339), c(9.82895066827581,
53.1009319396015), c(10.061107415855, 53.1908752763309), c(10.1134381934544,
53.1450558612239), c(8.59001735546083, 53.1767797285482), c(6.43939168487555,
52.5520931654252), c(8.38811111096636, 53.9043055557574), c(6.20061916537948,
52.462037409576), c(8.66656282486832, 52.8269702466929), c(9.92127490588442,
53.1240045666796), c(9.77810957468704, 53.1445777603789), c(10.0972382106036,
53.1604265989175), c(10.0473952445094, 53.1698097395641), c(9.23773401919961,
53.2120381900218), c(8.29524237837988, 52.822332696399), c(6.63690696797941,
53.4436726627048), c(6.89839325296288, 53.947454203445), c(6.97064542834721,
54.2487197094445), c(9.98865072631714, 53.4088944299342), c(9.94164401569524,
53.1500576073959), c(9.64242996587752, 52.9285470044703), c(10.1026940185685,
53.1635394335485), c(9.94874529044194, 53.2202512735354), c(8.8025526552284,
53.2423093779114), c(7.93352467761445, 52.9129105531343), c(6.6418846001424,
53.2459031608081), c(7.56102465003101, 53.5306444680171), c(7.36619114998468,
53.748869508885), c(7.40993284414052, 54.5367797663042), c(9.90022663895919,
53.3726361099083), c(9.41110555596208, 52.5001044709056), c(10.1151193231519,
53.1539029361817), c(10.1064400828529, 53.1793449776572), c(9.94235711256256,
53.2622041055899), c(9.44215997717822, 53.4799339987572), c(7.03832846889284,
53.1986115213435), c(7.32755360272354, 53.416700338513), c(7.57828611098173,
53.6107027769073), c(7.55411005022882, 54.1905803935834)), overlaps = list(
1L, 2L, 3L, 4L, c(5L, 7L), 6L, c(5L, 7L), 8L, 9L, 10L, 11L,
12L, 13L, c(14L, 16L), 15L, c(14L, 16L), 17L, 18L, 19L, 20L,
21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L,
33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L)), row.names = c(NA,
-42L), class = c("tbl_df", "tbl", "data.frame"))
Data
structure(list(id = 1:42, time = structure(c(1478601762, 1475170279,
1469770219, 1462441336, 1474739469, 1488216507, 1475203721, 1468705558,
1481722718, 1485897197, 1488669576, 1501288618, 1510266595, 1516828588,
1497048175, 1516546144, 1507576242, 1517654363, 1496070298, 1519765220,
1507408104, 1532046710, 1542196446, 1534747170, 1533605231, 1521381844,
1545389880, 1537988628, 1544304998, 1524842149, 1551051077, 1540822870,
1579775599, 1580337175, 1551486497, 1554879837, 1568620434, 1568701543,
1556387550, 1561253396, 1582925482, 1562166384), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), place = list(c(7.59729413351368, 52.6052275122351
), c(9.99728447956781, 53.43773657253), c(10.1114473929533, 53.1295890148866
), c(7.74115218835801, 53.555354690339), c(9.82895066827581,
53.1009319396015), c(10.061107415855, 53.1908752763309), c(10.1134381934544,
53.1450558612239), c(8.59001735546083, 53.1767797285482), c(6.43939168487555,
52.5520931654252), c(8.38811111096636, 53.9043055557574), c(6.20061916537948,
52.462037409576), c(8.66656282486832, 52.8269702466929), c(9.92127490588442,
53.1240045666796), c(9.77810957468704, 53.1445777603789), c(10.0972382106036,
53.1604265989175), c(10.0473952445094, 53.1698097395641), c(9.23773401919961,
53.2120381900218), c(8.29524237837988, 52.822332696399), c(6.63690696797941,
53.4436726627048), c(6.89839325296288, 53.947454203445), c(6.97064542834721,
54.2487197094445), c(9.98865072631714, 53.4088944299342), c(9.94164401569524,
53.1500576073959), c(9.64242996587752, 52.9285470044703), c(10.1026940185685,
53.1635394335485), c(9.94874529044194, 53.2202512735354), c(8.8025526552284,
53.2423093779114), c(7.93352467761445, 52.9129105531343), c(6.6418846001424,
53.2459031608081), c(7.56102465003101, 53.5306444680171), c(7.36619114998468,
53.748869508885), c(7.40993284414052, 54.5367797663042), c(9.90022663895919,
53.3726361099083), c(9.41110555596208, 52.5001044709056), c(10.1151193231519,
53.1539029361817), c(10.1064400828529, 53.1793449776572), c(9.94235711256256,
53.2622041055899), c(9.44215997717822, 53.4799339987572), c(7.03832846889284,
53.1986115213435), c(7.32755360272354, 53.416700338513), c(7.57828611098173,
53.6107027769073), c(7.55411005022882, 54.1905803935834))), row.names = c(NA,
-42L), class = c("tbl_df", "tbl", "data.frame"))
If you truly want speed you can write your own C++ code to calculate the distance (because geosphere
is quite slow) and time comparisons
Save this code in a file, for example "~/Desktop/find_overlaps.cpp"
you need to install Rcpp
- install.packages("Rcpp")
#include "Rcpp.h"
#include <math.h>
static const double earth = 6378137.0; // WSG-84 definition
// haversine formula taken from the geodist library
// - https://github.com/hypertidy/geodist
double distance_haversine(double x1, double y1, double x2, double y2) {
double cosy1 = cos( y1 * M_PI / 180.0 );
double cosy2 = cos( y2 * M_PI / 180.0 );
double sxd = sin ((x2 - x1) * M_PI / 360.0);
double syd = sin ((y2 - y1) * M_PI / 360.0);
double d = syd * syd + cosy1 * cosy2 * sxd * sxd;
d = 2.0 * earth * asin (sqrt (d));
return (d);
}
// returns true if second date is within 30 days of the first
bool within_days( int first_date, int second_date ) {
int days = 30 * 24 * 60 * 60;
int lower_bound = first_date - days;
int upper_bound = first_date + days;
return lower_bound <= second_date && second_date <= upper_bound;
}
bool within_distance( Rcpp::NumericVector start_place, Rcpp::NumericVector end_place, double distance_limit = 20000.0 ) {
double x1 = start_place[0];
double y1 = start_place[1];
double x2 = end_place[0];
double y2 = end_place[1];
return distance_haversine(x1, y1, x2, y2) <= distance_limit;
}
// [[Rcpp::export]]
SEXP find_overlaps( Rcpp::NumericVector ids, Rcpp::IntegerVector dates, Rcpp::List place ) {
R_xlen_t n = dates.length();
R_xlen_t i, j;
Rcpp::List res( n );
R_xlen_t result_counter;
for( i = 0; i < n; ++i ) {
Rcpp::IntegerVector overlaps( n ); // initialise vector to store results
result_counter = 0;
for( j = 0; j < n; ++j ) {
// ignore self-comparisons
if( i != j ) {
int first_date = dates[ i ];
int second_date = dates[ j ];
Rcpp::NumericVector first_place = place[ i ];
Rcpp::NumericVector second_place = place[ j ];
// check the place values exist
if( first_place.length() != 2 || second_place.length() != 2 ) {
continue;
}
if( within_days( first_date, second_date) && within_distance( first_place, second_place ) ) {
overlaps[ result_counter ] = j;
result_counter++;
}
}
if( result_counter > 0 ) {
Rcpp::IntegerVector id_idx = overlaps[ Rcpp::Range( 0, result_counter - 1 ) ];
res[ i ] = ids[ id_idx ];
}
}
}
return res;
}
Then source it in R
library(Rcpp)
Rcpp::sourceCpp(file = "~/Desktop/find_overlaps.cpp")
res <- find_overlaps( df$id, df$time, df$place )
df$overlaps <- res
df
# id time place overlaps
# 1 1 2016-11-08 10:42:42 7.597294, 52.605228 NULL
# 2 2 2016-09-29 17:31:19 9.997284, 53.437737 NULL
# 3 3 2016-07-29 05:30:19 10.11145, 53.12959 NULL
# 4 4 2016-05-05 09:42:16 7.741152, 53.555355 NULL
# 5 5 2016-09-24 17:51:09 9.828951, 53.100932 7
# 6 6 2017-02-27 17:28:27 10.06111, 53.19088 NULL
# 7 7 2016-09-30 02:48:41 10.11344, 53.14506 5
# 8 8 2016-07-16 21:45:58 8.590017, 53.176780 NULL
# 9 9 2016-12-14 13:38:38 6.439392, 52.552093 NULL
# 10 10 2017-01-31 21:13:17 8.388111, 53.904306 NULL
# 11 11 2017-03-04 23:19:36 6.200619, 52.462037 NULL
# 12 12 2017-07-29 00:36:58 8.666563, 52.826970 NULL
# 13 13 2017-11-09 22:29:55 9.921275, 53.124005 NULL
# 14 14 2018-01-24 21:16:28 9.77811, 53.14458 16
# 15 15 2017-06-09 22:42:55 10.09724, 53.16043 NULL
# 16 16 2018-01-21 14:49:04 10.04740, 53.16981 14
# 17 17 2017-10-09 19:10:42 9.237734, 53.212038 NULL
# 18 18 2018-02-03 10:39:23 8.295242, 52.822333 NULL
# 19 19 2017-05-29 15:04:58 6.636907, 53.443673 NULL
# 20 20 2018-02-27 21:00:20 6.898393, 53.947454 NULL
# ...
NULL
values)A possible solution using spatialrisk
and dplyr
. The key functions in spatialrisk
are written in C++ (Rcpp), and are therefore very fast.
spatialrisk::points_in_circle()
calculates the observations within radius from a center point. Note that distances are calculated using the Haversine formula.
library(spatialrisk)
library(dplyr)
# Find rows within time period
overlap_time_fn <- function(df, days, time_ref){
seconds <- days * 24 * 3600
df %>%
filter(time < time_ref + seconds, time > time_ref - seconds)
}
# Define radius in meters
dist <- 20000
# Define number of days
days <- 30
df %>%
mutate(lon = place[[1]][1]) %>% # add lon column
mutate(lat = place[[1]][2]) %>% # add lat column
rowwise() %>%
mutate(overlap_spat = toString(points_in_circle(., lon_center = lon,
lat_center = lat,
radius = dist)$id)) %>%
mutate(overlap_time = toString(overlap_time_fn(., days = days,
time_ref = time)$id))
A possible solution with data.table
:
id
On example dataset, performance comparison shows around x20
improvement, see below.
library(dplyr)
library(purrr)
library(geosphere)
library(data.table)
maxtime <- 30
maxdist <- 20000
setDT(test)
microbenchmark::microbenchmark( datatable = {# Calculate box around each point
test[,c('x','y','boxxmin','boxxmax','boxymin','boxymax','timemin','timemax'):=.(
sapply(place,function(x) x[1]),
sapply(place,function(x) x[2]),
sapply(place,function(x) x[1] - maxdist),
sapply(place,function(x) x[1] + maxdist),
sapply(place,function(x) x[2] - maxdist),
sapply(place,function(x) x[2] + maxdist),
sapply(time,function(t) t - maxtime * 86400),
sapply(time,function(t) t + maxtime * 86400))]
# Select near enough places
result <- test[test,.(place,i.id,i.time,i.place,i.x,i.y,id,x,y,time, place),on=.(x>=boxxmin, y>=boxymin, x <= boxxmax, y<= boxymax , time >= timemin, time <= timemax)]
# Calculate spatio-temporal distances
result[,c('dist','dt'):=.(distGeo(matrix(unlist(i.place),ncol=2,byrow=T),matrix(unlist(place),ncol=2,byrow=T)),
abs(as.numeric(difftime(i.time,time,unit = 'days')))) ]
# Calculate overlaps
overlaps <- result[dt < 30 & dist < 20000 ][,.(overlap = list(i.id)), by = .(id)]
test[overlaps,.(id,time,place,overlap),on=.(id=id)]
id time place overlap
1: 1 2016-11-08 10:42:42 7.597294,52.605228 1
2: 2 2016-09-29 17:31:19 9.997284,53.437737 2
3: 3 2016-07-29 05:30:19 10.11145,53.12959 3
4: 4 2016-05-05 09:42:16 7.741152,53.555355 4
5: 5 2016-09-24 17:51:09 9.828951,53.100932 5,7
6: 7 2016-09-30 02:48:41 10.11344,53.14506 5,7
7: 6 2017-02-27 17:28:27 10.06111,53.19088 6
8: 8 2016-07-16 21:45:58 8.590017,53.176780 8
9: 9 2016-12-14 13:38:38 6.439392,52.552093 9
10: 10 2017-01-31 21:13:17 8.388111,53.904306 10
11: 11 2017-03-04 23:19:36 6.200619,52.462037 11
12: 12 2017-07-29 00:36:58 8.666563,52.826970 12
13: 13 2017-11-09 22:29:55 9.921275,53.124005 13
14: 14 2018-01-24 21:16:28 9.77811,53.14458 14,16
15: 16 2018-01-21 14:49:04 10.04740,53.16981 14,16
16: 15 2017-06-09 22:42:55 10.09724,53.16043 15
17: 17 2017-10-09 19:10:42 9.237734,53.212038 17
...
Performance comparison :
Unit: milliseconds
expr min lq mean median uq max neval cld
datatable 9.0788 10.21605 11.75099 10.7112 11.7880 24.6698 100 a
purrr 176.7696 199.85610 209.79916 209.2215 218.3165 253.2139 100 b
Try lapply
function, most of the time it is more efficient and some times define functions with ~
is not a good idea. Try this:
p <- do.call(rbind,data$place)
t <- data$time
y <- data %>%
mutate(overlaps = map2(lapply(place,function(x)which(distGeo(x,p2=p)<=20000)),
map(time,function(x)which(abs(as.numeric(difftime(x,t,units = "days")))<=30)),
intersect))
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With