Suppose I have the following two data frames:
set.seed(123)
df_1 <- data.frame(
name_1 = c("john", "david", "alex", "kevin", "trevor", "xavier", "tom", "michael", "troy", "kelly", "chris", "henry", "taylor", "ryan", "peter"),
lon = rnorm(15, mean = -74.0060, sd = 0.01),
lat = rnorm(15, mean = 40.7128, sd = 0.01)
)
df_2 <- data.frame(
name_2 = c("matthew", "tyler", "sebastian", "julie", "anna", "tim", "david", "nigel", "sarah", "steph", "sylvia", "boris", "theo", "malcolm"),
lon = rnorm(14, mean = -74.0060, sd = 0.01),
lat = rnorm(14, mean = 40.7128, sd = 0.01)
)
My Problem: For each person in df_1, I am trying to find out the 5 closest people (haversine distance) to this person in df_2, and record various distance statistics (e.g. mean, median, max, min standard deviation).
Attempt
First, I defined the distance function:
library(geosphere)
haversine_distance <- function(lon1, lat1, lon2, lat2) {
distHaversine(c(lon1, lat1), c(lon2, lat2))
}
Then, I calculated the distance between each person in df_1 and all people in df_2:
# Create a matrix to store results
distances <- matrix(nrow = nrow(df_1), ncol = nrow(df_2))
# calculate the distances
for (i in 1:nrow(df_1)) {
for (j in 1:nrow(df_2)) {
distances[i, j] <- haversine_distance(df_1$lon[i], df_1$lat[i], df_2$lon[j], df_2$lat[j])
}
}
# Create final
final <- data.frame(
name_1 = rep(df_1$name_1, each = nrow(df_2)),
lon_1 = rep(df_1$lon, each = nrow(df_2)),
lat_1 = rep(df_1$lat, each = nrow(df_2)),
name_2 = rep(df_2$name_2, nrow(df_1)),
lon_2 = rep(df_2$lon, nrow(df_1)),
lat_2 = rep(df_2$lat, nrow(df_1)),
distance = c(distances)
)
Finally, for each person in df_1, I kept the 5 minimum distances and recorded the distance statistics:
# Keep only first 5 rows for each unique value of final$name_1
final <- final[order(final$name_1, final$distance), ]
final <- final[ave(final$distance, final$name_1, FUN = seq_along) <= 5, ]
# Calculate summary statistics for each unique person in final$name_1
final_summary <- aggregate(distance ~ name_1,
data = final,
FUN = function(x) c(min = min(x),
max = max(x),
mean = mean(x),
median = median(x),
sd = sd(x)))
final_summary <- do.call(data.frame, final_summary)
names(final_summary)[-(1)] <- c("min_distance", "max_distance", "mean_distance", "median_distance", "sd_distance")
final_summary$closest_people <- tapply(final$name_2,
final$name_1,
FUN = function(x) paste(sort(x), collapse = ", "))
# break closest_people column into multiple columns
n <- 5
closest_people_split <- strsplit(final_summary$closest_people, ", ")
final_summary[paste0("closest_", seq_len(n))] <- do.call(rbind, closest_people_split)
The final result look like this:
name_1 min_distance max_distance mean_distance median_distance sd_distance closest_people closest_1 closest_2 closest_3 closest_4 closest_5
1 alex 342.8375 1158.1408 717.0810 650.9167 358.7439 boris, david, matthew, nigel, sarah boris david matthew nigel sarah
2 chris 195.4891 1504.8199 934.6618 895.8301 489.5175 boris, david, malcolm, nigel, steph boris david malcolm nigel steph
3 david 549.4500 830.2758 716.3839 807.6626 143.9571 matthew, sarah, steph, sylvia, tim matthew sarah steph sylvia tim
4 henry 423.1875 975.1733 639.5657 560.1101 223.2389 anna, boris, matthew, sebastian, tim anna boris matthew sebastian tim
5 john 415.8956 1174.1631 849.4313 965.2928 313.2616 boris, julie, matthew, theo, tyler boris julie matthew theo tyler
6 kelly 489.7949 828.5550 657.5908 658.7015 120.6485 david, julie, matthew, sebastian, steph david julie matthew sebastian steph
My Question: Although this code seems to run without errors, I have the feeling that this code will start to take a long time to run when the sizes of df_1 and df_2 start to grow. Hence, I am looking for ways to improve the efficiency of this code. Can someone please suggest routines for large data frames?
A data.table approach to this problem might be as follows:
funcs <- function(d,n) {
c(setNames(lapply(c(min,max,mean,median,sd), \(f) f(d)), c("min", "max", "mean", "median", "sd")),
list("names" = paste0(n, collapse=", "))
)
}
library(data.table)
setDT(cross_join(df_1, df_2))[
,dist:=distHaversine(c(lon.x, lat.x), c(lon.y, lat.y)), .(name_1, name_2)
][order(dist), .SD[1:5, funcs(dist, name_2)], name_1]
Output:
name_1 min max mean median sd names
1: taylor 170.5171 746.6206 470.0857 439.8022 227.39141 david, tim, nigel, sarah, sebastian
2: peter 195.4891 1455.0204 834.2543 830.2758 539.69009 steph, boris, matthew, anna, david
3: tom 243.6729 530.4778 426.2490 447.8639 110.26649 tim, sebastian, julie, nigel, david
4: ryan 342.8375 1243.7473 970.0721 1052.6759 367.08513 tyler, julie, sebastian, sylvia, nigel
5: henry 394.8684 894.5358 647.1996 670.9220 236.69562 anna, matthew, david, steph, boris
6: john 423.1875 1948.9521 1106.4374 1052.8789 674.69139 boris, steph, matthew, anna, david
7: kelly 491.6430 1130.9239 717.7716 658.7015 248.96974 sylvia, tyler, sarah, nigel, julie
8: trevor 520.1834 650.9167 609.4363 631.9494 52.96026 nigel, sarah, julie, tim, sebastian
9: troy 549.4500 1035.0599 782.8799 828.5550 220.72034 tyler, sylvia, sarah, nigel, theo
10: michael 581.9209 1504.5642 1057.1773 1012.5247 378.81712 theo, tyler, sylvia, sarah, nigel
11: david 602.9369 941.3102 752.1558 715.3872 159.37550 nigel, sarah, david, sylvia, anna
12: kevin 638.9259 834.5504 715.5252 644.2898 102.23793 matthew, anna, david, nigel, steph
13: xavier 972.9730 1767.1953 1369.5604 1396.8569 371.03190 julie, sebastian, tim, tyler, david
14: chris 1389.1659 2106.7084 1644.0448 1455.8430 316.31565 julie, tyler, sebastian, tim, theo
15: alex 1765.7750 2428.5429 2013.7843 1828.6055 294.37805 julie, tyler, sebastian, tim, theo
Another approach using dplyr is to use cross_join with rowwise() to get the distances, followed by slice_head(n=5, by=name_1) to get the five minimum distance by name_1, and then reframe or summarize the usual way:
cross_join(df_1, df_2) %>%
rowwise() %>%
mutate(dist=distHaversine(c(lon.x, lat.x), c(lon.y, lat.y))) %>%
ungroup() %>%
arrange(dist) %>%
slice_head(n = 5, by=name_1) %>%
reframe(
min_distance = min(dist),
max_distance = max(dist),
mean_distance=mean(dist),
median_distance=median(dist),
sd_distance = sd(dist),
names = paste0(name_2, collapse=","),
.by=name_1
)
Output:
# A tibble: 15 × 7
name_1 min_distance max_distance mean_distance median_distance sd_distance names
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 taylor 171. 747. 470. 440. 227. david,tim,nigel,sarah,sebastian
2 peter 195. 1455. 834. 830. 540. steph,boris,matthew,anna,david
3 tom 244. 530. 426. 448. 110. tim,sebastian,julie,nigel,david
4 ryan 343. 1244. 970. 1053. 367. tyler,julie,sebastian,sylvia,nigel
5 henry 395. 895. 647. 671. 237. anna,matthew,david,steph,boris
6 john 423. 1949. 1106. 1053. 675. boris,steph,matthew,anna,david
7 kelly 492. 1131. 718. 659. 249. sylvia,tyler,sarah,nigel,julie
8 trevor 520. 651. 609. 632. 53.0 nigel,sarah,julie,tim,sebastian
9 troy 549. 1035. 783. 829. 221. tyler,sylvia,sarah,nigel,theo
10 michael 582. 1505. 1057. 1013. 379. theo,tyler,sylvia,sarah,nigel
11 david 603. 941. 752. 715. 159. nigel,sarah,david,sylvia,anna
12 kevin 639. 835. 716. 644. 102. matthew,anna,david,nigel,steph
13 xavier 973. 1767. 1370. 1397. 371. julie,sebastian,tim,tyler,david
14 chris 1389. 2107. 1644. 1456. 316. julie,tyler,sebastian,tim,theo
15 alex 1766. 2429. 2014. 1829. 294. julie,tyler,sebastian,tim,theo
Work in progress
This solution is neither more concise nor faster than the one given by user @langtang, but draws attention to geosphere:.distm() as well as {Rfast}, {psych}, and {collpase}.
(1) Calculate matrix of distances (MD)
MD = geosphere::distm(df_1[-1], df_2[-1], fun = geosphere::distHaversine)
(2) For each row (points from df_1), find first five nearest points (stored in matrix X)
rowMins = \(D, k) matrix(D[order(row(D), D)], ncol = ncol(D), byrow = TRUE)[, k]
X = rowMins(MD, 1:5)
(3) Calculate summary characteristics (S) row-wisely
Namely min, max, mean, median, and sd.
Although consired to be relatively slow, the trick is to transpose X first, such that we can use well-established functions. Unfortunately, summary(t(X)) lacks sd while collapse::qsut(t(X)) lacks median. Comment please if there are options.
S = cbind(collapse::qsu(t(X))[, -1], Median = Rfast::rowMedians(X))
This already has created overhead, since qsu() and rowMedians() both ran over each row. An alternative might be
psych::describe(t(X), skew = FALSE)[3:7] |> # S2
`row.names<-`(df_1$name_1)
giving
mean sd median min max
john 1877.49 526.79 2086.03 965.66 2241.15
david 763.62 160.63 831.82 562.65 910.51
alex 1518.81 192.12 1561.73 1225.09 1721.88
kevin 892.37 290.81 922.51 582.00 1236.57
trevor 623.79 226.16 592.48 359.11 857.30
xavier 741.49 130.98 677.02 621.83 932.36
tom 530.70 189.60 597.44 205.96 663.98
michael 1109.90 146.69 1097.67 893.32 1295.68
troy 861.05 188.89 801.55 616.48 1059.15
kelly 802.43 291.93 800.38 432.64 1118.09
chris 1184.69 233.42 1233.05 840.16 1457.28
henry 963.14 257.45 994.43 649.05 1337.23
taylor 594.71 386.41 757.00 118.70 1003.30
ryan 720.59 217.00 772.56 407.34 957.74
peter 1333.79 552.69 1509.01 374.18 1718.11
There might be an option to specify which summary statistics should be calculated. I did not read the full documentation.
However, all this does not really help since you want the names associated with the five closest points too. A lot of overhead happens here.
X2 = t(apply(MD, 1, \(i) names(sort(i)[1:5])))
# collapse::dapply(MD, \(i) names(sort(i)[1:5])), 1) does not work
Side note. It appears to me there is no better base R solution than:
f = \(X, k) t(apply(X, 1, \(i) names(sort(i)[1:k])))
Finally giving
> cbind(data.frame(S), data.frame(X2))
Mean SD Min Max Median X1 X2 X3 X4 X5
john 1879.5928 527.3832 966.7430 2243.6565 2088.3664 steph tyler malcolm boris tim
david 764.4733 160.8068 563.2785 911.5332 832.7494 steph tyler sebastian tim malcolm
alex 1520.5110 192.3321 1226.4584 1723.8043 1563.4727 anna matthew theo sylvia david
kevin 893.3704 291.1345 582.6504 1237.9577 923.5420 tyler steph tim sebastian boris
trevor 624.4905 226.4107 359.5081 858.2593 593.1462 david sebastian tim tyler matthew
xavier 742.3244 131.1247 622.5262 933.4064 677.7796 anna matthew sylvia sarah julie
tom 531.2896 189.8111 206.1902 664.7259 598.1114 tim tyler julie david sebastian
michael 1111.1394 146.8538 894.3224 1297.1271 1098.9009 nigel theo malcolm sebastian david
troy 862.0110 189.0964 617.1660 1060.3376 802.4499 sebastian david malcolm nigel theo
kelly 803.3236 292.2534 433.1232 1119.3422 801.2745 sebastian david malcolm tim theo
chris 1186.0200 233.6779 841.1021 1458.9119 1234.4328 anna matthew theo david sarah
henry 964.2193 257.7372 649.7776 1338.7294 995.5426 tyler steph tim boris sebastian
taylor 595.3773 386.8399 118.8290 1004.4187 757.8502 tyler tim sebastian julie sarah
ryan 721.3972 217.2475 407.7949 958.8076 773.4229 david theo anna nigel matthew
peter 1335.2818 553.3045 374.5971 1720.0334 1510.6984 steph tyler malcolm sebastian tim
Note
Data in reproducible format.
df_1 = structure(list(
name_1 = c(
"john",
"david",
"alex",
"kevin",
"trevor",
"xavier",
"tom",
"michael",
"troy",
"kelly",
"chris",
"henry",
"taylor",
"ryan",
"peter"
),
lon = c(
-74.0116047564655,
-74.0083017748948,
-73.9904129168585,
-74.0052949160858,
-74.0047071226484,
-73.9888493501312,
-74.0013908379401,
-74.0186506123461,
-74.0128685285189,
-74.010456619701,
-73.9937591820256,
-74.0024018617294,
-74.001992285494,
-74.0048931728406,
-74.0115584113475
),
lat = c(
40.730669131368,
40.7177785047823,
40.6931338284337,
40.7198135590156,
40.7080720859227,
40.7021217629401,
40.7106202508534,
40.7025399555169,
40.7055110877071,
40.7065496073215,
40.6959330668926,
40.721177870445,
40.7143337311784,
40.7014186306299,
40.7253381492107
)
),
class = "data.frame",
row.names = c(NA, -15L))
df_2 = structure(list(
name_2 = c(
"matthew",
"tyler",
"sebastian",
"julie",
"anna",
"tim",
"david",
"nigel",
"sarah",
"steph",
"sylvia",
"boris",
"theo",
"malcolm"
),
lon = c(
-73.9950316098685,
-74.0016481850917,
-74.0092593158553,
-73.9945119238155,
-73.9960649614404,
-74.0005160304049,
-74.0036126826489,
-74.0122790607604,
-73.9923934755147,
-74.0120025958715,
-73.9841266700698,
-73.9906738937381,
-74.008357003591,
-74.0162642090031
),
lat = c(
40.705695934363,
40.7153688370916,
40.7103330812154,
40.709324574006,
40.7032838143273,
40.7123497227519,
40.7049509553054,
40.6961205806341,
40.7089977347971,
40.7219899660906,
40.7070465303739,
40.7188796432223,
40.6966211729171,
40.7122443803448
)
),
class = "data.frame",
row.names = c(NA, -14L))
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With