I'm trying to replicate the work done in the really cool nearest neighbor question but do it for each area in my dataframe rather than the group as a whole.
My data ncbaby (don't ask) looks like this:
id printid areaname latitude longitude
1 7912048 233502729 073 36.06241 -80.44229
2 735253 171241999 Area 12-06 35.54452 -78.75388
3 4325564 85564887 Area 12-04 35.49328 -78.73756
4 4222241 85461255 Area 12-06 35.53621 -78.75553
5 11997754 356053648 Area 12-04 35.49328 -78.73756
6 13444458 536073775 Area 12-06 35.53987 -78.74922
I'd like to run the function for each areaname. I tried calling split but but the distance function won't call on a list.
splitfile <- split(ncbaby, ncbaby$precinctname)
c <- gDistance(splitfile, byid=TRUE)
Error in (function (classes, fdef, mtable) :
unable to find an inherited method for function ‘is.projected’ for signature ‘"list"’
The closest I've gotten is:
library(sp)
library(rgeos)
uniq <- unique(unlist(ncbaby$areaname))
for (i in 1:length(uniq)){
data_1 <- subset(ncbaby, areaname == uniq[i])
sp.mydata <- data_1
coordinates(sp.mydata) <- ~longitude+latitude
d <- gDistance(sp.mydata, byid=TRUE)
min.d <- apply(d, 1, function(x) order(x, decreasing=F)[2])
newdata <- cbind(data_1, data_1[min.d,], apply(d, 1, function(x) sort(x, decreasing=F)[2]))
colnames(newdata) <- c(colnames(data_1), 'n.ID',
'n.printid', '.Areaname', 'n.latitude', 'n.longitude',
'distance')
}
The problem here is that it ends up kicking out only the last returned value. ideas? I'd be interested/open to modifying the function as well, as it seems it might be more efficient.
I checked the linked post and revised the idea a bit. I thought using apply()
may not be a good idea for a large data set. So I rather used a data.table-related approach. First, I converted my sample data to a SpatialPointsDataFrame. Then, I split the data by a group variable (i.e.,group). As Eddie suggested, I utilized lapply()
with data.table functions. When you use gDistance()
, you have a two-dimensional vector as an output. I converted that to a data.table object so that following data process potentially goes faster. I reshaped the dt object with melt()
and remove all data points with distance = 0. Finally, I took the first row for each Var1
. Please note that Var1
here represents each row of the sample data, mydf
. The last job was to add the new distance vector to the original data frame. I hope this will help you.
DATA
group user_id latitude longitude
1 B23 85553 -34.44011 172.6954
2 B23 85553 -34.43929 172.6939
3 B23 85553 -34.43929 172.6939
4 B23 85553 -34.43851 172.6924
5 B23 57357 -34.42747 172.6778
6 B23 57357 -34.42747 172.6778
7 B23 57357 -34.42747 172.6778
8 B23 98418 -34.43119 172.7014
9 B23 98418 -34.43225 172.7023
10 B23 98418 -34.43224 172.7023
11 B23 98418 -34.43224 172.7023
12 B24 57357 -34.43647 172.7141
13 B24 57357 -34.43647 172.7141
14 B24 57357 -34.43647 172.7141
15 B24 98418 -34.43904 172.7172
16 B24 98418 -34.43904 172.7172
17 B24 98418 -34.43904 172.7172
18 B24 98418 -34.43925 172.7168
19 B24 98418 -34.43915 172.7169
20 B24 98418 -34.43915 172.7169
21 B24 98418 -34.43915 172.7169
22 B24 98418 -34.43915 172.7169
CODE
library(sp)
library(rgeos)
library(data.table)
# Copy the original
temp <- mydf
#DF to SPDF
coordinates(temp) <- ~longitude+latitude
# Split the data by a group variable
mylist <- split(temp, f = temp$group)
#For each element in mylist, apply gDistance, reshape the output of
# gDistance and create a data.table. Then, reshape the data, remove
# rows with distance = 0. Finally, choose the first row for each
# variable. levels in variable represents rows in mydf.
out <- rbindlist(
lapply(mylist, function(x){
d <- setDT(melt(gDistance(x, byid = TRUE)))
setorder(d, Var1, value)
d <- d[value > 0]
d <- d[, .SD[1], by = Var1]
d
})
)
out <- cbind(mydf, distance = out$value)
# group user_id latitude longitude distance
#1 B23 85553 -34.44011 172.6954 1.743945e-03
#2 B23 85553 -34.43929 172.6939 1.661118e-03
#3 B23 85553 -34.43929 172.6939 1.661118e-03
#4 B23 85553 -34.43851 172.6924 1.661118e-03
#5 B23 57357 -34.42747 172.6778 1.836642e-02
#6 B23 57357 -34.42747 172.6778 1.836642e-02
#7 B23 57357 -34.42747 172.6778 1.836642e-02
#8 B23 98418 -34.43119 172.7014 1.369100e-03
#9 B23 98418 -34.43225 172.7023 1.456022e-05
#10 B23 98418 -34.43224 172.7023 1.456022e-05
#11 B23 98418 -34.43224 172.7023 1.456022e-05
#12 B24 57357 -34.43647 172.7141 3.862696e-03
#13 B24 57357 -34.43647 172.7141 3.862696e-03
#14 B24 57357 -34.43647 172.7141 3.862696e-03
#15 B24 98418 -34.43904 172.7172 3.245705e-04
#16 B24 98418 -34.43904 172.7172 3.245705e-04
#17 B24 98418 -34.43904 172.7172 3.245705e-04
#18 B24 98418 -34.43925 172.7168 1.393162e-04
#19 B24 98418 -34.43915 172.7169 1.393162e-04
#20 B24 98418 -34.43915 172.7169 1.393162e-04
#21 B24 98418 -34.43915 172.7169 1.393162e-04
#22 B24 98418 -34.43915 172.7169 1.393162e-04
DATA in dput()
mydf <- structure(list(group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("B23",
"B24"), class = "factor"), user_id = c(85553L, 85553L, 85553L,
85553L, 57357L, 57357L, 57357L, 98418L, 98418L, 98418L, 98418L,
57357L, 57357L, 57357L, 98418L, 98418L, 98418L, 98418L, 98418L,
98418L, 98418L, 98418L), latitude = c(-34.440114, -34.43929,
-34.43929, -34.438507, -34.427467, -34.427467, -34.427467, -34.431187,
-34.432254, -34.43224, -34.43224, -34.436472, -34.436472, -34.436472,
-34.439038, -34.439038, -34.439038, -34.439246, -34.439149, -34.439149,
-34.439149, -34.439149), longitude = c(172.695443, 172.693906,
172.693906, 172.692441, 172.677763, 172.677763, 172.677763, 172.701413,
172.702284, 172.702288, 172.702288, 172.71411, 172.71411, 172.71411,
172.717203, 172.717203, 172.717203, 172.716798, 172.716898, 172.716898,
172.716898, 172.716898)), .Names = c("group", "user_id", "latitude",
"longitude"), row.names = c(NA, -22L), class = "data.frame")
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With