Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Get the nearest distance with two geodataframe in pandas

Here is my first geodatframe :

!pip install geopandas
import pandas as pd
import geopandas

city1 = [{'City':"Buenos Aires","Country":"Argentina","Latitude":-34.58,"Longitude":-58.66},
           {'City':"Brasilia","Country":"Brazil","Latitude":-15.78 ,"Longitude":-70.66},
         {'City':"Santiago","Country":"Chile ","Latitude":-33.45 ,"Longitude":-70.66 }]
city2 =  [{'City':"Bogota","Country":"Colombia ","Latitude":4.60 ,"Longitude":-74.08},
           {'City':"Caracas","Country":"Venezuela","Latitude":10.48  ,"Longitude":-66.86}]
city1df = pd.DataFrame(city1)
city2df = pd.DataFrame(city2)
gcity1df = geopandas.GeoDataFrame(
    city1df, geometry=geopandas.points_from_xy(city1df.Longitude, city1df.Latitude))
gcity2df = geopandas.GeoDataFrame(
    city2df, geometry=geopandas.points_from_xy(city2df.Longitude, city2df.Latitude))

City1

           City    Country  Latitude  Longitude                     geometry
0  Buenos Aires  Argentina    -34.58     -58.66  POINT (-58.66000 -34.58000)
1      Brasilia     Brazil    -15.78     -47.91  POINT (-47.91000 -15.78000)
2      Santiago      Chile    -33.45     -70.66  POINT (-70.66000 -33.45000)

and my second geodataframe : City2 :

         City    Country  Latitude  Longitude                     geometry
1        Bogota   Colombia      4.60     -74.08    POINT (-74.08000 4.60000)
2       Caracas  Venezuela     10.48     -66.86   POINT (-66.86000 10.48000)

i would like third dataframe with the nearest city from city1 to city2 with the distance like :

           City    Country  Latitude  Longitude                     geometry    Nearest    Distance
0  Buenos Aires  Argentina    -34.58     -58.66  POINT (-58.66000 -34.58000)    Bogota    111 Km

Here is my actual solution using geodjango and dict (but it's way too long) :

from django.contrib.gis.geos import GEOSGeometry
result = []
dict_result = {}
for city01 in city1 :
  dist = 99999999
  pnt = GEOSGeometry('SRID=4326;POINT( '+str(city01["Latitude"])+' '+str(city01['Longitude'])+')')
  for city02 in city2:
    pnt2 = GEOSGeometry('SRID=4326;POINT('+str(city02['Latitude'])+' '+str(city02['Longitude'])+')')
    distance_test = pnt.distance(pnt2) * 100
    if distance_test < dist :
      dist = distance_test
  result.append(dist)
  dict_result[city01['City']] = city02['City']

Here are my tryings :

from shapely.ops import nearest_points
# unary union of the gpd2 geomtries 
pts3 = gcity2df.geometry.unary_union
def Euclidean_Dist(df1, df2, cols=['x_coord','y_coord']):
    return np.linalg.norm(df1[cols].values - df2[cols].values,
                   axis=1)
def near(point, pts=pts3):
     # find the nearest point and return the corresponding Place value
     nearest = gcity2df.geometry == nearest_points(point, pts)[1]

     return gcity2df[nearest].City
gcity1df['Nearest'] = gcity1df.apply(lambda row: near(row.geometry), axis=1)
gcity1df

here :

    City    Country     Latitude    Longitude   geometry    Nearest
0   Buenos Aires    Argentina   -34.58  -58.66  POINT (-58.66000 -34.58000)     Bogota
1   Brasilia    Brazil  -15.78  -70.66  POINT (-70.66000 -15.78000)     Bogota
2   Santiago    Chile   -33.45  -70.66  POINT (-70.66000 -33.45000)     Bogota

Regards

like image 810
Bussiere Avatar asked Jan 26 '20 17:01

Bussiere


1 Answers

Firstly, I merge two data frames by cross join. And then, I found distance between two points using map in python. I use map, because most of the time it is much faster than apply, itertuples, iterrows etc. (Reference: https://stackoverflow.com/a/52674448/8205554)

Lastly, I group by data frame and fetch minimum values of distance.

Here are libraries,

import pandas as pd
import geopandas
import geopy.distance
from math import radians, cos, sin, asin, sqrt

Here are used functions,

def dist1(p1, p2):
    lon1, lat1, lon2, lat2 = map(radians, [p1.x, p1.y, p2.x, p2.y])

    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 

    return c * 6373

def dist2(p1, p2):
    lon1, lat1, lon2, lat2 = map(radians, [p1[0], p1[1], p2[0], p2[1]])

    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 

    return c * 6373

def dist3(p1, p2):
    x = p1.y, p1.x
    y = p2.y, p2.x

    return geopy.distance.geodesic(x, y).km

def dist4(p1, p2):
    x = p1[1], p1[0]
    y = p2[1], p2[0]

    return geopy.distance.geodesic(x, y).km

And data,

city1 = [
  {
    'City': 'Buenos Aires',
    'Country': 'Argentina',
    'Latitude': -34.58,
    'Longitude': -58.66
  },
  {
    'City': 'Brasilia',
    'Country': 'Brazil',
    'Latitude': -15.78,
    'Longitude': -70.66
  },
  {
    'City': 'Santiago',
    'Country': 'Chile ',
    'Latitude': -33.45,
    'Longitude': -70.66
  }
]

city2 = [
  {
    'City': 'Bogota',
    'Country': 'Colombia ',
    'Latitude': 4.6,
    'Longitude': -74.08
  },
  {
    'City': 'Caracas',
    'Country': 'Venezuela',
    'Latitude': 10.48,
    'Longitude': -66.86
  }
]


city1df = pd.DataFrame(city1)
city2df = pd.DataFrame(city2)

Cross join with geopandas data frames,

gcity1df = geopandas.GeoDataFrame(
    city1df, 
    geometry=geopandas.points_from_xy(city1df.Longitude, city1df.Latitude)
)
gcity2df = geopandas.GeoDataFrame(
    city2df, 
    geometry=geopandas.points_from_xy(city2df.Longitude, city2df.Latitude)
)

# cross join geopandas
gcity1df['key'] = 1
gcity2df['key'] = 1
merged = gcity1df.merge(gcity2df, on='key')

math functions and geopandas,

# 6.64 ms ± 588 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit

# find distance
merged['dist'] = list(map(dist1, merged['geometry_x'], merged['geometry_y']))

mapping = {
    'City_x': 'City',
    'Country_x': 'Country',
    'Latitude_x': 'Latitude',
    'Longitude_x': 'Longitude',
    'geometry_x': 'geometry',
    'City_y': 'Nearest',
    'dist': 'Distance'
}

nearest = merged.loc[merged.groupby(['City_x', 'Country_x'])['dist'].idxmin()]
nearest.rename(columns=mapping)[list(mapping.values())]

           City    Country  Latitude  Longitude                     geometry  \
2      Brasilia     Brazil    -15.78     -70.66  POINT (-70.66000 -15.78000)   
0  Buenos Aires  Argentina    -34.58     -58.66  POINT (-58.66000 -34.58000)   
4      Santiago     Chile     -33.45     -70.66  POINT (-70.66000 -33.45000)   

  Nearest     Distance  
2  Bogota  2297.922808  
0  Bogota  4648.004515  
4  Bogota  4247.586882 

geopy and geopandas,

# 9.99 ms ± 764 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit

# find distance
merged['dist'] = list(map(dist3, merged['geometry_x'], merged['geometry_y']))

mapping = {
    'City_x': 'City',
    'Country_x': 'Country',
    'Latitude_x': 'Latitude',
    'Longitude_x': 'Longitude',
    'geometry_x': 'geometry',
    'City_y': 'Nearest',
    'dist': 'Distance'
}

nearest = merged.loc[merged.groupby(['City_x', 'Country_x'])['dist'].idxmin()]
nearest.rename(columns=mapping)[list(mapping.values())]

           City    Country  Latitude  Longitude                     geometry  \
2      Brasilia     Brazil    -15.78     -70.66  POINT (-70.66000 -15.78000)   
0  Buenos Aires  Argentina    -34.58     -58.66  POINT (-58.66000 -34.58000)   
4      Santiago     Chile     -33.45     -70.66  POINT (-70.66000 -33.45000)   

  Nearest     Distance  
2  Bogota  2285.239605  
0  Bogota  4628.641817  
4  Bogota  4226.710978 

If you want to use pandas instead of geopandas,

# cross join pandas
city1df['key'] = 1
city2df['key'] = 1
merged = city1df.merge(city2df, on='key')

With math functions,

# 8.65 ms ± 2.21 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit

# find distance
merged['dist'] = list(
    map(
        dist2, 
        merged[['Longitude_x', 'Latitude_x']].values, 
        merged[['Longitude_y', 'Latitude_y']].values
    )
)

mapping = {
    'City_x': 'City',
    'Country_x': 'Country',
    'Latitude_x': 'Latitude',
    'Longitude_x': 'Longitude',
    'City_y': 'Nearest',
    'dist': 'Distance'
}

nearest = merged.loc[merged.groupby(['City_x', 'Country_x'])['dist'].idxmin()]
nearest.rename(columns=mapping)[list(mapping.values())]

           City    Country  Latitude  Longitude Nearest     Distance
2      Brasilia     Brazil    -15.78     -70.66  Bogota  2297.922808
0  Buenos Aires  Argentina    -34.58     -58.66  Bogota  4648.004515
4      Santiago     Chile     -33.45     -70.66  Bogota  4247.586882

With geopy,

# 9.8 ms ± 807 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit

# find distance
merged['dist'] = list(
    map(
        dist4, 
        merged[['Longitude_x', 'Latitude_x']].values, 
        merged[['Longitude_y', 'Latitude_y']].values
    )
)

mapping = {
    'City_x': 'City',
    'Country_x': 'Country',
    'Latitude_x': 'Latitude',
    'Longitude_x': 'Longitude',
    'City_y': 'Nearest',
    'dist': 'Distance'
}

nearest = merged.loc[merged.groupby(['City_x', 'Country_x'])['dist'].idxmin()]
nearest.rename(columns=mapping)[list(mapping.values())]

           City    Country  Latitude  Longitude Nearest     Distance
2      Brasilia     Brazil    -15.78     -70.66  Bogota  2285.239605
0  Buenos Aires  Argentina    -34.58     -58.66  Bogota  4628.641817
4      Santiago     Chile     -33.45     -70.66  Bogota  4226.710978
like image 67
E. Zeytinci Avatar answered Oct 19 '22 13:10

E. Zeytinci