I have a dataframe with 30,000 records in the following format:
ID | Name | Latitude | Longitude | Country |
1 | Hull | 53.744 | -0.3456 | GB |
I would like to select one record to be the start location and one record to be the destination and return a path (list) for the shortest path.
I am using Geopy to find the distance between points in km
import geopy.distance
coords_1 = (52.2296756, 21.0122287)
coords_2 = (52.406374, 16.9251681)
print (geopy.distance.vincenty(coords_1, coords_2).km)
I have read how to do A* in python from the following tutorial: https://www.redblobgames.com/pathfinding/a-star/implementation.html
However they create a grid system to navigate through.
This is a visual representation of the records in the dataframe:
This is the code I have so far however it fails to find a path:
def calcH(start, end):
coords_1 = (df['latitude'][start], df['longitude'][start])
coords_2 = (df['latitude'][end], df['longitude'][end])
distance = (geopy.distance.vincenty(coords_1, coords_2)).km
return distance
^Calculates the distance between points
def getneighbors(startlocation):
neighborDF = pd.DataFrame(columns=['ID', 'Distance'])
coords_1 = (df['latitude'][startlocation], df['longitude'][startlocation])
for index, row in df.iterrows():
coords_2 = (df['latitude'][index], df['longitude'][index])
distance = round((geopy.distance.vincenty(coords_1, coords_2)).km,2)
neighborDF.loc[len(neighborDF)] = [index, distance]
neighborDF = neighborDF.sort_values(by=['Distance'])
neighborDF = neighborDF.reset_index(drop=True)
return neighborDF[1:5]
^Returns the 4 closest locations (ignoring itself)
openlist = pd.DataFrame(columns=['ID', 'F', 'G', 'H', 'parentID'])
closedlist = pd.DataFrame(columns=['ID', 'F', 'G', 'H', 'parentID'])
startIndex = 25479 # Hessle
endIndex = 8262 # Leeds
h = calcH(startIndex, endIndex)
openlist.loc[len(openlist)] = [startIndex,h, 0, h, startIndex]
while True:
#sort the open list by F score
openlist = openlist.sort_values(by=['F'])
openlist = openlist.reset_index(drop=True)
currentLocation = openlist.loc[0]
closedlist.loc[len(closedlist)] = currentLocation
openlist = openlist[openlist.ID != currentLocation.ID]
if currentLocation.ID == endIndex:
print("Complete")
break
adjacentLocations = getneighbors(currentLocation.ID)
if(len(adjacentLocations) < 1):
print("No Neighbors: " + str(currentLocation.ID))
else:
print(str(len(adjacentLocations)))
for index, row in adjacentLocations.iterrows():
if adjacentLocations['ID'][index] in closedlist.values:
continue
if (adjacentLocations['ID'][index] in openlist.values) == False:
g = currentLocation.G + calcH(currentLocation.ID, adjacentLocations['ID'][index])
h = calcH(adjacentLocations['ID'][index], endIndex)
f = g + h
openlist.loc[len(openlist)] = [adjacentLocations['ID'][index], f, g, h, currentLocation.ID]
else:
adjacentLocationInDF = openlist.loc[openlist['ID'] == adjacentLocations['ID'][index]] #Get location from openlist
g = currentLocation.G + calcH(currentLocation.ID, adjacentLocations['ID'][index])
f = g + adjacentLocationInDF.H
if float(f) < float(adjacentLocationInDF.F):
openlist = openlist[openlist.ID != currentLocation.ID]
openlist.loc[len(openlist)] = [adjacentLocations['ID'][index], f, g, adjacentLocationInDF.H, currentLocation.ID]
if (len(openlist)< 1):
print("No Path")
break
Finds the path from the closed list:
# return the path
pathdf = pd.DataFrame(columns=['name', 'latitude', 'longitude', 'country'])
def getParent(index):
parentDF = closedlist.loc[closedlist['ID'] == index]
pathdf.loc[len(pathdf)] = [df['name'][parentDF.ID.values[0]],df['latitude'][parentDF.ID.values[0]],df['longitude'][parentDF.ID.values[0]],df['country'][parentDF.ID.values[0]]]
if index != startIndex:
getParent(parentDF.parentID.values[0])
getParent(closedlist['ID'][len(closedlist)-1])
Currently this implementation of A* isn't finding a complete path . Any suggestions?
Edit: I have tried increasing the number of considered neighbors from 4 to 10 and I got a path but not a optimum path:
We are trying to get from Hessle to Leeds.
^ available nodes
Raw Data: Link
I'm still not sure what's the problem with your appraoch, although there certainly are a few, as already mentioned in comments.
x in dataframe.values
will check whether x
is any of the values in the numpy-array returned by values
, not necessarily the ID fieldAnyhow, I found this to be an interesting problem and gave it a try. Turns out, though, that using dataframes as some kind of pseudo-heap was indeed very slow, and also I found the dataframe-indexing to be extremely confusing (and possibly error-prone?), so I changed the code to use namedtuple
for data and a proper heapq
heap for the openlist
and a dict
mapping nodes to their parents for the closedlist
. Also, there are fewer checks than in your code (e.g. whether a node is already in the openlist) and those do not matter really.
import csv, geopy.distance, collections, heapq
Location = collections.namedtuple("Location", "ID name latitude longitude country".split())
data = {}
with open("stations.csv") as f:
r = csv.DictReader(f)
for d in r:
i, n, x, y, c = int(d["id"]), d["name"], d["latitude"], d["longitude"], d["country"]
if c == "GB":
data[i] = Location(i,n,x,y,c)
def calcH(start, end):
coords_1 = (data[start].latitude, data[start].longitude)
coords_2 = (data[end].latitude, data[end].longitude)
distance = (geopy.distance.vincenty(coords_1, coords_2)).km
return distance
def getneighbors(startlocation, n=10):
return sorted(data.values(), key=lambda x: calcH(startlocation, x.ID))[1:n+1]
def getParent(closedlist, index):
path = []
while index is not None:
path.append(index)
index = closedlist.get(index, None)
return [data[i] for i in path[::-1]]
startIndex = 25479 # Hessle
endIndex = 8262 # Leeds
Node = collections.namedtuple("Node", "ID F G H parentID".split())
h = calcH(startIndex, endIndex)
openlist = [(h, Node(startIndex, h, 0, h, None))] # heap
closedlist = {} # map visited nodes to parent
while len(openlist) >= 1:
_, currentLocation = heapq.heappop(openlist)
print(currentLocation)
if currentLocation.ID in closedlist:
continue
closedlist[currentLocation.ID] = currentLocation.parentID
if currentLocation.ID == endIndex:
print("Complete")
for p in getParent(closedlist, currentLocation.ID):
print(p)
break
for other in getneighbors(currentLocation.ID):
g = currentLocation.G + calcH(currentLocation.ID, other.ID)
h = calcH(other.ID, endIndex)
f = g + h
heapq.heappush(openlist, (f, Node(other.ID, f, g, h, currentLocation.ID)))
This gives me this path from Hessle to Leeds, which seems more reasonable:
Location(ID=25479, name='Hessle', latitude='53.717567', longitude='-0.442169', country='GB')
Location(ID=8166, name='Brough', latitude='53.726452', longitude='-0.578255', country='GB')
Location(ID=25208, name='Eastrington', latitude='53.75481', longitude='-0.786612', country='GB')
Location(ID=25525, name='Howden', latitude='53.764526', longitude='-0.86068', country='GB')
Location(ID=7780, name='Selby', latitude='53.78336', longitude='-1.06355', country='GB')
Location(ID=26157, name='Sherburn-In-Elmet', latitude='53.797142', longitude='-1.23176', country='GB')
Location(ID=25308, name='Garforth Station', latitude='53.796211', longitude='-1.382083', country='GB')
Location(ID=8262, name='Leeds', latitude='53.795158', longitude='-1.549089', country='GB')
Even if you can't use this because you have to use Pandas (?), maybe this helps you finally spot your actual error.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With