I am new to python. I am using dbscan
code for clustering purpose with some changes.Now code is running fine but its very slow. So I found out that I have to remove 'for loop' from my code.Here is a part of the code:
class Point:
def __init__(self, x = 0, y = 0, visited = False, isnoise = False):
self.x = x
self.y = y
self.visited = False
self.isnoise = False
def show(self):
return self.x, self.y
def dist(self, p1, p2):
#Calculate the great circle distance between two points on the earth (specified in decimal degrees)return distance between two point
# convert decimal degrees to radians
dlat = radians(p2.x-p1.x)
dlon = radians(p2.y-p1.y)
a = sin(dlat/2) * sin(dlat/2) + cos(radians(p1.x))* cos(radians(p2.x)) * sin(dlon/2) * sin(dlon/2)
c = 2 * atan2(sqrt(a), sqrt(1-a))
d = 6371 * c
return d
def distanceQuery(self,neighbor_pts):
dista=[]
for i in range(len(neighbor_pts)):
for j in range(i+1,len(neighbor_pts)):
z=self.dist(neighbor_pts[i],neighbor_pts[j])
dista.append(z)
return max(dista)
distanceQuery
function is using double for loop. Is there any way I can remove this? Can I vectorize this double for loop? As this is clustering code there are some steps which require appending. I have read that numpy array work different than python list when it comes to appending. Appending numpy arrays is inefficient.
So this can be vectorize. But here is other part of code where appending is happening just after that I check for certain condition.
def expandCluster(self, P, neighbor_points):
self.cluster[self.cluster_inx].append(P)
iterator = iter(neighbor_points)
while True:
try:
npoint_tmp = iterator.next()
except StopIteration:
# StopIteration exception is raised after last element
break
if (not npoint_tmp.visited):
#for each point P' in NeighborPts
npoint_tmp.visited = True
NeighborPts_ = self.regionQuery(npoint_tmp)
if (len(NeighborPts_) >= self.MinPts):
for j in range(len(NeighborPts_)):
neighbor_points.append(NeighborPts_[j])
if self.distanceQuery(neighbor_points)>0.10:
break
Now if I vectorize neighbor_points also. I will have to tackle the problem of appending? So each point will append into neighbour_points
and then it will make a distanceQuery
. And this process is also part of a iteration. So kind of two loops are here also.I just want to make sure that appending in numpy array wont be inefficient
import numpy as np
def dist(p1, p2):
# Initially, p1.shape() == (n, 2) and p2.shape() == (m, 2)
# Now, p1.shape() == (1, n, 2) and p2.shape() == (m, 1, 2)
p1 = p1[np.newaxis, :, :]
p2 = p2[:, np.newaxis, :]
# get all the vectory things
from numpy import sin, cos, radians, sqrt, arctan2 as atan2
# do the same math as before, but use `p[..., 0]` instead of `p.x` etc
dlat = radians(p2[..., 0] - p1[..., 0])
dlon = radians(p2[..., 1] - p1[..., 1])
a = sin(dlat/2) * sin(dlat/2) + cos(p1[..., 0])*cos(p2[..., 0]) * sin(dlon/2) * sin(dlon/2)
c = 2 * atan2(sqrt(a), sqrt(1-a))
d = 6371 * c
return d
def distanceQuery(neighbor_pts):
return np.max(dist(neighbor_pts, neighbor_pts))
e.g:
>>> points = np.array([[0, 0], [45, 0], [45, 45], [90, 0]], dtype=float)
>>> dist(points, points)
array([[ 0. , 5003.77169901, 6272.52596983, 10007.54339801],
[ 5003.77169901, 0. , 2579.12525679, 5003.77169901],
[ 6272.52596983, 2579.12525679, 0. , 4347.69702221],
[ 10007.54339801, 5003.77169901, 4347.69702221, 0. ]])
>>> np.max(_)
10007.543398010286
def dist_slow(p1, p2):
"""your function, adjusted to take an array instead of a `Point`"""
from math import radians, cos, sqrt, atan2
# compute the distance for all possible pairs
dlat = radians(p2[0]-p1[0])
dlon = radians(p2[1]-p1[1])
a = sin(dlat/2) * sin(dlat/2) + cos(radians(p1[0]))*cos(radians(p2[0])) * sin(dlon/2) * sin(dlon/2)
c = 2 * atan2(sqrt(a), sqrt(1-a))
d = 6371 * c
return d
def query_iter(p):
return max(dist_slow(p1, p2) for p1, p2 in itertools.combinations(p, 2))
def query_orig(p):
dista=[]
for i in range(len(p)):
for j in range(i + 1, len(p)):
z = dist_slow(p[i], p[j])
dista.append(z)
return max(dista)
def query_mine(p):
return dist(p, p).max()
Then:
>>> points = np.random.rand(1000, 2)
>>> timeit query_orig(points)
1 loops, best of 3: 7.94 s per loop
>>> timeit query_iter(points)
1 loops, best of 3: 7.35 s per loop
>>> timeit query_mine(points)
10 loops, best of 3: 150 ms per loop
You can do everything in "vector" form with numpy ufunc:
from numpy import radians, sin, cos, sqrt, arctan2
from numpy import random
def max_dist(p1x,p1y,p2x,p2y):
# give them "orthogonal" shape
p1x = p1x.reshape(p1x.size,1)
p1y = p1y.reshape(p1y.size,1)
p2x = p2x.reshape(1,p2x.size)
p2y = p2y.reshape(1,p2y.size)
# compute the distance for all possible pairs
dlat = radians(p2x-p1x)
dlon = radians(p2y-p1y)
a = sin(dlat/2) * sin(dlat/2) + cos(radians(p1x))*cos(radians(p2x)) * sin(dlon/2) * sin(dlon/2)
c = 2 * arctan2(sqrt(a), sqrt(1-a))
d = 6371 * c
return d.max()
if __name__=='__main__':
# generate random samples
N = 1000
p1x,p1y,p2x,p2y = random.rand(4,N)
print 'max_dist=',max_dist(p1x,p1y,p2x,p2y)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With