I want to find (efficiently) all pairs of points that are closer than some distance max_d
. My current method, using cdist
, is:
import numpy as np
from scipy.spatial.distance import cdist
def close_pairs(X,max_d):
d = cdist(X,X)
I,J = (d<max_d).nonzero()
IJ = np.sort(np.vstack((I,J)), axis=0)
# remove diagonal element
IJ = IJ[:,np.diff(IJ,axis=0).ravel()<>0]
# remove duplicate
dt = np.dtype([('i',int),('j',int)])
pairs = np.unique(IJ.T.view(dtype=dt)).view(int).reshape(-1,2)
return pairs
def test():
X = np.random.rand(100,2)*20
p = close_pairs(X,2)
from matplotlib import pyplot as plt
plt.clf()
plt.plot(X[:,0],X[:,1],'.r')
plt.plot(X[p,0].T,X[p,1].T,'-b')
But I think this is overkill (and not very readable), because most of the work is done only to remove distance-to-self and duplicates.
My main question is: is there a better way to do it?
(Note: the type of outputs (array
, set
, ...) is not important at this point)
My current thinking is on using pdist
which returns a condensed distance array which contains only the right pairs. However, once I found the suitable coordinates k
's from the condensed distance array, how do I compute which i,j
pairs it is equivalent to?
So the alternative question is: is there an easy way to get the list of coordinate pairs relative to the entries of pdist
outputs:
f(k)->i,j
cdist(X,X)[i,j] = pdist(X)[k]
In my experience, there are two fastest ways to find neighbor lists in 3D. One is to use a most naive double-for-loop code written in C++ or Cython (in my case, both). It runs in N^2, but is very fast for small systems. The other way is to use a linear time algorithm. Scipy ckdtree is a good choice, but has limitations. Neighbor list finders from molecular dynamics software are most powerful, but are very hard to wrap, and likely have slow initialization time.
Below I compare four methods:
Scipy.spatial.ckdtree
scipy.spatial.distance.pdist
Test setup: n
points scattered in a rectangular box at volume density 0.2. System size ranging from 10 to a 1000000 (a million) particles. Contact radius is taken from 0.5, 1, 2, 4, 7, 10
. Note that because density is 0.2, at contact radius 0.5 we'll have on average about 0.1 contacts per particle, at 1 = 0.8, at 2 = 6.4, and at 10 - about 800! Contact finding was repeated several times for small systems, done once for systems >30k particles. If time per call exceeded 5 seconds, the run was aborted.
Setup: dual xeon 2687Wv3, 128GB RAM, Ubuntu 14.04, python 2.7.11, scipy 0.16.0, numpy 1.10.1. None of the code was using parallel optimizations (except for OpenMM, though parallel part went so quick that it was not even noticeable on a CPU graph, most of the time was spend piping data to-from OpenMM).
Results: Note that plots below are logscale, and spread over 6 orders of magnitude. Even small visual difference may be actually 10-fold.
For systems less than 1000 particles, Cython
code was always faster. However, after 1000 particles results are dependent on the contact radius. pdist
implementation was always slower than cython, and takes much more memory, because it explicitly creates a distance matrix, which is slow because of sqrt.
ckdtree
is a good choice for all system sizes. ckdtree
performs just 3-10 times worseInstalling OpenMM is very tricky; you can read more in http://bitbucket.org/mirnylab/openmm-polymer file "contactmaps.py" or in the readme. However, the results below show that it is only advantageous for 5-50 contacts per particle, for N>100k particles.
Cython code below:
import numpy as np
cimport numpy as np
cimport cython
cdef extern from "<vector>" namespace "std":
cdef cppclass vector[T]:
cppclass iterator:
T operator*()
iterator operator++()
bint operator==(iterator)
bint operator!=(iterator)
vector()
void push_back(T&)
T& operator[](int)
T& at(int)
iterator begin()
iterator end()
np.import_array() # initialize C API to call PyArray_SimpleNewFromData
cdef public api tonumpyarray(int* data, long long size) with gil:
if not (data and size >= 0): raise ValueError
cdef np.npy_intp dims = size
#NOTE: it doesn't take ownership of `data`. You must free `data` yourself
return np.PyArray_SimpleNewFromData(1, &dims, np.NPY_INT, <void*>data)
@cython.boundscheck(False)
@cython.wraparound(False)
def contactsCython(inArray, cutoff):
inArray = np.asarray(inArray, dtype = np.float64, order = "C")
cdef int N = len(inArray)
cdef np.ndarray[np.double_t, ndim = 2] data = inArray
cdef int j,i
cdef double curdist
cdef double cutoff2 = cutoff * cutoff # IMPORTANT to avoid slow sqrt calculation
cdef vector[int] contacts1
cdef vector[int] contacts2
for i in range(N):
for j in range(i+1, N):
curdist = (data[i,0] - data[j,0]) **2 +(data[i,1] - data[j,1]) **2 + (data[i,2] - data[j,2]) **2
if curdist < cutoff2:
contacts1.push_back(i)
contacts2.push_back(j)
cdef int M = len(contacts1)
cdef np.ndarray[np.int32_t, ndim = 2] contacts = np.zeros((M,2), dtype = np.int32)
for i in range(M):
contacts[i,0] = contacts1[i]
contacts[i,1] = contacts2[i]
return contacts
Compilation (or makefile) for Cython code:
cython --cplus fastContacts.pyx
g++ -g -march=native -Ofast -fpic -c fastContacts.cpp -o fastContacts.o `python-config --includes`
g++ -g -march=native -Ofast -shared -o fastContacts.so fastContacts.o `python-config --libs`
Testing code:
from __future__ import print_function, division
import signal
import time
from contextlib import contextmanager
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial import ckdtree
from scipy.spatial.distance import pdist
from contactmaps import giveContactsOpenMM # remove this unless you have OpenMM and openmm-polymer libraries installed
from fastContacts import contactsCython
class TimeoutException(Exception): pass
@contextmanager
def time_limit(seconds):
def signal_handler(signum, frame):
raise TimeoutException("Timed out!")
signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(seconds)
try:
yield
finally:
signal.alarm(0)
matplotlib.rcParams.update({'font.size': 8})
def close_pairs_ckdtree(X, max_d):
tree = ckdtree.cKDTree(X)
pairs = tree.query_pairs(max_d)
return np.array(list(pairs))
def condensed_to_pair_indices(n, k):
x = n - (4. * n ** 2 - 4 * n - 8 * k + 1) ** .5 / 2 - .5
i = x.astype(int)
j = k + i * (i + 3 - 2 * n) / 2 + 1
return np.array([i, j]).T
def close_pairs_pdist(X, max_d):
d = pdist(X)
k = (d < max_d).nonzero()[0]
return condensed_to_pair_indices(X.shape[0], k)
a = np.random.random((100, 3)) * 3 # test set
methods = {"cython": contactsCython, "ckdtree": close_pairs_ckdtree, "OpenMM": giveContactsOpenMM,
"pdist": close_pairs_pdist}
# checking that each method gives the same value
allUniqueInds = []
for ind, method in methods.items():
contacts = method(a, 1)
uniqueInds = contacts[:, 0] + 100 * contacts[:, 1] # unique index of each contacts
allUniqueInds.append(np.sort(uniqueInds)) # adding sorted unique conatcts
for j in allUniqueInds:
assert np.allclose(j, allUniqueInds[0])
# now actually doing testing
repeats = [30,30,30, 30, 30, 20, 20, 10, 5, 3, 2 , 1, 1, 1]
sizes = [10,30,100, 200, 300, 500, 1000, 2000, 3000, 10000, 30000, 100000, 300000, 1000000]
systems = [[np.random.random((n, 3)) * ((n / 0.2) ** 0.333333) for k in range(repeat)] for n, repeat in
zip(sizes, repeats)]
for j, radius in enumerate([0.5, 1, 2, 4, 7, 10]):
plt.subplot(2, 3, j + 1)
plt.title("Radius = {0}; {1:.2f} cont per particle".format(radius, 0.2 * (4 / 3 * np.pi * radius ** 3)))
times = {i: [] for i in methods}
for name, method in methods.items():
for n, system, repeat in zip(sizes, systems, repeats):
if name == "pdist" and n > 30000:
break # memory issues
st = time.time()
try:
with time_limit(5 * repeat):
for ind in range(repeat):
k = len(method(system[ind], radius))
except:
print("Run aborted")
break
end = time.time()
mytime = (end - st) / repeat
times[name].append((n, mytime))
print("{0} radius={1} n={2} time={3} repeat={4} contPerParticle={5}".format(name, radius, n, mytime,repeat, 2 * k / n))
for name in sorted(times.keys()):
plt.plot(*zip(*times[name]), label=name)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("System size")
plt.ylabel("Time (seconds)")
plt.legend(loc=0)
plt.show()
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With