I have been trying to test the speedup potential of using Cython as compared to the base Python code. For this purpose, I wrote two scripts 'linearAdvec_mat.py' and 'linearAdvec_mat.pyx' as follows:
linearAdvec_mat.py:
import numpy as np
def Adv_mat(N):
A = np.zeros((N, N));
for i in range(N):
if i == 0:
A[i, N - 1] = -1.0;
A[i, i] = 0.0;
A[i, i + 1] = 1.0;
elif i == N - 1:
A[i, i - 1] = -1.0;
A[i, i] = 0.0;
A[i, 0] = 1.0;
else:
A[i, i - 1] = -1.0;
A[i, i] = 0.0;
A[i, i + 1] = 1.0;
return A;
def Diff_mat(N):
D = np.zeros((N, N));
for i in range(N):
if i == 0:
D[i, N - 1] = 1.0;
D[i, i] = -2.0;
D[i, i + 1] = 1.0;
elif i == N - 1:
D[i, i - 1] = 1.0;
D[i, i] = -2.0;
D[i, 0] = 1.0;
else:
D[i, i - 1] = 1.0;
D[i, i] = -2.0;
D[i, i + 1] = 1.0;
return D;
def Compute_eigVals(N, alpha, kdt):
A = Adv_mat(N);
D = Diff_mat(N);
ADt = A*(-alpha/2.0) + D*kdt;
ldt = np.zeros(N, 'complex');
beta = np.zeros(N);
for m in range(N):
beta[m] = 2*np.pi*m/N;
if beta[m] > np.pi:
beta[m] = 2*np.pi - beta[m];
for j in range(N):
ldt[m] += ADt[0, j]*np.exp(1j*2.0*np.pi*j*m/N);
return ldt;
and linearAdvec_mat.pyx:
import numpy as np
cimport numpy as np
DTYPE = np.float64;
DTYPE_c = np.complex128;
ctypedef np.float64_t DTYPE_t;
cdef np.ndarray[DTYPE_t, ndim = 2] Adv_mat(int N):
cdef np.ndarray[DTYPE_t, ndim = 2] A = np.zeros((N, N), dtype = DTYPE);
cdef int i;
for i in range(N):
if i == 0:
A[i, N - 1] = -1.0;
A[i, i] = 0.0;
A[i, i + 1] = 1.0;
elif i == N - 1:
A[i, i - 1] = -1.0;
A[i, i] = 0.0;
A[i, 0] = 1.0;
else:
A[i, i - 1] = -1.0;
A[i, i] = 0.0;
A[i, i + 1] = 1.0;
return A;
cdef np.ndarray[DTYPE_t, ndim = 2] Diff_mat(int N):
cdef np.ndarray[DTYPE_t, ndim = 2] D = np.zeros((N, N), dtype = DTYPE);
cdef int i;
for i in range(N):
if i == 0:
D[i, N - 1] = 1.0;
D[i, i] = -2.0;
D[i, i + 1] = 1.0;
elif i == N - 1:
D[i, i - 1] = 1.0;
D[i, i] = -2.0;
D[i, 0] = 1.0;
else:
D[i, i - 1] = 1.0;
D[i, i] = -2.0;
D[i, i + 1] = 1.0;
return D;
cpdef np.ndarray[np.complex128_t, ndim = 1] Compute_eigVals(int N, double alpha, double kdt):
cdef np.ndarray[DTYPE_t, ndim = 2] A = Adv_mat(N);
cdef np.ndarray[DTYPE_t, ndim = 2] D = Diff_mat(N);
cdef np.ndarray[np.complex128_t, ndim = 2] ADt = A*(-alpha/2.0) + D*kdt + 0j;
cdef np.ndarray[np.complex128_t, ndim = 1] ldt = np.zeros(N, dtype = DTYPE_c);
cdef np.ndarray[DTYPE_t, ndim = 1] beta = np.zeros(N, dtype = DTYPE);
cdef int m, k;
for m in range(N):
beta[m] = 2*np.pi*m/N;
if beta[m] > np.pi:
beta[m] = 2*np.pi - beta[m];
for k in range(N):
ldt[m] = ldt[m] + ADt[0, k]*np.exp(1j*2.0*np.pi*k*m/N);
return ldt;
When I call the 'Compute_eigVals' function from the base python and the compiled .so file like shown below, I don't get any significant speedup from the cython script.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'
from libs.linearAdvec_mat import Compute_eigVals as Compute_eigVals_cy
from linearAdvec_mat import Compute_eigVals as Compute_eigVals_py
import time
#%% ------------------ Inputs ---------------------
N = 1000;
alpha = 0.8;
kdt = 0.05;
st = time.time();
eigs = Compute_eigVals_cy(N, alpha, kdt);
t_cy = time.time() - st;
print('Cython time : %0.8fs\n'%(t_cy));
st = time.time();
eigs = Compute_eigVals_py(N, alpha, kdt);
t_py = time.time() - st;
print('Python time : %0.8fs\n'%(t_py));
print('Cython is %0.5f times faster'%(t_py/t_cy));
I tried to check the amount of python interaction by running
cython -a linearAdvec_mat.pyx
in the terminal, but I couldn't work out anything from that. Could someone please provide some insights onto why I don't get a significant amount of speedup when using cython? My first guess was that my base python script is heavily reliant on numpy as such, it is already in an optimised state, but I am fully sure and am eager to figure out what is actually going on.
Cython solution:
Let's time your python function as benchmark reference:
In [3]: %timeit Compute_eigVals(N, alpha, kdt)
3.85 s ± 22.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
By profiling your Python code inside a jupyter notebook
In [4]: %lprun -f Compute_eigVals Compute_eigVals(N, alpha, kdt)
Timer unit: 1e-06 s
Total time: 4.35475 s
File: <ipython-input-1-61dba133ade4>
Function: Compute_eigVals at line 37
Line # Hits Time Per Hit % Time Line Contents
==============================================================
37 def Compute_eigVals(N, alpha, kdt):
38 1 2491.0 2491.0 0.1 A = Adv_mat(N);
39 1 2295.0 2295.0 0.1 D = Diff_mat(N);
40 1 8582.0 8582.0 0.2 ADt = A*(-alpha/2.0) + D*kdt;
41 1 11.0 11.0 0.0 ldt = np.zeros(N, 'complex');
42 1 2.0 2.0 0.0 beta = np.zeros(N);
43 1001 357.0 0.4 0.0 for m in range(N):
44 1000 713.0 0.7 0.0 beta[m] = 2*np.pi*m/N;
45 1000 720.0 0.7 0.0 if beta[m] > np.pi:
46 499 356.0 0.7 0.0 beta[m] = 2*np.pi - beta[m];
47 1001000 390717.0 0.4 9.0 for j in range(N):
48 1000000 3948510.0 3.9 90.7 ldt[m] += ADt[0, j]*np.exp(1j*2.0*np.pi*j*m/N);
49 1 1.0 1.0 0.0 return ldt;
we can observe that the time-critical part is the innermost loop. So let's take a look at your cython code:
Here are a few key points in order to reduce the python overhead:
np.pi
has noticeable python overhead. Instead, you can use the C constant pi
inside libc.math
. In addition, you can cache the results of 2.0*pi
and 1j*2.0*pi
since you use both multiple times.np.exp
has python overhead as well and calling it for a scalar argument doesn't justify the overhead of calling python functions. Instead, you can use the C cexp
function.cdivision
), disable the index checks (boundscheck
) and disable negative indices (wraparound
)In code:
cimport cython
from libc.math cimport pi
cdef extern from "complex.h":
double complex cexp(double complex)
# Adv_mat and Diff_mat are the same as above
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[np.complex128_t, ndim = 1] Compute_eigVals(int N, double alpha, double kdt):
cdef np.ndarray[DTYPE_t, ndim = 2] A = Adv_mat(N)
cdef np.ndarray[DTYPE_t, ndim = 2] D = Diff_mat(N)
cdef np.ndarray[np.complex128_t, ndim = 2] ADt = A*(-alpha/2.0) + D*kdt + 0j
cdef np.ndarray[np.complex128_t, ndim = 1] ldt = np.zeros(N, dtype = DTYPE_c)
cdef np.ndarray[DTYPE_t, ndim = 1] beta = np.zeros(N, dtype = DTYPE)
cdef int m, k
cdef double two_pi = 2*pi
cdef double complex factor = 1j*2.0*pi+0
for m in range(N):
beta[m] = two_pi*m / N;
if beta[m] > pi:
beta[m] = two_pi - beta[m];
for k in range(N):
ldt[m] = ldt[m] + ADt[0, k]*cexp(factor*k*m / N);
return ldt;
This eliminates all python interaction inside the loops. Timing this on my machine gives:
In [6]: %timeit Compute_eigVals(N, alpha, kdt)
45.8 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Improved Python version:
Note also that there's no real need for Cython, since you can replace your python loops with vectorized numpy operations:
def Compute_eigVals2(N, alpha, kdt):
A = Adv_mat(N);
D = Diff_mat(N);
ADt = A*(-alpha/2.0) + D*kdt;
beta = 2*np.pi*np.arange(N)/N
beta[beta > np.pi] = 2*np.pi - beta[beta > np.pi]
JM = np.arange(N) * np.arange(N)[:, None]
ldt = np.sum(ADt[0, :] * np.exp(1j*2.0*np.pi*JM/N), axis=-1)
return ldt
In [7]: %timeit Compute_eigVals2(N, alpha, kdt)
35.8 ms ± 655 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With