Cython not giving speedup

Question

I have been trying to test the speedup potential of using Cython as compared to the base Python code. For this purpose, I wrote two scripts 'linearAdvec_mat.py' and 'linearAdvec_mat.pyx' as follows:

linearAdvec_mat.py:

import numpy as np

def Adv_mat(N):
    A = np.zeros((N, N));
    for i in range(N):
        if i == 0:
            A[i, N - 1] = -1.0;
            A[i, i] = 0.0;
            A[i, i + 1] = 1.0;
        elif i == N - 1:
            A[i, i - 1] = -1.0;
            A[i, i] = 0.0;
            A[i, 0] = 1.0;
        else:
            A[i, i - 1] = -1.0;
            A[i, i] = 0.0;
            A[i, i + 1] = 1.0;
    return A;

def Diff_mat(N):
    D = np.zeros((N, N));
    for i in range(N):
        if i == 0:
            D[i, N - 1] = 1.0;
            D[i, i] = -2.0;
            D[i, i + 1] = 1.0;
        elif i == N - 1:
            D[i, i - 1] = 1.0;
            D[i, i] = -2.0;
            D[i, 0] = 1.0;
        else:
            D[i, i - 1] = 1.0;
            D[i, i] = -2.0;
            D[i, i + 1] = 1.0;
    return D;

def Compute_eigVals(N, alpha, kdt):
    A = Adv_mat(N);
    D = Diff_mat(N);
    ADt = A*(-alpha/2.0) + D*kdt;
    ldt = np.zeros(N, 'complex');
    beta = np.zeros(N);
    for m in range(N):
        beta[m] = 2*np.pi*m/N;
        if beta[m] > np.pi:
            beta[m] = 2*np.pi - beta[m];
        for j in range(N):
            ldt[m] += ADt[0, j]*np.exp(1j*2.0*np.pi*j*m/N);
    return ldt;

and linearAdvec_mat.pyx:

import numpy as np
cimport numpy as np

DTYPE = np.float64;
DTYPE_c = np.complex128;
ctypedef np.float64_t DTYPE_t;

cdef np.ndarray[DTYPE_t, ndim = 2] Adv_mat(int N):
    cdef np.ndarray[DTYPE_t, ndim = 2] A = np.zeros((N, N), dtype = DTYPE);
    cdef int i;
    for i in range(N):
        if i == 0:
            A[i, N - 1] = -1.0;
            A[i, i] = 0.0;
            A[i, i + 1] = 1.0;
        elif i == N - 1:
            A[i, i - 1] = -1.0;
            A[i, i] = 0.0;
            A[i, 0] = 1.0;
        else:
            A[i, i - 1] = -1.0;
            A[i, i] = 0.0;
            A[i, i + 1] = 1.0;
    return A;

cdef np.ndarray[DTYPE_t, ndim = 2] Diff_mat(int N):
    cdef np.ndarray[DTYPE_t, ndim = 2] D = np.zeros((N, N), dtype = DTYPE);
    cdef int i;
    for i in range(N):
        if i == 0:
            D[i, N - 1] = 1.0;
            D[i, i] = -2.0;
            D[i, i + 1] = 1.0;
        elif i == N - 1:
            D[i, i - 1] = 1.0;
            D[i, i] = -2.0;
            D[i, 0] = 1.0;
        else:
            D[i, i - 1] = 1.0;
            D[i, i] = -2.0;
            D[i, i + 1] = 1.0;
    return D;

cpdef np.ndarray[np.complex128_t, ndim = 1] Compute_eigVals(int N, double alpha, double kdt):
    cdef np.ndarray[DTYPE_t, ndim = 2] A = Adv_mat(N);
    cdef np.ndarray[DTYPE_t, ndim = 2] D = Diff_mat(N);
    cdef np.ndarray[np.complex128_t, ndim = 2] ADt = A*(-alpha/2.0) + D*kdt + 0j;
    cdef np.ndarray[np.complex128_t, ndim = 1] ldt = np.zeros(N, dtype = DTYPE_c);
    cdef np.ndarray[DTYPE_t, ndim = 1] beta = np.zeros(N, dtype = DTYPE);
    cdef int m, k;
    for m in range(N):
        beta[m] = 2*np.pi*m/N;
        if beta[m] > np.pi:
            beta[m] = 2*np.pi - beta[m];
        for k in range(N):
            ldt[m] = ldt[m] + ADt[0, k]*np.exp(1j*2.0*np.pi*k*m/N);
    return ldt;

When I call the 'Compute_eigVals' function from the base python and the compiled .so file like shown below, I don't get any significant speedup from the cython script.

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'
from libs.linearAdvec_mat import Compute_eigVals as Compute_eigVals_cy
from linearAdvec_mat import Compute_eigVals as Compute_eigVals_py
import time
#%% ------------------ Inputs ---------------------
N = 1000;
alpha = 0.8;
kdt = 0.05;

st = time.time();
eigs = Compute_eigVals_cy(N, alpha, kdt);
t_cy = time.time() - st;
print('Cython time : %0.8fs
'%(t_cy));

st = time.time();
eigs = Compute_eigVals_py(N, alpha, kdt);
t_py = time.time() - st;
print('Python time : %0.8fs
'%(t_py));
print('Cython is %0.5f times faster'%(t_py/t_cy));

I tried to check the amount of python interaction by running

cython -a linearAdvec_mat.pyx

in the terminal, but I couldn't work out anything from that. Could someone please provide some insights onto why I don't get a significant amount of speedup when using cython? My first guess was that my base python script is heavily reliant on numpy as such, it is already in an optimised state, but I am fully sure and am eager to figure out what is actually going on.

joni · Accepted Answer

Cython solution:

Let's time your python function as benchmark reference:

In [3]: %timeit Compute_eigVals(N, alpha, kdt)
3.85 s ± 22.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

By profiling your Python code inside a jupyter notebook

In [4]: %lprun -f Compute_eigVals Compute_eigVals(N, alpha, kdt)
Timer unit: 1e-06 s

Total time: 4.35475 s
File: <ipython-input-1-61dba133ade4>
Function: Compute_eigVals at line 37

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    37                                           def Compute_eigVals(N, alpha, kdt):
    38         1       2491.0   2491.0      0.1      A = Adv_mat(N);
    39         1       2295.0   2295.0      0.1      D = Diff_mat(N);
    40         1       8582.0   8582.0      0.2      ADt = A*(-alpha/2.0) + D*kdt;
    41         1         11.0     11.0      0.0      ldt = np.zeros(N, 'complex');
    42         1          2.0      2.0      0.0      beta = np.zeros(N);
    43      1001        357.0      0.4      0.0      for m in range(N):
    44      1000        713.0      0.7      0.0          beta[m] = 2*np.pi*m/N;
    45      1000        720.0      0.7      0.0          if beta[m] > np.pi:
    46       499        356.0      0.7      0.0              beta[m] = 2*np.pi - beta[m];
    47   1001000     390717.0      0.4      9.0          for j in range(N):
    48   1000000    3948510.0      3.9     90.7              ldt[m] += ADt[0, j]*np.exp(1j*2.0*np.pi*j*m/N);
    49         1          1.0      1.0      0.0      return ldt;

we can observe that the time-critical part is the innermost loop. So let's take a look at your cython code:

enter image description here

Here are a few key points in order to reduce the python overhead:

Accessing the constant np.pi has noticeable python overhead. Instead, you can use the C constant pi inside libc.math. In addition, you can cache the results of 2.0*pi and 1j*2.0*pi since you use both multiple times.
Similarly, the function np.exp has python overhead as well and calling it for a scalar argument doesn't justify the overhead of calling python functions. Instead, you can use the C cexp function.
Finally, you can use Cython Compiler directives to further accelerate your code. Here, we enable the C integer division (cdivision), disable the index checks (boundscheck) and disable negative indices (wraparound)

In code:

cimport cython

from libc.math cimport pi
cdef extern from "complex.h":
    double complex cexp(double complex)

# Adv_mat and Diff_mat are the same as above

@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[np.complex128_t, ndim = 1] Compute_eigVals(int N, double alpha, double kdt):
    cdef np.ndarray[DTYPE_t, ndim = 2] A = Adv_mat(N)
    cdef np.ndarray[DTYPE_t, ndim = 2] D = Diff_mat(N)
    cdef np.ndarray[np.complex128_t, ndim = 2] ADt = A*(-alpha/2.0) + D*kdt + 0j
    cdef np.ndarray[np.complex128_t, ndim = 1] ldt = np.zeros(N, dtype = DTYPE_c)
    cdef np.ndarray[DTYPE_t, ndim = 1] beta = np.zeros(N, dtype = DTYPE)
    cdef int m, k
    cdef double two_pi = 2*pi
    cdef double complex factor = 1j*2.0*pi+0
    for m in range(N):
        beta[m] = two_pi*m / N;
        if beta[m] > pi:
            beta[m] = two_pi - beta[m];
        for k in range(N):
            ldt[m] = ldt[m] + ADt[0, k]*cexp(factor*k*m / N);
    return ldt;

This eliminates all python interaction inside the loops. Timing this on my machine gives:

In [6]: %timeit Compute_eigVals(N, alpha, kdt)
45.8 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Improved Python version:

Note also that there's no real need for Cython, since you can replace your python loops with vectorized numpy operations:

def Compute_eigVals2(N, alpha, kdt):
    A = Adv_mat(N);
    D = Diff_mat(N);
    ADt = A*(-alpha/2.0) + D*kdt;
    beta = 2*np.pi*np.arange(N)/N
    beta[beta > np.pi] = 2*np.pi - beta[beta > np.pi]
    JM = np.arange(N) * np.arange(N)[:, None]
    ldt = np.sum(ADt[0, :] * np.exp(1j*2.0*np.pi*JM/N), axis=-1)
    return ldt

In [7]: %timeit Compute_eigVals2(N, alpha, kdt)
35.8 ms ± 655 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Cython not giving speedup

Tags:

python

numpy

cython

Suyash

1 Answers

joni

Recent Activity

Donate For Us

Cython not giving speedup

Tags:

python

numpy

cython

Suyash

1 Answers

joni

Related questions

Recent Activity

Donate For Us