I am trying to accelerate my code and this part of it is giving me problems,
I tried to use Cython and then followed the advise given here but my pure python function performs better than both the cython and cython_optimized ones
The cython code is the following:
import numpy as np
cimport numpy as np
DTYPE = np.float
ctypedef np.float_t DTYPE_t
cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
def compute_cython(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
delta = u-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u/IceI;
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile
def compute_cythonOptimized(np.ndarray[DTYPE_t, ndim=1] u, np.ndarray[DTYPE_t, ndim=1] PorosityProfile, np.ndarray[DTYPE_t, ndim=1] DensityIceProfile, np.ndarray[DTYPE_t, ndim=1] DensityDustProfile, np.ndarray DensityProfile):
assert u.dtype == DTYPE
assert PorosityProfile.dtype == DTYPE
assert DensityIceProfile.dtype == DTYPE
assert DensityDustProfile.dtype == DTYPE
assert DensityProfile.dtype == DTYPE
cdef float DustJ = 250.0
cdef float DustF = 633.0
cdef float DustG = 2.513
cdef float DustH = -2.2e-3
cdef float DustI = -2.8e-6
cdef float IceI = 273.16
cdef float IceC = 1.843e5
cdef float IceD = 1.6357e8
cdef float IceE = 3.5519e9
cdef float IceF = 1.6670e2
cdef float IceG = 6.4650e4
cdef float IceH = 1.6935e6
cdef np.ndarray[DTYPE_t, ndim=1] delta = u-DustJ
cdef np.ndarray[DTYPE_t, ndim=1] result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
cdef np.ndarray[DTYPE_t, ndim=1] x= u/IceI;
cdef np.ndarray[DTYPE_t, ndim=1] result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile
I then run the following commands:
def compute_python(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
delta = u-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u/IceI;
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile
import sublimation
import numpy as np
%timeit compute_python(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))
%timeit compute_cython(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))
%timeit compute_cythonOptimized(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))
Which results in the following:
For the pure python: 68.9 µs ± 851 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
For the non optimized cython: 68.2 µs ± 685 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
And for the optimized one: 72.7 µs ± 416 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
What am I doing wrong ?
Thanks for your help,
CodeSurgeon already gave an excellent answer using Cython. In this answer I wan't to show an alternative way using Numba.
I have created three versions. In naive_numba
I only have added an function decorator. In improved_Numba
I have manually combined the loops (every vectorized command is actually a loop). In improved_Numba_p
I have parallelized the function. Please note that there is obviously a Bug not allowing to define constant values when using the pararallel accelerator. It has also be noted that the parallelized version is only beneficial for larger input arrays. But you can also add a small wrapper which calls the single threaded or the parallelized version according to the input array size.
Code dtype=float64
import numba as nb
import numpy as np
import time
@nb.njit(fastmath=True)
def naive_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
delta = u-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u/IceI;
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile
#error_model='numpy' sets divison by 0 to NaN instead of throwing a exception, this allows vectorization
@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
res=np.empty(u.shape[0],dtype=u.dtype)
for i in range(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]
return res
#there is obviously a bug in Numba (declaring const values in the function)
@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH):
res=np.empty((u.shape[0]),dtype=u.dtype)
for i in nb.prange(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]
return res
u=np.array(np.random.rand(1000000),dtype=np.float32)
PorosityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityIceProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityDustProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
#don't measure compilation overhead on first call
res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH)
for i in range(1000):
res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH)
print(time.time()-t1)
print(time.time()-t1)
Performance
Arraysize np.random.rand(100)
Numpy 46.8µs
naive Numba 3.1µs
improved Numba: 1.62µs
improved_Numba_p: 17.45µs
#Arraysize np.random.rand(1000000)
Numpy 255.8ms
naive Numba 18.6ms
improved Numba: 6.13ms
improved_Numba_p: 3.54ms
Code dtype=np.float32
If np.float32 is sufficient you have to explicitly declare all constant values in the function to float32. Otherwise Numba will use float64.
@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
IceI, IceC, IceD, IceE, IceF, IceG, IceH = nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2), nb.float32(6.4650e4), nb.float32(1.6935e6)
res=np.empty(u.shape[0],dtype=u.dtype)
for i in range(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)
x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]
return res
@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
res=np.empty((u.shape[0]),dtype=u.dtype)
DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
IceI, IceC, IceD, IceE, IceF, IceG, IceH = nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2), nb.float32(6.4650e4), nb.float32(1.6935e6)
for i in nb.prange(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)
x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]
return res
Performance
Arraysize np.random.rand(100).astype(np.float32)
Numpy 29.3µs
improved Numba: 1.33µs
improved_Numba_p: 18µs
Arraysize np.random.rand(1000000).astype(np.float32)
Numpy 117ms
improved Numba: 2.46ms
improved_Numba_p: 1.56ms
The comparison to the Cython version provided by @CodeSurgeon isn't really fair because he didn't compile the function with enabled AVX2 and FMA3 instructions. Numba compiles by default with -march=native which enables AVX2 and FMA3 instructions on my Core i7-4xxx.
But this makes sence if you wan't to distribute a compiled Cython version of your code, because it won't run by default on pre Haswell processors (or all Pentium and Celerons) if that optimizations are enabled. Compiling multiple code paths should be possible, but is compiler dependend and more work.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With