Chapter 19: Code optimization
Robert Johansson
Source code listings for Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib (ISBN 979-8-8688-0412-0).
import numbaimport pyximportimport cythonimport numpy as np%matplotlib inline
import matplotlib.pyplot as pltNumba¶
np.random.seed(0)data = np.random.randn(50000)def py_sum(data):
s = 0
for d in data:
s += d
return sdef py_cumsum(data):
out = np.zeros(len(data), dtype=np.float64)
s = 0
for n in range(len(data)):
s += data[n]
out[n] = s
return out%timeit py_sum(data)3.02 ms ± 108 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
assert abs(py_sum(data) - np.sum(data)) < 1e-10%timeit np.sum(data)11.3 μs ± 1.08 μs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
%timeit py_cumsum(data)6.86 ms ± 185 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
assert np.allclose(np.cumsum(data), py_cumsum(data))%timeit np.cumsum(data)146 μs ± 3.16 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
@numba.jit
def jit_sum(data):
s = 0
for d in data:
s += d
return sassert abs(jit_sum(data) - np.sum(data)) < 1e-10%timeit jit_sum(data)31.4 μs ± 1.16 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
jit_cumsum = numba.jit()(py_cumsum)assert np.allclose(np.cumsum(data), jit_cumsum(data))%timeit jit_cumsum(data)38 μs ± 2.31 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
Julia fractal¶
def py_julia_fractal(z_re, z_im, j):
for m in range(len(z_re)):
for n in range(len(z_im)):
z = z_re[m] + 1j * z_im[n]
for t in range(256):
z = z**2 - 0.05 + 0.68j
if np.abs(z) > 2.0:
# if (z.real * z.real + z.imag * z.imag) > 4.0: # a bit faster
j[m, n] = t
breakjit_julia_fractal = numba.jit(nopython=True)(py_julia_fractal)N = 1024
j = np.zeros((N, N), np.int64)
z_real = np.linspace(-1.5, 1.5, N)
z_imag = np.linspace(-1.5, 1.5, N)jit_julia_fractal(z_real, z_imag, j)fig, ax = plt.subplots(figsize=(14, 14))
ax.imshow(j, cmap=plt.cm.RdBu_r, extent=[-1.5, 1.5, -1.5, 1.5])
ax.set_xlabel("$\mathrm{Re}(z)$", fontsize=18)
ax.set_ylabel("$\mathrm{Im}(z)$", fontsize=18)
fig.tight_layout()
fig.savefig("ch19-numba-julia-fractal.pdf")
%timeit py_julia_fractal(z_real, z_imag, j)17.3 s ± 717 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit jit_julia_fractal(z_real, z_imag, j)149 ms ± 647 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Vectorize¶
def py_Heaviside(x):
if x == 0.0:
return 0.5
if x < 0.0:
return 0.0
else:
return 1.0x = np.linspace(-2, 2, 50001)%timeit [py_Heaviside(xx) for xx in x]4.27 ms ± 113 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
np_vec_Heaviside = np.vectorize(py_Heaviside)np_vec_Heaviside(x)array([0., 0., 0., ..., 1., 1., 1.], shape=(50001,))%timeit np_vec_Heaviside(x)5.15 ms ± 15.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
def np_Heaviside(x):
return (x > 0.0) + (x == 0.0) / 2.0%timeit np_Heaviside(x)64.9 μs ± 722 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
@numba.vectorize([numba.float32(numba.float32), numba.float64(numba.float64)])
def jit_Heaviside(x):
if x == 0.0:
return 0.5
if x < 0:
return 0.0
else:
return 1.0%timeit jit_Heaviside(x)10.2 μs ± 113 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
jit_Heaviside([-1, -0.5, 0.0, 0.5, 1.0])array([0. , 0. , 0.5, 1. , 1. ])Cython¶
!rm cy_sum.*%%writefile cy_sum.pyx
def cy_sum(data):
s = 0.0
for d in data:
s += d
return sWriting cy_sum.pyx
!cython cy_sum.pyx# 5 lines of python code -> 1470 lines of C code ...
!wc cy_sum.c 7793 25359 305330 cy_sum.c
%%writefile setup.py
from distutils.core import setup
from Cython.Build import cythonize
import numpy as np
setup(ext_modules=cythonize('cy_sum.pyx'),
include_dirs=[np.get_include()],
requires=['Cython', 'numpy'] )Overwriting setup.py
!python setup.py build_ext --inplace > /dev/nullfrom cy_sum import cy_sumcy_sum(data)np.float64(-189.70046227549025)%timeit cy_sum(data)3.09 ms ± 100 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit py_sum(data)3.32 ms ± 63.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%writefile cy_cumsum.pyx
cimport numpy
import numpy
def cy_cumsum(data):
out = numpy.zeros_like(data)
s = 0
for n in range(len(data)):
s += data[n]
out[n] = s
return outOverwriting cy_cumsum.pyx
pyximport.install(setup_args={"include_dirs": np.get_include()});pyximport.install(setup_args=dict(include_dirs=np.get_include()));from cy_cumsum import cy_cumsum%timeit cy_cumsum(data)6.7 ms ± 197 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit py_cumsum(data)6.49 ms ± 156 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Using IPython cython command¶
%load_ext cython%%cython -a
def cy_sum(data):
s = 0.0
for d in data:
s += d
return sLoading...
%timeit cy_sum(data)2.92 ms ± 34.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit py_sum(data)3.13 ms ± 36.1 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
assert np.allclose(np.sum(data), cy_sum(data))%%cython -a
cimport numpy
cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
def cy_sum(numpy.ndarray[numpy.float64_t, ndim=1] data):
cdef numpy.float64_t s = 0.0
#cdef int n, N = data.shape[0]
cdef int n, N = len(data)
for n in range(N):
s += data[n]
return sLoading...
%timeit cy_sum(data)24.9 μs ± 1.77 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
%timeit jit_sum(data)24 μs ± 135 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
%timeit np.sum(data)11.4 μs ± 1.02 μs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
Cummulative sum¶
%%cython -a
cimport numpy
import numpy
cimport cython
ctypedef numpy.float64_t FTYPE_t
@cython.boundscheck(False)
@cython.wraparound(False)
def cy_cumsum(numpy.ndarray[FTYPE_t, ndim=1] data):
cdef int n, N = data.size
cdef numpy.ndarray[FTYPE_t, ndim=1] out = numpy.zeros(N, dtype=data.dtype)
cdef numpy.float64_t s = 0.0
for n in range(N):
s += data[n]
out[n] = s
return outLoading...
%timeit py_cumsum(data)7.4 ms ± 187 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit cy_cumsum(data)42.1 μs ± 2.41 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
%timeit jit_cumsum(data)42.2 μs ± 1.58 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
%timeit np.cumsum(data)154 μs ± 2.27 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
assert np.allclose(cy_cumsum(data), np.cumsum(data))Fused types¶
py_sum([1.0, 2.0, 3.0, 4.0, 5.0])15.0py_sum([1, 2, 3, 4, 5])15cy_sum(np.array([1.0, 2.0, 3.0, 4.0, 5.0]))15.0cy_sum(np.array([1, 2, 3, 4, 5], dtype=np.float64))15.0%%cython -a
cimport numpy
cimport cython
ctypedef fused I_OR_F_t:
numpy.int64_t
numpy.float64_t
@cython.boundscheck(False)
@cython.wraparound(False)
def cy_fused_sum(numpy.ndarray[I_OR_F_t, ndim=1] data):
cdef I_OR_F_t s = 0
cdef int n, N = data.size
for n in range(N):
s += data[n]
return sLoading...
cy_fused_sum(np.array([1.0, 2.0, 3.0, 4.0, 5.0]))15.0cy_fused_sum(np.array([1, 2, 3, 4, 5]))15Julia fractal¶
%%cython -a
cimport numpy
cimport cython
ctypedef numpy.int64_t ITYPE_t
ctypedef numpy.float64_t FTYPE_t
cpdef inline double abs2(double complex z):
return z.real * z.real + z.imag * z.imag
@cython.boundscheck(False)
@cython.wraparound(False)
def cy_julia_fractal(numpy.ndarray[FTYPE_t, ndim=1] z_re,
numpy.ndarray[FTYPE_t, ndim=1] z_im,
numpy.ndarray[ITYPE_t, ndim=2] j):
cdef int m, n, t, M = z_re.size, N = z_im.size
cdef double complex z
for m in range(M):
for n in range(N):
z = z_re[m] + 1.0j * z_im[n]
for t in range(256):
z = z ** 2 - 0.05 + 0.68j
if abs2(z) > 4.0:
j[m, n] = t
breakLoading...
N = 1024j = np.zeros((N, N), dtype=np.int64)z_real = np.linspace(-1.5, 1.5, N)z_imag = np.linspace(-1.5, 1.5, N)%timeit cy_julia_fractal(z_real, z_imag, j)2.47 s ± 169 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit jit_julia_fractal(z_real, z_imag, j)155 ms ± 2.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
j1 = np.zeros((N, N), dtype=np.int64)cy_julia_fractal(z_real, z_imag, j1)j2 = np.zeros((N, N), dtype=np.int64)jit_julia_fractal(z_real, z_imag, j2)# assert np.allclose(j1, j2)fig, axes = plt.subplots(1, 2, figsize=(14, 14))
axes[0].imshow(j1, cmap=plt.cm.RdBu_r, extent=[-1.5, 1.5, -1.5, 1.5])
axes[0].set_xlabel("$\mathrm{Re}(z)$", fontsize=18)
axes[0].set_ylabel("$\mathrm{Im}(z)$", fontsize=18)
axes[1].imshow(j2, cmap=plt.cm.RdBu_r, extent=[-1.5, 1.5, -1.5, 1.5])
axes[1].set_xlabel("$\mathrm{Re}(z)$", fontsize=18)
axes[1].set_ylabel("$\mathrm{Im}(z)$", fontsize=18)
fig.tight_layout()
Calling C function¶
%%cython
cdef extern from "math.h":
double acos(double)
def cy_acos1(double x):
return acos(x)%timeit cy_acos1(0.5)37.2 ns ± 0.401 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
%%cython
from libc.math cimport acos
def cy_acos2(double x):
return acos(x)%timeit cy_acos2(0.5)35.5 ns ± 0.155 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
from numpy import arccos%timeit arccos(0.5)80.3 ns ± 1.72 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
from math import acos%timeit acos(0.5)32.6 ns ± 0.582 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
assert cy_acos1(0.5) == acos(0.5)assert cy_acos2(0.5) == acos(0.5)- Johansson, R. (2024). Numerical Python: Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib. Apress. 10.1007/979-8-8688-0413-7