Skip to article content

Pre-pre-school

Back to Article
Chapter 19: Code optimization
Download Notebook

Chapter 19: Code optimization

import numba
import pyximport
import cython
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

Numba

np.random.seed(0)
data = np.random.randn(50000)
def py_sum(data):
    s = 0
    for d in data:
        s += d
    return s
def py_cumsum(data):
    out = np.zeros(len(data), dtype=np.float64)
    s = 0
    for n in range(len(data)):
        s += data[n]
        out[n] = s

    return out
%timeit py_sum(data)
3.02 ms ± 108 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
assert abs(py_sum(data) - np.sum(data)) < 1e-10
%timeit np.sum(data)
11.3 μs ± 1.08 μs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
%timeit py_cumsum(data)
6.86 ms ± 185 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
assert np.allclose(np.cumsum(data), py_cumsum(data))
%timeit np.cumsum(data)
146 μs ± 3.16 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
@numba.jit
def jit_sum(data):
    s = 0
    for d in data:
        s += d

    return s
assert abs(jit_sum(data) - np.sum(data)) < 1e-10
%timeit jit_sum(data)
31.4 μs ± 1.16 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
jit_cumsum = numba.jit()(py_cumsum)
assert np.allclose(np.cumsum(data), jit_cumsum(data))
%timeit jit_cumsum(data)
38 μs ± 2.31 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

Julia fractal

def py_julia_fractal(z_re, z_im, j):
    for m in range(len(z_re)):
        for n in range(len(z_im)):
            z = z_re[m] + 1j * z_im[n]
            for t in range(256):
                z = z**2 - 0.05 + 0.68j
                if np.abs(z) > 2.0:
                    # if (z.real * z.real + z.imag * z.imag) > 4.0:  # a bit faster
                    j[m, n] = t
                    break
jit_julia_fractal = numba.jit(nopython=True)(py_julia_fractal)
N = 1024
j = np.zeros((N, N), np.int64)
z_real = np.linspace(-1.5, 1.5, N)
z_imag = np.linspace(-1.5, 1.5, N)
jit_julia_fractal(z_real, z_imag, j)
fig, ax = plt.subplots(figsize=(14, 14))
ax.imshow(j, cmap=plt.cm.RdBu_r, extent=[-1.5, 1.5, -1.5, 1.5])
ax.set_xlabel("$\mathrm{Re}(z)$", fontsize=18)
ax.set_ylabel("$\mathrm{Im}(z)$", fontsize=18)
fig.tight_layout()
fig.savefig("ch19-numba-julia-fractal.pdf")
<Figure size 10500x10500 with 1 Axes>
%timeit py_julia_fractal(z_real, z_imag, j)
17.3 s ± 717 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit jit_julia_fractal(z_real, z_imag, j)
149 ms ± 647 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Vectorize

def py_Heaviside(x):
    if x == 0.0:
        return 0.5

    if x < 0.0:
        return 0.0
    else:
        return 1.0
x = np.linspace(-2, 2, 50001)
%timeit [py_Heaviside(xx) for xx in x]
4.27 ms ± 113 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
np_vec_Heaviside = np.vectorize(py_Heaviside)
np_vec_Heaviside(x)
array([0., 0., 0., ..., 1., 1., 1.], shape=(50001,))
%timeit np_vec_Heaviside(x)
5.15 ms ± 15.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
def np_Heaviside(x):
    return (x > 0.0) + (x == 0.0) / 2.0
%timeit np_Heaviside(x)
64.9 μs ± 722 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
@numba.vectorize([numba.float32(numba.float32), numba.float64(numba.float64)])
def jit_Heaviside(x):
    if x == 0.0:
        return 0.5

    if x < 0:
        return 0.0
    else:
        return 1.0
%timeit jit_Heaviside(x)
10.2 μs ± 113 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
jit_Heaviside([-1, -0.5, 0.0, 0.5, 1.0])
array([0. , 0. , 0.5, 1. , 1. ])

Cython

!rm cy_sum.*
%%writefile cy_sum.pyx

def cy_sum(data):
    s = 0.0
    for d in data:
        s += d
    return s
Writing cy_sum.pyx
!cython cy_sum.pyx
# 5 lines of python code -> 1470 lines of C code ...
!wc cy_sum.c
  7793  25359 305330 cy_sum.c
%%writefile setup.py

from distutils.core import setup
from Cython.Build import cythonize

import numpy as np
setup(ext_modules=cythonize('cy_sum.pyx'),
      include_dirs=[np.get_include()],
      requires=['Cython', 'numpy'] )
Overwriting setup.py
!python setup.py build_ext --inplace > /dev/null
from cy_sum import cy_sum
cy_sum(data)
np.float64(-189.70046227549025)
%timeit cy_sum(data)
3.09 ms ± 100 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit py_sum(data)
3.32 ms ± 63.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%writefile cy_cumsum.pyx

cimport numpy
import numpy

def cy_cumsum(data):
    out = numpy.zeros_like(data)
    s = 0 
    for n in range(len(data)):
        s += data[n]
        out[n] = s

    return out
Overwriting cy_cumsum.pyx
pyximport.install(setup_args={"include_dirs": np.get_include()});
pyximport.install(setup_args=dict(include_dirs=np.get_include()));
from cy_cumsum import cy_cumsum
%timeit cy_cumsum(data)
6.7 ms ± 197 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit py_cumsum(data)
6.49 ms ± 156 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Using IPython cython command

%load_ext cython
%%cython -a
def cy_sum(data):
    s = 0.0
    for d in data:
        s += d
    return s
Loading...
%timeit cy_sum(data)
2.92 ms ± 34.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit py_sum(data)
3.13 ms ± 36.1 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
assert np.allclose(np.sum(data), cy_sum(data))
%%cython -a
cimport numpy
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
def cy_sum(numpy.ndarray[numpy.float64_t, ndim=1] data):
    cdef numpy.float64_t s = 0.0
    #cdef int n, N = data.shape[0]
    cdef int n, N = len(data)
    for n in range(N):
        s += data[n]
    return s
Loading...
%timeit cy_sum(data)
24.9 μs ± 1.77 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
%timeit jit_sum(data)
24 μs ± 135 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
%timeit np.sum(data)
11.4 μs ± 1.02 μs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

Cummulative sum

%%cython -a
cimport numpy
import numpy
cimport cython

ctypedef numpy.float64_t FTYPE_t

@cython.boundscheck(False)
@cython.wraparound(False)
def cy_cumsum(numpy.ndarray[FTYPE_t, ndim=1] data):
    cdef int n, N = data.size
    cdef numpy.ndarray[FTYPE_t, ndim=1] out = numpy.zeros(N, dtype=data.dtype)
    cdef numpy.float64_t s = 0.0
    for n in range(N):
        s += data[n]
        out[n] = s
    return out
Loading...
%timeit py_cumsum(data)
7.4 ms ± 187 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit cy_cumsum(data)
42.1 μs ± 2.41 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
%timeit jit_cumsum(data)
42.2 μs ± 1.58 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
%timeit np.cumsum(data)
154 μs ± 2.27 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
assert np.allclose(cy_cumsum(data), np.cumsum(data))

Fused types

py_sum([1.0, 2.0, 3.0, 4.0, 5.0])
15.0
py_sum([1, 2, 3, 4, 5])
15
cy_sum(np.array([1.0, 2.0, 3.0, 4.0, 5.0]))
15.0
cy_sum(np.array([1, 2, 3, 4, 5], dtype=np.float64))
15.0
%%cython -a
cimport numpy
cimport cython

ctypedef fused I_OR_F_t:
    numpy.int64_t 
    numpy.float64_t 

@cython.boundscheck(False)
@cython.wraparound(False)
def cy_fused_sum(numpy.ndarray[I_OR_F_t, ndim=1] data):
    cdef I_OR_F_t s = 0
    cdef int n, N = data.size
    for n in range(N):
        s += data[n]
    return s
Loading...
cy_fused_sum(np.array([1.0, 2.0, 3.0, 4.0, 5.0]))
15.0
cy_fused_sum(np.array([1, 2, 3, 4, 5]))
15

Julia fractal

%%cython -a
cimport numpy
cimport cython

ctypedef numpy.int64_t ITYPE_t
ctypedef numpy.float64_t FTYPE_t

cpdef inline double abs2(double complex z):
    return z.real * z.real + z.imag * z.imag

@cython.boundscheck(False)
@cython.wraparound(False)
def cy_julia_fractal(numpy.ndarray[FTYPE_t, ndim=1] z_re, 
                     numpy.ndarray[FTYPE_t, ndim=1] z_im, 
                     numpy.ndarray[ITYPE_t, ndim=2] j):
    cdef int m, n, t, M = z_re.size, N = z_im.size
    cdef double complex z
    for m in range(M):
        for n in range(N):
            z = z_re[m] + 1.0j * z_im[n]
            for t in range(256):
                z = z ** 2 - 0.05 + 0.68j
                if abs2(z) > 4.0:
                    j[m, n] = t
                    break
Loading...
N = 1024
j = np.zeros((N, N), dtype=np.int64)
z_real = np.linspace(-1.5, 1.5, N)
z_imag = np.linspace(-1.5, 1.5, N)
%timeit cy_julia_fractal(z_real, z_imag, j)
2.47 s ± 169 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit jit_julia_fractal(z_real, z_imag, j)
155 ms ± 2.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
j1 = np.zeros((N, N), dtype=np.int64)
cy_julia_fractal(z_real, z_imag, j1)
j2 = np.zeros((N, N), dtype=np.int64)
jit_julia_fractal(z_real, z_imag, j2)
# assert np.allclose(j1, j2)
fig, axes = plt.subplots(1, 2, figsize=(14, 14))
axes[0].imshow(j1, cmap=plt.cm.RdBu_r, extent=[-1.5, 1.5, -1.5, 1.5])
axes[0].set_xlabel("$\mathrm{Re}(z)$", fontsize=18)
axes[0].set_ylabel("$\mathrm{Im}(z)$", fontsize=18)
axes[1].imshow(j2, cmap=plt.cm.RdBu_r, extent=[-1.5, 1.5, -1.5, 1.5])
axes[1].set_xlabel("$\mathrm{Re}(z)$", fontsize=18)
axes[1].set_ylabel("$\mathrm{Im}(z)$", fontsize=18)
fig.tight_layout()
<Figure size 10500x10500 with 2 Axes>

Calling C function

%%cython

cdef extern from "math.h":
     double acos(double)

def cy_acos1(double x):
    return acos(x)
%timeit cy_acos1(0.5)
37.2 ns ± 0.401 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
%%cython

from libc.math cimport acos

def cy_acos2(double x):
    return acos(x)
%timeit cy_acos2(0.5)
35.5 ns ± 0.155 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
from numpy import arccos
%timeit arccos(0.5)
80.3 ns ± 1.72 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
from math import acos
%timeit acos(0.5)
32.6 ns ± 0.582 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
assert cy_acos1(0.5) == acos(0.5)
assert cy_acos2(0.5) == acos(0.5)
References
  1. Johansson, R. (2024). Numerical Python: Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib. Apress. 10.1007/979-8-8688-0413-7