Chapter 10: Sparse matrices and graphs

Robert Johansson

Source code listings for Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib (ISBN 979-8-8688-0412-0).

%matplotlib inline
%config InlineBackend.figure_format='retina'

import matplotlib as mpl
import matplotlib.pyplot as plt
# mpl.rcParams['text.usetex'] = True
# mpl.rcParams['mathtext.fontset'] = 'stix'
# mpl.rcParams['font.family'] = 'serif'
# mpl.rcParams['font.sans-serif'] = 'stix'

import scipy.sparse as sp

import scipy.sparse.linalg

import numpy as np

import scipy.linalg as la

import networkx as nx

Coordinate list format¶

values = [1, 2, 3, 4]

rows = [0, 1, 2, 3]

cols = [1, 3, 2, 0]

A = sp.coo_matrix((values, (rows, cols)), shape=[4, 4])

A.todense()

matrix([[0, 1, 0, 0],
        [0, 0, 0, 2],
        [0, 0, 3, 0],
        [4, 0, 0, 0]])

<COOrdinate sparse matrix of dtype 'int64'
	with 4 stored elements and shape (4, 4)>

A.shape, A.size, A.dtype, A.ndim

((4, 4), 4, dtype('int64'), 2)

A.nnz, A.data

(4, array([1, 2, 3, 4]))

A.row

array([0, 1, 2, 3], dtype=int32)

A.col

array([1, 3, 2, 0], dtype=int32)

A.tocsr()

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 4 stored elements and shape (4, 4)>

A.toarray()

array([[0, 1, 0, 0],
       [0, 0, 0, 2],
       [0, 0, 3, 0],
       [4, 0, 0, 0]])

A.todense()

matrix([[0, 1, 0, 0],
        [0, 0, 0, 2],
        [0, 0, 3, 0],
        [4, 0, 0, 0]])

Not all sparse matrix formats supports indexing:

# A[1, 2]

# A.tobsr()[1, 2]

But some do:

A.tocsr()[1, 2]

np.int64(0)

A.tolil()[1:3, 3]

<List of Lists sparse matrix of dtype 'int64'
	with 1 stored elements and shape (2, 1)>

CSR¶

A = np.array([[1, 2, 0, 0], [0, 3, 4, 0], [0, 0, 5, 6], [7, 0, 8, 9]])
A

array([[1, 2, 0, 0],
       [0, 3, 4, 0],
       [0, 0, 5, 6],
       [7, 0, 8, 9]])

A = sp.csr_matrix(A)

A.data

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

A.indices

array([0, 1, 1, 2, 2, 3, 0, 2, 3], dtype=int32)

A.indptr

array([0, 2, 4, 6, 9], dtype=int32)

i = 2

A.indptr[i], A.indptr[i + 1] - 1

(np.int32(4), np.int32(5))

A.indices[A.indptr[i] : A.indptr[i + 1]]

array([2, 3], dtype=int32)

A.data[A.indptr[i] : A.indptr[i + 1]]

array([5, 6])

Functions for constructing sparse matrices¶

N = 10

A = -2 * sp.eye(N) + sp.eye(N, k=1) + sp.eye(N, k=-1)

<DIAgonal sparse matrix of dtype 'float64'
	with 28 stored elements (3 diagonals) and shape (10, 10)>

A.todense()

matrix([[-2.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 1., -2.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  1., -2.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  1., -2.,  1.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1., -2.,  1.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  1., -2.,  1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  1., -2.,  1.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  1., -2.,  1.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1., -2.,  1.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1., -2.]])

fig, ax = plt.subplots()
ax.spy(A)
fig.tight_layout()
fig.savefig("ch10-sparse-matrix-1.pdf");

A = sp.diags([1, -2, 1], [1, 0, -1], shape=[N, N], format="csc")

/tmp/ipykernel_46607/547814692.py:1: FutureWarning: Input has data type int64, but the output has been cast to float64.  In the future, the output data type will match the input. To avoid this warning, set the `dtype` parameter to `None` to have the output dtype match the input, or set it to the desired output data type.
  A = sp.diags([1, -2, 1], [1, 0, -1], shape=[N, N], format="csc")

<Compressed Sparse Column sparse matrix of dtype 'float64'
	with 28 stored elements and shape (10, 10)>

fig, ax = plt.subplots()
ax.spy(A);

B = sp.diags([1, 1], [-1, 1], shape=[3, 3])

/tmp/ipykernel_46607/1445505839.py:1: FutureWarning: Input has data type int64, but the output has been cast to float64.  In the future, the output data type will match the input. To avoid this warning, set the `dtype` parameter to `None` to have the output dtype match the input, or set it to the desired output data type.
  B = sp.diags([1, 1], [-1, 1], shape=[3, 3])

<DIAgonal sparse matrix of dtype 'float64'
	with 4 stored elements (2 diagonals) and shape (3, 3)>

C = sp.kron(A, B, format="csr")
C

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 112 stored elements and shape (30, 30)>

fig, (ax_A, ax_B, ax_C) = plt.subplots(1, 3, figsize=(12, 4))
ax_A.spy(A)
ax_B.spy(B)
ax_C.spy(C)
fig.tight_layout()
fig.savefig("ch10-sparse-matrix-2.pdf");

Sparse linear algebra¶

N = 10

A = sp.diags([1, -2, 1], [1, 0, -1], shape=[N, N], format="csc")

/tmp/ipykernel_46607/547814692.py:1: FutureWarning: Input has data type int64, but the output has been cast to float64.  In the future, the output data type will match the input. To avoid this warning, set the `dtype` parameter to `None` to have the output dtype match the input, or set it to the desired output data type.
  A = sp.diags([1, -2, 1], [1, 0, -1], shape=[N, N], format="csc")

b = -np.ones(N)

x = sp.linalg.spsolve(A, b)

array([ 5., 9., 12., 14., 15., 15., 14., 12., 9., 5.])

np.linalg.solve(A.todense(), b)

array([ 5., 9., 12., 14., 15., 15., 14., 12., 9., 5.])

lu = sp.linalg.splu(A)

lu.L

<Compressed Sparse Column sparse array of dtype 'float64'
	with 20 stored elements and shape (10, 10)>

lu.perm_r

array([0, 1, 2, 3, 4, 5, 6, 8, 7, 9], dtype=int32)

lu.U

<Compressed Sparse Column sparse array of dtype 'float64'
	with 20 stored elements and shape (10, 10)>

def sp_permute(A, perm_r, perm_c):
    """permute rows and columns of A"""
    M, N = A.shape
    # row permumation matrix
    Pr = sp.coo_matrix((np.ones(M), (perm_r, np.arange(N)))).tocsr()
    # column permutation matrix
    Pc = sp.coo_matrix((np.ones(M), (np.arange(M), perm_c))).tocsr()
    return Pr.T * A * Pc.T

lu.L * lu.U - A

<Compressed Sparse Column sparse array of dtype 'float64'
	with 26 stored elements and shape (10, 10)>

sp_permute(lu.L * lu.U, lu.perm_r, lu.perm_c) - A

<Compressed Sparse Column sparse matrix of dtype 'float64'
	with 26 stored elements and shape (10, 10)>

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4))
ax1.spy(lu.L)
ax2.spy(lu.U)
ax3.spy(A)

x = lu.solve(b)

array([ 5., 9., 12., 14., 15., 15., 14., 12., 9., 5.])

# use_umfpack=True is only effective if scikit-umfpack is installed
# (in which case UMFPACK is the default solver)
x = sp.linalg.spsolve(A, b, use_umfpack=True)

array([ 5., 9., 12., 14., 15., 15., 14., 12., 9., 5.])

x, info = sp.linalg.cg(A, b)

array([ 5., 9., 12., 14., 15., 15., 14., 12., 9., 5.])

x, info = sp.linalg.bicgstab(A, b)

array([ 5., 9., 12., 14., 15., 15., 14., 12., 9., 5.])

# atol argument is a recent addition
x, info = sp.linalg.lgmres(A, b, atol=1e-5)

array([ 5., 9., 12., 14., 15., 15., 14., 12., 9., 5.])

N = 25

An example of a matrix reording method: Reverse Cuthil McKee¶

A = sp.diags([1, -2, 1], [8, 0, -8], shape=[N, N], format="csc")

/tmp/ipykernel_46607/3160634348.py:1: FutureWarning: Input has data type int64, but the output has been cast to float64.  In the future, the output data type will match the input. To avoid this warning, set the `dtype` parameter to `None` to have the output dtype match the input, or set it to the desired output data type.
  A = sp.diags([1, -2, 1], [8, 0, -8], shape=[N, N], format="csc")

perm = sp.csgraph.reverse_cuthill_mckee(A)
perm

array([23, 15,  7, 22, 14,  6, 21, 13,  5, 20, 12,  4, 19, 11,  3, 18, 10,
        2, 17,  9,  1, 24, 16,  8,  0], dtype=int32)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))
ax1.spy(A)
ax2.spy(sp_permute(A, perm, perm))

Performance comparison sparse/dense¶

# compare performance of solving Ax=b vs system size N,
# where A is the sparse matrix for the 1d poisson problem
import time


def setup(N):
    A = sp.diags([1, -2, 1], [1, 0, -1], shape=[N, N], format="csr")
    b = -np.ones(N)
    return A, A.todense(), b


reps = 100
N_vec = np.arange(2, 300, 1)
t_sparse = np.empty(len(N_vec))
t_dense = np.empty(len(N_vec))
for idx, N in enumerate(N_vec):
    A, A_dense, b = setup(N)
    t = time.time()
    for r in range(reps):
        x = np.linalg.solve(A_dense, b)
    t_dense[idx] = (time.time() - t) / reps
    t = time.time()
    for r in range(reps):
        x = sp.linalg.spsolve(A, b, use_umfpack=True)
    t_sparse[idx] = (time.time() - t) / reps

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(N_vec, t_dense * 1e3, ".-", label="dense")
ax.plot(N_vec, t_sparse * 1e3, ".-", label="sparse")
ax.set_xlabel(r"$N$", fontsize=16)
ax.set_ylabel("elapsed time (ms)", fontsize=16)
ax.legend(loc=0)
fig.tight_layout()
fig.savefig("ch10-sparse-vs-dense.pdf")

/tmp/ipykernel_46607/537225260.py:7: FutureWarning: Input has data type int64, but the output has been cast to float64.  In the future, the output data type will match the input. To avoid this warning, set the `dtype` parameter to `None` to have the output dtype match the input, or set it to the desired output data type.
  A = sp.diags([1, -2, 1], [1, 0, -1], shape=[N, N], format="csr")

Eigenvalue problems¶

N = 10

A = sp.diags([1, -2, 1], [1, 0, -1], shape=[N, N], format="csc")

/tmp/ipykernel_46607/547814692.py:1: FutureWarning: Input has data type int64, but the output has been cast to float64.  In the future, the output data type will match the input. To avoid this warning, set the `dtype` parameter to `None` to have the output dtype match the input, or set it to the desired output data type.
  A = sp.diags([1, -2, 1], [1, 0, -1], shape=[N, N], format="csc")

evals, evecs = sp.linalg.eigs(A, k=4, which="LM")

evals

array([-3.91898595+0.j, -3.68250707+0.j, -3.30972147+0.j, -2.83083003+0.j])

np.allclose(A.dot(evecs[:, 0]), evals[0] * evecs[:, 0])

True

evals, evecs = sp.linalg.eigsh(A, k=4, which="LM")

evals

array([-3.91898595, -3.68250707, -3.30972147, -2.83083003])

evals, evecs = sp.linalg.eigs(A, k=4, which="SR")

evals

array([-3.91898595+0.j, -3.68250707+0.j, -3.30972147+0.j, -2.83083003+0.j])

np.real(evals).argsort()

array([0, 1, 2, 3])

def sp_eigs_sorted(A, k=6, which="SR"):
    """compute and return eigenvalues sorted by real value"""
    evals, evecs = sp.linalg.eigs(A, k=k, which=which)
    idx = np.real(evals).argsort()
    return evals[idx], evecs[idx]

evals, evecs = sp_eigs_sorted(A, k=4, which="SM")

evals

array([-1.16916997+0.j, -0.69027853+0.j, -0.31749293+0.j, -0.08101405+0.j])

Random matrix example¶

N = 100

x_vec = np.linspace(0, 1, 50)

# seed sp.rand with random_state to obtain a reproducible result
M1 = sp.rand(N, N, density=0.2, random_state=112312321)
# M1 = M1 + M1.conj().T
M2 = sp.rand(N, N, density=0.2, random_state=984592134)
# M2 = M2 + M2.conj().T

evals = np.array([sp_eigs_sorted((1 - x) * M1 + x * M2, k=25)[0] for x in x_vec])

fig, ax = plt.subplots(figsize=(8, 4))

for idx in range(evals.shape[1]):
    ax.plot(x_vec, np.real(evals[:, idx]), lw=0.5)

ax.set_xlabel(r"$x$", fontsize=16)
ax.set_ylabel(r"eig.vals. of $(1-x)M_1+xM_2$", fontsize=16)

fig.tight_layout()
fig.savefig("ch10-sparse-eigs.pdf")

Graphs¶

g = nx.Graph()

g.add_node(1)

g.nodes()

NodeView((1,))

g.add_nodes_from([3, 4, 5])

g.nodes()

NodeView((1, 3, 4, 5))

g.add_edge(1, 2)

g.edges()

EdgeView([(1, 2)])

g.add_edges_from([(3, 4), (5, 6)])

g.edges()

EdgeView([(1, 2), (3, 4), (5, 6)])

g.add_weighted_edges_from([(1, 3, 1.5), (3, 5, 2.5)])

g.edges()

EdgeView([(1, 2), (1, 3), (3, 4), (3, 5), (5, 6)])

g.edges(data=True)

EdgeDataView([(1, 2, {}), (1, 3, {'weight': 1.5}), (3, 4, {}), (3, 5, {'weight': 2.5}), (5, 6, {})])

g.add_weighted_edges_from([(6, 7, 1.5)])

g.nodes()

NodeView((1, 3, 4, 5, 2, 6, 7))

g.edges()

EdgeView([(1, 2), (1, 3), (3, 4), (3, 5), (5, 6), (6, 7)])

import numpy as np

import json

with open("tokyo-metro.json") as f:
    data = json.load(f)

data.keys()

dict_keys(['C', 'G', 'F', 'H', 'M', 'N', 'T', 'Y', 'Z'])

data["C"]

{'color': '#149848',
 'transfers': [['C3', 'F15'],
  ['C4', 'Z2'],
  ['C4', 'G2'],
  ['C7', 'M14'],
  ['C7', 'N6'],
  ['C7', 'G6'],
  ['C8', 'M15'],
  ['C8', 'H6'],
  ['C9', 'H7'],
  ['C9', 'Y18'],
  ['C11', 'T9'],
  ['C11', 'M18'],
  ['C11', 'Z8'],
  ['C12', 'M19'],
  ['C18', 'H21']],
 'travel_times': [['C1', 'C2', 2],
  ['C2', 'C3', 2],
  ['C3', 'C4', 1],
  ['C4', 'C5', 2],
  ['C5', 'C6', 2],
  ['C6', 'C7', 2],
  ['C7', 'C8', 1],
  ['C8', 'C9', 3],
  ['C9', 'C10', 1],
  ['C10', 'C11', 2],
  ['C11', 'C12', 2],
  ['C12', 'C13', 2],
  ['C13', 'C14', 2],
  ['C14', 'C15', 2],
  ['C15', 'C16', 2],
  ['C16', 'C17', 3],
  ['C17', 'C18', 3],
  ['C18', 'C19', 3]]}

# data

g = nx.Graph()

for line in data.values():
    g.add_weighted_edges_from(line["travel_times"])
    g.add_edges_from(line["transfers"])

for n1, n2 in g.edges():
    g[n1][n2]["transfer"] = "weight" not in g[n1][n2]

g.number_of_nodes()

184

list(g.nodes())[:5]

['C1', 'C2', 'C3', 'C4', 'C5']

g.number_of_edges()

243

list(g.edges())[:5]

[('C1', 'C2'), ('C2', 'C3'), ('C3', 'C4'), ('C3', 'F15'), ('C4', 'C5')]

on_foot = [edge for edge in g.edges() if g.get_edge_data(*edge)["transfer"]]

on_train = [edge for edge in g.edges() if not g.get_edge_data(*edge)["transfer"]]

colors = [data[n[0].upper()]["color"] for n in g.nodes()]

# from networkx.drawing.nx_agraph import graphviz_layout

fig, ax = plt.subplots(1, 1, figsize=(14, 10))

pos = nx.drawing.nx_agraph.graphviz_layout(g, prog="neato")
nx.draw(g, pos, ax=ax, node_size=300, node_color=colors)
nx.draw_networkx_labels(g, pos=pos, ax=ax, font_size=6)
nx.draw_networkx_edges(g, pos=pos, ax=ax, edgelist=on_train, width=2)
nx.draw_networkx_edges(g, pos=pos, ax=ax, edgelist=on_foot, edge_color="blue")

# removing the default axis on all sides:
for side in ["bottom", "right", "top", "left"]:
    ax.spines[side].set_visible(False)

# removing the axis labels and ticks
ax.set_xticks([])
ax.set_yticks([])
ax.xaxis.set_ticks_position("none")
ax.yaxis.set_ticks_position("none")
fig.tight_layout()
fig.savefig("ch10-metro-graph.pdf")
fig.savefig("ch10-metro-graph.png")

g.degree()

DegreeView({'C1': 1, 'C2': 2, 'C3': 3, 'C4': 4, 'C5': 2, 'C6': 2, 'C7': 5, 'C8': 4, 'C9': 4, 'C10': 2, 'C11': 5, 'C12': 3, 'C13': 2, 'C14': 2, 'C15': 2, 'C16': 2, 'C17': 2, 'C18': 3, 'C19': 1, 'F15': 3, 'Z2': 4, 'G2': 4, 'M14': 5, 'N6': 5, 'G6': 5, 'M15': 4, 'H6': 4, 'H7': 4, 'Y18': 4, 'T9': 5, 'M18': 5, 'Z8': 5, 'M19': 3, 'H21': 2, 'G1': 3, 'G3': 2, 'G4': 3, 'G5': 6, 'G7': 2, 'G8': 2, 'G9': 4, 'G10': 2, 'G11': 3, 'G12': 3, 'G13': 2, 'G14': 2, 'G15': 3, 'G16': 3, 'G17': 2, 'G18': 2, 'G19': 1, 'Z1': 3, 'F16': 3, 'Z3': 3, 'M13': 6, 'Y16': 6, 'Z4': 6, 'N7': 6, 'M16': 4, 'H8': 4, 'T10': 3, 'Z9': 3, 'H16': 3, 'H17': 3, 'F1': 2, 'F2': 3, 'F3': 3, 'F4': 3, 'F5': 3, 'F6': 3, 'F7': 3, 'F8': 3, 'F9': 4, 'F10': 2, 'F11': 2, 'F12': 2, 'F13': 3, 'F14': 2, 'Y1': 2, 'Y2': 3, 'Y3': 3, 'Y4': 3, 'Y5': 3, 'Y6': 3, 'Y7': 3, 'Y8': 3, 'Y9': 4, 'M25': 3, 'M9': 3, 'H1': 1, 'H2': 2, 'H3': 2, 'H4': 2, 'H5': 2, 'H9': 2, 'H10': 2, 'H11': 2, 'H12': 3, 'H13': 2, 'H14': 2, 'H15': 2, 'H18': 2, 'H19': 2, 'H20': 2, 'T11': 3, 'M1': 1, 'M2': 2, 'M3': 2, 'M4': 2, 'M5': 2, 'M6': 3, 'M7': 2, 'M8': 2, 'M10': 2, 'M11': 2, 'M12': 3, 'M17': 2, 'M20': 2, 'M21': 2, 'M22': 3, 'M23': 2, 'M24': 2, 'm3': 1, 'm4': 2, 'm5': 2, 'N8': 3, 'N11': 3, 'N1': 2, 'N2': 3, 'N3': 3, 'N4': 2, 'N5': 2, 'N9': 3, 'N10': 4, 'N12': 2, 'N13': 2, 'N14': 2, 'N15': 2, 'N16': 2, 'N17': 2, 'N18': 2, 'N19': 1, 'T1': 2, 'T2': 3, 'T3': 3, 'Y14': 3, 'Y13': 4, 'T6': 4, 'T4': 2, 'T5': 2, 'T7': 3, 'T8': 2, 'T12': 2, 'T13': 2, 'T14': 2, 'T15': 2, 'T16': 2, 'T17': 2, 'T18': 2, 'T19': 2, 'T20': 2, 'T21': 2, 'T22': 2, 'T23': 2, 'T24': 1, 'Z6': 3, 'Y10': 2, 'Y11': 2, 'Y12': 2, 'Y15': 2, 'Y17': 2, 'Y19': 2, 'Y20': 2, 'Y21': 2, 'Y22': 2, 'Y23': 2, 'Y24': 1, 'Z5': 2, 'Z7': 2, 'Z10': 2, 'Z11': 2, 'Z12': 2, 'Z13': 2, 'Z14': 1})

d_max = max(d for (n, d) in g.degree())

[(n, d) for (n, d) in g.degree() if d == d_max]

[('G5', 6), ('M13', 6), ('Y16', 6), ('Z4', 6), ('N7', 6)]

p = nx.shortest_path(g, "Y24", "C19")

np.array(p)

array(['Y24', 'Y23', 'Y22', 'Y21', 'Y20', 'Y19', 'Y18', 'C9', 'C10',
       'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19'],
      dtype='<U3')

np.sum(
    [
        g[p[n]][p[n + 1]]["weight"]
        for n in range(len(p) - 1)
        if "weight" in g[p[n]][p[n + 1]]
    ]
)

np.int64(35)

h = g.copy()

for n1, n2 in h.edges():
    if "transfer" in h[n1][n2]:
        h[n1][n2]["weight"] = 5

p = nx.shortest_path(h, "Y24", "C19")

np.array(p)

array(['Y24', 'Y23', 'Y22', 'Y21', 'Y20', 'Y19', 'Y18', 'C9', 'C10',
       'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19'],
      dtype='<U3')

np.sum([h[p[n]][p[n + 1]]["weight"] for n in range(len(p) - 1)])

np.int64(85)

p = nx.shortest_path(h, "Z1", "H16")

np.sum([h[p[n]][p[n + 1]]["weight"] for n in range(len(p) - 1)])

np.int64(65)

# A = nx.to_scipy_sparse_matrix(g)

# A = nx.to_scipy_sparse_array(g)

<Compressed Sparse Column sparse matrix of dtype 'float64'
	with 28 stored elements and shape (10, 10)>

perm = sp.csgraph.reverse_cuthill_mckee(A)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))
ax1.spy(A, markersize=2)
ax2.spy(sp_permute(A, perm, perm), markersize=2)
fig.tight_layout()
fig.savefig("ch12-rcm-graph.pdf")

References¶

Johansson, R. (2024). Numerical Python: Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib. Apress. 10.1007/979-8-8688-0413-7