Chapter 18: Code listing - Pre-pre-school

Chapter 18: Code listing

Robert Johansson

Source code listings for Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib (ISBN 979-8-8688-0412-0).

Imports¶

import numpy as np

np.random.seed(0)

import pandas as pd

import csv

import json

import h5py

import tables

import pickle
# import cPickle

import msgpack

CSV¶

%%writefile playerstats-2013-2014.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5
5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0

Overwriting playerstats-2013-2014.csv

%%writefile playerstats-2013-2014-top30.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5
5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0
6,Phil Kessel,TOR,R,82,37,43,80,-5,27,8,20,0,0,6,0,305,12.1,20:39,24.5,14.3
7,Taylor Hall,EDM,L,75,27,53,80,-15,44,7,17,0,1,1,1,250,10.8,20:00,25.4,45.7
8,Alex Ovechkin,WSH,L,78,51,28,79,-35,48,24,39,0,1,10,3,386,13.2,20:32,21.8,66.7
9,Joe Pavelski,SJS,C,82,41,38,79,+23,32,16,31,1,2,3,0,225,18.2,19:51,27.1,56.0
10,Jamie Benn,DAL,L,81,34,45,79,+21,64,5,19,1,3,3,1,279,12.2,19:09,25.0,52.8
11,Nicklas Backstrom,WSH,C,82,18,61,79,-20,54,6,44,1,1,1,0,196,9.2,19:48,23.3,50.4
12,Patrick Sharp,CHI,L,82,34,44,78,+13,40,10,25,0,0,3,1,313,10.9,18:53,22.7,54.6
13,Joe Thornton,SJS,C,82,11,65,76,+20,32,2,19,0,1,3,1,122,9.0,18:55,26.3,56.1
14,Erik Karlsson,OTT,D,82,20,54,74,-15,36,5,31,0,0,1,0,257,7.8,27:04,28.6,0.0
15,Evgeni Malkin,PIT,C,60,23,49,72,+10,62,7,30,0,0,3,0,191,12.0,20:03,21.4,48.8
16,Patrick Marleau,SJS,L,82,33,37,70,+0,18,11,23,2,2,4,0,285,11.6,20:31,27.3,52.9
17,Anze Kopitar,LAK,C,82,29,41,70,+34,24,10,23,0,0,9,2,200,14.5,20:53,25.4,53.3
18,Matt Duchene,COL,C,71,23,47,70,+8,19,5,17,0,0,6,1,217,10.6,18:29,22.0,50.3
19,Martin St. Louis,"TBL, NYR",R,81,30,39,69,+13,10,9,21,1,2,5,1,204,14.7,20:56,25.7,40.7
20,Patrick Kane,CHI,R,69,29,40,69,+7,22,10,25,0,0,6,0,227,12.8,19:36,22.9,50.0
21,Blake Wheeler,WPG,R,82,28,41,69,+4,63,8,19,0,0,4,2,225,12.4,18:41,24.0,37.5
22,Kyle Okposo,NYI,R,71,27,42,69,-9,51,5,15,0,0,4,1,195,13.8,20:26,22.2,47.5
23,David Krejci,BOS,C,80,19,50,69,+39,28,3,19,0,0,6,1,169,11.2,19:07,21.3,51.2
24,Chris Kunitz,PIT,L,78,35,33,68,+25,66,13,22,0,0,8,0,218,16.1,19:09,22.2,75.0
25,Jonathan Toews,CHI,C,76,28,40,68,+26,34,5,15,3,5,5,0,193,14.5,20:28,25.9,57.2
26,Thomas Vanek,"BUF, NYI, MTL",L,78,27,41,68,+7,46,8,18,0,0,4,0,248,10.9,19:21,21.6,43.5
27,Jaromir Jagr,NJD,R,82,24,43,67,+16,46,5,17,0,0,6,1,231,10.4,19:09,22.8,0.0
28,John Tavares,NYI,C,59,24,42,66,-6,40,8,25,0,0,4,0,188,12.8,21:14,22.3,49.1
29,Jason Spezza,OTT,C,75,23,43,66,-26,46,9,22,0,0,5,0,223,10.3,18:12,23.8,54.0
30,Jordan Eberle,EDM,R,80,28,37,65,-11,18,7,20,1,1,4,1,200,14.0,19:32,25.4,38.1

Overwriting playerstats-2013-2014-top30.csv

!head -n 5 playerstats-2013-2014-top30.csv

# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9

rows = []

with open("playerstats-2013-2014.csv") as f:
    csvreader = csv.reader(f)
    rows = [fields for fields in csvreader]

rows[1][1:6]

['Player', 'Team', 'Pos', 'GP', 'G']

rows[2][1:6]

['Sidney Crosby', 'PIT', 'C', '80', '36']

data = np.random.randn(100, 3)

np.savetxt(
    "data.csv",
    data,
    delimiter=",",
    header="x, y, z",
    comments="# Random x, y, z coordinates\n",
)

!head -n 5 data.csv

# Random x, y, z coordinates
x, y, z
1.764052345967664026e+00,4.001572083672232938e-01,9.787379841057392005e-01
2.240893199201457797e+00,1.867557990149967484e+00,-9.772778798764110153e-01
9.500884175255893682e-01,-1.513572082976978872e-01,-1.032188517935578448e-01

data_load = np.loadtxt("data.csv", skiprows=2, delimiter=",")

data_load[1, :]

array([ 2.2408932 , 1.86755799, -0.97727788])

data_load.dtype

dtype('float64')

(data == data_load).all()

np.True_

data = np.loadtxt("playerstats-2013-2014.csv", skiprows=2, delimiter=",", dtype=bytes)

data[0][1:6]

array([b'Sidney Crosby', b'PIT', b'C', b'80', b'36'], dtype='|S13')

np.loadtxt("playerstats-2013-2014.csv", skiprows=2, delimiter=",", usecols=[6, 7, 8])

array([[ 68., 104.,  18.],
       [ 56.,  87.,  28.],
       [ 58.,  86.,   7.],
       [ 47.,  84.,  16.],
       [ 39.,  82.,  32.]])

df = pd.read_csv("playerstats-2013-2014.csv", skiprows=1)

df = df.set_index("Rank")

df[["Player", "GP", "G", "A", "P"]]

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 1 to 5
Data columns (total 20 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Player    5 non-null      object 
 1   Team      5 non-null      object 
 2   Pos       5 non-null      object 
 3   GP        5 non-null      int64  
 4   G         5 non-null      int64  
 5   A         5 non-null      int64  
 6   P         5 non-null      int64  
 7   +/-       5 non-null      int64  
 8   PIM       5 non-null      int64  
 9   PPG       5 non-null      int64  
 10  PPP       5 non-null      int64  
 11  SHG       5 non-null      int64  
 12  SHP       5 non-null      int64  
 13  GW        5 non-null      int64  
 14  OT        5 non-null      int64  
 15  S         5 non-null      int64  
 16  S%        5 non-null      float64
 17  TOI/GP    5 non-null      object 
 18  Shift/GP  5 non-null      float64
 19  FO%       5 non-null      float64
dtypes: float64(3), int64(13), object(4)
memory usage: 840.0+ bytes

df[["Player", "GP", "G", "A", "P"]].to_csv("playerstats-2013-2014-subset.csv")

!head -n 5 playerstats-2013-2014-subset.csv

Rank,Player,GP,G,A,P
1,Sidney Crosby,80,36,68,104
2,Ryan Getzlaf,77,31,56,87
3,Claude Giroux,82,28,58,86
4,Tyler Seguin,80,37,47,84

HDF5¶

h5py¶

import h5py

# mode = "w", "r", "w-", "r+", "a"

f = h5py.File("data.h5", "w")

f.mode

'r+'

f.flush()

f.close()

f = h5py.File("data.h5", "w")

f.name

'/'

grp1 = f.create_group("experiment1")

grp1.name

'/experiment1'

grp2_meas = f.create_group("experiment2/measurement")

grp2_meas.name

'/experiment2/measurement'

grp2_sim = f.create_group("experiment2/simulation")

grp2_sim.name

'/experiment2/simulation'

f["/experiment1"]

<HDF5 group "/experiment1" (0 members)>

f["/experiment2/simulation"]

<HDF5 group "/experiment2/simulation" (0 members)>

grp_expr2 = f["/experiment2"]

grp_expr2["simulation"]

<HDF5 group "/experiment2/simulation" (0 members)>

list(f.keys())

['experiment1', 'experiment2']

list(f.items())

[('experiment1', <HDF5 group "/experiment1" (0 members)>),
 ('experiment2', <HDF5 group "/experiment2" (2 members)>)]

f.visit(lambda x: print(x))

experiment1
experiment2
experiment2/measurement
experiment2/simulation

f.visititems(lambda name, value: print(name, value))

experiment1 <HDF5 group "/experiment1" (0 members)>
experiment2 <HDF5 group "/experiment2" (2 members)>
experiment2/measurement <HDF5 group "/experiment2/measurement" (0 members)>
experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>

"experiment1" in f

True

"simulation" in f["experiment2"]

True

"experiment3" in f

False

f.flush()

!ls -l *.h5

-rw-r--r-- 1 carlosal1015 carlosal1015    4272 Jan 23 15:18 data.h5
-rw-r--r-- 1 carlosal1015 carlosal1015   70847 Jan 23 15:14 playerstats-2013-2014.h5
-rw-r--r-- 1 carlosal1015 carlosal1015 2119192 Jan 23 15:14 store.h5

!h5ls -r data.h5

data.h5: unable to open file

data1 = np.arange(10)

data2 = np.random.randn(100, 100)

f["array1"] = data1

f["/experiment2/measurement/meas1"] = data2

f.visititems(lambda name, value: print(name, value))

array1 <HDF5 dataset "array1": shape (10,), type "<i8">
experiment1 <HDF5 group "/experiment1" (0 members)>
experiment2 <HDF5 group "/experiment2" (2 members)>
experiment2/measurement <HDF5 group "/experiment2/measurement" (1 members)>
experiment2/measurement/meas1 <HDF5 dataset "meas1": shape (100, 100), type "<f8">
experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>

ds = f["array1"]

ds

<HDF5 dataset "array1": shape (10,), type "<i8">

ds.name

'/array1'

ds.dtype

dtype('int64')

ds.shape

(10,)

ds.len()

10

# help(ds)

ds

<HDF5 dataset "array1": shape (10,), type "<i8">

h5py.__version__

'3.15.1'

# ds.value
np.array(ds)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

ds = f["/experiment2/measurement/meas1"]

ds

<HDF5 dataset "meas1": shape (100, 100), type "<f8">

ds.dtype

dtype('<f8')

ds.shape

(100, 100)

data_full = ds[...]

type(data_full)

numpy.ndarray

data_full.shape

(100, 100)

data_col = ds[:, 0]

data_col.shape

(100,)

type(data_col)

numpy.ndarray

ds[10:20:3, 10:20:3]

array([[ 0.60270766, -0.34804638, -0.813596  , -1.29737966],
       [ 0.91320192, -1.06343294,  0.22734595,  0.52759738],
       [ 1.25774422, -0.32775492,  1.4849256 ,  0.28005786],
       [-0.84907287, -0.30000358,  1.79691852, -0.19871506]])

ds[[1, 2, 3], :].shape

(3, 100)

ds[[1, 2, 3], :].shape

(3, 100)

mask = ds[:, 0] > 2.0

mask.shape, mask.dtype

((100,), dtype('bool'))

ds[mask, 0]

array([2.04253623, 2.1041854 , 2.05689385])

ds[mask, :5]

array([[ 2.04253623, -0.91946118,  0.11467003, -0.1374237 ,  1.36552692],
       [ 2.1041854 ,  0.22725706, -1.1291663 , -0.28133197, -0.7394167 ],
       [ 2.05689385,  0.18041971, -0.06670925, -0.02835398,  0.48480475]])

# create empty data sets, assign and update datasets

ds = f.create_dataset("array2", data=np.random.randint(10, size=10))

ds

<HDF5 dataset "array2": shape (10,), type "<i8">

# ds.value
np.array(ds)

array([0, 2, 2, 4, 7, 3, 7, 2, 4, 1])

ds = f.create_dataset("/experiment2/simulation/data1", shape=(5, 5), fillvalue=-1)

ds

<HDF5 dataset "data1": shape (5, 5), type "<f4">

# ds.value
np.array(ds)

array([[-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.]], dtype=float32)

ds = f.create_dataset(
    "/experiment1/simulation/data1",
    shape=(5000, 5000, 5000),
    fillvalue=0,
    compression="gzip",
)

ds

<HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">

ds[:, 0, 0] = np.random.rand(5000)

ds[1, :, 0] += np.random.rand(5000)

ds[:2, :5, 0]

array([[0.6939344 , 0.        , 0.        , 0.        , 0.        ],
       [1.4819994 , 0.01639538, 0.54387355, 0.11130908, 0.9928771 ]],
      dtype=float32)

ds.fillvalue

np.float32(0.0)

f["experiment1"].visititems(lambda name, value: print(name, value))

simulation <HDF5 group "/experiment1/simulation" (1 members)>
simulation/data1 <HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">

float(np.prod(ds.shape) * ds[0, 0, 0].nbytes) / (1024**3)  # Gb

465.66128730773926

f.flush()

f.filename

'data.h5'

!ls -lh data.h5

-rw-r--r-- 1 carlosal1015 carlosal1015 358K Jan 23 15:18 data.h5

del f["/experiment1/simulation/data1"]

f["experiment1"].visititems(lambda name, value: print(name, value))

simulation <HDF5 group "/experiment1/simulation" (0 members)>

f.close()

# attributes

f = h5py.File("data.h5", mode="a")  # ADDED mode="a"

f.attrs

<Attributes of HDF5 object at 136578732379808>

f.attrs["desc"] = "Result sets from experiments and simulations"

f["experiment1"].attrs["date"] = "2015-1-1"

f["experiment2"].attrs["date"] = "2015-1-2"

f["experiment2/simulation/data1"].attrs["k"] = 1.5

f["experiment2/simulation/data1"].attrs["T"] = 1000

list(f["experiment1"].attrs.keys())

['date']

list(f["experiment2/simulation/data1"].attrs.items())

[('T', np.int64(1000)), ('k', np.float64(1.5))]

"T" in f["experiment2/simulation/data1"].attrs

True

del f["experiment2/simulation/data1"].attrs["T"]

"T" in f["experiment2/simulation/data1"].attrs

False

f["experiment2/simulation/data1"].attrs["t"] = np.array([1, 2, 3])

f["experiment2/simulation/data1"].attrs["t"]

array([1, 2, 3])

f.close()

pytables¶

tables.__version__

'3.10.2'

df = pd.read_csv("playerstats-2013-2014-top30.csv", skiprows=1)
df = df.set_index("Rank")

df[["Player", "Pos", "GP", "P", "G", "A", "S%", "Shift/GP"]].head(5)

f = tables.open_file("playerstats-2013-2014.h5", mode="w")

grp = f.create_group(
    "/", "season_2013_2014", title="NHL player statistics for the 2013/2014 season"
)

grp

/season_2013_2014 (Group) np.str_('NHL player statistics for the 2013/2014 season')
  children := []

f.root

/ (RootGroup) np.str_('')
  children := ['season_2013_2014' (Group)]

class PlayerStat(tables.IsDescription):
    player = tables.StringCol(20, dflt="")
    position = tables.StringCol(1, dflt="C")
    games_played = tables.UInt8Col(dflt=0)
    points = tables.UInt16Col(dflt=0)
    goals = tables.UInt16Col(dflt=0)
    assists = tables.UInt16Col(dflt=0)
    shooting_percentage = tables.Float64Col(dflt=0.0)
    shifts_per_game_played = tables.Float64Col(dflt=0.0)

top30_table = f.create_table(grp, "top30", PlayerStat, "Top 30 point leaders")

playerstat = top30_table.row

type(playerstat)

tables.tableextension.Row

for index, row_series in df.iterrows():
    playerstat["player"] = row_series["Player"]
    playerstat["position"] = row_series["Pos"]
    playerstat["games_played"] = row_series["GP"]
    playerstat["points"] = row_series["P"]
    playerstat["goals"] = row_series["G"]
    playerstat["assists"] = row_series["A"]
    playerstat["shooting_percentage"] = row_series["S%"]
    playerstat["shifts_per_game_played"] = row_series["Shift/GP"]
    playerstat.append()

top30_table.flush()

top30_table.cols.player[:5]

array([b'Sidney Crosby', b'Ryan Getzlaf', b'Claude Giroux',
       b'Tyler Seguin', b'Corey Perry'], dtype='|S20')

top30_table.cols.points[:5]

array([104, 87, 86, 84, 82], dtype=uint16)

def print_playerstat(row):
    print(
        "%20s\t%s\t%s\t%s"
        % (row["player"].decode("UTF-8"), row["points"], row["goals"], row["assists"])
    )

for row in top30_table.iterrows():
    print_playerstat(row)

       Sidney Crosby	104	36	68
        Ryan Getzlaf	87	31	56
       Claude Giroux	86	28	58
        Tyler Seguin	84	37	47
         Corey Perry	82	43	39
         Phil Kessel	80	37	43
         Taylor Hall	80	27	53
       Alex Ovechkin	79	51	28
        Joe Pavelski	79	41	38
          Jamie Benn	79	34	45
   Nicklas Backstrom	79	18	61
       Patrick Sharp	78	34	44
        Joe Thornton	76	11	65
       Erik Karlsson	74	20	54
       Evgeni Malkin	72	23	49
     Patrick Marleau	70	33	37
        Anze Kopitar	70	29	41
        Matt Duchene	70	23	47
    Martin St. Louis	69	30	39
        Patrick Kane	69	29	40
       Blake Wheeler	69	28	41
         Kyle Okposo	69	27	42
        David Krejci	69	19	50
        Chris Kunitz	68	35	33
      Jonathan Toews	68	28	40
        Thomas Vanek	68	27	41
        Jaromir Jagr	67	24	43
        John Tavares	66	24	42
        Jason Spezza	66	23	43
       Jordan Eberle	65	28	37

for row in top30_table.where("(points > 75) & (points <= 80)"):
    print_playerstat(row)

         Phil Kessel	80	37	43
         Taylor Hall	80	27	53
       Alex Ovechkin	79	51	28
        Joe Pavelski	79	41	38
          Jamie Benn	79	34	45
   Nicklas Backstrom	79	18	61
       Patrick Sharp	78	34	44
        Joe Thornton	76	11	65

for row in top30_table.where("(goals > 40) & (points < 80)"):
    print_playerstat(row)

       Alex Ovechkin	79	51	28
        Joe Pavelski	79	41	38

File(filename=playerstats-2013-2014.h5, title=np.str_(''), mode='w', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) np.str_('')
/season_2013_2014 (Group) np.str_('NHL player statistics for the 2013/2014 season')
/season_2013_2014/top30 (Table(np.int64(30),)) np.str_('Top 30 point leaders')
  description := {
  "assists": UInt16Col(shape=(), dflt=np.uint16(0), pos=0),
  "games_played": UInt8Col(shape=(), dflt=np.uint8(0), pos=1),
  "goals": UInt16Col(shape=(), dflt=np.uint16(0), pos=2),
  "player": StringCol(itemsize=20, shape=(), dflt=np.bytes_(b''), pos=3),
  "points": UInt16Col(shape=(), dflt=np.uint16(0), pos=4),
  "position": StringCol(itemsize=1, shape=(), dflt=np.bytes_(b'C'), pos=5),
  "shifts_per_game_played": Float64Col(shape=(), dflt=np.float64(0.0), pos=6),
  "shooting_percentage": Float64Col(shape=(), dflt=np.float64(0.0), pos=7)}
  byteorder := 'little'
  chunkshape := (np.int64(1489),)

f.flush()

f.close()

!h5ls -rv playerstats-2013-2014.h5

Opened "playerstats-2013-2014.h5" with sec2 driver.
/                        Group
    Attribute: CLASS scalar
        Type:      5-byte null-terminated UTF-8 string
    Attribute: PYTABLES_FORMAT_VERSION scalar
        Type:      3-byte null-terminated UTF-8 string
    Attribute: TITLE null
        Type:      1-byte null-terminated UTF-8 string
    Attribute: VERSION scalar
        Type:      3-byte null-terminated UTF-8 string
    Location:  1:48
    Links:     1
    Modified:  2026-01-23 15:18:57 -05
/season_2013_2014        Group
    Attribute: CLASS scalar
        Type:      5-byte null-terminated UTF-8 string
    Attribute: TITLE scalar
        Type:      46-byte null-terminated UTF-8 string
    Attribute: VERSION scalar
        Type:      3-byte null-terminated UTF-8 string
    Location:  1:280
    Links:     1
    Modified:  2026-01-23 15:18:57 -05
/season_2013_2014/top30  Dataset {30/Inf}
    Attribute: CLASS scalar
        Type:      5-byte null-terminated UTF-8 string
    Attribute: FIELD_0_FILL scalar
        Type:      native unsigned short
    Attribute: FIELD_0_NAME scalar
        Type:      7-byte null-terminated UTF-8 string
    Attribute: FIELD_1_FILL scalar
        Type:      native unsigned char
    Attribute: FIELD_1_NAME scalar
        Type:      12-byte null-terminated UTF-8 string
    Attribute: FIELD_2_FILL scalar
        Type:      native unsigned short
    Attribute: FIELD_2_NAME scalar
        Type:      5-byte null-terminated UTF-8 string
    Attribute: FIELD_3_FILL scalar
        Type:      1-byte null-terminated ASCII string
    Attribute: FIELD_3_NAME scalar
        Type:      6-byte null-terminated UTF-8 string
    Attribute: FIELD_4_FILL scalar
        Type:      native unsigned short
    Attribute: FIELD_4_NAME scalar
        Type:      6-byte null-terminated UTF-8 string
    Attribute: FIELD_5_FILL scalar
        Type:      1-byte null-terminated ASCII string
    Attribute: FIELD_5_NAME scalar
        Type:      8-byte null-terminated UTF-8 string
    Attribute: FIELD_6_FILL scalar
        Type:      native double
    Attribute: FIELD_6_NAME scalar
        Type:      22-byte null-terminated UTF-8 string
    Attribute: FIELD_7_FILL scalar
        Type:      native double
    Attribute: FIELD_7_NAME scalar
        Type:      19-byte null-terminated UTF-8 string
    Attribute: NROWS scalar
        Type:      native long
    Attribute: TITLE scalar
        Type:      20-byte null-terminated UTF-8 string
    Attribute: VERSION scalar
        Type:      3-byte null-terminated UTF-8 string
    Location:  1:609
    Links:     1
    Modified:  2026-01-23 15:18:57 -05
    Chunks:    {1489} 65516 bytes
    Storage:   1320 logical bytes, 65516 allocated bytes, 2.01% utilization
    Type:      struct {
                   "assists"          +0    native unsigned short
                   "games_played"     +2    native unsigned char
                   "goals"            +3    native unsigned short
                   "player"           +5    20-byte null-terminated ASCII string
                   "points"           +25   native unsigned short
                   "position"         +27   1-byte null-terminated ASCII string
                   "shifts_per_game_played" +28   native double
                   "shooting_percentage" +36   native double
               } 44 bytes

Pandas hdfstore¶

import pandas as pd

store = pd.HDFStore("store.h5")

df = pd.DataFrame(np.random.rand(5, 5))

store["df1"] = df

df = pd.read_csv("playerstats-2013-2014-top30.csv", skiprows=1)

store["df2"] = df

store.keys()

['/df1', '/df2']

"df2" in store

True

df = store["df1"]

store.root

/ (RootGroup) np.str_('')
  children := ['df1' (Group), 'df2' (Group)]

store.close()

f = h5py.File("store.h5")

f.visititems(lambda x, y: print(x, "\t" * int(3 - len(str(x)) // 8), y))

df1 			 <HDF5 group "/df1" (4 members)>
df1/axis0 		 <HDF5 dataset "axis0": shape (5,), type "<i8">
df1/axis1 		 <HDF5 dataset "axis1": shape (5,), type "<i8">
df1/block0_items 	 <HDF5 dataset "block0_items": shape (5,), type "<i8">
df1/block0_values 	 <HDF5 dataset "block0_values": shape (5, 5), type "<f8">
df2 			 <HDF5 group "/df2" (8 members)>
df2/axis0 		 <HDF5 dataset "axis0": shape (21,), type "|S8">
df2/axis1 		 <HDF5 dataset "axis1": shape (30,), type "<i8">
df2/block0_items 	 <HDF5 dataset "block0_items": shape (3,), type "|S8">
df2/block0_values 	 <HDF5 dataset "block0_values": shape (30, 3), type "<f8">
df2/block1_items 	 <HDF5 dataset "block1_items": shape (14,), type "|S4">
df2/block1_values 	 <HDF5 dataset "block1_values": shape (30, 14), type "<i8">
df2/block2_items 	 <HDF5 dataset "block2_items": shape (4,), type "|S6">
df2/block2_values 	 <HDF5 dataset "block2_values": shape (1,), type "|O">

f["/df2/block0_items"][:]

array([b'S%', b'Shift/GP', b'FO%'], dtype='|S8')

f["/df2/block0_values"][:3]

array([[13.9, 24. , 52.5],
       [15.2, 25.2, 49. ],
       [12.6, 25.1, 52.9]])

np.array(f["/df2/block1_items"])

array([b'Rank', b'GP', b'G', b'A', b'P', b'+/-', b'PIM', b'PPG', b'PPP',
       b'SHG', b'SHP', b'GW', b'OT', b'S'], dtype='|S4')

f["/df2/block1_values"][:3, :5]

array([[  1,  80,  36,  68, 104],
       [  2,  77,  31,  56,  87],
       [  3,  82,  28,  58,  86]])

Parquet¶

df = pd.read_csv(
    "temperature_outdoor_2014.tsv", delimiter="\t", names=["time", "temperature"]
)
df.time = (
    pd.to_datetime(df.time.values, unit="s")
    .tz_localize("UTC")
    .tz_convert("Europe/Stockholm")
)
df["dt"] = df.time.dt.strftime("%Y-%m-%d")

df.head()

df.to_parquet("temperature_outdoor_2014.parquet", index=None, partition_cols=["dt"])

!ls temperature_outdoor_2014.parquet | head

dt=2014-01-01
dt=2014-01-02
dt=2014-01-03
dt=2014-01-04
dt=2014-01-05
dt=2014-01-06
dt=2014-01-07
dt=2014-01-08
dt=2014-01-09
dt=2014-01-10

df.to_parquet("temperature_outdoor_2014_no_partitions.parquet", index=None)

!file temperature_outdoor_2014_no_partitions.parquet | head

temperature_outdoor_2014_no_partitions.parquet: Apache Parquet

!ls temperature_outdoor_2014.parquet/dt=2014-01-01 | head

d7c32bdc528844a5a8c45dc24659b6e5-0.parquet

df_20140401 = pd.read_parquet("temperature_outdoor_2014.parquet/dt=2014-04-01")

df_20140401.head()

import pyarrow.parquet as pq

table = pq.read_table(
    "temperature_outdoor_2014.parquet", columns=["time", "temperature"]
)

type(table)

pyarrow.lib.Table

# help(table)

df2 = table.to_pandas()

df2.head()

JSON¶

data = ["string", 1.0, 2, None]

data_json = json.dumps(data)

data_json

'["string", 1.0, 2, null]'

data2 = json.loads(data_json)

data

['string', 1.0, 2, None]

data[0]

'string'

data = {"one": 1, "two": 2.0, "three": "three"}

data_json = json.dumps(data)

print(data_json)

{"one": 1, "two": 2.0, "three": "three"}

data = json.loads(data_json)

data["two"]

2.0

data["three"]

'three'

data = {"one": [1], "two": [1, 2], "three": [1, 2, 3]}

data_json = json.dumps(data, indent=True)

print(data_json)

{
 "one": [
  1
 ],
 "two": [
  1,
  2
 ],
 "three": [
  1,
  2,
  3
 ]
}

data = {
    "one": [1],
    "two": {"one": 1, "two": 2},
    "three": [(1,), (1, 2), (1, 2, 3)],
    "four": "a text string",
}

with open("data.json", "w") as f:
    json.dump(data, f)

!cat data.json

{"one": [1], "two": {"one": 1, "two": 2}, "three": [[1], [1, 2], [1, 2, 3]], "four": "a text string"}

with open("data.json", "r") as f:
    data_from_file = json.load(f)

data_from_file["two"]

{'one': 1, 'two': 2}

data_from_file["three"]

[[1], [1, 2], [1, 2, 3]]

!head -n 20 tokyo-metro.json

{
    "C": {
        "color": "#149848",
        "transfers": [
            [
                "C3",
                "F15"
            ],
            [
                "C4",
                "Z2"
            ],
            [
                "C4",
                "G2"
            ],
            [
                "C7",
                "M14"
            ],

!wc tokyo-metro.json

 1471  1508 26839 tokyo-metro.json

with open("tokyo-metro.json", "r") as f:
    data = json.load(f)

data.keys()

dict_keys(['C', 'G', 'F', 'H', 'M', 'N', 'T', 'Y', 'Z'])

data["C"].keys()

dict_keys(['color', 'transfers', 'travel_times'])

data["C"]["color"]

'#149848'

data["C"]["transfers"]

[['C3', 'F15'],
 ['C4', 'Z2'],
 ['C4', 'G2'],
 ['C7', 'M14'],
 ['C7', 'N6'],
 ['C7', 'G6'],
 ['C8', 'M15'],
 ['C8', 'H6'],
 ['C9', 'H7'],
 ['C9', 'Y18'],
 ['C11', 'T9'],
 ['C11', 'M18'],
 ['C11', 'Z8'],
 ['C12', 'M19'],
 ['C18', 'H21']]

[(s, e, tt) for s, e, tt in data["C"]["travel_times"] if tt == 1]

[('C3', 'C4', 1), ('C7', 'C8', 1), ('C9', 'C10', 1)]

data

{'C': {'color': '#149848',
  'transfers': [['C3', 'F15'],
   ['C4', 'Z2'],
   ['C4', 'G2'],
   ['C7', 'M14'],
   ['C7', 'N6'],
   ['C7', 'G6'],
   ['C8', 'M15'],
   ['C8', 'H6'],
   ['C9', 'H7'],
   ['C9', 'Y18'],
   ['C11', 'T9'],
   ['C11', 'M18'],
   ['C11', 'Z8'],
   ['C12', 'M19'],
   ['C18', 'H21']],
  'travel_times': [['C1', 'C2', 2],
   ['C2', 'C3', 2],
   ['C3', 'C4', 1],
   ['C4', 'C5', 2],
   ['C5', 'C6', 2],
   ['C6', 'C7', 2],
   ['C7', 'C8', 1],
   ['C8', 'C9', 3],
   ['C9', 'C10', 1],
   ['C10', 'C11', 2],
   ['C11', 'C12', 2],
   ['C12', 'C13', 2],
   ['C13', 'C14', 2],
   ['C14', 'C15', 2],
   ['C15', 'C16', 2],
   ['C16', 'C17', 3],
   ['C17', 'C18', 3],
   ['C18', 'C19', 3]]},
 'G': {'color': '#f59230',
  'transfers': [['G1', 'Z1'],
   ['G1', 'F16'],
   ['G2', 'Z2'],
   ['G2', 'C4'],
   ['G4', 'Z3'],
   ['G5', 'M13'],
   ['G5', 'Y16'],
   ['G5', 'Z4'],
   ['G5', 'N7'],
   ['G6', 'N6'],
   ['G6', 'M14'],
   ['G6', 'C7'],
   ['G9', 'M16'],
   ['G9', 'H8'],
   ['G11', 'T10'],
   ['G12', 'Z9'],
   ['G15', 'H16'],
   ['G16', 'H17']],
  'travel_times': [['G1', 'G2', 2],
   ['G2', 'G3', 1],
   ['G3', 'G4', 2],
   ['G4', 'G5', 2],
   ['G5', 'G6', 2],
   ['G6', 'G7', 2],
   ['G7', 'G8', 2],
   ['G8', 'G9', 2],
   ['G9', 'G10', 1],
   ['G10', 'G11', 2],
   ['G11', 'G12', 2],
   ['G12', 'G13', 1],
   ['G13', 'G14', 2],
   ['G14', 'G15', 2],
   ['G15', 'G16', 1],
   ['G16', 'G17', 2],
   ['G17', 'G18', 1],
   ['G18', 'G19', 2]]},
 'F': {'color': '#b96528',
  'transfers': [['F1', 'Y1'],
   ['F2', 'Y2'],
   ['F3', 'Y3'],
   ['F4', 'Y4'],
   ['F5', 'Y5'],
   ['F6', 'Y6'],
   ['F7', 'Y7'],
   ['F8', 'Y8'],
   ['F9', 'Y9'],
   ['F9', 'M25'],
   ['F13', 'M9'],
   ['F15', 'C3'],
   ['F16', 'Z1'],
   ['F16', 'G1']],
  'travel_times': [['F1', 'F2', 3],
   ['F2', 'F3', 2],
   ['F3', 'F4', 3],
   ['F4', 'F5', 2],
   ['F5', 'F6', 2],
   ['F6', 'F7', 2],
   ['F7', 'F8', 2],
   ['F8', 'F9', 2],
   ['F9', 'F10', 3],
   ['F10', 'F11', 2],
   ['F11', 'F12', 2],
   ['F12', 'F13', 2],
   ['F13', 'F14', 3],
   ['F14', 'F15', 2],
   ['F15', 'F16', 2]]},
 'H': {'color': '#9cacb5',
  'transfers': [['H6', 'M15'],
   ['H6', 'C8'],
   ['H7', 'Y18'],
   ['H7', 'C9'],
   ['H8', 'M16'],
   ['H8', 'G9'],
   ['H12', 'T11'],
   ['H16', 'G15'],
   ['H17', 'G16'],
   ['H21', 'C18']],
  'travel_times': [['H1', 'H2', 3],
   ['H2', 'H3', 3],
   ['H3', 'H4', 3],
   ['H4', 'H5', 3],
   ['H5', 'H6', 2],
   ['H6', 'H7', 3],
   ['H7', 'H8', 1],
   ['H8', 'H9', 2],
   ['H9', 'H10', 2],
   ['H10', 'H11', 2],
   ['H11', 'H12', 1],
   ['H12', 'H13', 3],
   ['H13', 'H14', 1],
   ['H14', 'H15', 2],
   ['H15', 'H16', 2],
   ['H16', 'H17', 1],
   ['H17', 'H18', 2],
   ['H18', 'H19', 2],
   ['H19', 'H20', 2],
   ['H20', 'H21', 3]]},
 'M': {'color': '#ff0000',
  'transfers': [['M9', 'F13'],
   ['M12', 'N8'],
   ['M13', 'G5'],
   ['M13', 'Y16'],
   ['M13', 'Z4'],
   ['M13', 'N7'],
   ['M14', 'C7'],
   ['M14', 'G6'],
   ['M14', 'N6'],
   ['M15', 'H6'],
   ['M15', 'C8'],
   ['M16', 'G9'],
   ['M16', 'H8'],
   ['M18', 'T9'],
   ['M18', 'C11'],
   ['M18', 'Z8'],
   ['M19', 'C12'],
   ['M22', 'N11'],
   ['M25', 'Y9'],
   ['M25', 'F9']],
  'travel_times': [['M1', 'M2', 2],
   ['M2', 'M3', 2],
   ['M3', 'M4', 2],
   ['M4', 'M5', 2],
   ['M5', 'M6', 2],
   ['M6', 'M7', 2],
   ['M7', 'M8', 2],
   ['M8', 'M9', 2],
   ['M9', 'M10', 1],
   ['M10', 'M11', 2],
   ['M11', 'M12', 2],
   ['M12', 'M13', 3],
   ['M13', 'M14', 2],
   ['M14', 'M15', 1],
   ['M15', 'M16', 3],
   ['M16', 'M17', 2],
   ['M17', 'M18', 2],
   ['M18', 'M19', 2],
   ['M19', 'M20', 1],
   ['M20', 'M21', 2],
   ['M21', 'M22', 2],
   ['M22', 'M23', 3],
   ['M23', 'M24', 2],
   ['M24', 'M25', 3],
   ['m3', 'm4', 2],
   ['m4', 'm5', 2],
   ['m5', 'M6', 2]]},
 'N': {'color': '#1aaca9',
  'transfers': [['N1', 'T1'],
   ['N2', 'T2'],
   ['N3', 'T3'],
   ['N6', 'G6'],
   ['N6', 'M14'],
   ['N6', 'C7'],
   ['N7', 'Y16'],
   ['N7', 'Z4'],
   ['N7', 'G5'],
   ['N7', 'M13'],
   ['N8', 'M12'],
   ['N9', 'Y14'],
   ['N10', 'Y13'],
   ['N10', 'T6'],
   ['N11', 'M22']],
  'travel_times': [['N1', 'N2', 2],
   ['N2', 'N3', 2],
   ['N3', 'N4', 2],
   ['N4', 'N5', 2],
   ['N5', 'N6', 2],
   ['N6', 'N7', 2],
   ['N7', 'N8', 2],
   ['N8', 'N9', 2],
   ['N9', 'N10', 2],
   ['N10', 'N11', 2],
   ['N11', 'N12', 3],
   ['N12', 'N13', 2],
   ['N13', 'N14', 2],
   ['N14', 'N15', 3],
   ['N15', 'N16', 1],
   ['N16', 'N17', 3],
   ['N17', 'N18', 2],
   ['N18', 'N19', 2]]},
 'T': {'color': '#1aa7d8',
  'transfers': [['T6', 'N10'],
   ['T6', 'Y13'],
   ['T7', 'Z6'],
   ['T9', 'M18'],
   ['T9', 'C11'],
   ['T9', 'Z8'],
   ['T10', 'G11'],
   ['T11', 'H12']],
  'travel_times': [['T1', 'T2', 0],
   ['T2', 'T3', 3],
   ['T3', 'T4', 6],
   ['T4', 'T5', 9],
   ['T5', 'T6', 11],
   ['T6', 'T7', 13],
   ['T7', 'T8', 14],
   ['T8', 'T9', 16],
   ['T9', 'T10', 18],
   ['T10', 'T11', 20],
   ['T11', 'T12', 21],
   ['T12', 'T13', 24],
   ['T13', 'T14', 26],
   ['T14', 'T15', 27],
   ['T15', 'T16', 30],
   ['T16', 'T17', 33],
   ['T17', 'T18', 35],
   ['T18', 'T19', 37],
   ['T19', 'T20', 39],
   ['T20', 'T21', 41],
   ['T21', 'T22', 43],
   ['T22', 'T23', 46],
   ['T23', 'T24', 49]]},
 'Y': {'color': '#ede7c3',
  'transfers': [['Y1', 'F1'],
   ['Y2', 'F2'],
   ['Y3', 'F3'],
   ['Y4', 'F4'],
   ['Y5', 'F5'],
   ['Y6', 'F6'],
   ['Y7', 'F7'],
   ['Y8', 'F8'],
   ['Y9', 'F9'],
   ['Y9', 'M25'],
   ['Y13', 'T6'],
   ['Y13', 'N10'],
   ['Y14', 'N9'],
   ['Y16', 'Z4'],
   ['Y16', 'N7'],
   ['Y16', 'G5'],
   ['Y16', 'M13'],
   ['Y18', 'H7'],
   ['Y18', 'C9']],
  'travel_times': [['Y1', 'Y2', 4],
   ['Y2', 'Y3', 2],
   ['Y3', 'Y4', 3],
   ['Y4', 'Y5', 2],
   ['Y5', 'Y6', 2],
   ['Y6', 'Y7', 2],
   ['Y7', 'Y8', 2],
   ['Y8', 'Y9', 3],
   ['Y9', 'Y10', 2],
   ['Y10', 'Y11', 2],
   ['Y11', 'Y12', 2],
   ['Y12', 'Y13', 3],
   ['Y13', 'Y14', 2],
   ['Y14', 'Y15', 2],
   ['Y15', 'Y16', 1],
   ['Y16', 'Y17', 2],
   ['Y17', 'Y18', 2],
   ['Y18', 'Y19', 2],
   ['Y19', 'Y20', 2],
   ['Y20', 'Y21', 2],
   ['Y21', 'Y22', 2],
   ['Y22', 'Y23', 3],
   ['Y23', 'Y24', 2]]},
 'Z': {'color': '#a384bf',
  'transfers': [['Z1', 'F16'],
   ['Z1', 'G1'],
   ['Z2', 'C4'],
   ['Z2', 'G2'],
   ['Z3', 'G4'],
   ['Z4', 'Y16'],
   ['Z4', 'N7'],
   ['Z4', 'M13'],
   ['Z4', 'G5'],
   ['Z6', 'T7'],
   ['Z8', 'M18'],
   ['Z8', 'C11'],
   ['Z8', 'T9'],
   ['Z9', 'G12']],
  'travel_times': [['Z1', 'Z2', 3],
   ['Z2', 'Z3', 2],
   ['Z3', 'Z4', 2],
   ['Z4', 'Z5', 2],
   ['Z5', 'Z6', 2],
   ['Z6', 'Z7', 2],
   ['Z7', 'Z8', 2],
   ['Z8', 'Z9', 2],
   ['Z9', 'Z10', 3],
   ['Z10', 'Z11', 3],
   ['Z11', 'Z12', 3],
   ['Z12', 'Z13', 2],
   ['Z13', 'Z14', 2]]}}

!ls -lh tokyo-metro.json

-rw-r--r-- 1 carlosal1015 carlosal1015 27K Jan 22 15:31 tokyo-metro.json

data_pack = msgpack.packb(data)

# del data

type(data_pack)

bytes

len(data_pack)

3021

with open("tokyo-metro.msgpack", "wb") as f:
    f.write(data_pack)

!ls -lh tokyo-metro.msgpack

-rw-r--r-- 1 carlosal1015 carlosal1015 3.0K Jan 23 15:19 tokyo-metro.msgpack

with open("tokyo-metro.msgpack", "rb") as f:
    data_msgpack = f.read()
    data = msgpack.unpackb(data_msgpack)

list(data.keys())

['C', 'G', 'F', 'H', 'M', 'N', 'T', 'Y', 'Z']

import pickle

with open("tokyo-metro.pickle", "wb") as f:
    pickle.dump(data, f)

del data

!ls -lh tokyo-metro.pickle

-rw-r--r-- 1 carlosal1015 carlosal1015 5.2K Jan 23 15:19 tokyo-metro.pickle

with open("tokyo-metro.pickle", "rb") as f:
    data = pickle.load(f)

data.keys()

dict_keys(['C', 'G', 'F', 'H', 'M', 'N', 'T', 'Y', 'Z'])

References¶

Johansson, R. (2024). Numerical Python: Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib. Apress. 10.1007/979-8-8688-0413-7