Skip to article content

Pre-pre-school

Back to Article
Chapter 18: Code listing
Download Notebook

Chapter 18: Code listing

Imports

import numpy as np

np.random.seed(0)
import pandas as pd
import csv
import json
import h5py
import tables
import pickle
# import cPickle
import msgpack

CSV

%%writefile playerstats-2013-2014.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5
5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0
Overwriting playerstats-2013-2014.csv
%%writefile playerstats-2013-2014-top30.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5
5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0
6,Phil Kessel,TOR,R,82,37,43,80,-5,27,8,20,0,0,6,0,305,12.1,20:39,24.5,14.3
7,Taylor Hall,EDM,L,75,27,53,80,-15,44,7,17,0,1,1,1,250,10.8,20:00,25.4,45.7
8,Alex Ovechkin,WSH,L,78,51,28,79,-35,48,24,39,0,1,10,3,386,13.2,20:32,21.8,66.7
9,Joe Pavelski,SJS,C,82,41,38,79,+23,32,16,31,1,2,3,0,225,18.2,19:51,27.1,56.0
10,Jamie Benn,DAL,L,81,34,45,79,+21,64,5,19,1,3,3,1,279,12.2,19:09,25.0,52.8
11,Nicklas Backstrom,WSH,C,82,18,61,79,-20,54,6,44,1,1,1,0,196,9.2,19:48,23.3,50.4
12,Patrick Sharp,CHI,L,82,34,44,78,+13,40,10,25,0,0,3,1,313,10.9,18:53,22.7,54.6
13,Joe Thornton,SJS,C,82,11,65,76,+20,32,2,19,0,1,3,1,122,9.0,18:55,26.3,56.1
14,Erik Karlsson,OTT,D,82,20,54,74,-15,36,5,31,0,0,1,0,257,7.8,27:04,28.6,0.0
15,Evgeni Malkin,PIT,C,60,23,49,72,+10,62,7,30,0,0,3,0,191,12.0,20:03,21.4,48.8
16,Patrick Marleau,SJS,L,82,33,37,70,+0,18,11,23,2,2,4,0,285,11.6,20:31,27.3,52.9
17,Anze Kopitar,LAK,C,82,29,41,70,+34,24,10,23,0,0,9,2,200,14.5,20:53,25.4,53.3
18,Matt Duchene,COL,C,71,23,47,70,+8,19,5,17,0,0,6,1,217,10.6,18:29,22.0,50.3
19,Martin St. Louis,"TBL, NYR",R,81,30,39,69,+13,10,9,21,1,2,5,1,204,14.7,20:56,25.7,40.7
20,Patrick Kane,CHI,R,69,29,40,69,+7,22,10,25,0,0,6,0,227,12.8,19:36,22.9,50.0
21,Blake Wheeler,WPG,R,82,28,41,69,+4,63,8,19,0,0,4,2,225,12.4,18:41,24.0,37.5
22,Kyle Okposo,NYI,R,71,27,42,69,-9,51,5,15,0,0,4,1,195,13.8,20:26,22.2,47.5
23,David Krejci,BOS,C,80,19,50,69,+39,28,3,19,0,0,6,1,169,11.2,19:07,21.3,51.2
24,Chris Kunitz,PIT,L,78,35,33,68,+25,66,13,22,0,0,8,0,218,16.1,19:09,22.2,75.0
25,Jonathan Toews,CHI,C,76,28,40,68,+26,34,5,15,3,5,5,0,193,14.5,20:28,25.9,57.2
26,Thomas Vanek,"BUF, NYI, MTL",L,78,27,41,68,+7,46,8,18,0,0,4,0,248,10.9,19:21,21.6,43.5
27,Jaromir Jagr,NJD,R,82,24,43,67,+16,46,5,17,0,0,6,1,231,10.4,19:09,22.8,0.0
28,John Tavares,NYI,C,59,24,42,66,-6,40,8,25,0,0,4,0,188,12.8,21:14,22.3,49.1
29,Jason Spezza,OTT,C,75,23,43,66,-26,46,9,22,0,0,5,0,223,10.3,18:12,23.8,54.0
30,Jordan Eberle,EDM,R,80,28,37,65,-11,18,7,20,1,1,4,1,200,14.0,19:32,25.4,38.1
Overwriting playerstats-2013-2014-top30.csv
!head -n 5 playerstats-2013-2014-top30.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
rows = []
with open("playerstats-2013-2014.csv") as f:
    csvreader = csv.reader(f)
    rows = [fields for fields in csvreader]
rows[1][1:6]
['Player', 'Team', 'Pos', 'GP', 'G']
rows[2][1:6]
['Sidney Crosby', 'PIT', 'C', '80', '36']
data = np.random.randn(100, 3)
np.savetxt(
    "data.csv",
    data,
    delimiter=",",
    header="x, y, z",
    comments="# Random x, y, z coordinates\n",
)
!head -n 5 data.csv
# Random x, y, z coordinates
x, y, z
1.764052345967664026e+00,4.001572083672232938e-01,9.787379841057392005e-01
2.240893199201457797e+00,1.867557990149967484e+00,-9.772778798764110153e-01
9.500884175255893682e-01,-1.513572082976978872e-01,-1.032188517935578448e-01
data_load = np.loadtxt("data.csv", skiprows=2, delimiter=",")
data_load[1, :]
array([ 2.2408932 , 1.86755799, -0.97727788])
data_load.dtype
dtype('float64')
(data == data_load).all()
np.True_
data = np.loadtxt("playerstats-2013-2014.csv", skiprows=2, delimiter=",", dtype=bytes)
data[0][1:6]
array([b'Sidney Crosby', b'PIT', b'C', b'80', b'36'], dtype='|S13')
np.loadtxt("playerstats-2013-2014.csv", skiprows=2, delimiter=",", usecols=[6, 7, 8])
array([[ 68., 104., 18.], [ 56., 87., 28.], [ 58., 86., 7.], [ 47., 84., 16.], [ 39., 82., 32.]])
df = pd.read_csv("playerstats-2013-2014.csv", skiprows=1)
df = df.set_index("Rank")
df[["Player", "GP", "G", "A", "P"]]
Loading...
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 1 to 5
Data columns (total 20 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Player    5 non-null      object 
 1   Team      5 non-null      object 
 2   Pos       5 non-null      object 
 3   GP        5 non-null      int64  
 4   G         5 non-null      int64  
 5   A         5 non-null      int64  
 6   P         5 non-null      int64  
 7   +/-       5 non-null      int64  
 8   PIM       5 non-null      int64  
 9   PPG       5 non-null      int64  
 10  PPP       5 non-null      int64  
 11  SHG       5 non-null      int64  
 12  SHP       5 non-null      int64  
 13  GW        5 non-null      int64  
 14  OT        5 non-null      int64  
 15  S         5 non-null      int64  
 16  S%        5 non-null      float64
 17  TOI/GP    5 non-null      object 
 18  Shift/GP  5 non-null      float64
 19  FO%       5 non-null      float64
dtypes: float64(3), int64(13), object(4)
memory usage: 840.0+ bytes
df[["Player", "GP", "G", "A", "P"]].to_csv("playerstats-2013-2014-subset.csv")
!head -n 5 playerstats-2013-2014-subset.csv
Rank,Player,GP,G,A,P
1,Sidney Crosby,80,36,68,104
2,Ryan Getzlaf,77,31,56,87
3,Claude Giroux,82,28,58,86
4,Tyler Seguin,80,37,47,84

HDF5

h5py

import h5py
# mode = "w", "r", "w-", "r+", "a"
f = h5py.File("data.h5", "w")
f.mode
'r+'
f.flush()
f.close()
f = h5py.File("data.h5", "w")
f.name
'/'
grp1 = f.create_group("experiment1")
grp1.name
'/experiment1'
grp2_meas = f.create_group("experiment2/measurement")
grp2_meas.name
'/experiment2/measurement'
grp2_sim = f.create_group("experiment2/simulation")
grp2_sim.name
'/experiment2/simulation'
f["/experiment1"]
<HDF5 group "/experiment1" (0 members)>
f["/experiment2/simulation"]
<HDF5 group "/experiment2/simulation" (0 members)>
grp_expr2 = f["/experiment2"]
grp_expr2["simulation"]
<HDF5 group "/experiment2/simulation" (0 members)>
list(f.keys())
['experiment1', 'experiment2']
list(f.items())
[('experiment1', <HDF5 group "/experiment1" (0 members)>), ('experiment2', <HDF5 group "/experiment2" (2 members)>)]
f.visit(lambda x: print(x))
experiment1
experiment2
experiment2/measurement
experiment2/simulation
f.visititems(lambda name, value: print(name, value))
experiment1 <HDF5 group "/experiment1" (0 members)>
experiment2 <HDF5 group "/experiment2" (2 members)>
experiment2/measurement <HDF5 group "/experiment2/measurement" (0 members)>
experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>
"experiment1" in f
True
"simulation" in f["experiment2"]
True
"experiment3" in f
False
f.flush()
!ls -l *.h5
-rw-r--r-- 1 carlosal1015 carlosal1015    4272 Jan 23 15:18 data.h5
-rw-r--r-- 1 carlosal1015 carlosal1015   70847 Jan 23 15:14 playerstats-2013-2014.h5
-rw-r--r-- 1 carlosal1015 carlosal1015 2119192 Jan 23 15:14 store.h5
!h5ls -r data.h5
data.h5: unable to open file
data1 = np.arange(10)
data2 = np.random.randn(100, 100)
f["array1"] = data1
f["/experiment2/measurement/meas1"] = data2
f.visititems(lambda name, value: print(name, value))
array1 <HDF5 dataset "array1": shape (10,), type "<i8">
experiment1 <HDF5 group "/experiment1" (0 members)>
experiment2 <HDF5 group "/experiment2" (2 members)>
experiment2/measurement <HDF5 group "/experiment2/measurement" (1 members)>
experiment2/measurement/meas1 <HDF5 dataset "meas1": shape (100, 100), type "<f8">
experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>
ds = f["array1"]
ds
<HDF5 dataset "array1": shape (10,), type "<i8">
ds.name
'/array1'
ds.dtype
dtype('int64')
ds.shape
(10,)
ds.len()
10
# help(ds)
ds
<HDF5 dataset "array1": shape (10,), type "<i8">
h5py.__version__
'3.15.1'
# ds.value
np.array(ds)
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
ds = f["/experiment2/measurement/meas1"]
ds
<HDF5 dataset "meas1": shape (100, 100), type "<f8">
ds.dtype
dtype('<f8')
ds.shape
(100, 100)
data_full = ds[...]
type(data_full)
numpy.ndarray
data_full.shape
(100, 100)
data_col = ds[:, 0]
data_col.shape
(100,)
type(data_col)
numpy.ndarray
ds[10:20:3, 10:20:3]
array([[ 0.60270766, -0.34804638, -0.813596 , -1.29737966], [ 0.91320192, -1.06343294, 0.22734595, 0.52759738], [ 1.25774422, -0.32775492, 1.4849256 , 0.28005786], [-0.84907287, -0.30000358, 1.79691852, -0.19871506]])
ds[[1, 2, 3], :].shape
(3, 100)
ds[[1, 2, 3], :].shape
(3, 100)
mask = ds[:, 0] > 2.0
mask.shape, mask.dtype
((100,), dtype('bool'))
ds[mask, 0]
array([2.04253623, 2.1041854 , 2.05689385])
ds[mask, :5]
array([[ 2.04253623, -0.91946118, 0.11467003, -0.1374237 , 1.36552692], [ 2.1041854 , 0.22725706, -1.1291663 , -0.28133197, -0.7394167 ], [ 2.05689385, 0.18041971, -0.06670925, -0.02835398, 0.48480475]])
# create empty data sets, assign and update datasets
ds = f.create_dataset("array2", data=np.random.randint(10, size=10))
ds
<HDF5 dataset "array2": shape (10,), type "<i8">
# ds.value
np.array(ds)
array([0, 2, 2, 4, 7, 3, 7, 2, 4, 1])
ds = f.create_dataset("/experiment2/simulation/data1", shape=(5, 5), fillvalue=-1)
ds
<HDF5 dataset "data1": shape (5, 5), type "<f4">
# ds.value
np.array(ds)
array([[-1., -1., -1., -1., -1.], [-1., -1., -1., -1., -1.], [-1., -1., -1., -1., -1.], [-1., -1., -1., -1., -1.], [-1., -1., -1., -1., -1.]], dtype=float32)
ds = f.create_dataset(
    "/experiment1/simulation/data1",
    shape=(5000, 5000, 5000),
    fillvalue=0,
    compression="gzip",
)
ds
<HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">
ds[:, 0, 0] = np.random.rand(5000)
ds[1, :, 0] += np.random.rand(5000)
ds[:2, :5, 0]
array([[0.6939344 , 0. , 0. , 0. , 0. ], [1.4819994 , 0.01639538, 0.54387355, 0.11130908, 0.9928771 ]], dtype=float32)
ds.fillvalue
np.float32(0.0)
f["experiment1"].visititems(lambda name, value: print(name, value))
simulation <HDF5 group "/experiment1/simulation" (1 members)>
simulation/data1 <HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">
float(np.prod(ds.shape) * ds[0, 0, 0].nbytes) / (1024**3)  # Gb
465.66128730773926
f.flush()
f.filename
'data.h5'
!ls -lh data.h5
-rw-r--r-- 1 carlosal1015 carlosal1015 358K Jan 23 15:18 data.h5
del f["/experiment1/simulation/data1"]
f["experiment1"].visititems(lambda name, value: print(name, value))
simulation <HDF5 group "/experiment1/simulation" (0 members)>
f.close()
# attributes
f = h5py.File("data.h5", mode="a")  # ADDED mode="a"
f.attrs
<Attributes of HDF5 object at 136578732379808>
f.attrs["desc"] = "Result sets from experiments and simulations"
f["experiment1"].attrs["date"] = "2015-1-1"
f["experiment2"].attrs["date"] = "2015-1-2"
f["experiment2/simulation/data1"].attrs["k"] = 1.5
f["experiment2/simulation/data1"].attrs["T"] = 1000
list(f["experiment1"].attrs.keys())
['date']
list(f["experiment2/simulation/data1"].attrs.items())
[('T', np.int64(1000)), ('k', np.float64(1.5))]
"T" in f["experiment2/simulation/data1"].attrs
True
del f["experiment2/simulation/data1"].attrs["T"]
"T" in f["experiment2/simulation/data1"].attrs
False
f["experiment2/simulation/data1"].attrs["t"] = np.array([1, 2, 3])
f["experiment2/simulation/data1"].attrs["t"]
array([1, 2, 3])
f.close()

pytables

tables.__version__
'3.10.2'
df = pd.read_csv("playerstats-2013-2014-top30.csv", skiprows=1)
df = df.set_index("Rank")
df[["Player", "Pos", "GP", "P", "G", "A", "S%", "Shift/GP"]].head(5)
Loading...
f = tables.open_file("playerstats-2013-2014.h5", mode="w")
grp = f.create_group(
    "/", "season_2013_2014", title="NHL player statistics for the 2013/2014 season"
)
grp
/season_2013_2014 (Group) np.str_('NHL player statistics for the 2013/2014 season') children := []
f.root
/ (RootGroup) np.str_('') children := ['season_2013_2014' (Group)]
class PlayerStat(tables.IsDescription):
    player = tables.StringCol(20, dflt="")
    position = tables.StringCol(1, dflt="C")
    games_played = tables.UInt8Col(dflt=0)
    points = tables.UInt16Col(dflt=0)
    goals = tables.UInt16Col(dflt=0)
    assists = tables.UInt16Col(dflt=0)
    shooting_percentage = tables.Float64Col(dflt=0.0)
    shifts_per_game_played = tables.Float64Col(dflt=0.0)
top30_table = f.create_table(grp, "top30", PlayerStat, "Top 30 point leaders")
playerstat = top30_table.row
type(playerstat)
tables.tableextension.Row
for index, row_series in df.iterrows():
    playerstat["player"] = row_series["Player"]
    playerstat["position"] = row_series["Pos"]
    playerstat["games_played"] = row_series["GP"]
    playerstat["points"] = row_series["P"]
    playerstat["goals"] = row_series["G"]
    playerstat["assists"] = row_series["A"]
    playerstat["shooting_percentage"] = row_series["S%"]
    playerstat["shifts_per_game_played"] = row_series["Shift/GP"]
    playerstat.append()
top30_table.flush()
top30_table.cols.player[:5]
array([b'Sidney Crosby', b'Ryan Getzlaf', b'Claude Giroux', b'Tyler Seguin', b'Corey Perry'], dtype='|S20')
top30_table.cols.points[:5]
array([104, 87, 86, 84, 82], dtype=uint16)
def print_playerstat(row):
    print(
        "%20s\t%s\t%s\t%s"
        % (row["player"].decode("UTF-8"), row["points"], row["goals"], row["assists"])
    )
for row in top30_table.iterrows():
    print_playerstat(row)
       Sidney Crosby	104	36	68
        Ryan Getzlaf	87	31	56
       Claude Giroux	86	28	58
        Tyler Seguin	84	37	47
         Corey Perry	82	43	39
         Phil Kessel	80	37	43
         Taylor Hall	80	27	53
       Alex Ovechkin	79	51	28
        Joe Pavelski	79	41	38
          Jamie Benn	79	34	45
   Nicklas Backstrom	79	18	61
       Patrick Sharp	78	34	44
        Joe Thornton	76	11	65
       Erik Karlsson	74	20	54
       Evgeni Malkin	72	23	49
     Patrick Marleau	70	33	37
        Anze Kopitar	70	29	41
        Matt Duchene	70	23	47
    Martin St. Louis	69	30	39
        Patrick Kane	69	29	40
       Blake Wheeler	69	28	41
         Kyle Okposo	69	27	42
        David Krejci	69	19	50
        Chris Kunitz	68	35	33
      Jonathan Toews	68	28	40
        Thomas Vanek	68	27	41
        Jaromir Jagr	67	24	43
        John Tavares	66	24	42
        Jason Spezza	66	23	43
       Jordan Eberle	65	28	37
for row in top30_table.where("(points > 75) & (points <= 80)"):
    print_playerstat(row)
         Phil Kessel	80	37	43
         Taylor Hall	80	27	53
       Alex Ovechkin	79	51	28
        Joe Pavelski	79	41	38
          Jamie Benn	79	34	45
   Nicklas Backstrom	79	18	61
       Patrick Sharp	78	34	44
        Joe Thornton	76	11	65
for row in top30_table.where("(goals > 40) & (points < 80)"):
    print_playerstat(row)
       Alex Ovechkin	79	51	28
        Joe Pavelski	79	41	38
f
File(filename=playerstats-2013-2014.h5, title=np.str_(''), mode='w', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None)) / (RootGroup) np.str_('') /season_2013_2014 (Group) np.str_('NHL player statistics for the 2013/2014 season') /season_2013_2014/top30 (Table(np.int64(30),)) np.str_('Top 30 point leaders') description := { "assists": UInt16Col(shape=(), dflt=np.uint16(0), pos=0), "games_played": UInt8Col(shape=(), dflt=np.uint8(0), pos=1), "goals": UInt16Col(shape=(), dflt=np.uint16(0), pos=2), "player": StringCol(itemsize=20, shape=(), dflt=np.bytes_(b''), pos=3), "points": UInt16Col(shape=(), dflt=np.uint16(0), pos=4), "position": StringCol(itemsize=1, shape=(), dflt=np.bytes_(b'C'), pos=5), "shifts_per_game_played": Float64Col(shape=(), dflt=np.float64(0.0), pos=6), "shooting_percentage": Float64Col(shape=(), dflt=np.float64(0.0), pos=7)} byteorder := 'little' chunkshape := (np.int64(1489),)
f.flush()
f.close()
!h5ls -rv playerstats-2013-2014.h5
Opened "playerstats-2013-2014.h5" with sec2 driver.
/                        Group
    Attribute: CLASS scalar
        Type:      5-byte null-terminated UTF-8 string
    Attribute: PYTABLES_FORMAT_VERSION scalar
        Type:      3-byte null-terminated UTF-8 string
    Attribute: TITLE null
        Type:      1-byte null-terminated UTF-8 string
    Attribute: VERSION scalar
        Type:      3-byte null-terminated UTF-8 string
    Location:  1:48
    Links:     1
    Modified:  2026-01-23 15:18:57 -05
/season_2013_2014        Group
    Attribute: CLASS scalar
        Type:      5-byte null-terminated UTF-8 string
    Attribute: TITLE scalar
        Type:      46-byte null-terminated UTF-8 string
    Attribute: VERSION scalar
        Type:      3-byte null-terminated UTF-8 string
    Location:  1:280
    Links:     1
    Modified:  2026-01-23 15:18:57 -05
/season_2013_2014/top30  Dataset {30/Inf}
    Attribute: CLASS scalar
        Type:      5-byte null-terminated UTF-8 string
    Attribute: FIELD_0_FILL scalar
        Type:      native unsigned short
    Attribute: FIELD_0_NAME scalar
        Type:      7-byte null-terminated UTF-8 string
    Attribute: FIELD_1_FILL scalar
        Type:      native unsigned char
    Attribute: FIELD_1_NAME scalar
        Type:      12-byte null-terminated UTF-8 string
    Attribute: FIELD_2_FILL scalar
        Type:      native unsigned short
    Attribute: FIELD_2_NAME scalar
        Type:      5-byte null-terminated UTF-8 string
    Attribute: FIELD_3_FILL scalar
        Type:      1-byte null-terminated ASCII string
    Attribute: FIELD_3_NAME scalar
        Type:      6-byte null-terminated UTF-8 string
    Attribute: FIELD_4_FILL scalar
        Type:      native unsigned short
    Attribute: FIELD_4_NAME scalar
        Type:      6-byte null-terminated UTF-8 string
    Attribute: FIELD_5_FILL scalar
        Type:      1-byte null-terminated ASCII string
    Attribute: FIELD_5_NAME scalar
        Type:      8-byte null-terminated UTF-8 string
    Attribute: FIELD_6_FILL scalar
        Type:      native double
    Attribute: FIELD_6_NAME scalar
        Type:      22-byte null-terminated UTF-8 string
    Attribute: FIELD_7_FILL scalar
        Type:      native double
    Attribute: FIELD_7_NAME scalar
        Type:      19-byte null-terminated UTF-8 string
    Attribute: NROWS scalar
        Type:      native long
    Attribute: TITLE scalar
        Type:      20-byte null-terminated UTF-8 string
    Attribute: VERSION scalar
        Type:      3-byte null-terminated UTF-8 string
    Location:  1:609
    Links:     1
    Modified:  2026-01-23 15:18:57 -05
    Chunks:    {1489} 65516 bytes
    Storage:   1320 logical bytes, 65516 allocated bytes, 2.01% utilization
    Type:      struct {
                   "assists"          +0    native unsigned short
                   "games_played"     +2    native unsigned char
                   "goals"            +3    native unsigned short
                   "player"           +5    20-byte null-terminated ASCII string
                   "points"           +25   native unsigned short
                   "position"         +27   1-byte null-terminated ASCII string
                   "shifts_per_game_played" +28   native double
                   "shooting_percentage" +36   native double
               } 44 bytes

Pandas hdfstore

import pandas as pd
store = pd.HDFStore("store.h5")
df = pd.DataFrame(np.random.rand(5, 5))
store["df1"] = df
df = pd.read_csv("playerstats-2013-2014-top30.csv", skiprows=1)
store["df2"] = df
store.keys()
['/df1', '/df2']
"df2" in store
True
df = store["df1"]
store.root
/ (RootGroup) np.str_('') children := ['df1' (Group), 'df2' (Group)]
store.close()
f = h5py.File("store.h5")
f.visititems(lambda x, y: print(x, "\t" * int(3 - len(str(x)) // 8), y))
df1 			 <HDF5 group "/df1" (4 members)>
df1/axis0 		 <HDF5 dataset "axis0": shape (5,), type "<i8">
df1/axis1 		 <HDF5 dataset "axis1": shape (5,), type "<i8">
df1/block0_items 	 <HDF5 dataset "block0_items": shape (5,), type "<i8">
df1/block0_values 	 <HDF5 dataset "block0_values": shape (5, 5), type "<f8">
df2 			 <HDF5 group "/df2" (8 members)>
df2/axis0 		 <HDF5 dataset "axis0": shape (21,), type "|S8">
df2/axis1 		 <HDF5 dataset "axis1": shape (30,), type "<i8">
df2/block0_items 	 <HDF5 dataset "block0_items": shape (3,), type "|S8">
df2/block0_values 	 <HDF5 dataset "block0_values": shape (30, 3), type "<f8">
df2/block1_items 	 <HDF5 dataset "block1_items": shape (14,), type "|S4">
df2/block1_values 	 <HDF5 dataset "block1_values": shape (30, 14), type "<i8">
df2/block2_items 	 <HDF5 dataset "block2_items": shape (4,), type "|S6">
df2/block2_values 	 <HDF5 dataset "block2_values": shape (1,), type "|O">
f["/df2/block0_items"][:]
array([b'S%', b'Shift/GP', b'FO%'], dtype='|S8')
f["/df2/block0_values"][:3]
array([[13.9, 24. , 52.5], [15.2, 25.2, 49. ], [12.6, 25.1, 52.9]])
np.array(f["/df2/block1_items"])
array([b'Rank', b'GP', b'G', b'A', b'P', b'+/-', b'PIM', b'PPG', b'PPP', b'SHG', b'SHP', b'GW', b'OT', b'S'], dtype='|S4')
f["/df2/block1_values"][:3, :5]
array([[ 1, 80, 36, 68, 104], [ 2, 77, 31, 56, 87], [ 3, 82, 28, 58, 86]])

Parquet

df = pd.read_csv(
    "temperature_outdoor_2014.tsv", delimiter="\t", names=["time", "temperature"]
)
df.time = (
    pd.to_datetime(df.time.values, unit="s")
    .tz_localize("UTC")
    .tz_convert("Europe/Stockholm")
)
df["dt"] = df.time.dt.strftime("%Y-%m-%d")
df.head()
Loading...
df.to_parquet("temperature_outdoor_2014.parquet", index=None, partition_cols=["dt"])
!ls temperature_outdoor_2014.parquet | head
dt=2014-01-01
dt=2014-01-02
dt=2014-01-03
dt=2014-01-04
dt=2014-01-05
dt=2014-01-06
dt=2014-01-07
dt=2014-01-08
dt=2014-01-09
dt=2014-01-10
df.to_parquet("temperature_outdoor_2014_no_partitions.parquet", index=None)
!file temperature_outdoor_2014_no_partitions.parquet | head
temperature_outdoor_2014_no_partitions.parquet: Apache Parquet
!ls temperature_outdoor_2014.parquet/dt=2014-01-01 | head
d7c32bdc528844a5a8c45dc24659b6e5-0.parquet
df_20140401 = pd.read_parquet("temperature_outdoor_2014.parquet/dt=2014-04-01")
df_20140401.head()
Loading...
import pyarrow.parquet as pq
table = pq.read_table(
    "temperature_outdoor_2014.parquet", columns=["time", "temperature"]
)
type(table)
pyarrow.lib.Table
# help(table)
df2 = table.to_pandas()
df2.head()
Loading...

JSON

data = ["string", 1.0, 2, None]
data_json = json.dumps(data)
data_json
'["string", 1.0, 2, null]'
data2 = json.loads(data_json)
data
['string', 1.0, 2, None]
data[0]
'string'
data = {"one": 1, "two": 2.0, "three": "three"}
data_json = json.dumps(data)
print(data_json)
{"one": 1, "two": 2.0, "three": "three"}
data = json.loads(data_json)
data["two"]
2.0
data["three"]
'three'
data = {"one": [1], "two": [1, 2], "three": [1, 2, 3]}
data_json = json.dumps(data, indent=True)
print(data_json)
{
 "one": [
  1
 ],
 "two": [
  1,
  2
 ],
 "three": [
  1,
  2,
  3
 ]
}
data = {
    "one": [1],
    "two": {"one": 1, "two": 2},
    "three": [(1,), (1, 2), (1, 2, 3)],
    "four": "a text string",
}
with open("data.json", "w") as f:
    json.dump(data, f)
!cat data.json
{"one": [1], "two": {"one": 1, "two": 2}, "three": [[1], [1, 2], [1, 2, 3]], "four": "a text string"}
with open("data.json", "r") as f:
    data_from_file = json.load(f)
data_from_file["two"]
{'one': 1, 'two': 2}
data_from_file["three"]
[[1], [1, 2], [1, 2, 3]]
!head -n 20 tokyo-metro.json
{
    "C": {
        "color": "#149848",
        "transfers": [
            [
                "C3",
                "F15"
            ],
            [
                "C4",
                "Z2"
            ],
            [
                "C4",
                "G2"
            ],
            [
                "C7",
                "M14"
            ],
!wc tokyo-metro.json
 1471  1508 26839 tokyo-metro.json
with open("tokyo-metro.json", "r") as f:
    data = json.load(f)
data.keys()
dict_keys(['C', 'G', 'F', 'H', 'M', 'N', 'T', 'Y', 'Z'])
data["C"].keys()
dict_keys(['color', 'transfers', 'travel_times'])
data["C"]["color"]
'#149848'
data["C"]["transfers"]
[['C3', 'F15'], ['C4', 'Z2'], ['C4', 'G2'], ['C7', 'M14'], ['C7', 'N6'], ['C7', 'G6'], ['C8', 'M15'], ['C8', 'H6'], ['C9', 'H7'], ['C9', 'Y18'], ['C11', 'T9'], ['C11', 'M18'], ['C11', 'Z8'], ['C12', 'M19'], ['C18', 'H21']]
[(s, e, tt) for s, e, tt in data["C"]["travel_times"] if tt == 1]
[('C3', 'C4', 1), ('C7', 'C8', 1), ('C9', 'C10', 1)]
data
{'C': {'color': '#149848', 'transfers': [['C3', 'F15'], ['C4', 'Z2'], ['C4', 'G2'], ['C7', 'M14'], ['C7', 'N6'], ['C7', 'G6'], ['C8', 'M15'], ['C8', 'H6'], ['C9', 'H7'], ['C9', 'Y18'], ['C11', 'T9'], ['C11', 'M18'], ['C11', 'Z8'], ['C12', 'M19'], ['C18', 'H21']], 'travel_times': [['C1', 'C2', 2], ['C2', 'C3', 2], ['C3', 'C4', 1], ['C4', 'C5', 2], ['C5', 'C6', 2], ['C6', 'C7', 2], ['C7', 'C8', 1], ['C8', 'C9', 3], ['C9', 'C10', 1], ['C10', 'C11', 2], ['C11', 'C12', 2], ['C12', 'C13', 2], ['C13', 'C14', 2], ['C14', 'C15', 2], ['C15', 'C16', 2], ['C16', 'C17', 3], ['C17', 'C18', 3], ['C18', 'C19', 3]]}, 'G': {'color': '#f59230', 'transfers': [['G1', 'Z1'], ['G1', 'F16'], ['G2', 'Z2'], ['G2', 'C4'], ['G4', 'Z3'], ['G5', 'M13'], ['G5', 'Y16'], ['G5', 'Z4'], ['G5', 'N7'], ['G6', 'N6'], ['G6', 'M14'], ['G6', 'C7'], ['G9', 'M16'], ['G9', 'H8'], ['G11', 'T10'], ['G12', 'Z9'], ['G15', 'H16'], ['G16', 'H17']], 'travel_times': [['G1', 'G2', 2], ['G2', 'G3', 1], ['G3', 'G4', 2], ['G4', 'G5', 2], ['G5', 'G6', 2], ['G6', 'G7', 2], ['G7', 'G8', 2], ['G8', 'G9', 2], ['G9', 'G10', 1], ['G10', 'G11', 2], ['G11', 'G12', 2], ['G12', 'G13', 1], ['G13', 'G14', 2], ['G14', 'G15', 2], ['G15', 'G16', 1], ['G16', 'G17', 2], ['G17', 'G18', 1], ['G18', 'G19', 2]]}, 'F': {'color': '#b96528', 'transfers': [['F1', 'Y1'], ['F2', 'Y2'], ['F3', 'Y3'], ['F4', 'Y4'], ['F5', 'Y5'], ['F6', 'Y6'], ['F7', 'Y7'], ['F8', 'Y8'], ['F9', 'Y9'], ['F9', 'M25'], ['F13', 'M9'], ['F15', 'C3'], ['F16', 'Z1'], ['F16', 'G1']], 'travel_times': [['F1', 'F2', 3], ['F2', 'F3', 2], ['F3', 'F4', 3], ['F4', 'F5', 2], ['F5', 'F6', 2], ['F6', 'F7', 2], ['F7', 'F8', 2], ['F8', 'F9', 2], ['F9', 'F10', 3], ['F10', 'F11', 2], ['F11', 'F12', 2], ['F12', 'F13', 2], ['F13', 'F14', 3], ['F14', 'F15', 2], ['F15', 'F16', 2]]}, 'H': {'color': '#9cacb5', 'transfers': [['H6', 'M15'], ['H6', 'C8'], ['H7', 'Y18'], ['H7', 'C9'], ['H8', 'M16'], ['H8', 'G9'], ['H12', 'T11'], ['H16', 'G15'], ['H17', 'G16'], ['H21', 'C18']], 'travel_times': [['H1', 'H2', 3], ['H2', 'H3', 3], ['H3', 'H4', 3], ['H4', 'H5', 3], ['H5', 'H6', 2], ['H6', 'H7', 3], ['H7', 'H8', 1], ['H8', 'H9', 2], ['H9', 'H10', 2], ['H10', 'H11', 2], ['H11', 'H12', 1], ['H12', 'H13', 3], ['H13', 'H14', 1], ['H14', 'H15', 2], ['H15', 'H16', 2], ['H16', 'H17', 1], ['H17', 'H18', 2], ['H18', 'H19', 2], ['H19', 'H20', 2], ['H20', 'H21', 3]]}, 'M': {'color': '#ff0000', 'transfers': [['M9', 'F13'], ['M12', 'N8'], ['M13', 'G5'], ['M13', 'Y16'], ['M13', 'Z4'], ['M13', 'N7'], ['M14', 'C7'], ['M14', 'G6'], ['M14', 'N6'], ['M15', 'H6'], ['M15', 'C8'], ['M16', 'G9'], ['M16', 'H8'], ['M18', 'T9'], ['M18', 'C11'], ['M18', 'Z8'], ['M19', 'C12'], ['M22', 'N11'], ['M25', 'Y9'], ['M25', 'F9']], 'travel_times': [['M1', 'M2', 2], ['M2', 'M3', 2], ['M3', 'M4', 2], ['M4', 'M5', 2], ['M5', 'M6', 2], ['M6', 'M7', 2], ['M7', 'M8', 2], ['M8', 'M9', 2], ['M9', 'M10', 1], ['M10', 'M11', 2], ['M11', 'M12', 2], ['M12', 'M13', 3], ['M13', 'M14', 2], ['M14', 'M15', 1], ['M15', 'M16', 3], ['M16', 'M17', 2], ['M17', 'M18', 2], ['M18', 'M19', 2], ['M19', 'M20', 1], ['M20', 'M21', 2], ['M21', 'M22', 2], ['M22', 'M23', 3], ['M23', 'M24', 2], ['M24', 'M25', 3], ['m3', 'm4', 2], ['m4', 'm5', 2], ['m5', 'M6', 2]]}, 'N': {'color': '#1aaca9', 'transfers': [['N1', 'T1'], ['N2', 'T2'], ['N3', 'T3'], ['N6', 'G6'], ['N6', 'M14'], ['N6', 'C7'], ['N7', 'Y16'], ['N7', 'Z4'], ['N7', 'G5'], ['N7', 'M13'], ['N8', 'M12'], ['N9', 'Y14'], ['N10', 'Y13'], ['N10', 'T6'], ['N11', 'M22']], 'travel_times': [['N1', 'N2', 2], ['N2', 'N3', 2], ['N3', 'N4', 2], ['N4', 'N5', 2], ['N5', 'N6', 2], ['N6', 'N7', 2], ['N7', 'N8', 2], ['N8', 'N9', 2], ['N9', 'N10', 2], ['N10', 'N11', 2], ['N11', 'N12', 3], ['N12', 'N13', 2], ['N13', 'N14', 2], ['N14', 'N15', 3], ['N15', 'N16', 1], ['N16', 'N17', 3], ['N17', 'N18', 2], ['N18', 'N19', 2]]}, 'T': {'color': '#1aa7d8', 'transfers': [['T6', 'N10'], ['T6', 'Y13'], ['T7', 'Z6'], ['T9', 'M18'], ['T9', 'C11'], ['T9', 'Z8'], ['T10', 'G11'], ['T11', 'H12']], 'travel_times': [['T1', 'T2', 0], ['T2', 'T3', 3], ['T3', 'T4', 6], ['T4', 'T5', 9], ['T5', 'T6', 11], ['T6', 'T7', 13], ['T7', 'T8', 14], ['T8', 'T9', 16], ['T9', 'T10', 18], ['T10', 'T11', 20], ['T11', 'T12', 21], ['T12', 'T13', 24], ['T13', 'T14', 26], ['T14', 'T15', 27], ['T15', 'T16', 30], ['T16', 'T17', 33], ['T17', 'T18', 35], ['T18', 'T19', 37], ['T19', 'T20', 39], ['T20', 'T21', 41], ['T21', 'T22', 43], ['T22', 'T23', 46], ['T23', 'T24', 49]]}, 'Y': {'color': '#ede7c3', 'transfers': [['Y1', 'F1'], ['Y2', 'F2'], ['Y3', 'F3'], ['Y4', 'F4'], ['Y5', 'F5'], ['Y6', 'F6'], ['Y7', 'F7'], ['Y8', 'F8'], ['Y9', 'F9'], ['Y9', 'M25'], ['Y13', 'T6'], ['Y13', 'N10'], ['Y14', 'N9'], ['Y16', 'Z4'], ['Y16', 'N7'], ['Y16', 'G5'], ['Y16', 'M13'], ['Y18', 'H7'], ['Y18', 'C9']], 'travel_times': [['Y1', 'Y2', 4], ['Y2', 'Y3', 2], ['Y3', 'Y4', 3], ['Y4', 'Y5', 2], ['Y5', 'Y6', 2], ['Y6', 'Y7', 2], ['Y7', 'Y8', 2], ['Y8', 'Y9', 3], ['Y9', 'Y10', 2], ['Y10', 'Y11', 2], ['Y11', 'Y12', 2], ['Y12', 'Y13', 3], ['Y13', 'Y14', 2], ['Y14', 'Y15', 2], ['Y15', 'Y16', 1], ['Y16', 'Y17', 2], ['Y17', 'Y18', 2], ['Y18', 'Y19', 2], ['Y19', 'Y20', 2], ['Y20', 'Y21', 2], ['Y21', 'Y22', 2], ['Y22', 'Y23', 3], ['Y23', 'Y24', 2]]}, 'Z': {'color': '#a384bf', 'transfers': [['Z1', 'F16'], ['Z1', 'G1'], ['Z2', 'C4'], ['Z2', 'G2'], ['Z3', 'G4'], ['Z4', 'Y16'], ['Z4', 'N7'], ['Z4', 'M13'], ['Z4', 'G5'], ['Z6', 'T7'], ['Z8', 'M18'], ['Z8', 'C11'], ['Z8', 'T9'], ['Z9', 'G12']], 'travel_times': [['Z1', 'Z2', 3], ['Z2', 'Z3', 2], ['Z3', 'Z4', 2], ['Z4', 'Z5', 2], ['Z5', 'Z6', 2], ['Z6', 'Z7', 2], ['Z7', 'Z8', 2], ['Z8', 'Z9', 2], ['Z9', 'Z10', 3], ['Z10', 'Z11', 3], ['Z11', 'Z12', 3], ['Z12', 'Z13', 2], ['Z13', 'Z14', 2]]}}
!ls -lh tokyo-metro.json
-rw-r--r-- 1 carlosal1015 carlosal1015 27K Jan 22 15:31 tokyo-metro.json
data_pack = msgpack.packb(data)
# del data
type(data_pack)
bytes
len(data_pack)
3021
with open("tokyo-metro.msgpack", "wb") as f:
    f.write(data_pack)
!ls -lh tokyo-metro.msgpack
-rw-r--r-- 1 carlosal1015 carlosal1015 3.0K Jan 23 15:19 tokyo-metro.msgpack
with open("tokyo-metro.msgpack", "rb") as f:
    data_msgpack = f.read()
    data = msgpack.unpackb(data_msgpack)
list(data.keys())
['C', 'G', 'F', 'H', 'M', 'N', 'T', 'Y', 'Z']
import pickle
with open("tokyo-metro.pickle", "wb") as f:
    pickle.dump(data, f)
del data
!ls -lh tokyo-metro.pickle
-rw-r--r-- 1 carlosal1015 carlosal1015 5.2K Jan 23 15:19 tokyo-metro.pickle
with open("tokyo-metro.pickle", "rb") as f:
    data = pickle.load(f)
data.keys()
dict_keys(['C', 'G', 'F', 'H', 'M', 'N', 'T', 'Y', 'Z'])
References
  1. Johansson, R. (2024). Numerical Python: Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib. Apress. 10.1007/979-8-8688-0413-7