Chapter 18: Code listing
Robert Johansson
Source code listings for Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib (ISBN 979-8-8688-0412-0).
Imports¶
import numpy as np
np.random.seed(0)import pandas as pdimport csvimport jsonimport h5pyimport tablesimport pickle
# import cPickleimport msgpackCSV¶
%%writefile playerstats-2013-2014.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5
5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0Overwriting playerstats-2013-2014.csv
%%writefile playerstats-2013-2014-top30.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5
5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0
6,Phil Kessel,TOR,R,82,37,43,80,-5,27,8,20,0,0,6,0,305,12.1,20:39,24.5,14.3
7,Taylor Hall,EDM,L,75,27,53,80,-15,44,7,17,0,1,1,1,250,10.8,20:00,25.4,45.7
8,Alex Ovechkin,WSH,L,78,51,28,79,-35,48,24,39,0,1,10,3,386,13.2,20:32,21.8,66.7
9,Joe Pavelski,SJS,C,82,41,38,79,+23,32,16,31,1,2,3,0,225,18.2,19:51,27.1,56.0
10,Jamie Benn,DAL,L,81,34,45,79,+21,64,5,19,1,3,3,1,279,12.2,19:09,25.0,52.8
11,Nicklas Backstrom,WSH,C,82,18,61,79,-20,54,6,44,1,1,1,0,196,9.2,19:48,23.3,50.4
12,Patrick Sharp,CHI,L,82,34,44,78,+13,40,10,25,0,0,3,1,313,10.9,18:53,22.7,54.6
13,Joe Thornton,SJS,C,82,11,65,76,+20,32,2,19,0,1,3,1,122,9.0,18:55,26.3,56.1
14,Erik Karlsson,OTT,D,82,20,54,74,-15,36,5,31,0,0,1,0,257,7.8,27:04,28.6,0.0
15,Evgeni Malkin,PIT,C,60,23,49,72,+10,62,7,30,0,0,3,0,191,12.0,20:03,21.4,48.8
16,Patrick Marleau,SJS,L,82,33,37,70,+0,18,11,23,2,2,4,0,285,11.6,20:31,27.3,52.9
17,Anze Kopitar,LAK,C,82,29,41,70,+34,24,10,23,0,0,9,2,200,14.5,20:53,25.4,53.3
18,Matt Duchene,COL,C,71,23,47,70,+8,19,5,17,0,0,6,1,217,10.6,18:29,22.0,50.3
19,Martin St. Louis,"TBL, NYR",R,81,30,39,69,+13,10,9,21,1,2,5,1,204,14.7,20:56,25.7,40.7
20,Patrick Kane,CHI,R,69,29,40,69,+7,22,10,25,0,0,6,0,227,12.8,19:36,22.9,50.0
21,Blake Wheeler,WPG,R,82,28,41,69,+4,63,8,19,0,0,4,2,225,12.4,18:41,24.0,37.5
22,Kyle Okposo,NYI,R,71,27,42,69,-9,51,5,15,0,0,4,1,195,13.8,20:26,22.2,47.5
23,David Krejci,BOS,C,80,19,50,69,+39,28,3,19,0,0,6,1,169,11.2,19:07,21.3,51.2
24,Chris Kunitz,PIT,L,78,35,33,68,+25,66,13,22,0,0,8,0,218,16.1,19:09,22.2,75.0
25,Jonathan Toews,CHI,C,76,28,40,68,+26,34,5,15,3,5,5,0,193,14.5,20:28,25.9,57.2
26,Thomas Vanek,"BUF, NYI, MTL",L,78,27,41,68,+7,46,8,18,0,0,4,0,248,10.9,19:21,21.6,43.5
27,Jaromir Jagr,NJD,R,82,24,43,67,+16,46,5,17,0,0,6,1,231,10.4,19:09,22.8,0.0
28,John Tavares,NYI,C,59,24,42,66,-6,40,8,25,0,0,4,0,188,12.8,21:14,22.3,49.1
29,Jason Spezza,OTT,C,75,23,43,66,-26,46,9,22,0,0,5,0,223,10.3,18:12,23.8,54.0
30,Jordan Eberle,EDM,R,80,28,37,65,-11,18,7,20,1,1,4,1,200,14.0,19:32,25.4,38.1Overwriting playerstats-2013-2014-top30.csv
!head -n 5 playerstats-2013-2014-top30.csv# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
rows = []with open("playerstats-2013-2014.csv") as f:
csvreader = csv.reader(f)
rows = [fields for fields in csvreader]rows[1][1:6]['Player', 'Team', 'Pos', 'GP', 'G']rows[2][1:6]['Sidney Crosby', 'PIT', 'C', '80', '36']data = np.random.randn(100, 3)np.savetxt(
"data.csv",
data,
delimiter=",",
header="x, y, z",
comments="# Random x, y, z coordinates\n",
)!head -n 5 data.csv# Random x, y, z coordinates
x, y, z
1.764052345967664026e+00,4.001572083672232938e-01,9.787379841057392005e-01
2.240893199201457797e+00,1.867557990149967484e+00,-9.772778798764110153e-01
9.500884175255893682e-01,-1.513572082976978872e-01,-1.032188517935578448e-01
data_load = np.loadtxt("data.csv", skiprows=2, delimiter=",")data_load[1, :]array([ 2.2408932 , 1.86755799, -0.97727788])data_load.dtypedtype('float64')(data == data_load).all()np.True_data = np.loadtxt("playerstats-2013-2014.csv", skiprows=2, delimiter=",", dtype=bytes)data[0][1:6]array([b'Sidney Crosby', b'PIT', b'C', b'80', b'36'], dtype='|S13')np.loadtxt("playerstats-2013-2014.csv", skiprows=2, delimiter=",", usecols=[6, 7, 8])array([[ 68., 104., 18.],
[ 56., 87., 28.],
[ 58., 86., 7.],
[ 47., 84., 16.],
[ 39., 82., 32.]])df = pd.read_csv("playerstats-2013-2014.csv", skiprows=1)df = df.set_index("Rank")df[["Player", "GP", "G", "A", "P"]]Loading...
df.info()<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 1 to 5
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Player 5 non-null object
1 Team 5 non-null object
2 Pos 5 non-null object
3 GP 5 non-null int64
4 G 5 non-null int64
5 A 5 non-null int64
6 P 5 non-null int64
7 +/- 5 non-null int64
8 PIM 5 non-null int64
9 PPG 5 non-null int64
10 PPP 5 non-null int64
11 SHG 5 non-null int64
12 SHP 5 non-null int64
13 GW 5 non-null int64
14 OT 5 non-null int64
15 S 5 non-null int64
16 S% 5 non-null float64
17 TOI/GP 5 non-null object
18 Shift/GP 5 non-null float64
19 FO% 5 non-null float64
dtypes: float64(3), int64(13), object(4)
memory usage: 840.0+ bytes
df[["Player", "GP", "G", "A", "P"]].to_csv("playerstats-2013-2014-subset.csv")!head -n 5 playerstats-2013-2014-subset.csvRank,Player,GP,G,A,P
1,Sidney Crosby,80,36,68,104
2,Ryan Getzlaf,77,31,56,87
3,Claude Giroux,82,28,58,86
4,Tyler Seguin,80,37,47,84
HDF5¶
h5py¶
import h5py# mode = "w", "r", "w-", "r+", "a"f = h5py.File("data.h5", "w")f.mode'r+'f.flush()f.close()f = h5py.File("data.h5", "w")f.name'/'grp1 = f.create_group("experiment1")grp1.name'/experiment1'grp2_meas = f.create_group("experiment2/measurement")grp2_meas.name'/experiment2/measurement'grp2_sim = f.create_group("experiment2/simulation")grp2_sim.name'/experiment2/simulation'f["/experiment1"]<HDF5 group "/experiment1" (0 members)>f["/experiment2/simulation"]<HDF5 group "/experiment2/simulation" (0 members)>grp_expr2 = f["/experiment2"]grp_expr2["simulation"]<HDF5 group "/experiment2/simulation" (0 members)>list(f.keys())['experiment1', 'experiment2']list(f.items())[('experiment1', <HDF5 group "/experiment1" (0 members)>),
('experiment2', <HDF5 group "/experiment2" (2 members)>)]f.visit(lambda x: print(x))experiment1
experiment2
experiment2/measurement
experiment2/simulation
f.visititems(lambda name, value: print(name, value))experiment1 <HDF5 group "/experiment1" (0 members)>
experiment2 <HDF5 group "/experiment2" (2 members)>
experiment2/measurement <HDF5 group "/experiment2/measurement" (0 members)>
experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>
"experiment1" in fTrue"simulation" in f["experiment2"]True"experiment3" in fFalsef.flush()!ls -l *.h5-rw-r--r-- 1 carlosal1015 carlosal1015 4272 Jan 23 15:18 data.h5
-rw-r--r-- 1 carlosal1015 carlosal1015 70847 Jan 23 15:14 playerstats-2013-2014.h5
-rw-r--r-- 1 carlosal1015 carlosal1015 2119192 Jan 23 15:14 store.h5
!h5ls -r data.h5data.h5: unable to open file
data1 = np.arange(10)data2 = np.random.randn(100, 100)f["array1"] = data1f["/experiment2/measurement/meas1"] = data2f.visititems(lambda name, value: print(name, value))array1 <HDF5 dataset "array1": shape (10,), type "<i8">
experiment1 <HDF5 group "/experiment1" (0 members)>
experiment2 <HDF5 group "/experiment2" (2 members)>
experiment2/measurement <HDF5 group "/experiment2/measurement" (1 members)>
experiment2/measurement/meas1 <HDF5 dataset "meas1": shape (100, 100), type "<f8">
experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>
ds = f["array1"]ds<HDF5 dataset "array1": shape (10,), type "<i8">ds.name'/array1'ds.dtypedtype('int64')ds.shape(10,)ds.len()10# help(ds)ds<HDF5 dataset "array1": shape (10,), type "<i8">h5py.__version__'3.15.1'# ds.value
np.array(ds)array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])ds = f["/experiment2/measurement/meas1"]ds<HDF5 dataset "meas1": shape (100, 100), type "<f8">ds.dtypedtype('<f8')ds.shape(100, 100)data_full = ds[...]type(data_full)numpy.ndarraydata_full.shape(100, 100)data_col = ds[:, 0]data_col.shape(100,)type(data_col)numpy.ndarrayds[10:20:3, 10:20:3]array([[ 0.60270766, -0.34804638, -0.813596 , -1.29737966],
[ 0.91320192, -1.06343294, 0.22734595, 0.52759738],
[ 1.25774422, -0.32775492, 1.4849256 , 0.28005786],
[-0.84907287, -0.30000358, 1.79691852, -0.19871506]])ds[[1, 2, 3], :].shape(3, 100)ds[[1, 2, 3], :].shape(3, 100)mask = ds[:, 0] > 2.0mask.shape, mask.dtype((100,), dtype('bool'))ds[mask, 0]array([2.04253623, 2.1041854 , 2.05689385])ds[mask, :5]array([[ 2.04253623, -0.91946118, 0.11467003, -0.1374237 , 1.36552692],
[ 2.1041854 , 0.22725706, -1.1291663 , -0.28133197, -0.7394167 ],
[ 2.05689385, 0.18041971, -0.06670925, -0.02835398, 0.48480475]])# create empty data sets, assign and update datasetsds = f.create_dataset("array2", data=np.random.randint(10, size=10))ds<HDF5 dataset "array2": shape (10,), type "<i8"># ds.value
np.array(ds)array([0, 2, 2, 4, 7, 3, 7, 2, 4, 1])ds = f.create_dataset("/experiment2/simulation/data1", shape=(5, 5), fillvalue=-1)ds<HDF5 dataset "data1": shape (5, 5), type "<f4"># ds.value
np.array(ds)array([[-1., -1., -1., -1., -1.],
[-1., -1., -1., -1., -1.],
[-1., -1., -1., -1., -1.],
[-1., -1., -1., -1., -1.],
[-1., -1., -1., -1., -1.]], dtype=float32)ds = f.create_dataset(
"/experiment1/simulation/data1",
shape=(5000, 5000, 5000),
fillvalue=0,
compression="gzip",
)ds<HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">ds[:, 0, 0] = np.random.rand(5000)ds[1, :, 0] += np.random.rand(5000)ds[:2, :5, 0]array([[0.6939344 , 0. , 0. , 0. , 0. ],
[1.4819994 , 0.01639538, 0.54387355, 0.11130908, 0.9928771 ]],
dtype=float32)ds.fillvaluenp.float32(0.0)f["experiment1"].visititems(lambda name, value: print(name, value))simulation <HDF5 group "/experiment1/simulation" (1 members)>
simulation/data1 <HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">
float(np.prod(ds.shape) * ds[0, 0, 0].nbytes) / (1024**3) # Gb465.66128730773926f.flush()f.filename'data.h5'!ls -lh data.h5-rw-r--r-- 1 carlosal1015 carlosal1015 358K Jan 23 15:18 data.h5
del f["/experiment1/simulation/data1"]f["experiment1"].visititems(lambda name, value: print(name, value))simulation <HDF5 group "/experiment1/simulation" (0 members)>
f.close()# attributesf = h5py.File("data.h5", mode="a") # ADDED mode="a"f.attrs<Attributes of HDF5 object at 136578732379808>f.attrs["desc"] = "Result sets from experiments and simulations"f["experiment1"].attrs["date"] = "2015-1-1"f["experiment2"].attrs["date"] = "2015-1-2"f["experiment2/simulation/data1"].attrs["k"] = 1.5f["experiment2/simulation/data1"].attrs["T"] = 1000list(f["experiment1"].attrs.keys())['date']list(f["experiment2/simulation/data1"].attrs.items())[('T', np.int64(1000)), ('k', np.float64(1.5))]"T" in f["experiment2/simulation/data1"].attrsTruedel f["experiment2/simulation/data1"].attrs["T"]"T" in f["experiment2/simulation/data1"].attrsFalsef["experiment2/simulation/data1"].attrs["t"] = np.array([1, 2, 3])f["experiment2/simulation/data1"].attrs["t"]array([1, 2, 3])f.close()pytables¶
tables.__version__'3.10.2'df = pd.read_csv("playerstats-2013-2014-top30.csv", skiprows=1)
df = df.set_index("Rank")df[["Player", "Pos", "GP", "P", "G", "A", "S%", "Shift/GP"]].head(5)Loading...
f = tables.open_file("playerstats-2013-2014.h5", mode="w")grp = f.create_group(
"/", "season_2013_2014", title="NHL player statistics for the 2013/2014 season"
)grp/season_2013_2014 (Group) np.str_('NHL player statistics for the 2013/2014 season')
children := []f.root/ (RootGroup) np.str_('')
children := ['season_2013_2014' (Group)]class PlayerStat(tables.IsDescription):
player = tables.StringCol(20, dflt="")
position = tables.StringCol(1, dflt="C")
games_played = tables.UInt8Col(dflt=0)
points = tables.UInt16Col(dflt=0)
goals = tables.UInt16Col(dflt=0)
assists = tables.UInt16Col(dflt=0)
shooting_percentage = tables.Float64Col(dflt=0.0)
shifts_per_game_played = tables.Float64Col(dflt=0.0)top30_table = f.create_table(grp, "top30", PlayerStat, "Top 30 point leaders")playerstat = top30_table.rowtype(playerstat)tables.tableextension.Rowfor index, row_series in df.iterrows():
playerstat["player"] = row_series["Player"]
playerstat["position"] = row_series["Pos"]
playerstat["games_played"] = row_series["GP"]
playerstat["points"] = row_series["P"]
playerstat["goals"] = row_series["G"]
playerstat["assists"] = row_series["A"]
playerstat["shooting_percentage"] = row_series["S%"]
playerstat["shifts_per_game_played"] = row_series["Shift/GP"]
playerstat.append()top30_table.flush()top30_table.cols.player[:5]array([b'Sidney Crosby', b'Ryan Getzlaf', b'Claude Giroux',
b'Tyler Seguin', b'Corey Perry'], dtype='|S20')top30_table.cols.points[:5]array([104, 87, 86, 84, 82], dtype=uint16)def print_playerstat(row):
print(
"%20s\t%s\t%s\t%s"
% (row["player"].decode("UTF-8"), row["points"], row["goals"], row["assists"])
)for row in top30_table.iterrows():
print_playerstat(row) Sidney Crosby 104 36 68
Ryan Getzlaf 87 31 56
Claude Giroux 86 28 58
Tyler Seguin 84 37 47
Corey Perry 82 43 39
Phil Kessel 80 37 43
Taylor Hall 80 27 53
Alex Ovechkin 79 51 28
Joe Pavelski 79 41 38
Jamie Benn 79 34 45
Nicklas Backstrom 79 18 61
Patrick Sharp 78 34 44
Joe Thornton 76 11 65
Erik Karlsson 74 20 54
Evgeni Malkin 72 23 49
Patrick Marleau 70 33 37
Anze Kopitar 70 29 41
Matt Duchene 70 23 47
Martin St. Louis 69 30 39
Patrick Kane 69 29 40
Blake Wheeler 69 28 41
Kyle Okposo 69 27 42
David Krejci 69 19 50
Chris Kunitz 68 35 33
Jonathan Toews 68 28 40
Thomas Vanek 68 27 41
Jaromir Jagr 67 24 43
John Tavares 66 24 42
Jason Spezza 66 23 43
Jordan Eberle 65 28 37
for row in top30_table.where("(points > 75) & (points <= 80)"):
print_playerstat(row) Phil Kessel 80 37 43
Taylor Hall 80 27 53
Alex Ovechkin 79 51 28
Joe Pavelski 79 41 38
Jamie Benn 79 34 45
Nicklas Backstrom 79 18 61
Patrick Sharp 78 34 44
Joe Thornton 76 11 65
for row in top30_table.where("(goals > 40) & (points < 80)"):
print_playerstat(row) Alex Ovechkin 79 51 28
Joe Pavelski 79 41 38
fFile(filename=playerstats-2013-2014.h5, title=np.str_(''), mode='w', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) np.str_('')
/season_2013_2014 (Group) np.str_('NHL player statistics for the 2013/2014 season')
/season_2013_2014/top30 (Table(np.int64(30),)) np.str_('Top 30 point leaders')
description := {
"assists": UInt16Col(shape=(), dflt=np.uint16(0), pos=0),
"games_played": UInt8Col(shape=(), dflt=np.uint8(0), pos=1),
"goals": UInt16Col(shape=(), dflt=np.uint16(0), pos=2),
"player": StringCol(itemsize=20, shape=(), dflt=np.bytes_(b''), pos=3),
"points": UInt16Col(shape=(), dflt=np.uint16(0), pos=4),
"position": StringCol(itemsize=1, shape=(), dflt=np.bytes_(b'C'), pos=5),
"shifts_per_game_played": Float64Col(shape=(), dflt=np.float64(0.0), pos=6),
"shooting_percentage": Float64Col(shape=(), dflt=np.float64(0.0), pos=7)}
byteorder := 'little'
chunkshape := (np.int64(1489),)f.flush()f.close()!h5ls -rv playerstats-2013-2014.h5Opened "playerstats-2013-2014.h5" with sec2 driver.
/ Group
Attribute: CLASS scalar
Type: 5-byte null-terminated UTF-8 string
Attribute: PYTABLES_FORMAT_VERSION scalar
Type: 3-byte null-terminated UTF-8 string
Attribute: TITLE null
Type: 1-byte null-terminated UTF-8 string
Attribute: VERSION scalar
Type: 3-byte null-terminated UTF-8 string
Location: 1:48
Links: 1
Modified: 2026-01-23 15:18:57 -05
/season_2013_2014 Group
Attribute: CLASS scalar
Type: 5-byte null-terminated UTF-8 string
Attribute: TITLE scalar
Type: 46-byte null-terminated UTF-8 string
Attribute: VERSION scalar
Type: 3-byte null-terminated UTF-8 string
Location: 1:280
Links: 1
Modified: 2026-01-23 15:18:57 -05
/season_2013_2014/top30 Dataset {30/Inf}
Attribute: CLASS scalar
Type: 5-byte null-terminated UTF-8 string
Attribute: FIELD_0_FILL scalar
Type: native unsigned short
Attribute: FIELD_0_NAME scalar
Type: 7-byte null-terminated UTF-8 string
Attribute: FIELD_1_FILL scalar
Type: native unsigned char
Attribute: FIELD_1_NAME scalar
Type: 12-byte null-terminated UTF-8 string
Attribute: FIELD_2_FILL scalar
Type: native unsigned short
Attribute: FIELD_2_NAME scalar
Type: 5-byte null-terminated UTF-8 string
Attribute: FIELD_3_FILL scalar
Type: 1-byte null-terminated ASCII string
Attribute: FIELD_3_NAME scalar
Type: 6-byte null-terminated UTF-8 string
Attribute: FIELD_4_FILL scalar
Type: native unsigned short
Attribute: FIELD_4_NAME scalar
Type: 6-byte null-terminated UTF-8 string
Attribute: FIELD_5_FILL scalar
Type: 1-byte null-terminated ASCII string
Attribute: FIELD_5_NAME scalar
Type: 8-byte null-terminated UTF-8 string
Attribute: FIELD_6_FILL scalar
Type: native double
Attribute: FIELD_6_NAME scalar
Type: 22-byte null-terminated UTF-8 string
Attribute: FIELD_7_FILL scalar
Type: native double
Attribute: FIELD_7_NAME scalar
Type: 19-byte null-terminated UTF-8 string
Attribute: NROWS scalar
Type: native long
Attribute: TITLE scalar
Type: 20-byte null-terminated UTF-8 string
Attribute: VERSION scalar
Type: 3-byte null-terminated UTF-8 string
Location: 1:609
Links: 1
Modified: 2026-01-23 15:18:57 -05
Chunks: {1489} 65516 bytes
Storage: 1320 logical bytes, 65516 allocated bytes, 2.01% utilization
Type: struct {
"assists" +0 native unsigned short
"games_played" +2 native unsigned char
"goals" +3 native unsigned short
"player" +5 20-byte null-terminated ASCII string
"points" +25 native unsigned short
"position" +27 1-byte null-terminated ASCII string
"shifts_per_game_played" +28 native double
"shooting_percentage" +36 native double
} 44 bytes
Pandas hdfstore¶
import pandas as pdstore = pd.HDFStore("store.h5")df = pd.DataFrame(np.random.rand(5, 5))store["df1"] = dfdf = pd.read_csv("playerstats-2013-2014-top30.csv", skiprows=1)store["df2"] = dfstore.keys()['/df1', '/df2']"df2" in storeTruedf = store["df1"]store.root/ (RootGroup) np.str_('')
children := ['df1' (Group), 'df2' (Group)]store.close()f = h5py.File("store.h5")f.visititems(lambda x, y: print(x, "\t" * int(3 - len(str(x)) // 8), y))df1 <HDF5 group "/df1" (4 members)>
df1/axis0 <HDF5 dataset "axis0": shape (5,), type "<i8">
df1/axis1 <HDF5 dataset "axis1": shape (5,), type "<i8">
df1/block0_items <HDF5 dataset "block0_items": shape (5,), type "<i8">
df1/block0_values <HDF5 dataset "block0_values": shape (5, 5), type "<f8">
df2 <HDF5 group "/df2" (8 members)>
df2/axis0 <HDF5 dataset "axis0": shape (21,), type "|S8">
df2/axis1 <HDF5 dataset "axis1": shape (30,), type "<i8">
df2/block0_items <HDF5 dataset "block0_items": shape (3,), type "|S8">
df2/block0_values <HDF5 dataset "block0_values": shape (30, 3), type "<f8">
df2/block1_items <HDF5 dataset "block1_items": shape (14,), type "|S4">
df2/block1_values <HDF5 dataset "block1_values": shape (30, 14), type "<i8">
df2/block2_items <HDF5 dataset "block2_items": shape (4,), type "|S6">
df2/block2_values <HDF5 dataset "block2_values": shape (1,), type "|O">
f["/df2/block0_items"][:]array([b'S%', b'Shift/GP', b'FO%'], dtype='|S8')f["/df2/block0_values"][:3]array([[13.9, 24. , 52.5],
[15.2, 25.2, 49. ],
[12.6, 25.1, 52.9]])np.array(f["/df2/block1_items"])array([b'Rank', b'GP', b'G', b'A', b'P', b'+/-', b'PIM', b'PPG', b'PPP',
b'SHG', b'SHP', b'GW', b'OT', b'S'], dtype='|S4')f["/df2/block1_values"][:3, :5]array([[ 1, 80, 36, 68, 104],
[ 2, 77, 31, 56, 87],
[ 3, 82, 28, 58, 86]])Parquet¶
df = pd.read_csv(
"temperature_outdoor_2014.tsv", delimiter="\t", names=["time", "temperature"]
)
df.time = (
pd.to_datetime(df.time.values, unit="s")
.tz_localize("UTC")
.tz_convert("Europe/Stockholm")
)
df["dt"] = df.time.dt.strftime("%Y-%m-%d")df.head()Loading...
df.to_parquet("temperature_outdoor_2014.parquet", index=None, partition_cols=["dt"])!ls temperature_outdoor_2014.parquet | headdt=2014-01-01
dt=2014-01-02
dt=2014-01-03
dt=2014-01-04
dt=2014-01-05
dt=2014-01-06
dt=2014-01-07
dt=2014-01-08
dt=2014-01-09
dt=2014-01-10
df.to_parquet("temperature_outdoor_2014_no_partitions.parquet", index=None)!file temperature_outdoor_2014_no_partitions.parquet | headtemperature_outdoor_2014_no_partitions.parquet: Apache Parquet
!ls temperature_outdoor_2014.parquet/dt=2014-01-01 | headd7c32bdc528844a5a8c45dc24659b6e5-0.parquet
df_20140401 = pd.read_parquet("temperature_outdoor_2014.parquet/dt=2014-04-01")df_20140401.head()Loading...
import pyarrow.parquet as pqtable = pq.read_table(
"temperature_outdoor_2014.parquet", columns=["time", "temperature"]
)type(table)pyarrow.lib.Table# help(table)df2 = table.to_pandas()df2.head()Loading...
JSON¶
data = ["string", 1.0, 2, None]data_json = json.dumps(data)data_json'["string", 1.0, 2, null]'data2 = json.loads(data_json)data['string', 1.0, 2, None]data[0]'string'data = {"one": 1, "two": 2.0, "three": "three"}data_json = json.dumps(data)print(data_json){"one": 1, "two": 2.0, "three": "three"}
data = json.loads(data_json)data["two"]2.0data["three"]'three'data = {"one": [1], "two": [1, 2], "three": [1, 2, 3]}data_json = json.dumps(data, indent=True)print(data_json){
"one": [
1
],
"two": [
1,
2
],
"three": [
1,
2,
3
]
}
data = {
"one": [1],
"two": {"one": 1, "two": 2},
"three": [(1,), (1, 2), (1, 2, 3)],
"four": "a text string",
}with open("data.json", "w") as f:
json.dump(data, f)!cat data.json{"one": [1], "two": {"one": 1, "two": 2}, "three": [[1], [1, 2], [1, 2, 3]], "four": "a text string"}with open("data.json", "r") as f:
data_from_file = json.load(f)data_from_file["two"]{'one': 1, 'two': 2}data_from_file["three"][[1], [1, 2], [1, 2, 3]]!head -n 20 tokyo-metro.json{
"C": {
"color": "#149848",
"transfers": [
[
"C3",
"F15"
],
[
"C4",
"Z2"
],
[
"C4",
"G2"
],
[
"C7",
"M14"
],
!wc tokyo-metro.json 1471 1508 26839 tokyo-metro.json
with open("tokyo-metro.json", "r") as f:
data = json.load(f)data.keys()dict_keys(['C', 'G', 'F', 'H', 'M', 'N', 'T', 'Y', 'Z'])data["C"].keys()dict_keys(['color', 'transfers', 'travel_times'])data["C"]["color"]'#149848'data["C"]["transfers"][['C3', 'F15'],
['C4', 'Z2'],
['C4', 'G2'],
['C7', 'M14'],
['C7', 'N6'],
['C7', 'G6'],
['C8', 'M15'],
['C8', 'H6'],
['C9', 'H7'],
['C9', 'Y18'],
['C11', 'T9'],
['C11', 'M18'],
['C11', 'Z8'],
['C12', 'M19'],
['C18', 'H21']][(s, e, tt) for s, e, tt in data["C"]["travel_times"] if tt == 1][('C3', 'C4', 1), ('C7', 'C8', 1), ('C9', 'C10', 1)]data{'C': {'color': '#149848',
'transfers': [['C3', 'F15'],
['C4', 'Z2'],
['C4', 'G2'],
['C7', 'M14'],
['C7', 'N6'],
['C7', 'G6'],
['C8', 'M15'],
['C8', 'H6'],
['C9', 'H7'],
['C9', 'Y18'],
['C11', 'T9'],
['C11', 'M18'],
['C11', 'Z8'],
['C12', 'M19'],
['C18', 'H21']],
'travel_times': [['C1', 'C2', 2],
['C2', 'C3', 2],
['C3', 'C4', 1],
['C4', 'C5', 2],
['C5', 'C6', 2],
['C6', 'C7', 2],
['C7', 'C8', 1],
['C8', 'C9', 3],
['C9', 'C10', 1],
['C10', 'C11', 2],
['C11', 'C12', 2],
['C12', 'C13', 2],
['C13', 'C14', 2],
['C14', 'C15', 2],
['C15', 'C16', 2],
['C16', 'C17', 3],
['C17', 'C18', 3],
['C18', 'C19', 3]]},
'G': {'color': '#f59230',
'transfers': [['G1', 'Z1'],
['G1', 'F16'],
['G2', 'Z2'],
['G2', 'C4'],
['G4', 'Z3'],
['G5', 'M13'],
['G5', 'Y16'],
['G5', 'Z4'],
['G5', 'N7'],
['G6', 'N6'],
['G6', 'M14'],
['G6', 'C7'],
['G9', 'M16'],
['G9', 'H8'],
['G11', 'T10'],
['G12', 'Z9'],
['G15', 'H16'],
['G16', 'H17']],
'travel_times': [['G1', 'G2', 2],
['G2', 'G3', 1],
['G3', 'G4', 2],
['G4', 'G5', 2],
['G5', 'G6', 2],
['G6', 'G7', 2],
['G7', 'G8', 2],
['G8', 'G9', 2],
['G9', 'G10', 1],
['G10', 'G11', 2],
['G11', 'G12', 2],
['G12', 'G13', 1],
['G13', 'G14', 2],
['G14', 'G15', 2],
['G15', 'G16', 1],
['G16', 'G17', 2],
['G17', 'G18', 1],
['G18', 'G19', 2]]},
'F': {'color': '#b96528',
'transfers': [['F1', 'Y1'],
['F2', 'Y2'],
['F3', 'Y3'],
['F4', 'Y4'],
['F5', 'Y5'],
['F6', 'Y6'],
['F7', 'Y7'],
['F8', 'Y8'],
['F9', 'Y9'],
['F9', 'M25'],
['F13', 'M9'],
['F15', 'C3'],
['F16', 'Z1'],
['F16', 'G1']],
'travel_times': [['F1', 'F2', 3],
['F2', 'F3', 2],
['F3', 'F4', 3],
['F4', 'F5', 2],
['F5', 'F6', 2],
['F6', 'F7', 2],
['F7', 'F8', 2],
['F8', 'F9', 2],
['F9', 'F10', 3],
['F10', 'F11', 2],
['F11', 'F12', 2],
['F12', 'F13', 2],
['F13', 'F14', 3],
['F14', 'F15', 2],
['F15', 'F16', 2]]},
'H': {'color': '#9cacb5',
'transfers': [['H6', 'M15'],
['H6', 'C8'],
['H7', 'Y18'],
['H7', 'C9'],
['H8', 'M16'],
['H8', 'G9'],
['H12', 'T11'],
['H16', 'G15'],
['H17', 'G16'],
['H21', 'C18']],
'travel_times': [['H1', 'H2', 3],
['H2', 'H3', 3],
['H3', 'H4', 3],
['H4', 'H5', 3],
['H5', 'H6', 2],
['H6', 'H7', 3],
['H7', 'H8', 1],
['H8', 'H9', 2],
['H9', 'H10', 2],
['H10', 'H11', 2],
['H11', 'H12', 1],
['H12', 'H13', 3],
['H13', 'H14', 1],
['H14', 'H15', 2],
['H15', 'H16', 2],
['H16', 'H17', 1],
['H17', 'H18', 2],
['H18', 'H19', 2],
['H19', 'H20', 2],
['H20', 'H21', 3]]},
'M': {'color': '#ff0000',
'transfers': [['M9', 'F13'],
['M12', 'N8'],
['M13', 'G5'],
['M13', 'Y16'],
['M13', 'Z4'],
['M13', 'N7'],
['M14', 'C7'],
['M14', 'G6'],
['M14', 'N6'],
['M15', 'H6'],
['M15', 'C8'],
['M16', 'G9'],
['M16', 'H8'],
['M18', 'T9'],
['M18', 'C11'],
['M18', 'Z8'],
['M19', 'C12'],
['M22', 'N11'],
['M25', 'Y9'],
['M25', 'F9']],
'travel_times': [['M1', 'M2', 2],
['M2', 'M3', 2],
['M3', 'M4', 2],
['M4', 'M5', 2],
['M5', 'M6', 2],
['M6', 'M7', 2],
['M7', 'M8', 2],
['M8', 'M9', 2],
['M9', 'M10', 1],
['M10', 'M11', 2],
['M11', 'M12', 2],
['M12', 'M13', 3],
['M13', 'M14', 2],
['M14', 'M15', 1],
['M15', 'M16', 3],
['M16', 'M17', 2],
['M17', 'M18', 2],
['M18', 'M19', 2],
['M19', 'M20', 1],
['M20', 'M21', 2],
['M21', 'M22', 2],
['M22', 'M23', 3],
['M23', 'M24', 2],
['M24', 'M25', 3],
['m3', 'm4', 2],
['m4', 'm5', 2],
['m5', 'M6', 2]]},
'N': {'color': '#1aaca9',
'transfers': [['N1', 'T1'],
['N2', 'T2'],
['N3', 'T3'],
['N6', 'G6'],
['N6', 'M14'],
['N6', 'C7'],
['N7', 'Y16'],
['N7', 'Z4'],
['N7', 'G5'],
['N7', 'M13'],
['N8', 'M12'],
['N9', 'Y14'],
['N10', 'Y13'],
['N10', 'T6'],
['N11', 'M22']],
'travel_times': [['N1', 'N2', 2],
['N2', 'N3', 2],
['N3', 'N4', 2],
['N4', 'N5', 2],
['N5', 'N6', 2],
['N6', 'N7', 2],
['N7', 'N8', 2],
['N8', 'N9', 2],
['N9', 'N10', 2],
['N10', 'N11', 2],
['N11', 'N12', 3],
['N12', 'N13', 2],
['N13', 'N14', 2],
['N14', 'N15', 3],
['N15', 'N16', 1],
['N16', 'N17', 3],
['N17', 'N18', 2],
['N18', 'N19', 2]]},
'T': {'color': '#1aa7d8',
'transfers': [['T6', 'N10'],
['T6', 'Y13'],
['T7', 'Z6'],
['T9', 'M18'],
['T9', 'C11'],
['T9', 'Z8'],
['T10', 'G11'],
['T11', 'H12']],
'travel_times': [['T1', 'T2', 0],
['T2', 'T3', 3],
['T3', 'T4', 6],
['T4', 'T5', 9],
['T5', 'T6', 11],
['T6', 'T7', 13],
['T7', 'T8', 14],
['T8', 'T9', 16],
['T9', 'T10', 18],
['T10', 'T11', 20],
['T11', 'T12', 21],
['T12', 'T13', 24],
['T13', 'T14', 26],
['T14', 'T15', 27],
['T15', 'T16', 30],
['T16', 'T17', 33],
['T17', 'T18', 35],
['T18', 'T19', 37],
['T19', 'T20', 39],
['T20', 'T21', 41],
['T21', 'T22', 43],
['T22', 'T23', 46],
['T23', 'T24', 49]]},
'Y': {'color': '#ede7c3',
'transfers': [['Y1', 'F1'],
['Y2', 'F2'],
['Y3', 'F3'],
['Y4', 'F4'],
['Y5', 'F5'],
['Y6', 'F6'],
['Y7', 'F7'],
['Y8', 'F8'],
['Y9', 'F9'],
['Y9', 'M25'],
['Y13', 'T6'],
['Y13', 'N10'],
['Y14', 'N9'],
['Y16', 'Z4'],
['Y16', 'N7'],
['Y16', 'G5'],
['Y16', 'M13'],
['Y18', 'H7'],
['Y18', 'C9']],
'travel_times': [['Y1', 'Y2', 4],
['Y2', 'Y3', 2],
['Y3', 'Y4', 3],
['Y4', 'Y5', 2],
['Y5', 'Y6', 2],
['Y6', 'Y7', 2],
['Y7', 'Y8', 2],
['Y8', 'Y9', 3],
['Y9', 'Y10', 2],
['Y10', 'Y11', 2],
['Y11', 'Y12', 2],
['Y12', 'Y13', 3],
['Y13', 'Y14', 2],
['Y14', 'Y15', 2],
['Y15', 'Y16', 1],
['Y16', 'Y17', 2],
['Y17', 'Y18', 2],
['Y18', 'Y19', 2],
['Y19', 'Y20', 2],
['Y20', 'Y21', 2],
['Y21', 'Y22', 2],
['Y22', 'Y23', 3],
['Y23', 'Y24', 2]]},
'Z': {'color': '#a384bf',
'transfers': [['Z1', 'F16'],
['Z1', 'G1'],
['Z2', 'C4'],
['Z2', 'G2'],
['Z3', 'G4'],
['Z4', 'Y16'],
['Z4', 'N7'],
['Z4', 'M13'],
['Z4', 'G5'],
['Z6', 'T7'],
['Z8', 'M18'],
['Z8', 'C11'],
['Z8', 'T9'],
['Z9', 'G12']],
'travel_times': [['Z1', 'Z2', 3],
['Z2', 'Z3', 2],
['Z3', 'Z4', 2],
['Z4', 'Z5', 2],
['Z5', 'Z6', 2],
['Z6', 'Z7', 2],
['Z7', 'Z8', 2],
['Z8', 'Z9', 2],
['Z9', 'Z10', 3],
['Z10', 'Z11', 3],
['Z11', 'Z12', 3],
['Z12', 'Z13', 2],
['Z13', 'Z14', 2]]}}!ls -lh tokyo-metro.json-rw-r--r-- 1 carlosal1015 carlosal1015 27K Jan 22 15:31 tokyo-metro.json
data_pack = msgpack.packb(data)# del datatype(data_pack)byteslen(data_pack)3021with open("tokyo-metro.msgpack", "wb") as f:
f.write(data_pack)!ls -lh tokyo-metro.msgpack-rw-r--r-- 1 carlosal1015 carlosal1015 3.0K Jan 23 15:19 tokyo-metro.msgpack
with open("tokyo-metro.msgpack", "rb") as f:
data_msgpack = f.read()
data = msgpack.unpackb(data_msgpack)list(data.keys())['C', 'G', 'F', 'H', 'M', 'N', 'T', 'Y', 'Z']import picklewith open("tokyo-metro.pickle", "wb") as f:
pickle.dump(data, f)del data!ls -lh tokyo-metro.pickle-rw-r--r-- 1 carlosal1015 carlosal1015 5.2K Jan 23 15:19 tokyo-metro.pickle
with open("tokyo-metro.pickle", "rb") as f:
data = pickle.load(f)data.keys()dict_keys(['C', 'G', 'F', 'H', 'M', 'N', 'T', 'Y', 'Z'])- Johansson, R. (2024). Numerical Python: Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib. Apress. 10.1007/979-8-8688-0413-7