Skip to article content

Pre-pre-school

Back to Article
Chapter 14: Statistical modelling
Download Notebook

Chapter 14: Statistical modelling

import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import patsy
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.rcParams["mathtext.fontset"] = "stix"
mpl.rcParams["font.family"] = "serif"
mpl.rcParams["font.sans-serif"] = "stix"
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
sns.set(style="whitegrid")

Statistical models and patsy formula

np.random.seed(123456789)
y = np.array([1, 2, 3, 4, 5])
x1 = np.array([6, 7, 8, 9, 10])
x2 = np.array([11, 12, 13, 14, 15])
X = np.vstack([np.ones(5), x1, x2, x1 * x2]).T
X
array([[ 1., 6., 11., 66.], [ 1., 7., 12., 84.], [ 1., 8., 13., 104.], [ 1., 9., 14., 126.], [ 1., 10., 15., 150.]])
beta, res, rank, sval = np.linalg.lstsq(X, y)
beta
array([-5.55555556e-01, 1.88888889e+00, -8.88888889e-01, -8.88900666e-16])
data = {"y": y, "x1": x1, "x2": x2}
y, X = patsy.dmatrices("y ~ 1 + x1 + x2 + x1*x2", data)
y
DesignMatrix with shape (5, 1) y 1 2 3 4 5 Terms: 'y' (column 0)
X
DesignMatrix with shape (5, 4) Intercept x1 x2 x1:x2 1 6 11 66 1 7 12 84 1 8 13 104 1 9 14 126 1 10 15 150 Terms: 'Intercept' (column 0) 'x1' (column 1) 'x2' (column 2) 'x1:x2' (column 3)
type(X)
patsy.design_info.DesignMatrix
np.array(X)
array([[ 1., 6., 11., 66.], [ 1., 7., 12., 84.], [ 1., 8., 13., 104.], [ 1., 9., 14., 126.], [ 1., 10., 15., 150.]])
df_data = pd.DataFrame(data)
y, X = patsy.dmatrices("y ~ 1 + x1 + x2 + x1:x2", df_data, return_type="dataframe")
X
Loading...
model = sm.OLS(y, X)
result = model.fit()
result.params
Intercept -5.555556e-01 x1 1.888889e+00 x2 -8.888889e-01 x1:x2 -1.110223e-15 dtype: float64
model = smf.ols("y ~ 1 + x1 + x2 + x1:x2", df_data)
result = model.fit()
result.params
Intercept -5.555556e-01 x1 1.888889e+00 x2 -8.888889e-01 x1:x2 -1.110223e-15 dtype: float64
print(result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 7.401e+26
Date:                Fri, 23 Jan 2026   Prob (F-statistic):           1.35e-27
Time:                        15:03:47   Log-Likelihood:                 145.84
No. Observations:                   5   AIC:                            -285.7
Df Residuals:                       2   BIC:                            -286.9
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.5556   1.87e-13  -2.96e+12      0.000      -0.556      -0.556
x1             1.8889   6.99e-13    2.7e+12      0.000       1.889       1.889
x2            -0.8889   2.38e-13  -3.73e+12      0.000      -0.889      -0.889
x1:x2       -1.11e-15    2.2e-14     -0.051      0.964   -9.56e-14    9.34e-14
==============================================================================
Omnibus:                          nan   Durbin-Watson:                   0.003
Prob(Omnibus):                    nan   Jarque-Bera (JB):                0.645
Skew:                           0.375   Prob(JB):                        0.724
Kurtosis:                       1.408   Cond. No.                     9.62e+17
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.68e-32. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
/usr/lib/python3.14/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given.
  warn("omni_normtest is not valid with less than 8 observations; %i "
beta
array([-5.55555556e-01, 1.88888889e+00, -8.88888889e-01, -8.88900666e-16])
from collections import defaultdict
data = defaultdict(lambda: np.array([1, 2, 3]))
patsy.dmatrices("y ~ a", data=data)[1].design_info.term_names
['Intercept', 'a']
patsy.dmatrices("y ~ 1 + a + b", data=data)[1].design_info.term_names
['Intercept', 'a', 'b']
patsy.dmatrices("y ~ -1 + a + b", data=data)[1].design_info.term_names
['a', 'b']
patsy.dmatrices("y ~ a * b", data=data)[1].design_info.term_names
['Intercept', 'a', 'b', 'a:b']
patsy.dmatrices("y ~ a * b * c", data=data)[1].design_info.term_names
['Intercept', 'a', 'b', 'a:b', 'c', 'a:c', 'b:c', 'a:b:c']
patsy.dmatrices("y ~ a * b * c - a:b:c", data=data)[1].design_info.term_names
['Intercept', 'a', 'b', 'a:b', 'c', 'a:c', 'b:c']
data = {k: np.array([]) for k in ["y", "a", "b", "c"]}
patsy.dmatrices("y ~ a + b", data=data)[1].design_info.term_names
['Intercept', 'a', 'b']
patsy.dmatrices("y ~ I(a + b)", data=data)[1].design_info.term_names
['Intercept', 'I(a + b)']
patsy.dmatrices("y ~ a*a", data=data)[1].design_info.term_names
['Intercept', 'a']
patsy.dmatrices("y ~ I(a**2)", data=data)[1].design_info.term_names
['Intercept', 'I(a ** 2)']
patsy.dmatrices("y ~ np.log(a) + b", data=data)[1].design_info.term_names
['Intercept', 'np.log(a)', 'b']
z = lambda x1, x2: x1 + x2
patsy.dmatrices("y ~ z(a, b)", data=data)[1].design_info.term_names
['Intercept', 'z(a, b)']

Categorical variables

data = {"y": [1, 2, 3], "a": [1, 2, 3]}
patsy.dmatrices("y ~ - 1 + a", data=data, return_type="dataframe")[1]
Loading...
patsy.dmatrices("y ~ - 1 + C(a)", data=data, return_type="dataframe")[1]
Loading...
data = {"y": [1, 2, 3], "a": ["type A", "type B", "type C"]}
patsy.dmatrices("y ~ - 1 + a", data=data, return_type="dataframe")[1]
Loading...
patsy.dmatrices("y ~ - 1 + C(a, Poly)", data=data, return_type="dataframe")[1]
Loading...

Linear regression

np.random.seed(123456789)
N = 100
x1 = np.random.randn(N)
x2 = np.random.randn(N)
data = pd.DataFrame({"x1": x1, "x2": x2})
def y_true(x1, x2):
    return 1 + 2 * x1 + 3 * x2 + 4 * x1 * x2
data["y_true"] = y_true(x1, x2)
e = np.random.randn(N)
data["y"] = data["y_true"] + e
data.head()
Loading...
fig, axes = plt.subplots(1, 2, figsize=(8, 4))

axes[0].scatter(data["x1"], data["y"])
axes[1].scatter(data["x2"], data["y"])

fig.tight_layout()
<Figure size 6000x3000 with 2 Axes>
data.shape
(100, 4)
model = smf.ols("y ~ x1 + x2", data)
result = model.fit()
print(result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.380
Model:                            OLS   Adj. R-squared:                  0.367
Method:                 Least Squares   F-statistic:                     29.76
Date:                Fri, 23 Jan 2026   Prob (F-statistic):           8.36e-11
Time:                        15:03:49   Log-Likelihood:                -271.52
No. Observations:                 100   AIC:                             549.0
Df Residuals:                      97   BIC:                             556.9
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.9868      0.382      2.581      0.011       0.228       1.746
x1             1.0810      0.391      2.766      0.007       0.305       1.857
x2             3.0793      0.432      7.134      0.000       2.223       3.936
==============================================================================
Omnibus:                       19.951   Durbin-Watson:                   1.682
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               49.964
Skew:                          -0.660   Prob(JB):                     1.41e-11
Kurtosis:                       6.201   Cond. No.                         1.32
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
result.rsquared
np.float64(0.3802538325513255)
result.resid.head()
0 -3.370455 1 -11.153477 2 -11.721319 3 -0.948410 4 0.306215 dtype: float64
z, p = stats.normaltest(result.resid.values)
p
np.float64(4.6524990253009316e-05)
result.params
Intercept 0.986826 x1 1.081044 x2 3.079284 dtype: float64
fig, ax = plt.subplots(figsize=(8, 4))
smg.qqplot(result.resid, ax=ax)

fig.tight_layout()
fig.savefig("ch14-qqplot-model-1.pdf")
<Figure size 6000x3000 with 1 Axes>
model = smf.ols("y ~ x1 + x2 + x1*x2", data)
result = model.fit()
print(result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.955
Model:                            OLS   Adj. R-squared:                  0.954
Method:                 Least Squares   F-statistic:                     684.5
Date:                Fri, 23 Jan 2026   Prob (F-statistic):           1.21e-64
Time:                        15:03:51   Log-Likelihood:                -140.01
No. Observations:                 100   AIC:                             288.0
Df Residuals:                      96   BIC:                             298.4
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.8706      0.103      8.433      0.000       0.666       1.076
x1             1.9693      0.108     18.160      0.000       1.754       2.185
x2             2.9670      0.117     25.466      0.000       2.736       3.198
x1:x2          3.9440      0.112     35.159      0.000       3.721       4.167
==============================================================================
Omnibus:                        2.950   Durbin-Watson:                   2.072
Prob(Omnibus):                  0.229   Jarque-Bera (JB):                2.734
Skew:                           0.327   Prob(JB):                        0.255
Kurtosis:                       2.521   Cond. No.                         1.38
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
result.params
Intercept 0.870620 x1 1.969345 x2 2.967004 x1:x2 3.943993 dtype: float64
result.rsquared
np.float64(0.9553393745884368)
z, p = stats.normaltest(result.resid.values)
p
np.float64(0.22874710482505198)
fig, ax = plt.subplots(figsize=(8, 4))
smg.qqplot(result.resid, ax=ax)

fig.tight_layout()
fig.savefig("ch14-qqplot-model-2.pdf")
<Figure size 6000x3000 with 1 Axes>
x = np.linspace(-1, 1, 50)
X1, X2 = np.meshgrid(x, x)
new_data = pd.DataFrame({"x1": X1.ravel(), "x2": X2.ravel()})
y_pred = result.predict(new_data)
y_pred.shape
(2500,)
y_pred = y_pred.values.reshape(50, 50)
fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True)


def plot_y_contour(ax, Y, title):
    c = ax.contourf(X1, X2, Y, 15, cmap=plt.cm.RdBu)
    ax.set_xlabel(r"$x_1$", fontsize=20)
    ax.set_ylabel(r"$x_2$", fontsize=20)
    ax.set_title(title)
    cb = fig.colorbar(c, ax=ax)
    cb.set_label(r"$y$", fontsize=20)


plot_y_contour(axes[0], y_true(X1, X2), "true relation")
plot_y_contour(axes[1], y_pred, "fitted model")

fig.tight_layout()
fig.savefig("ch14-comparison-model-true.pdf")
<Figure size 9000x3750 with 4 Axes>

Datasets from R

dataset = sm.datasets.get_rdataset("Icecream", "Ecdat")
dataset.title
'Ice Cream Consumption'
print(dataset.__doc__)
.. container::

   .. container::

      ======== ===============
      Icecream R Documentation
      ======== ===============

      .. rubric:: Ice Cream Consumption
         :name: ice-cream-consumption

      .. rubric:: Description
         :name: description

      four–weekly observations from 1951–03–18 to 1953–07–11

      *number of observations* : 30

      *observation* : country

      *country* : United States

      .. rubric:: Usage
         :name: usage

      .. code:: R

         data(Icecream)

      .. rubric:: Format
         :name: format

      A time series containing :

      cons
         consumption of ice cream per head (in pints);

      income
         average family income per week (in US Dollars);

      price
         price of ice cream (per pint);

      temp
         average temperature (in Fahrenheit);

      .. rubric:: Source
         :name: source

      Hildreth, C. and J. Lu (1960) *Demand relations with
      autocorrelated disturbances*, Technical Bulletin No 2765, Michigan
      State University.

      .. rubric:: References
         :name: references

      Verbeek, Marno (2004) *A Guide to Modern Econometrics*, John Wiley
      and Sons, chapter 4.

      .. rubric:: See Also
         :name: see-also

      ``Index.Source``, ``Index.Economics``, ``Index.Econometrics``,
      ``Index.Observations``,

      ``Index.Time.Series``

dataset.data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   cons    30 non-null     float64
 1   income  30 non-null     int64  
 2   price   30 non-null     float64
 3   temp    30 non-null     int64  
dtypes: float64(2), int64(2)
memory usage: 1.1 KB
model = smf.ols("cons ~ -1 + price + temp", data=dataset.data)
result = model.fit()
print(result.summary())
                                 OLS Regression Results                                
=======================================================================================
Dep. Variable:                   cons   R-squared (uncentered):                   0.986
Model:                            OLS   Adj. R-squared (uncentered):              0.985
Method:                 Least Squares   F-statistic:                              1001.
Date:                Fri, 23 Jan 2026   Prob (F-statistic):                    9.03e-27
Time:                        15:03:59   Log-Likelihood:                          51.903
No. Observations:                  30   AIC:                                     -99.81
Df Residuals:                      28   BIC:                                     -97.00
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
price          0.7254      0.093      7.805      0.000       0.535       0.916
temp           0.0032      0.000      6.549      0.000       0.002       0.004
==============================================================================
Omnibus:                        5.350   Durbin-Watson:                   0.637
Prob(Omnibus):                  0.069   Jarque-Bera (JB):                3.675
Skew:                           0.776   Prob(JB):                        0.159
Kurtosis:                       3.729   Cond. No.                         593.
==============================================================================

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

smg.plot_fit(result, 0, ax=ax1)
smg.plot_fit(result, 1, ax=ax2)

fig.tight_layout()
fig.savefig("ch14-regressionplots.pdf")
<Figure size 9000x3000 with 2 Axes>
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

sns.regplot(dataset.data, x="price", y="cons", ax=ax1)
sns.regplot(dataset.data, x="temp", y="cons", ax=ax2)
fig.tight_layout()
fig.savefig("ch14-regressionplots-seaborn.pdf")
<Figure size 9000x3000 with 2 Axes>

Discrete regression, logistic regression

df = sm.datasets.get_rdataset("iris").data
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sepal.Length  150 non-null    float64
 1   Sepal.Width   150 non-null    float64
 2   Petal.Length  150 non-null    float64
 3   Petal.Width   150 non-null    float64
 4   Species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
df.Species.unique()
array(['setosa', 'versicolor', 'virginica'], dtype=object)
df_subset = df[(df.Species == "versicolor") | (df.Species == "virginica")].copy()
df_subset = df[df.Species.isin(["versicolor", "virginica"])].copy()
df_subset.Species = df_subset.Species.map({"versicolor": 1, "virginica": 0})
df_subset.rename(
    columns={
        "Sepal.Length": "Sepal_Length",
        "Sepal.Width": "Sepal_Width",
        "Petal.Length": "Petal_Length",
        "Petal.Width": "Petal_Width",
    },
    inplace=True,
)
df_subset.head(3)
Loading...
model = smf.logit(
    "Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width", data=df_subset
)
result = model.fit()
Optimization terminated successfully.
         Current function value: 0.059493
         Iterations 12
print(result.summary())
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                Species   No. Observations:                  100
Model:                          Logit   Df Residuals:                       95
Method:                           MLE   Df Model:                            4
Date:                Fri, 23 Jan 2026   Pseudo R-squ.:                  0.9142
Time:                        15:04:10   Log-Likelihood:                -5.9493
converged:                       True   LL-Null:                       -69.315
Covariance Type:            nonrobust   LLR p-value:                 1.947e-26
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       42.6378     25.708      1.659      0.097      -7.748      93.024
Sepal_Length     2.4652      2.394      1.030      0.303      -2.228       7.158
Sepal_Width      6.6809      4.480      1.491      0.136      -2.099      15.461
Petal_Length    -9.4294      4.737     -1.990      0.047     -18.714      -0.145
Petal_Width    -18.2861      9.743     -1.877      0.061     -37.381       0.809
================================================================================

Possibly complete quasi-separation: A fraction 0.60 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
print(result.get_margeff().summary())
        Logit Marginal Effects       
=====================================
Dep. Variable:                Species
Method:                          dydx
At:                           overall
================================================================================
                  dy/dx    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Sepal_Length     0.0445      0.038      1.163      0.245      -0.031       0.120
Sepal_Width      0.1207      0.064      1.891      0.059      -0.004       0.246
Petal_Length    -0.1703      0.057     -2.965      0.003      -0.283      -0.058
Petal_Width     -0.3303      0.110     -2.998      0.003      -0.546      -0.114
================================================================================

Note: Sepal_Length and Sepal_Width do not seem to contribute much to predictiveness of the model.

model = smf.logit("Species ~ Petal_Length + Petal_Width", data=df_subset)
result = model.fit()
Optimization terminated successfully.
         Current function value: 0.102818
         Iterations 10
print(result.summary())
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                Species   No. Observations:                  100
Model:                          Logit   Df Residuals:                       97
Method:                           MLE   Df Model:                            2
Date:                Fri, 23 Jan 2026   Pseudo R-squ.:                  0.8517
Time:                        15:04:10   Log-Likelihood:                -10.282
converged:                       True   LL-Null:                       -69.315
Covariance Type:            nonrobust   LLR p-value:                 2.303e-26
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       45.2723     13.612      3.326      0.001      18.594      71.951
Petal_Length    -5.7545      2.306     -2.496      0.013     -10.274      -1.235
Petal_Width    -10.4467      3.756     -2.782      0.005     -17.808      -3.086
================================================================================

Possibly complete quasi-separation: A fraction 0.34 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
print(result.get_margeff().summary())
        Logit Marginal Effects       
=====================================
Dep. Variable:                Species
Method:                          dydx
At:                           overall
================================================================================
                  dy/dx    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Petal_Length    -0.1736      0.052     -3.347      0.001      -0.275      -0.072
Petal_Width     -0.3151      0.068     -4.608      0.000      -0.449      -0.181
================================================================================
params = result.params
beta0 = -params["Intercept"] / params["Petal_Width"]
beta1 = -params["Petal_Length"] / params["Petal_Width"]
df_new = pd.DataFrame(
    {
        "Petal_Length": np.random.randn(20) * 0.5 + 5,
        "Petal_Width": np.random.randn(20) * 0.5 + 1.7,
    }
)
df_new["P-Species"] = result.predict(df_new)
df_new["P-Species"].head(3)
0 0.995472 1 0.799899 2 0.000033 Name: P-Species, dtype: float64
df_new["Species"] = (df_new["P-Species"] > 0.5).astype(int)
df_new.head()
Loading...
fig, ax = plt.subplots(1, 1, figsize=(8, 4))

ax.plot(
    df_subset[df_subset.Species == 0].Petal_Length.values,
    df_subset[df_subset.Species == 0].Petal_Width.values,
    "s",
    label="virginica",
)
ax.plot(
    df_new[df_new.Species == 0].Petal_Length.values,
    df_new[df_new.Species == 0].Petal_Width.values,
    "o",
    markersize=10,
    color="steelblue",
    label="virginica (pred.)",
)

ax.plot(
    df_subset[df_subset.Species == 1].Petal_Length.values,
    df_subset[df_subset.Species == 1].Petal_Width.values,
    "s",
    label="versicolor",
)
ax.plot(
    df_new[df_new.Species == 1].Petal_Length.values,
    df_new[df_new.Species == 1].Petal_Width.values,
    "o",
    markersize=10,
    color="green",
    label="versicolor (pred.)",
)

_x = np.array([4.0, 6.1])
ax.plot(_x, beta0 + beta1 * _x, "k")

ax.set_xlabel("Petal length")
ax.set_ylabel("Petal width")
ax.legend(loc=2)
fig.tight_layout()
fig.savefig("ch14-logit.pdf")
<Figure size 6000x3000 with 1 Axes>

Poisson distribution

dataset = sm.datasets.get_rdataset("discoveries")
df = dataset.data.set_index("time").rename(columns={"value": "discoveries"})
df.head(10).T
Loading...
fig, ax = plt.subplots(1, 1, figsize=(16, 4))
df.plot(kind="bar", ax=ax)
fig.tight_layout()
fig.savefig("ch14-discoveries.pdf")
<Figure size 12000x3000 with 1 Axes>
model = smf.poisson("discoveries ~ 1", data=df)
result = model.fit()
Optimization terminated successfully.
         Current function value: 2.168457
         Iterations 1
print(result.summary())
                          Poisson Regression Results                          
==============================================================================
Dep. Variable:            discoveries   No. Observations:                  100
Model:                        Poisson   Df Residuals:                       99
Method:                           MLE   Df Model:                            0
Date:                Fri, 23 Jan 2026   Pseudo R-squ.:                   0.000
Time:                        15:04:57   Log-Likelihood:                -216.85
converged:                       True   LL-Null:                       -216.85
Covariance Type:            nonrobust   LLR p-value:                       nan
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.1314      0.057     19.920      0.000       1.020       1.243
==============================================================================
lmbda = np.exp(result.params)
X = stats.poisson(lmbda)
result.conf_int()
Loading...
X_ci_l = stats.poisson(np.exp(result.conf_int().values)[0, 0])
X_ci_u = stats.poisson(np.exp(result.conf_int().values)[0, 1])
v, k = np.histogram(df.values, bins=12, range=(0, 12), density=True)
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
ax.bar(k[:-1], v, color="steelblue", align="center", label="Dicoveries per year")
ax.bar(
    k - 0.125,
    X_ci_l.pmf(k),
    color="red",
    alpha=0.5,
    align="center",
    width=0.25,
    label="Poisson fit (CI, lower)",
)
ax.bar(k, X.pmf(k), color="green", align="center", width=0.5, label="Poisson fit")
ax.bar(
    k + 0.125,
    X_ci_u.pmf(k),
    color="red",
    alpha=0.5,
    align="center",
    width=0.25,
    label="Poisson fit (CI, upper)",
)

ax.legend()
fig.tight_layout()
fig.savefig("ch14-discoveries-per-year.pdf")
<Figure size 9000x3000 with 1 Axes>

Time series

df = pd.read_csv(
    "temperature_outdoor_2014.tsv", header=None, delimiter="\t", names=["time", "temp"]
)
df.time = pd.to_datetime(df.time, unit="s")
df = df.set_index("time").resample("H").mean()
/tmp/ipykernel_50130/3892922595.py:5: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
  df = df.set_index("time").resample("H").mean()
df_march = df[df.index.month == 3]
df_april = df[df.index.month == 4]
df_march.plot(figsize=(12, 4));
<Figure size 9000x3000 with 1 Axes>
fig, axes = plt.subplots(1, 4, figsize=(12, 3))
smg.tsa.plot_acf(df_march.temp, lags=72, ax=axes[0])
smg.tsa.plot_acf(df_march.temp.diff().dropna(), lags=72, ax=axes[1])
smg.tsa.plot_acf(df_march.temp.diff().diff().dropna(), lags=72, ax=axes[2])
smg.tsa.plot_acf(df_march.temp.diff().diff().diff().dropna(), lags=72, ax=axes[3])
fig.tight_layout()
fig.savefig("ch14-timeseries-autocorrelation.pdf")
<Figure size 9000x2250 with 4 Axes>
from statsmodels.tsa import ar_model, arima_model
sm.tsa.AR
statsmodels.tsa.ar_model.AR
# help(sm.tsa)
model = ar_model.AutoReg(df_march.temp, lags=72)
# model = sm.tsa.AR(df_march.temp)
result = model.fit()
sm.stats.durbin_watson(result.resid)
np.float64(1.9985623006353086)
fig, ax = plt.subplots(1, 1, figsize=(8, 3))
smg.tsa.plot_acf(result.resid, lags=72, ax=ax)
fig.tight_layout()
fig.savefig("ch14-timeseries-resid-acf.pdf")
<Figure size 6000x2250 with 1 Axes>
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
ax.plot(df_march.index.values[-72:], df_march.temp.values[-72:], label="train data")
ax.plot(df_april.index.values[:72], df_april.temp.values[:72], label="actual outcome")
ax.plot(
    pd.date_range("2014-04-01", "2014-04-4", freq="H").values,
    result.predict("2014-04-01", "2014-04-4"),
    label="predicted outcome",
)

ax.legend()
fig.tight_layout()
fig.savefig("ch14-timeseries-prediction.pdf")
/tmp/ipykernel_50130/2609899811.py:5: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.
  pd.date_range("2014-04-01", "2014-04-4", freq="H").values,
<Figure size 9000x3000 with 1 Axes>
# Using ARMA model on daily average temperatures
df_march = df_march.resample("D").mean()
df_april = df_april.resample("D").mean()
import statsmodels.tsa.arima
statsmodels.tsa.arima.model.ARIMA
statsmodels.tsa.arima.model.ARIMA
# model = sm.tsa.ARMA(df_march, (4, 1))
model = statsmodels.tsa.arima.model.ARIMA(df_march.temp, order=(4, 0, 1))
result = model.fit()
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
ax.plot(df_march.index.values[-7:], df_march.temp.values[-7:], "s-", label="train data")
ax.plot(
    df_april.index.values[:7], df_april.temp.values[:7], "s-", label="actual outcome"
)
ax.plot(
    pd.date_range("2014-04-01", "2014-04-7").values,
    result.predict("2014-04-01", "2014-04-7"),
    "s-",
    label="predicted outcome",
)
ax.legend()
fig.tight_layout()
<Figure size 9000x3000 with 1 Axes>
References
  1. Johansson, R. (2024). Numerical Python: Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib. Apress. 10.1007/979-8-8688-0413-7