Forecasting count data¶
Many timeseries are composed of counts, which are non-negative integers. For example, the number of cars that pass through a toll booth in a given hour, the number of people who visit a website in a given day, or the number of sales of a product. The original Prophet model struggles to handle this type of data, as it assumes that the data is continuous and normally distributed. In this tutorial, we will show you how to use the prophetverse
library to model count data, with a prophet-like model that uses Negative Binomial likelihood.
# Disable warnings
import warnings
warnings.simplefilter(action="ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from sktime.transformations.series.fourier import FourierFeatures
from sktime.forecasting.compose import ForecastingPipeline
from numpyro import distributions as dist
Import dataset¶
We use a dataset Melbourne Pedestrian Counts
from forecastingdata, which contains the hourly pedestrian counts in Melbourne, Australia, from a set of sensors located in different parts of the city.
from sktime.datasets import load_forecastingdata
def _parse_data(df):
dfs = []
# iterrows
for index, row in df.iterrows():
_df = pd.DataFrame(
data={
"pedestrian_count": row["series_value"],
"date": pd.period_range(
row["start_timestamp"], periods=len(row["series_value"]), freq="H"
),
},
)
_df["series_name"] = row["series_name"]
dfs.append(_df)
return pd.concat(dfs).set_index(["series_name", "date"])
def load_dataset():
df, _ = load_forecastingdata("pedestrian_counts_dataset")
return _parse_data(df)
y = load_dataset()
# We take only one time series for simplicity
y = y.loc["T2"]
split_index = 24*365
y_train, y_test = y.iloc[:split_index], y.iloc[split_index+1:split_index*2+1]
y_train.head()
pedestrian_count | |
---|---|
date | |
2009-05-01 00:00 | 52.0 |
2009-05-01 01:00 | 34.0 |
2009-05-01 02:00 | 19.0 |
2009-05-01 03:00 | 14.0 |
2009-05-01 04:00 | 15.0 |
y_train.iloc[:24*21].plot(figsize=(10, 3), marker="o", color="black", legend=True)
<Axes: xlabel='date'>
ax = y_train["pedestrian_count"].rename("Train").plot(figsize=(20,7))
y_test["pedestrian_count"].rename("Test").plot(ax=ax)
ax.legend()
plt.show()
Fitting models¶
The series has some clear patterns: a daily seasonality, a weekly seasonality, and a yearly seasonality. It also has many zeros, and a model assuming normal distributed observations would not be able to capture this. First, let's fit and forecast with the standard prophet, then see how the negative binomial model performs.
Prophet with normal likelihood (bad for count data)¶
from prophetverse.sktime import Prophetverse
from prophetverse.effects.fourier import LinearFourierSeasonality
from prophetverse.utils.regex import no_input_columns
# Here we set the prior for the seasonality effect
# And the coefficients for it
exogenous_effects = [
(
"seasonality",
LinearFourierSeasonality(
sp_list=[24, 24 * 7, 24 * 365.5],
fourier_terms_list=[2, 2, 10],
freq="H",
prior_scale=0.5,
effect_mode="multiplicative",
),
no_input_columns,
),
]
model = Prophetverse(
trend="flat",
changepoint_interval=300,
changepoint_prior_scale=0.001,
exogenous_effects=exogenous_effects,
noise_scale=0.05,
optimizer_steps=20000,
optimizer_name="Adam",
optimizer_kwargs={"step_size": 0.001},
inference_method="map",
mcmc_warmup=500,
mcmc_samples=2000,
likelihood="normal",
)
model.fit(y=y_train)
100%|██████████| 20000/20000 [00:02<00:00, 7476.35it/s, init loss: 28421.7793, avg. loss [19001-20000]: -11899.9804]
Prophetverse(changepoint_interval=300, exogenous_effects=[('seasonality', LinearFourierSeasonality(effect_mode='multiplicative', fourier_terms_list=[2, 2, 10], freq='H', prior_scale=0.5, sp_list=[24, 168, 8772.0]), '^$')], mcmc_warmup=500, optimizer_kwargs={'step_size': 0.001}, optimizer_steps=20000, trend='flat')Please rerun this cell to show the HTML repr or trust the notebook.
Prophetverse(changepoint_interval=300, exogenous_effects=[('seasonality', LinearFourierSeasonality(effect_mode='multiplicative', fourier_terms_list=[2, 2, 10], freq='H', prior_scale=0.5, sp_list=[24, 168, 8772.0]), '^$')], mcmc_warmup=500, optimizer_kwargs={'step_size': 0.001}, optimizer_steps=20000, trend='flat')
LinearFourierSeasonality(effect_mode='multiplicative', fourier_terms_list=[2, 2, 10], freq='H', prior_scale=0.5, sp_list=[24, 168, 8772.0])
forecast_horizon = y_train.index[-100:].union(y_test.index[:300])
fig, ax = plt.subplots(figsize=(10, 3))
preds_normal = model.predict(fh=forecast_horizon)
preds_normal["pedestrian_count"].rename("Normal model").plot.line(
ax=ax, legend=False, color="tab:blue"
)
ax.scatter(y_train.index, y_train, marker="o", color="k", s=2, alpha=0.5, label="Train")
ax.scatter(y_test.index, y_test, marker="o", color="green", s=2, alpha=0.5, label="Test")
ax.set_title("Prophet with normal likelihood")
ax.legend()
fig.show()
quantiles = model.predict_quantiles(fh=forecast_horizon, alpha=[0.1, 0.9])
fig, ax = plt.subplots(figsize=(10, 3))
# Plot area between quantiles
ax.fill_between(quantiles.index.to_timestamp(), quantiles.iloc[:, 0], quantiles.iloc[:, -1], alpha=0.5)
ax.scatter(forecast_horizon.to_timestamp(), y.loc[forecast_horizon], marker="o", color="k", s=2, alpha=1)
ax.axvline(y_train.index[-1].to_timestamp(), color="r", linestyle="--")
fig.show()
The model forecasts some negative values - which don't make sense. Damn you, normal distribution! Let's try the negative binomial model.
Prophet with negative binomial likelihood¶
model.set_params(likelihood="negbinomial")
model.fit(y=y_train)
100%|██████████| 20000/20000 [00:19<00:00, 1046.00it/s, init loss: 27705970.0000, avg. loss [19001-20000]: 59806.3395]
Prophetverse(changepoint_interval=300, exogenous_effects=[('seasonality', LinearFourierSeasonality(effect_mode='multiplicative', fourier_terms_list=[2, 2, 10], freq='H', prior_scale=0.5, sp_list=[24, 168, 8772.0]), '^$')], likelihood='negbinomial', mcmc_warmup=500, optimizer_kwargs={'step_size': 0.001}, optimizer_steps=20000, trend='flat')Please rerun this cell to show the HTML repr or trust the notebook.
Prophetverse(changepoint_interval=300, exogenous_effects=[('seasonality', LinearFourierSeasonality(effect_mode='multiplicative', fourier_terms_list=[2, 2, 10], freq='H', prior_scale=0.5, sp_list=[24, 168, 8772.0]), '^$')], likelihood='negbinomial', mcmc_warmup=500, optimizer_kwargs={'step_size': 0.001}, optimizer_steps=20000, trend='flat')
LinearFourierSeasonality(effect_mode='multiplicative', fourier_terms_list=[2, 2, 10], freq='H', prior_scale=0.5, sp_list=[24, 168, 8772.0])
fig, ax = plt.subplots(figsize=(10, 3))
preds_negbin = model.predict(fh=forecast_horizon)
preds_negbin["pedestrian_count"].rename("Neg. Binomial model").plot.line(
ax=ax, legend=False, color="tab:purple"
)
ax.scatter(y_train.index, y_train, marker="o", color="k", s=2, alpha=0.5, label="Train")
ax.scatter(
y_test.index, y_test, marker="o", color="green", s=2, alpha=0.5, label="Test"
)
ax.set_title("Prophet with Negative Binomial likelihood")
ax.legend()
fig.show()
quantiles = model.predict_quantiles(fh=forecast_horizon, alpha=[0.1, 0.9])
fig, ax = plt.subplots(figsize=(10, 3))
# Plot area between quantiles
ax.fill_between(
quantiles.index.to_timestamp(),
quantiles.iloc[:, 0],
quantiles.iloc[:, -1],
alpha=0.5,
)
ax.scatter(
forecast_horizon.to_timestamp(),
y.loc[forecast_horizon],
marker="o",
color="k",
s=2,
alpha=1,
)
ax.axvline(y_train.index[-1].to_timestamp(), color="r", linestyle="--")
fig.show()
Comparing both forecasts side by side¶
fig, ax = plt.subplots(figsize=(9, 5))
preds_negbin["pedestrian_count"].rename("Neg. Binomial model").plot.line(
ax=ax, legend=False, color="tab:purple"
)
preds_normal["pedestrian_count"].rename("Normal model").plot.line(
ax=ax, legend=False, color="tab:blue"
)
ax.scatter(y_train.index, y_train, marker="o", color="k", s=6, alpha=0.5, label="Train")
ax.scatter(
y_test.index, y_test, marker="o", color="green", s=6, alpha=0.5, label="Test"
)
ax.set_title("Forecasting pedestrian counts")
# Remove xlabel
ax.set_xlabel("")
ax.axvline(y_train.index[-1].to_timestamp(), color="black", linestyle="--", alpha=0.3, zorder=-1)
fig.legend(loc="center", ncol=4, bbox_to_anchor=(0.5, 0.8))
fig.tight_layout()
fig.show()