第9章 モデルへの外部変数の追加
In [12]:
Copied!
from typing import Union
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.stats.diagnostic import acorr_ljungbox
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')
from typing import Union
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.stats.diagnostic import acorr_ljungbox
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')
SARIMAXモデルを調べる¶
In [2]:
Copied!
df_macro = sm.datasets.macrodata.load_pandas().data
df_macro.index = pd.PeriodIndex(
df_macro.apply(lambda ser: f"{int(ser['year'])}Q{int(ser['quarter'])}", axis=1),
freq='Q', name='date'
)
df_macro.drop(columns=['year', 'quarter'], inplace=True)
df_macro.head(10)
df_macro = sm.datasets.macrodata.load_pandas().data
df_macro.index = pd.PeriodIndex(
df_macro.apply(lambda ser: f"{int(ser['year'])}Q{int(ser['quarter'])}", axis=1),
freq='Q', name='date'
)
df_macro.drop(columns=['year', 'quarter'], inplace=True)
df_macro.head(10)
Out[2]:
realgdp | realcons | realinv | realgovt | realdpi | cpi | m1 | tbilrate | unemp | pop | infl | realint | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||||
1959Q1 | 2710.349 | 1707.4 | 286.898 | 470.045 | 1886.9 | 28.98 | 139.7 | 2.82 | 5.8 | 177.146 | 0.00 | 0.00 |
1959Q2 | 2778.801 | 1733.7 | 310.859 | 481.301 | 1919.7 | 29.15 | 141.7 | 3.08 | 5.1 | 177.830 | 2.34 | 0.74 |
1959Q3 | 2775.488 | 1751.8 | 289.226 | 491.260 | 1916.4 | 29.35 | 140.5 | 3.82 | 5.3 | 178.657 | 2.74 | 1.09 |
1959Q4 | 2785.204 | 1753.7 | 299.356 | 484.052 | 1931.3 | 29.37 | 140.0 | 4.33 | 5.6 | 179.386 | 0.27 | 4.06 |
1960Q1 | 2847.699 | 1770.5 | 331.722 | 462.199 | 1955.5 | 29.54 | 139.6 | 3.50 | 5.2 | 180.007 | 2.31 | 1.19 |
1960Q2 | 2834.390 | 1792.9 | 298.152 | 460.400 | 1966.1 | 29.55 | 140.2 | 2.68 | 5.2 | 180.671 | 0.14 | 2.55 |
1960Q3 | 2839.022 | 1785.8 | 296.375 | 474.676 | 1967.8 | 29.75 | 140.9 | 2.36 | 5.6 | 181.528 | 2.70 | -0.34 |
1960Q4 | 2802.616 | 1788.2 | 259.764 | 476.434 | 1966.6 | 29.84 | 141.1 | 2.29 | 6.3 | 182.287 | 1.21 | 1.08 |
1961Q1 | 2819.264 | 1787.7 | 266.405 | 475.854 | 1984.5 | 29.81 | 142.1 | 2.37 | 6.8 | 182.992 | -0.40 | 2.77 |
1961Q2 | 2872.005 | 1814.3 | 286.246 | 480.328 | 2014.4 | 29.92 | 142.9 | 2.29 | 7.0 | 183.691 | 1.47 | 0.81 |
変数 | 説明 |
---|---|
realgdp | 実質国内総生産(目的変数または内生変数) |
realcons | 実質個人消費支出 |
realinv | 実質民間国内総投資額 |
realgovt | 連邦政府の実質消費支出および投資額 |
realdpi | 実質個人可処分所得 |
cpi | 四半期末消費者物価指数 |
m1 | M1名目貨幣供給量 |
tbilrate | 3ヶ月物短期国債の四半期月次平均 |
unemp | 季節調整失業率 |
pop | 四半期末人口 |
infl | インフレ率 |
realint | 実質金利率 |
In [3]:
Copied!
fig, axes = plt.subplots(2, 3, figsize=(11, 6), sharex=True)
axes = axes.flatten()
for i in range(6):
df_macro.iloc[:, i].plot(ax=axes[i])
axes[i].set_title(df_macro.columns[i])
fig.tight_layout()
fig, axes = plt.subplots(2, 3, figsize=(11, 6), sharex=True)
axes = axes.flatten()
for i in range(6):
df_macro.iloc[:, i].plot(ax=axes[i])
axes[i].set_title(df_macro.columns[i])
fig.tight_layout()
SARIMAXモデルを使って実質GDPを予測する¶
In [29]:
Copied!
# 目的変数、外生変数を定義
# シンプルのため外生変数は5つに制限
target = df_macro[['realgdp']]
exog = df_macro.iloc[:, 1:6]
# train, testにわける
target_train = target[:200]
exog_train = exog[:200]
# 目的変数、外生変数を定義
# シンプルのため外生変数は5つに制限
target = df_macro[['realgdp']]
exog = df_macro.iloc[:, 1:6]
# train, testにわける
target_train = target[:200]
exog_train = exog[:200]
In [31]:
Copied!
# 定常性の確認
for d in [0, 1]:
ADF_result = adfuller(np.diff(target['realgdp'], n=d))
print(f'実質GDP({d}次差分)ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')
# 定常性の確認
for d in [0, 1]:
ADF_result = adfuller(np.diff(target['realgdp'], n=d))
print(f'実質GDP({d}次差分)ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')
実質GDP(0次差分)ADF検定 ADF Statistic: 1.750 p-value: 0.998 実質GDP(1次差分)ADF検定 ADF Statistic: -6.306 p-value: 0.000
In [32]:
Copied!
# 一意なSARIMAXモデルをすべて適合させる関数
def optimize_SARIMAX(
endog: Union[pd.Series, list],
exog: Union[pd.Series, list],
orders: list,
d: int,
D: int,
s: int
) -> pd.DataFrame:
"""
endog: 時系列データ
exog: 外生変数
orders: SARIMAX(p, d, q)のpとqの値の組み合わせ
d: 和分次数
D: 季節差分の次数
s: 頻度
"""
res = []
for order in tqdm(orders):
try:
model = SARIMAX(
endog,
order=(order[0], d, order[1]),
seasonal_order=(order[2], D, order[3], s),
simple_differencing=False
).fit(disp=False)
res.append([order, model.aic])
except:
continue
df_res = (
pd.DataFrame(res, columns=['(p,q,P,Q)', 'AIC'])
.sort_values(by='AIC', ascending=True)
.reset_index(drop=True)
)
return df_res
# 一意なSARIMAXモデルをすべて適合させる関数
def optimize_SARIMAX(
endog: Union[pd.Series, list],
exog: Union[pd.Series, list],
orders: list,
d: int,
D: int,
s: int
) -> pd.DataFrame:
"""
endog: 時系列データ
exog: 外生変数
orders: SARIMAX(p, d, q)のpとqの値の組み合わせ
d: 和分次数
D: 季節差分の次数
s: 頻度
"""
res = []
for order in tqdm(orders):
try:
model = SARIMAX(
endog,
order=(order[0], d, order[1]),
seasonal_order=(order[2], D, order[3], s),
simple_differencing=False
).fit(disp=False)
res.append([order, model.aic])
except:
continue
df_res = (
pd.DataFrame(res, columns=['(p,q,P,Q)', 'AIC'])
.sort_values(by='AIC', ascending=True)
.reset_index(drop=True)
)
return df_res
In [33]:
Copied!
d = 1
D = 0
s = 4
orders = list(product(range(4), repeat=4)) # p,q,P,Qをそれぞれ0~4まで動かす
optimize_SARIMAX(target_train, exog_train, orders, d, D, s)
d = 1
D = 0
s = 4
orders = list(product(range(4), repeat=4)) # p,q,P,Qをそれぞれ0~4まで動かす
optimize_SARIMAX(target_train, exog_train, orders, d, D, s)
0%| | 0/256 [00:00<?, ?it/s]
Out[33]:
(p,q,P,Q) | AIC | |
---|---|---|
0 | (0, 1, 2, 3) | 266.143045 |
1 | (3, 2, 3, 0) | 2145.460369 |
2 | (3, 3, 1, 1) | 2145.820653 |
3 | (3, 3, 0, 1) | 2145.979871 |
4 | (3, 1, 3, 0) | 2146.012430 |
... | ... | ... |
247 | (0, 0, 0, 2) | 2234.293056 |
248 | (0, 0, 0, 3) | 2235.079581 |
249 | (0, 1, 0, 0) | 2242.318380 |
250 | (0, 0, 0, 1) | 2250.954180 |
251 | (0, 0, 0, 0) | 2297.594527 |
252 rows × 2 columns
In [34]:
Copied!
# (p,q,P,Q)=(0,1,2,3)がなぜか一番小さい
# オーダーも小さすぎる気がするので無視する
model = SARIMAX(target_train, exog_train, order=(3,1,3), seasonal_order=(0,0,0,4), simple_differencing=False).fit(disp=False)
print(model.summary())
# (p,q,P,Q)=(0,1,2,3)がなぜか一番小さい
# オーダーも小さすぎる気がするので無視する
model = SARIMAX(target_train, exog_train, order=(3,1,3), seasonal_order=(0,0,0,4), simple_differencing=False).fit(disp=False)
print(model.summary())
SARIMAX Results ============================================================================== Dep. Variable: realgdp No. Observations: 200 Model: SARIMAX(3, 1, 3) Log Likelihood -859.412 Date: Fri, 17 Nov 2023 AIC 1742.824 Time: 20:11:00 BIC 1782.344 Sample: 03-31-1959 HQIC 1758.819 - 12-31-2008 Covariance Type: opg ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ realcons 0.9708 0.045 21.505 0.000 0.882 1.059 realinv 1.0132 0.033 30.693 0.000 0.948 1.078 realgovt 0.7283 0.127 5.728 0.000 0.479 0.977 realdpi 0.0102 0.025 0.408 0.683 -0.039 0.059 cpi 5.8719 1.299 4.520 0.000 3.326 8.418 ar.L1 1.0636 0.402 2.646 0.008 0.276 1.852 ar.L2 0.4957 0.708 0.700 0.484 -0.892 1.883 ar.L3 -0.6766 0.340 -1.990 0.047 -1.343 -0.010 ma.L1 -1.1028 0.434 -2.541 0.011 -1.953 -0.252 ma.L2 -0.3284 0.775 -0.424 0.672 -1.847 1.191 ma.L3 0.6518 0.407 1.600 0.110 -0.146 1.450 sigma2 331.2909 30.734 10.779 0.000 271.053 391.529 =================================================================================== Ljung-Box (L1) (Q): 0.01 Jarque-Bera (JB): 13.34 Prob(Q): 0.92 Prob(JB): 0.00 Heteroskedasticity (H): 3.57 Skew: 0.32 Prob(H) (two-sided): 0.00 Kurtosis: 4.10 =================================================================================== Warnings: [1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [35]:
Copied!
model.plot_diagnostics(figsize=(10, 8))
plt.tight_layout()
model.plot_diagnostics(figsize=(10, 8))
plt.tight_layout()
In [36]:
Copied!
# 残差が無相関であることが確認できる
acorr_ljungbox(model.resid, np.arange(1, 11))
# 残差が無相関であることが確認できる
acorr_ljungbox(model.resid, np.arange(1, 11))
Out[36]:
lb_stat | lb_pvalue | |
---|---|---|
1 | 0.092071 | 0.761560 |
2 | 0.198871 | 0.905348 |
3 | 0.200938 | 0.977438 |
4 | 1.257216 | 0.868591 |
5 | 1.258003 | 0.939193 |
6 | 1.285134 | 0.972471 |
7 | 1.476883 | 0.983099 |
8 | 1.486838 | 0.992924 |
9 | 1.859312 | 0.993502 |
10 | 1.866005 | 0.997268 |
In [81]:
Copied!
def rolling_forecast(
endog: Union[pd.Series, list],
exog: Union[pd.Series, list],
train_len: int,
horizon: int,
window: int,
method: str
) -> list:
total_len = train_len + horizon
preds = []
if method == 'last':
for i in range(train_len, total_len, window):
val_last = endog.iloc[i-1, 0]
preds.extend(val_last for _ in range(window))
elif method == 'SARIMAX':
for i in range(train_len, total_len, window):
model = SARIMAX(
endog[:i],
exog[:i],
order=(3,1,3),
seasonal_order=(0,0,0,4),
simple_differencing=False
).fit(disp=False)
pred = model.forecast(window, exog=exog[i:i+window]).values
preds.extend(pred)
else:
raise ValueError('choose method form [last, SARIMAX]')
return preds
def rolling_forecast(
endog: Union[pd.Series, list],
exog: Union[pd.Series, list],
train_len: int,
horizon: int,
window: int,
method: str
) -> list:
total_len = train_len + horizon
preds = []
if method == 'last':
for i in range(train_len, total_len, window):
val_last = endog.iloc[i-1, 0]
preds.extend(val_last for _ in range(window))
elif method == 'SARIMAX':
for i in range(train_len, total_len, window):
model = SARIMAX(
endog[:i],
exog[:i],
order=(3,1,3),
seasonal_order=(0,0,0,4),
simple_differencing=False
).fit(disp=False)
pred = model.forecast(window, exog=exog[i:i+window]).values
preds.extend(pred)
else:
raise ValueError('choose method form [last, SARIMAX]')
return preds
In [82]:
Copied!
target_train = target[:196]
target_test = target[196:]
TRAIN_LEN = len(target_train)
HORIZON = len(target_test)
WINDOW = 1
target_test['pred_last'] = rolling_forecast(target, exog, TRAIN_LEN, HORIZON, WINDOW, 'last')
target_test['pred_SARIMAX'] = rolling_forecast(target, exog, TRAIN_LEN, HORIZON, WINDOW, 'SARIMAX')
target_train = target[:196]
target_test = target[196:]
TRAIN_LEN = len(target_train)
HORIZON = len(target_test)
WINDOW = 1
target_test['pred_last'] = rolling_forecast(target, exog, TRAIN_LEN, HORIZON, WINDOW, 'last')
target_test['pred_SARIMAX'] = rolling_forecast(target, exog, TRAIN_LEN, HORIZON, WINDOW, 'SARIMAX')
In [86]:
Copied!
def mape(y_true, y_pred):
return np.mean(np.abs((y_pred - y_true) / y_true))
xs = ['pred_last', 'pred_SARIMAX']
ys = [mape(target_test['realgdp'], target_test[x]) for x in xs]
plt.bar(xs, ys)
plt.xlabel('Baseline')
plt.ylabel('MAPE')
# set y value to percentage
plt.gca().yaxis.set_major_formatter(lambda y, _: '{:.01%}'.format(y))
def mape(y_true, y_pred):
return np.mean(np.abs((y_pred - y_true) / y_true))
xs = ['pred_last', 'pred_SARIMAX']
ys = [mape(target_test['realgdp'], target_test[x]) for x in xs]
plt.bar(xs, ys)
plt.xlabel('Baseline')
plt.ylabel('MAPE')
# set y value to percentage
plt.gca().yaxis.set_major_formatter(lambda y, _: '{:.01%}'.format(y))
In [ ]:
Copied!