第6章 複雑な時系列のモデル化
データセンターの帯域幅使用量を予測する¶
In [1]:
Copied!
from typing import Union
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm.auto import tqdm
from typing import Union
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm.auto import tqdm
In [2]:
Copied!
url = 'https://raw.githubusercontent.com/marcopeix/TimeSeriesForecastingInPython/master/data/bandwidth.csv'
df = pd.read_csv(url)
df.index = pd.date_range('2019-01-01', freq='H', periods=10000, name='Time')
# plot
fig, ax = plt.subplots()
ax.set_ylabel('Hourly bandwith usage (MBps)')
df['hourly_bandwidth'].plot(ax=ax)
plt.show()
url = 'https://raw.githubusercontent.com/marcopeix/TimeSeriesForecastingInPython/master/data/bandwidth.csv'
df = pd.read_csv(url)
df.index = pd.date_range('2019-01-01', freq='H', periods=10000, name='Time')
# plot
fig, ax = plt.subplots()
ax.set_ylabel('Hourly bandwith usage (MBps)')
df['hourly_bandwidth'].plot(ax=ax)
plt.show()
自己回帰移動平均(ARMA)プロセスを調べる¶
定常的なARMAプロセスを特定する¶
$$ y_t = 0.33 y_{t-1} + 0.9 \varepsilon_{t-1} + \varepsilon_{t} $$
In [20]:
Copied!
ar1 = np.array([1, -0.33])
ma1 = np.array([1, 0.9])
np.random.seed(42)
ARMA_1_1 = ArmaProcess(ar1, ma1).generate_sample(nsample=1000)
ar1 = np.array([1, -0.33])
ma1 = np.array([1, 0.9])
np.random.seed(42)
ARMA_1_1 = ArmaProcess(ar1, ma1).generate_sample(nsample=1000)
In [21]:
Copied!
ADF_result = adfuller(ARMA_1_1)
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')
# plot correlation
fig, axes = plt.subplots(1, 2, figsize=[12, 4])
plot_acf(ARMA_1_1, auto_ylims=True, lags=20, ax=axes[0])
plot_pacf(ARMA_1_1, auto_ylims=True, lags=20, ax=axes[1])
plt.show()
ADF_result = adfuller(ARMA_1_1)
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')
# plot correlation
fig, axes = plt.subplots(1, 2, figsize=[12, 4])
plot_acf(ARMA_1_1, auto_ylims=True, lags=20, ax=axes[0])
plot_pacf(ARMA_1_1, auto_ylims=True, lags=20, ax=axes[1])
plt.show()
ADF Statistic: -6.430 p-value: 0.000
一般的なモデル化手続きを考え出す¶
In [22]:
Copied!
# 一意なARMA(p,q)モデルをすべて適合させる関数
def optimize_ARMA(endog: Union[pd.Series, list], orders: list) -> pd.DataFrame:
"""
endog: 時系列データ
orders: ARMA(p,q)のpとqの値の組み合わせ
"""
res = []
for order in tqdm(orders):
try:
model = SARIMAX(
endog,
order=(order[0], 0, order[1]),
simple_differencing=False
).fit(disp=False)
res.append([order, model.aic])
except:
continue
df_res = (
pd.DataFrame(res, columns=['(p,q)', 'AIC'])
.sort_values(by='AIC', ascending=True)
.reset_index(drop=True)
)
return df_res
# 一意なARMA(p,q)モデルをすべて適合させる関数
def optimize_ARMA(endog: Union[pd.Series, list], orders: list) -> pd.DataFrame:
"""
endog: 時系列データ
orders: ARMA(p,q)のpとqの値の組み合わせ
"""
res = []
for order in tqdm(orders):
try:
model = SARIMAX(
endog,
order=(order[0], 0, order[1]),
simple_differencing=False
).fit(disp=False)
res.append([order, model.aic])
except:
continue
df_res = (
pd.DataFrame(res, columns=['(p,q)', 'AIC'])
.sort_values(by='AIC', ascending=True)
.reset_index(drop=True)
)
return df_res
In [23]:
Copied!
orders = list(product(range(4), range(4))) # p,qをそれぞれ0~3まで動かす
df_res = optimize_ARMA(ARMA_1_1, orders)
df_res # きちんと(1,1)が最小になるようにするにはサンプルを何度か取り直す必要があった
orders = list(product(range(4), range(4))) # p,qをそれぞれ0~3まで動かす
df_res = optimize_ARMA(ARMA_1_1, orders)
df_res # きちんと(1,1)が最小になるようにするにはサンプルを何度か取り直す必要があった
0%| | 0/16 [00:00<?, ?it/s]
/home/yoneda/.local/lib/python3.10/site-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. warn('Non-invertible starting MA parameters found.' /home/yoneda/.local/lib/python3.10/site-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters. warn('Non-stationary starting autoregressive parameters' /home/yoneda/.local/lib/python3.10/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals warnings.warn("Maximum Likelihood optimization failed to "
Out[23]:
(p,q) | AIC | |
---|---|---|
0 | (1, 1) | 2801.407785 |
1 | (2, 1) | 2802.906070 |
2 | (1, 2) | 2802.967762 |
3 | (0, 3) | 2803.666793 |
4 | (1, 3) | 2804.524027 |
5 | (3, 1) | 2804.588567 |
6 | (2, 2) | 2804.822282 |
7 | (3, 3) | 2806.162939 |
8 | (2, 3) | 2806.175380 |
9 | (3, 2) | 2806.894930 |
10 | (0, 2) | 2812.840730 |
11 | (0, 1) | 2891.869245 |
12 | (3, 0) | 2981.643911 |
13 | (2, 0) | 3042.627787 |
14 | (1, 0) | 3207.291261 |
15 | (0, 0) | 3780.418416 |
In [24]:
Copied!
# 残差分析
model = SARIMAX(ARMA_1_1, order=(1, 0, 1), simple_differencing=False).fit(disp=False)
model.plot_diagnostics(figsize=(10, 8))
plt.show()
# 残差分析
model = SARIMAX(ARMA_1_1, order=(1, 0, 1), simple_differencing=False).fit(disp=False)
model.plot_diagnostics(figsize=(10, 8))
plt.show()
In [25]:
Copied!
# 残差が無相関かの評価
acorr_ljungbox(model.resid, np.arange(1, 11))
# 残差が無相関かの評価
acorr_ljungbox(model.resid, np.arange(1, 11))
Out[25]:
lb_stat | lb_pvalue | |
---|---|---|
1 | 0.030706 | 0.860897 |
2 | 0.528021 | 0.767966 |
3 | 0.540904 | 0.909817 |
4 | 2.079774 | 0.721089 |
5 | 2.271897 | 0.810385 |
6 | 2.896262 | 0.821750 |
7 | 2.900378 | 0.894063 |
8 | 4.418799 | 0.817501 |
9 | 4.787567 | 0.852419 |
10 | 5.246805 | 0.874093 |
一般的なモデル化手続きを適用する¶
In [9]:
Copied!
ADF_result = adfuller(df['hourly_bandwidth'])
print('帯域使用量ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')
df_diff = df.diff().dropna()
ADF_result = adfuller(df_diff['hourly_bandwidth'])
print('帯域使用量(1次差分) ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')
fig, axes = plt.subplots(1, 2, figsize=[12, 4])
df['hourly_bandwidth'].plot(ax=axes[0])
df_diff['hourly_bandwidth'].plot(ax=axes[1])
axes[0].set_ylabel('Hourly bandwith usage (MBps')
axes[1].set_ylabel('Hourly bandwith usage (MBps) (difference)')
plt.show()
ADF_result = adfuller(df['hourly_bandwidth'])
print('帯域使用量ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')
df_diff = df.diff().dropna()
ADF_result = adfuller(df_diff['hourly_bandwidth'])
print('帯域使用量(1次差分) ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')
fig, axes = plt.subplots(1, 2, figsize=[12, 4])
df['hourly_bandwidth'].plot(ax=axes[0])
df_diff['hourly_bandwidth'].plot(ax=axes[1])
axes[0].set_ylabel('Hourly bandwith usage (MBps')
axes[1].set_ylabel('Hourly bandwith usage (MBps) (difference)')
plt.show()
帯域使用量ADF検定 ADF Statistic: -0.871 p-value: 0.797 帯域使用量(1次差分) ADF検定 ADF Statistic: -20.695 p-value: 0.000
In [10]:
Copied!
train = df_diff[:-168]
test = df_diff[-168:]
fig, axes = plt.subplots(2, 1, figsize=[8, 8], sharex=True)
df['hourly_bandwidth'].plot(ax=axes[0])
df_diff['hourly_bandwidth'].plot(ax=axes[1])
axes[0].set_ylabel('Hourly bandwith usage (MBps)')
axes[1].set_ylabel('Diff. Hourly bandwith usage (MBps)')
for i in range(2):
axes[i].axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
train = df_diff[:-168]
test = df_diff[-168:]
fig, axes = plt.subplots(2, 1, figsize=[8, 8], sharex=True)
df['hourly_bandwidth'].plot(ax=axes[0])
df_diff['hourly_bandwidth'].plot(ax=axes[1])
axes[0].set_ylabel('Hourly bandwith usage (MBps)')
axes[1].set_ylabel('Diff. Hourly bandwith usage (MBps)')
for i in range(2):
axes[i].axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
In [11]:
Copied!
optimize_ARMA(train['hourly_bandwidth'], orders)
optimize_ARMA(train['hourly_bandwidth'], orders)
0%| | 0/16 [00:00<?, ?it/s]
/home/yoneda/.local/lib/python3.10/site-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. warn('Non-invertible starting MA parameters found.' /home/yoneda/.local/lib/python3.10/site-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. warn('Non-invertible starting MA parameters found.' /home/yoneda/.local/lib/python3.10/site-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters. warn('Non-invertible starting MA parameters found.'
Out[11]:
(p,q) | AIC | |
---|---|---|
0 | (3, 2) | 27991.063879 |
1 | (2, 3) | 27991.287509 |
2 | (2, 2) | 27991.603598 |
3 | (3, 3) | 27993.416924 |
4 | (1, 3) | 28003.349550 |
5 | (1, 2) | 28051.351401 |
6 | (3, 1) | 28071.155496 |
7 | (3, 0) | 28095.618186 |
8 | (2, 1) | 28097.250766 |
9 | (2, 0) | 28098.407664 |
10 | (1, 1) | 28172.510044 |
11 | (1, 0) | 28941.056983 |
12 | (0, 3) | 31355.802141 |
13 | (0, 2) | 33531.179284 |
14 | (0, 1) | 39402.269523 |
15 | (0, 0) | 49035.184224 |
In [12]:
Copied!
# 上の結果から上位3つのペアは有望そうである。
# 特に(2, 2)は変数が少ないので複雑でない。
# (2, 2)モデルの残差分析を行う
# 残差分析
model = SARIMAX(train['hourly_bandwidth'], order=(2, 0, 2), simple_differencing=False).fit(disp=False)
model.plot_diagnostics(figsize=(10, 8))
plt.show()
# 上の結果から上位3つのペアは有望そうである。
# 特に(2, 2)は変数が少ないので複雑でない。
# (2, 2)モデルの残差分析を行う
# 残差分析
model = SARIMAX(train['hourly_bandwidth'], order=(2, 0, 2), simple_differencing=False).fit(disp=False)
model.plot_diagnostics(figsize=(10, 8))
plt.show()
In [13]:
Copied!
# 残差が無相関かの評価
acorr_ljungbox(model.resid, np.arange(1, 11))
# 残差が無相関かの評価
acorr_ljungbox(model.resid, np.arange(1, 11))
Out[13]:
lb_stat | lb_pvalue | |
---|---|---|
1 | 0.042190 | 0.837257 |
2 | 0.418364 | 0.811247 |
3 | 0.520271 | 0.914416 |
4 | 0.850554 | 0.931545 |
5 | 0.850841 | 0.973678 |
6 | 1.111754 | 0.981019 |
7 | 2.124864 | 0.952607 |
8 | 3.230558 | 0.919067 |
9 | 3.248662 | 0.953615 |
10 | 3.588289 | 0.964015 |
帯域幅使用量を予測する¶
In [14]:
Copied!
def rolling_forecast(df: pd.DataFrame, train_len: int, horizon: int, window :int, method: str) -> list:
total_len = train_len + horizon
preds = []
if method == 'mean':
for i in range(train_len, total_len, window):
mean = np.mean(df[:i].values)
preds.extend(mean for _ in range(window))
elif method == 'last':
for i in range(train_len, total_len, window):
last_val = df.iloc[i-1, 0]
preds.extend(last_val for _ in range(window))
elif method == 'ARMA':
for i in range(train_len, total_len, window):
model = SARIMAX(df[:i], order=(2, 0, 2))
res = model.fit(disp=False)
pred = res.get_prediction(0, i + window - 1).predicted_mean.iloc[-window:]
preds.extend(pred)
else:
raise ValueError(f"Invalid method: {method}, choose from ['mean', 'last', 'ARMA']")
return preds
def rolling_forecast(df: pd.DataFrame, train_len: int, horizon: int, window :int, method: str) -> list:
total_len = train_len + horizon
preds = []
if method == 'mean':
for i in range(train_len, total_len, window):
mean = np.mean(df[:i].values)
preds.extend(mean for _ in range(window))
elif method == 'last':
for i in range(train_len, total_len, window):
last_val = df.iloc[i-1, 0]
preds.extend(last_val for _ in range(window))
elif method == 'ARMA':
for i in range(train_len, total_len, window):
model = SARIMAX(df[:i], order=(2, 0, 2))
res = model.fit(disp=False)
pred = res.get_prediction(0, i + window - 1).predicted_mean.iloc[-window:]
preds.extend(pred)
else:
raise ValueError(f"Invalid method: {method}, choose from ['mean', 'last', 'ARMA']")
return preds
In [15]:
Copied!
TRAIN_LEN = len(train)
HORIZON = len(test)
WINDOW = 2
methods = ['mean', 'last', 'ARMA']
for method in methods:
test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method)
TRAIN_LEN = len(train)
HORIZON = len(test)
WINDOW = 2
methods = ['mean', 'last', 'ARMA']
for method in methods:
test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method)
/tmp/ipykernel_824/1314583584.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method) /tmp/ipykernel_824/1314583584.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method) /tmp/ipykernel_824/1314583584.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method)
In [16]:
Copied!
fig, ax = plt.subplots()
df_diff['hourly_bandwidth'].iloc[-200:].plot(ax=ax, label='actual')
ax.axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
ax.set_ylabel('Diff. Hourly bandwith usage (MBps)')
for method in methods:
test[method].plot(ax=ax, label=method, ls='dashed')
ax.legend()
plt.show()
fig, ax = plt.subplots()
df_diff['hourly_bandwidth'].iloc[-200:].plot(ax=ax, label='actual')
ax.axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
ax.set_ylabel('Diff. Hourly bandwith usage (MBps)')
for method in methods:
test[method].plot(ax=ax, label=method, ls='dashed')
ax.legend()
plt.show()
In [17]:
Copied!
plt.xlabel('Methods')
plt.ylabel('MSE')
plt.bar(
methods,
[mean_squared_error(test['hourly_bandwidth'], test[method]) for method in methods]
)
plt.show()
plt.xlabel('Methods')
plt.ylabel('MSE')
plt.bar(
methods,
[mean_squared_error(test['hourly_bandwidth'], test[method]) for method in methods]
)
plt.show()
In [18]:
Copied!
df['pred_ARMA'] = pd.Series()
df['pred_ARMA'].iloc[-168:] = df['hourly_bandwidth'].iloc[-168] + test['ARMA'].cumsum()
# plot
fig, ax = plt.subplots()
df['hourly_bandwidth'].iloc[-200:].plot(ax=ax, label='actual')
ax.axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
ax.set_ylabel('Hourly bandwith usage (MBps)')
df['pred_ARMA'].iloc[-200:].plot(ax=ax, label=method, ls='dashed')
ax.legend()
plt.show()
df['pred_ARMA'] = pd.Series()
df['pred_ARMA'].iloc[-168:] = df['hourly_bandwidth'].iloc[-168] + test['ARMA'].cumsum()
# plot
fig, ax = plt.subplots()
df['hourly_bandwidth'].iloc[-200:].plot(ax=ax, label='actual')
ax.axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
ax.set_ylabel('Hourly bandwith usage (MBps)')
df['pred_ARMA'].iloc[-200:].plot(ax=ax, label=method, ls='dashed')
ax.legend()
plt.show()
/tmp/ipykernel_824/2455051698.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['pred_ARMA'].iloc[-168:] = df['hourly_bandwidth'].iloc[-168] + test['ARMA'].cumsum()
In [19]:
Copied!
# calculate MAE
print(f"MAE: {mean_absolute_error(df['hourly_bandwidth'].iloc[-168:], df['pred_ARMA'].iloc[-168:]):.3f}")
# calculate MAE
print(f"MAE: {mean_absolute_error(df['hourly_bandwidth'].iloc[-168:], df['pred_ARMA'].iloc[-168:]):.3f}")
MAE: 14.000
In [ ]:
Copied!