第6章複雑な時系列のモデル化

データセンターの帯域幅使用量を予測する¶

In [1]:

Copied!





from typing import Union
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm.auto import tqdm
from typing import Union
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm.auto import tqdm

In [2]:

Copied!





url = 'https://raw.githubusercontent.com/marcopeix/TimeSeriesForecastingInPython/master/data/bandwidth.csv'
df = pd.read_csv(url)
df.index = pd.date_range('2019-01-01', freq='H', periods=10000, name='Time')

# plot
fig, ax = plt.subplots()
ax.set_ylabel('Hourly bandwith usage (MBps)')
df['hourly_bandwidth'].plot(ax=ax)
plt.show()
url = 'https://raw.githubusercontent.com/marcopeix/TimeSeriesForecastingInPython/master/data/bandwidth.csv'
df = pd.read_csv(url)
df.index = pd.date_range('2019-01-01', freq='H', periods=10000, name='Time')

# plot
fig, ax = plt.subplots()
ax.set_ylabel('Hourly bandwith usage (MBps)')
df['hourly_bandwidth'].plot(ax=ax)
plt.show()

No description has been provided for this image

自己回帰移動平均(ARMA)プロセスを調べる¶

定常的なARMAプロセスを特定する¶

$$ y_t = 0.33 y_{t-1} + 0.9 \varepsilon_{t-1} + \varepsilon_{t} $$

In [20]:

Copied!

ar1 = np.array([1, -0.33])
ma1 = np.array([1, 0.9])

np.random.seed(42)

ARMA_1_1 = ArmaProcess(ar1, ma1).generate_sample(nsample=1000)
ar1 = np.array([1, -0.33])
ma1 = np.array([1, 0.9])

np.random.seed(42)

ARMA_1_1 = ArmaProcess(ar1, ma1).generate_sample(nsample=1000)

In [21]:

Copied!





ADF_result = adfuller(ARMA_1_1)
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')

# plot correlation
fig, axes = plt.subplots(1, 2, figsize=[12, 4])
plot_acf(ARMA_1_1, auto_ylims=True, lags=20, ax=axes[0])
plot_pacf(ARMA_1_1, auto_ylims=True, lags=20, ax=axes[1])
plt.show()
ADF_result = adfuller(ARMA_1_1)
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')

# plot correlation
fig, axes = plt.subplots(1, 2, figsize=[12, 4])
plot_acf(ARMA_1_1, auto_ylims=True, lags=20, ax=axes[0])
plot_pacf(ARMA_1_1, auto_ylims=True, lags=20, ax=axes[1])
plt.show()

ADF Statistic: -6.430
p-value: 0.000

一般的なモデル化手続きを考え出す¶

In [22]:

Copied!





# 一意なARMA(p,q)モデルをすべて適合させる関数
def optimize_ARMA(endog: Union[pd.Series, list], orders: list) -> pd.DataFrame:
    """
    endog: 時系列データ
    orders: ARMA(p,q)のpとqの値の組み合わせ
    """
    res = []
    for order in tqdm(orders):
        try:
            model = SARIMAX(
                endog, 
                order=(order[0], 0, order[1]),
                simple_differencing=False
            ).fit(disp=False)
            res.append([order, model.aic])
        except:
            continue
    df_res = (
        pd.DataFrame(res, columns=['(p,q)', 'AIC'])
        .sort_values(by='AIC', ascending=True)
        .reset_index(drop=True)
    )
    return df_res
# 一意なARMA(p,q)モデルをすべて適合させる関数
def optimize_ARMA(endog: Union[pd.Series, list], orders: list) -> pd.DataFrame:
    """
    endog: 時系列データ
    orders: ARMA(p,q)のpとqの値の組み合わせ
    """
    res = []
    for order in tqdm(orders):
        try:
            model = SARIMAX(
                endog, 
                order=(order[0], 0, order[1]),
                simple_differencing=False
            ).fit(disp=False)
            res.append([order, model.aic])
        except:
            continue
    df_res = (
        pd.DataFrame(res, columns=['(p,q)', 'AIC'])
        .sort_values(by='AIC', ascending=True)
        .reset_index(drop=True)
    )
    return df_res

In [23]:

Copied!

orders = list(product(range(4), range(4))) # p,qをそれぞれ0~3まで動かす
df_res = optimize_ARMA(ARMA_1_1, orders)
df_res # きちんと(1,1)が最小になるようにするにはサンプルを何度か取り直す必要があった
orders = list(product(range(4), range(4))) # p,qをそれぞれ0~3まで動かす
df_res = optimize_ARMA(ARMA_1_1, orders)
df_res # きちんと(1,1)が最小になるようにするにはサンプルを何度か取り直す必要があった

  0%|          | 0/16 [00:00<?, ?it/s]

/home/yoneda/.local/lib/python3.10/site-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
/home/yoneda/.local/lib/python3.10/site-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
/home/yoneda/.local/lib/python3.10/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "

Out[23]:

	(p,q)	AIC
0	(1, 1)	2801.407785
1	(2, 1)	2802.906070
2	(1, 2)	2802.967762
3	(0, 3)	2803.666793
4	(1, 3)	2804.524027
5	(3, 1)	2804.588567
6	(2, 2)	2804.822282
7	(3, 3)	2806.162939
8	(2, 3)	2806.175380
9	(3, 2)	2806.894930
10	(0, 2)	2812.840730
11	(0, 1)	2891.869245
12	(3, 0)	2981.643911
13	(2, 0)	3042.627787
14	(1, 0)	3207.291261
15	(0, 0)	3780.418416

In [24]:

Copied!





# 残差分析
model = SARIMAX(ARMA_1_1, order=(1, 0, 1), simple_differencing=False).fit(disp=False)
model.plot_diagnostics(figsize=(10, 8))
plt.show()
# 残差分析
model = SARIMAX(ARMA_1_1, order=(1, 0, 1), simple_differencing=False).fit(disp=False)
model.plot_diagnostics(figsize=(10, 8))
plt.show()

In [25]:

Copied!

# 残差が無相関かの評価
acorr_ljungbox(model.resid, np.arange(1, 11))
# 残差が無相関かの評価
acorr_ljungbox(model.resid, np.arange(1, 11))

Out[25]:

	lb_stat	lb_pvalue
1	0.030706	0.860897
2	0.528021	0.767966
3	0.540904	0.909817
4	2.079774	0.721089
5	2.271897	0.810385
6	2.896262	0.821750
7	2.900378	0.894063
8	4.418799	0.817501
9	4.787567	0.852419
10	5.246805	0.874093

一般的なモデル化手続きを適用する¶

In [9]:

Copied!





ADF_result = adfuller(df['hourly_bandwidth'])
print('帯域使用量ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')

df_diff = df.diff().dropna()
ADF_result = adfuller(df_diff['hourly_bandwidth'])
print('帯域使用量(1次差分) ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')

fig, axes = plt.subplots(1, 2, figsize=[12, 4])
df['hourly_bandwidth'].plot(ax=axes[0])
df_diff['hourly_bandwidth'].plot(ax=axes[1])
axes[0].set_ylabel('Hourly bandwith usage (MBps')
axes[1].set_ylabel('Hourly bandwith usage (MBps) (difference)')
plt.show()
ADF_result = adfuller(df['hourly_bandwidth'])
print('帯域使用量ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')

df_diff = df.diff().dropna()
ADF_result = adfuller(df_diff['hourly_bandwidth'])
print('帯域使用量(1次差分) ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')

fig, axes = plt.subplots(1, 2, figsize=[12, 4])
df['hourly_bandwidth'].plot(ax=axes[0])
df_diff['hourly_bandwidth'].plot(ax=axes[1])
axes[0].set_ylabel('Hourly bandwith usage (MBps')
axes[1].set_ylabel('Hourly bandwith usage (MBps) (difference)')
plt.show()

帯域使用量ADF検定
ADF Statistic: -0.871
p-value: 0.797
帯域使用量(1次差分) ADF検定
ADF Statistic: -20.695
p-value: 0.000

In [10]:

Copied!





train = df_diff[:-168]
test = df_diff[-168:]

fig, axes = plt.subplots(2, 1, figsize=[8, 8], sharex=True)
df['hourly_bandwidth'].plot(ax=axes[0])
df_diff['hourly_bandwidth'].plot(ax=axes[1])
axes[0].set_ylabel('Hourly bandwith usage (MBps)')
axes[1].set_ylabel('Diff. Hourly bandwith usage (MBps)')
for i in range(2):
    axes[i].axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
train = df_diff[:-168]
test = df_diff[-168:]

fig, axes = plt.subplots(2, 1, figsize=[8, 8], sharex=True)
df['hourly_bandwidth'].plot(ax=axes[0])
df_diff['hourly_bandwidth'].plot(ax=axes[1])
axes[0].set_ylabel('Hourly bandwith usage (MBps)')
axes[1].set_ylabel('Diff. Hourly bandwith usage (MBps)')
for i in range(2):
    axes[i].axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)

In [11]:

Copied!

optimize_ARMA(train['hourly_bandwidth'], orders)
optimize_ARMA(train['hourly_bandwidth'], orders)

  0%|          | 0/16 [00:00<?, ?it/s]

/home/yoneda/.local/lib/python3.10/site-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
/home/yoneda/.local/lib/python3.10/site-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
/home/yoneda/.local/lib/python3.10/site-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'

Out[11]:

	(p,q)	AIC
0	(3, 2)	27991.063879
1	(2, 3)	27991.287509
2	(2, 2)	27991.603598
3	(3, 3)	27993.416924
4	(1, 3)	28003.349550
5	(1, 2)	28051.351401
6	(3, 1)	28071.155496
7	(3, 0)	28095.618186
8	(2, 1)	28097.250766
9	(2, 0)	28098.407664
10	(1, 1)	28172.510044
11	(1, 0)	28941.056983
12	(0, 3)	31355.802141
13	(0, 2)	33531.179284
14	(0, 1)	39402.269523
15	(0, 0)	49035.184224

In [12]:

Copied!





# 上の結果から上位3つのペアは有望そうである。
# 特に(2, 2)は変数が少ないので複雑でない。
# (2, 2)モデルの残差分析を行う
# 残差分析
model = SARIMAX(train['hourly_bandwidth'], order=(2, 0, 2), simple_differencing=False).fit(disp=False)
model.plot_diagnostics(figsize=(10, 8))
plt.show()
# 上の結果から上位3つのペアは有望そうである。
# 特に(2, 2)は変数が少ないので複雑でない。
# (2, 2)モデルの残差分析を行う
# 残差分析
model = SARIMAX(train['hourly_bandwidth'], order=(2, 0, 2), simple_differencing=False).fit(disp=False)
model.plot_diagnostics(figsize=(10, 8))
plt.show()

In [13]:

Copied!

# 残差が無相関かの評価
acorr_ljungbox(model.resid, np.arange(1, 11))
# 残差が無相関かの評価
acorr_ljungbox(model.resid, np.arange(1, 11))

Out[13]:

	lb_stat	lb_pvalue
1	0.042190	0.837257
2	0.418364	0.811247
3	0.520271	0.914416
4	0.850554	0.931545
5	0.850841	0.973678
6	1.111754	0.981019
7	2.124864	0.952607
8	3.230558	0.919067
9	3.248662	0.953615
10	3.588289	0.964015

帯域幅使用量を予測する¶

In [14]:

Copied!





def rolling_forecast(df: pd.DataFrame, train_len: int, horizon: int, window :int, method: str) -> list:
    total_len = train_len + horizon
    preds = []
    if method == 'mean':
        for i in range(train_len, total_len, window):
            mean = np.mean(df[:i].values)
            preds.extend(mean for _ in range(window))
    
    elif method == 'last':
        for i in range(train_len, total_len, window):
            last_val = df.iloc[i-1, 0]
            preds.extend(last_val for _ in range(window))
    
    elif method == 'ARMA':
        for i in range(train_len, total_len, window):
            model = SARIMAX(df[:i], order=(2, 0, 2))
            res = model.fit(disp=False)
            pred = res.get_prediction(0, i + window - 1).predicted_mean.iloc[-window:]
            preds.extend(pred)
    else:
        raise ValueError(f"Invalid method: {method}, choose from ['mean', 'last', 'ARMA']")
    return preds
def rolling_forecast(df: pd.DataFrame, train_len: int, horizon: int, window :int, method: str) -> list:
    total_len = train_len + horizon
    preds = []
    if method == 'mean':
        for i in range(train_len, total_len, window):
            mean = np.mean(df[:i].values)
            preds.extend(mean for _ in range(window))
    
    elif method == 'last':
        for i in range(train_len, total_len, window):
            last_val = df.iloc[i-1, 0]
            preds.extend(last_val for _ in range(window))
    
    elif method == 'ARMA':
        for i in range(train_len, total_len, window):
            model = SARIMAX(df[:i], order=(2, 0, 2))
            res = model.fit(disp=False)
            pred = res.get_prediction(0, i + window - 1).predicted_mean.iloc[-window:]
            preds.extend(pred)
    else:
        raise ValueError(f"Invalid method: {method}, choose from ['mean', 'last', 'ARMA']")
    return preds

In [15]:

Copied!





TRAIN_LEN = len(train)
HORIZON = len(test)
WINDOW = 2

methods = ['mean', 'last', 'ARMA']
for method in methods:
    test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method)
TRAIN_LEN = len(train)
HORIZON = len(test)
WINDOW = 2

methods = ['mean', 'last', 'ARMA']
for method in methods:
    test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method)

/tmp/ipykernel_824/1314583584.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method)
/tmp/ipykernel_824/1314583584.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method)
/tmp/ipykernel_824/1314583584.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method)

In [16]:

Copied!





fig, ax = plt.subplots()
df_diff['hourly_bandwidth'].iloc[-200:].plot(ax=ax, label='actual')
ax.axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
ax.set_ylabel('Diff. Hourly bandwith usage (MBps)')
for method in methods:
    test[method].plot(ax=ax, label=method, ls='dashed')
ax.legend()
plt.show()
fig, ax = plt.subplots()
df_diff['hourly_bandwidth'].iloc[-200:].plot(ax=ax, label='actual')
ax.axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
ax.set_ylabel('Diff. Hourly bandwith usage (MBps)')
for method in methods:
    test[method].plot(ax=ax, label=method, ls='dashed')
ax.legend()
plt.show()

In [17]:

Copied!





plt.xlabel('Methods')
plt.ylabel('MSE')
plt.bar(
    methods,
    [mean_squared_error(test['hourly_bandwidth'], test[method]) for method in methods]
)
plt.show()
plt.xlabel('Methods')
plt.ylabel('MSE')
plt.bar(
    methods,
    [mean_squared_error(test['hourly_bandwidth'], test[method]) for method in methods]
)
plt.show()

In [18]:

Copied!





df['pred_ARMA'] = pd.Series()
df['pred_ARMA'].iloc[-168:] = df['hourly_bandwidth'].iloc[-168] + test['ARMA'].cumsum()

# plot
fig, ax = plt.subplots()
df['hourly_bandwidth'].iloc[-200:].plot(ax=ax, label='actual')
ax.axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
ax.set_ylabel('Hourly bandwith usage (MBps)')
df['pred_ARMA'].iloc[-200:].plot(ax=ax, label=method, ls='dashed')
ax.legend()
plt.show()
df['pred_ARMA'] = pd.Series()
df['pred_ARMA'].iloc[-168:] = df['hourly_bandwidth'].iloc[-168] + test['ARMA'].cumsum()

# plot
fig, ax = plt.subplots()
df['hourly_bandwidth'].iloc[-200:].plot(ax=ax, label='actual')
ax.axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
ax.set_ylabel('Hourly bandwith usage (MBps)')
df['pred_ARMA'].iloc[-200:].plot(ax=ax, label=method, ls='dashed')
ax.legend()
plt.show()

/tmp/ipykernel_824/2455051698.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pred_ARMA'].iloc[-168:] = df['hourly_bandwidth'].iloc[-168] + test['ARMA'].cumsum()

In [19]:

Copied!

# calculate MAE
print(f"MAE: {mean_absolute_error(df['hourly_bandwidth'].iloc[-168:], df['pred_ARMA'].iloc[-168:]):.3f}")
# calculate MAE
print(f"MAE: {mean_absolute_error(df['hourly_bandwidth'].iloc[-168:], df['pred_ARMA'].iloc[-168:]):.3f}")

MAE: 14.000

In [ ]:

第6章 複雑な時系列のモデル化