第4章 移動平均プロセスのモデル化
移動平均(MA)プロセスを定義する¶
In [115]:
Copied!
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
In [51]:
Copied!
url = 'https://raw.githubusercontent.com/marcopeix/TimeSeriesForecastingInPython/master/data/widget_sales.csv'
df = pd.read_csv(url)
df.index = pd.date_range('2019-01-01', periods=500, name='Time')
url = 'https://raw.githubusercontent.com/marcopeix/TimeSeriesForecastingInPython/master/data/widget_sales.csv'
df = pd.read_csv(url)
df.index = pd.date_range('2019-01-01', periods=500, name='Time')
In [60]:
Copied!
ADF_result = adfuller(df['widget_sales'])
print('製品売上高ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')
df_diff = df.diff().dropna()
ADF_result = adfuller(df_diff['widget_sales'])
print('製品売上高(1次差分) ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')
fig, axes = plt.subplots(1, 2, figsize=[12, 4])
df['widget_sales'].plot(ax=axes[0])
df_diff['widget_sales'].plot(ax=axes[1])
axes[0].set_ylabel('Widget sales (k$)')
axes[1].set_ylabel('Widget sales - diff (k$)')
plt.show()
ADF_result = adfuller(df['widget_sales'])
print('製品売上高ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')
df_diff = df.diff().dropna()
ADF_result = adfuller(df_diff['widget_sales'])
print('製品売上高(1次差分) ADF検定')
print(f'ADF Statistic: {ADF_result[0]:.3f}')
print(f'p-value: {ADF_result[1]:.3f}')
fig, axes = plt.subplots(1, 2, figsize=[12, 4])
df['widget_sales'].plot(ax=axes[0])
df_diff['widget_sales'].plot(ax=axes[1])
axes[0].set_ylabel('Widget sales (k$)')
axes[1].set_ylabel('Widget sales - diff (k$)')
plt.show()
製品売上高ADF検定 ADF Statistic: -1.512 p-value: 0.527 製品売上高(1次差分) ADF検定 ADF Statistic: -10.577 p-value: 0.000
In [53]:
Copied!
plot_acf(df_diff, auto_ylims=True, lags=30)
plt.show()
plot_acf(df_diff, auto_ylims=True, lags=30)
plt.show()
MAプロセスを予測する¶
In [79]:
Copied!
train = df_diff[:int(0.9 * len(df_diff))]
test = df_diff[int(0.9 * len(df_diff)):]
fig, axes = plt.subplots(2, 1, figsize=[8, 8], sharex=True)
df['widget_sales'].plot(ax=axes[0])
df_diff['widget_sales'].plot(ax=axes[1])
axes[0].set_ylabel('Widget sales (k$)')
axes[1].set_ylabel('Widget sales - diff (k$)')
for i in range(2):
axes[i].axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
train = df_diff[:int(0.9 * len(df_diff))]
test = df_diff[int(0.9 * len(df_diff)):]
fig, axes = plt.subplots(2, 1, figsize=[8, 8], sharex=True)
df['widget_sales'].plot(ax=axes[0])
df_diff['widget_sales'].plot(ax=axes[1])
axes[0].set_ylabel('Widget sales (k$)')
axes[1].set_ylabel('Widget sales - diff (k$)')
for i in range(2):
axes[i].axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
In [101]:
Copied!
def rolling_forecast(df: pd.DataFrame, train_len: int, horizon: int, window :int, method: str) -> list:
total_len = train_len + horizon
preds = []
if method == 'mean':
for i in range(train_len, total_len, window):
mean = np.mean(df[:i].values)
preds.extend(mean for _ in range(window))
elif method == 'last':
for i in range(train_len, total_len, window):
last_val = df.iloc[i-1, 0]
preds.extend(last_val for _ in range(window))
elif method == 'MA':
for i in range(train_len, total_len, window):
model = SARIMAX(df[:i], order=(0, 0, 2))
res = model.fit(disp=False)
pred = res.get_prediction(0, i + window - 1).predicted_mean.iloc[-window:]
preds.extend(pred)
else:
raise ValueError(f"Invalid method: {method}, choose from ['mean', 'last', 'MA']")
return preds
def rolling_forecast(df: pd.DataFrame, train_len: int, horizon: int, window :int, method: str) -> list:
total_len = train_len + horizon
preds = []
if method == 'mean':
for i in range(train_len, total_len, window):
mean = np.mean(df[:i].values)
preds.extend(mean for _ in range(window))
elif method == 'last':
for i in range(train_len, total_len, window):
last_val = df.iloc[i-1, 0]
preds.extend(last_val for _ in range(window))
elif method == 'MA':
for i in range(train_len, total_len, window):
model = SARIMAX(df[:i], order=(0, 0, 2))
res = model.fit(disp=False)
pred = res.get_prediction(0, i + window - 1).predicted_mean.iloc[-window:]
preds.extend(pred)
else:
raise ValueError(f"Invalid method: {method}, choose from ['mean', 'last', 'MA']")
return preds
In [102]:
Copied!
TRAIN_LEN = len(train)
HORIZON = len(test)
WINDOW = 2
methods = ['mean', 'last', 'MA']
for method in methods:
test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method)
TRAIN_LEN = len(train)
HORIZON = len(test)
WINDOW = 2
methods = ['mean', 'last', 'MA']
for method in methods:
test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method)
/tmp/ipykernel_61933/3131777319.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method) /tmp/ipykernel_61933/3131777319.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy test[method] = rolling_forecast(df_diff, TRAIN_LEN, HORIZON, WINDOW, method)
In [108]:
Copied!
fig, ax = plt.subplots()
df_diff['widget_sales'].iloc[-70:].plot(ax=ax, label='actual')
ax.axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
ax.set_ylabel('Widget sales - diff (k$)')
for method in methods:
test[method].plot(ax=ax, label=method, ls='dashed')
ax.legend()
plt.show()
fig, ax = plt.subplots()
df_diff['widget_sales'].iloc[-70:].plot(ax=ax, label='actual')
ax.axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
ax.set_ylabel('Widget sales - diff (k$)')
for method in methods:
test[method].plot(ax=ax, label=method, ls='dashed')
ax.legend()
plt.show()
In [107]:
Copied!
plt.xlabel('Methods')
plt.ylabel('MSE')
plt.bar(
methods,
[mean_squared_error(test['widget_sales'], test[method]) for method in methods]
)
plt.show()
plt.xlabel('Methods')
plt.ylabel('MSE')
plt.bar(
methods,
[mean_squared_error(test['widget_sales'], test[method]) for method in methods]
)
plt.show()
In [119]:
Copied!
df['pred_MA'] = pd.Series()
df['pred_MA'].iloc[450:] = df['widget_sales'].iloc[450] + test['MA'].cumsum()
# plot
fig, ax = plt.subplots()
df['widget_sales'].iloc[-90:].plot(ax=ax, label='actual')
ax.axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
ax.set_ylabel('Widget sales (k$)')
df['pred_MA'].iloc[-90:].plot(ax=ax, label=method, ls='dashed')
ax.legend()
plt.show()
df['pred_MA'] = pd.Series()
df['pred_MA'].iloc[450:] = df['widget_sales'].iloc[450] + test['MA'].cumsum()
# plot
fig, ax = plt.subplots()
df['widget_sales'].iloc[-90:].plot(ax=ax, label='actual')
ax.axvspan(test.index[0], test.index[-1], color='tab:orange', alpha=0.5)
ax.set_ylabel('Widget sales (k$)')
df['pred_MA'].iloc[-90:].plot(ax=ax, label=method, ls='dashed')
ax.legend()
plt.show()
/tmp/ipykernel_61933/3235568680.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['pred_MA'].iloc[450:] = df['widget_sales'].iloc[450] + test['MA'].cumsum()
In [121]:
Copied!
# calculate MAE
print(f"MAE: {mean_absolute_error(df['widget_sales'].iloc[450:], df['pred_MA'].iloc[450:]):.3f}")
# calculate MAE
print(f"MAE: {mean_absolute_error(df['widget_sales'].iloc[450:], df['pred_MA'].iloc[450:]):.3f}")
MAE: 2.324
In [ ]:
Copied!