第13章 ディープラーニングのためのデータウィンドウとベースライン
In [28]:
Copied!
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import MeanAbsoluteError
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import MeanAbsoluteError
データウィンドウを作成する¶
In [2]:
Copied!
url_train = 'https://raw.githubusercontent.com/marcopeix/TimeSeriesForecastingInPython/master/data/train.csv'
url_val = 'https://raw.githubusercontent.com/marcopeix/TimeSeriesForecastingInPython/master/data/val.csv'
url_test = 'https://raw.githubusercontent.com/marcopeix/TimeSeriesForecastingInPython/master/data/test.csv'
df_train = pd.read_csv(url_train, index_col=0)
df_val = pd.read_csv(url_val, index_col=0)
df_test = pd.read_csv(url_test, index_col=0)
url_train = 'https://raw.githubusercontent.com/marcopeix/TimeSeriesForecastingInPython/master/data/train.csv'
url_val = 'https://raw.githubusercontent.com/marcopeix/TimeSeriesForecastingInPython/master/data/val.csv'
url_test = 'https://raw.githubusercontent.com/marcopeix/TimeSeriesForecastingInPython/master/data/test.csv'
df_train = pd.read_csv(url_train, index_col=0)
df_val = pd.read_csv(url_val, index_col=0)
df_test = pd.read_csv(url_test, index_col=0)
In [30]:
Copied!
class DataWindow:
def __init__(self, input_width, label_width, shift, df_train, df_val, df_test, label_columns=None):
# window size
self.input_width = input_width
self.label_width = label_width
self.shift = shift
self.total_window_size = input_width + shift
# データ
self.df_train = df_train
self.df_val = df_val
self.df_test = df_test
# ラベル
self.label_columns = label_columns
if label_columns is not None:
self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
self.column_indices = {name: i for i, name in enumerate(self.df_train.columns)}
# スライス
self.input_slice = slice(0, input_width)
self.input_indices = np.arange(self.total_window_size)[self.input_slice]
# ラベル開始位置
self.label_start = self.total_window_size - self.label_width
self.labels_slice = slice(self.label_start, None)
self.label_indices = np.arange(self.total_window_size)[self.labels_slice]
def split_to_inputs_labels(self, features):
inputs = features[:, self.input_slice, :]
labels = features[:, self.labels_slice, :]
if self.label_columns is not None:
labels = tf.stack([labels[:, :, self.column_indices[name]] for name in self.label_columns], axis=-1)
inputs.set_shape([None, self.input_width, None])
labels.set_shape([None, self.label_width, None])
return inputs, labels
def plot(self, plot_col: str, model=None, max_subplots=3):
inputs, labels = self.sample_batch
plt.figure(figsize=(12, 8))
plot_col_index = self.column_indices[plot_col]
n_max = min(max_subplots, len(inputs))
for n in range(n_max):
plt.subplot(n_max, 1, n+1)
plt.ylabel(f'{plot_col} [scaled]')
plt.plot(self.input_indices, inputs[n, :, plot_col_index], label='Inputs', marker='.', zorder=-10)
if self.label_columns:
label_col_index = self.label_columns_indices.get(plot_col, None)
else:
label_col_index = plot_col_index
if label_col_index is None:
continue
plt.scatter(self.label_indices, labels[n, :, label_col_index], edgecolors='k', label='Labels', c='tab:green', s=64)
if model is not None:
predictions = model(inputs)
plt.scatter(self.label_indices, predictions[n, :, label_col_index], marker='X', edgecolors='k', label='Predictions', c='tab:red', s=64)
if n == 0:
plt.legend()
plt.xlabel('Time (h)')
def make_dataset(self, data):
data = np.array(data, dtype=np.float32)
ds = tf.keras.utils.timeseries_dataset_from_array(
data=data,
targets=None,
sequence_length=self.total_window_size,
sequence_stride=1,
shuffle=True,
batch_size=32,
)
ds = ds.map(self.split_to_inputs_labels)
return ds
@property
def train(self):
return self.make_dataset(self.df_train)
@property
def val(self):
return self.make_dataset(self.df_val)
@property
def test(self):
return self.make_dataset(self.df_test)
@property
def sample_batch(self):
"""Get and cache an example batch of `inputs, labels` for plotting."""
result = getattr(self, '_sample_batch', None)
if result is None:
result = next(iter(self.train))
self._sample_batch = result
return result
class DataWindow:
def __init__(self, input_width, label_width, shift, df_train, df_val, df_test, label_columns=None):
# window size
self.input_width = input_width
self.label_width = label_width
self.shift = shift
self.total_window_size = input_width + shift
# データ
self.df_train = df_train
self.df_val = df_val
self.df_test = df_test
# ラベル
self.label_columns = label_columns
if label_columns is not None:
self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
self.column_indices = {name: i for i, name in enumerate(self.df_train.columns)}
# スライス
self.input_slice = slice(0, input_width)
self.input_indices = np.arange(self.total_window_size)[self.input_slice]
# ラベル開始位置
self.label_start = self.total_window_size - self.label_width
self.labels_slice = slice(self.label_start, None)
self.label_indices = np.arange(self.total_window_size)[self.labels_slice]
def split_to_inputs_labels(self, features):
inputs = features[:, self.input_slice, :]
labels = features[:, self.labels_slice, :]
if self.label_columns is not None:
labels = tf.stack([labels[:, :, self.column_indices[name]] for name in self.label_columns], axis=-1)
inputs.set_shape([None, self.input_width, None])
labels.set_shape([None, self.label_width, None])
return inputs, labels
def plot(self, plot_col: str, model=None, max_subplots=3):
inputs, labels = self.sample_batch
plt.figure(figsize=(12, 8))
plot_col_index = self.column_indices[plot_col]
n_max = min(max_subplots, len(inputs))
for n in range(n_max):
plt.subplot(n_max, 1, n+1)
plt.ylabel(f'{plot_col} [scaled]')
plt.plot(self.input_indices, inputs[n, :, plot_col_index], label='Inputs', marker='.', zorder=-10)
if self.label_columns:
label_col_index = self.label_columns_indices.get(plot_col, None)
else:
label_col_index = plot_col_index
if label_col_index is None:
continue
plt.scatter(self.label_indices, labels[n, :, label_col_index], edgecolors='k', label='Labels', c='tab:green', s=64)
if model is not None:
predictions = model(inputs)
plt.scatter(self.label_indices, predictions[n, :, label_col_index], marker='X', edgecolors='k', label='Predictions', c='tab:red', s=64)
if n == 0:
plt.legend()
plt.xlabel('Time (h)')
def make_dataset(self, data):
data = np.array(data, dtype=np.float32)
ds = tf.keras.utils.timeseries_dataset_from_array(
data=data,
targets=None,
sequence_length=self.total_window_size,
sequence_stride=1,
shuffle=True,
batch_size=32,
)
ds = ds.map(self.split_to_inputs_labels)
return ds
@property
def train(self):
return self.make_dataset(self.df_train)
@property
def val(self):
return self.make_dataset(self.df_val)
@property
def test(self):
return self.make_dataset(self.df_test)
@property
def sample_batch(self):
"""Get and cache an example batch of `inputs, labels` for plotting."""
result = getattr(self, '_sample_batch', None)
if result is None:
result = next(iter(self.train))
self._sample_batch = result
return result
ベースラインモデルを実装する¶
In [49]:
Copied!
# まずは次のステップを前のステップで予測結果とする
single_step_window = DataWindow(input_width=1, label_width=1, shift=1, df_train=df_train, df_val=df_val, df_test=df_test, label_columns=['traffic_volume'])
wide_window = DataWindow(input_width=24, label_width=24, shift=1, df_train=df_train, df_val=df_val, df_test=df_test, label_columns=['traffic_volume'])
# まずは次のステップを前のステップで予測結果とする
single_step_window = DataWindow(input_width=1, label_width=1, shift=1, df_train=df_train, df_val=df_val, df_test=df_test, label_columns=['traffic_volume'])
wide_window = DataWindow(input_width=24, label_width=24, shift=1, df_train=df_train, df_val=df_val, df_test=df_test, label_columns=['traffic_volume'])
In [50]:
Copied!
class Baseline(Model):
def __init__(self, label_index=None):
super().__init__()
self.label_index = label_index
def call(self, inputs):
if self.label_index is None:
return inputs
elif isinstance(self.label_index, list):
tensors = []
for index in self.label_index:
res = inputs[:, :, index]
res = res[:, :, tf.newaxis]
tensors.append(res)
return tf.concat(tensors, axis=-1)
else:
res = inputs[:, :, self.label_index]
return res[:, :, tf.newaxis]
class Baseline(Model):
def __init__(self, label_index=None):
super().__init__()
self.label_index = label_index
def call(self, inputs):
if self.label_index is None:
return inputs
elif isinstance(self.label_index, list):
tensors = []
for index in self.label_index:
res = inputs[:, :, index]
res = res[:, :, tf.newaxis]
tensors.append(res)
return tf.concat(tensors, axis=-1)
else:
res = inputs[:, :, self.label_index]
return res[:, :, tf.newaxis]
In [51]:
Copied!
column_indices = {name: i for i, name in enumerate(df_train.columns)}
baseline_last = Baseline(column_indices['traffic_volume'])
baseline_last.compile(loss=MeanSquaredError(), metrics=[MeanAbsoluteError()])
column_indices = {name: i for i, name in enumerate(df_train.columns)}
baseline_last = Baseline(column_indices['traffic_volume'])
baseline_last.compile(loss=MeanSquaredError(), metrics=[MeanAbsoluteError()])
In [52]:
Copied!
performane_val = {}
performance_test = {}
performane_val['Baseline - Last'] = baseline_last.evaluate(single_step_window.val)
performance_test['Baseline - Last'] = baseline_last.evaluate(single_step_window.test, verbose=0)
performane_val = {}
performance_test = {}
performane_val['Baseline - Last'] = baseline_last.evaluate(single_step_window.val)
performance_test['Baseline - Last'] = baseline_last.evaluate(single_step_window.test, verbose=0)
110/110 [==============================] - 0s 2ms/step - loss: 684054.1250 - mean_absolute_error: 595.2283
In [53]:
Copied!
# single_step_window.plot('traffic_volume', baseline_last)だとあまり状況が見えないのでwide_windowにしていく
wide_window.plot('traffic_volume', baseline_last)
# single_step_window.plot('traffic_volume', baseline_last)だとあまり状況が見えないのでwide_windowにしていく
wide_window.plot('traffic_volume', baseline_last)
In [60]:
Copied!
# マルチステップの場合
multi_window = DataWindow(input_width=24, label_width=24, shift=24, df_train=df_train, df_val=df_val, df_test=df_test, label_columns=['traffic_volume'])
class MultiStepLastBaseline(Model):
def __init__(self, label_index=None):
super().__init__()
self.label_index = label_index
def call(self, inputs):
if self.label_index is None:
return tf.tile(inputs[:, -1:, :], [1, 24, 1])
return tf.tile(inputs[:, -1:, self.label_index:], [1, 24, 1])
class RepeatBaseline(Model):
def __init__(self, label_index=None):
super().__init__()
self.label_index = label_index
def call(self, inputs):
return inputs[:, :, self.label_index:]
# マルチステップの場合
multi_window = DataWindow(input_width=24, label_width=24, shift=24, df_train=df_train, df_val=df_val, df_test=df_test, label_columns=['traffic_volume'])
class MultiStepLastBaseline(Model):
def __init__(self, label_index=None):
super().__init__()
self.label_index = label_index
def call(self, inputs):
if self.label_index is None:
return tf.tile(inputs[:, -1:, :], [1, 24, 1])
return tf.tile(inputs[:, -1:, self.label_index:], [1, 24, 1])
class RepeatBaseline(Model):
def __init__(self, label_index=None):
super().__init__()
self.label_index = label_index
def call(self, inputs):
return inputs[:, :, self.label_index:]
In [61]:
Copied!
ms_baseline_last = MultiStepLastBaseline(label_index=column_indices['traffic_volume'])
ms_baseline_last.compile(loss=MeanSquaredError(), metrics=[MeanAbsoluteError()])
ms_baseline_repeat = RepeatBaseline(label_index=column_indices['traffic_volume'])
ms_baseline_repeat.compile(loss=MeanSquaredError(), metrics=[MeanAbsoluteError()])
ms_val_performance = {}
ms_test_performance = {}
ms_val_performance['Baseline - Last'] = ms_baseline_last.evaluate(multi_window.val)
ms_test_performance['Baseline - Last'] = ms_baseline_last.evaluate(multi_window.test)
ms_val_performance['Baseline - Repeat'] = ms_baseline_repeat.evaluate(multi_window.val)
ms_test_performance['Baseline - Repeat'] = ms_baseline_repeat.evaluate(multi_window.test)
multi_window.plot('traffic_volume', ms_baseline_last)
multi_window.plot('traffic_volume', ms_baseline_repeat)
ms_baseline_last = MultiStepLastBaseline(label_index=column_indices['traffic_volume'])
ms_baseline_last.compile(loss=MeanSquaredError(), metrics=[MeanAbsoluteError()])
ms_baseline_repeat = RepeatBaseline(label_index=column_indices['traffic_volume'])
ms_baseline_repeat.compile(loss=MeanSquaredError(), metrics=[MeanAbsoluteError()])
ms_val_performance = {}
ms_test_performance = {}
ms_val_performance['Baseline - Last'] = ms_baseline_last.evaluate(multi_window.val)
ms_test_performance['Baseline - Last'] = ms_baseline_last.evaluate(multi_window.test)
ms_val_performance['Baseline - Repeat'] = ms_baseline_repeat.evaluate(multi_window.val)
ms_test_performance['Baseline - Repeat'] = ms_baseline_repeat.evaluate(multi_window.test)
multi_window.plot('traffic_volume', ms_baseline_last)
multi_window.plot('traffic_volume', ms_baseline_repeat)
109/109 [==============================] - 1s 5ms/step - loss: 12769637.0000 - mean_absolute_error: 2974.6951 54/54 [==============================] - 0s 5ms/step - loss: 12498208.0000 - mean_absolute_error: 2964.0754 109/109 [==============================] - 2s 3ms/step - loss: 10588796.0000 - mean_absolute_error: 2431.1047 54/54 [==============================] - 0s 3ms/step - loss: 10346166.0000 - mean_absolute_error: 2403.8494
In [63]:
Copied!
# 多出力のベースラインモデル
col_names = ['temp', 'traffic_volume']
mo_single_step_window = DataWindow(input_width=1, label_width=1, shift=1, df_train=df_train, df_val=df_val, df_test=df_test, label_columns=col_names)
mo_wide_window = DataWindow(input_width=24, label_width=24, shift=1, df_train=df_train, df_val=df_val, df_test=df_test, label_columns=col_names)
mo_baseline_last = Baseline(label_index=[column_indices[col] for col in col_names])
mo_baseline_last.compile(loss=MeanSquaredError(), metrics=[MeanAbsoluteError()])
mo_val_performance = {}
mo_test_performance = {}
# テキストだとmo_wide_windowを使ってevaluateをしているが本来的にはmo_single_step_windowを使う必要がある
mo_val_performance['Baseline - Last'] = mo_baseline_last.evaluate(mo_single_step_window.val)
mo_test_performance['Baseline - Last'] = mo_baseline_last.evaluate(mo_single_step_window.val)
for col in col_names:
mo_wide_window.plot(col, mo_baseline_last)
# 多出力のベースラインモデル
col_names = ['temp', 'traffic_volume']
mo_single_step_window = DataWindow(input_width=1, label_width=1, shift=1, df_train=df_train, df_val=df_val, df_test=df_test, label_columns=col_names)
mo_wide_window = DataWindow(input_width=24, label_width=24, shift=1, df_train=df_train, df_val=df_val, df_test=df_test, label_columns=col_names)
mo_baseline_last = Baseline(label_index=[column_indices[col] for col in col_names])
mo_baseline_last.compile(loss=MeanSquaredError(), metrics=[MeanAbsoluteError()])
mo_val_performance = {}
mo_test_performance = {}
# テキストだとmo_wide_windowを使ってevaluateをしているが本来的にはmo_single_step_windowを使う必要がある
mo_val_performance['Baseline - Last'] = mo_baseline_last.evaluate(mo_single_step_window.val)
mo_test_performance['Baseline - Last'] = mo_baseline_last.evaluate(mo_single_step_window.val)
for col in col_names:
mo_wide_window.plot(col, mo_baseline_last)
110/110 [==============================] - 0s 2ms/step - loss: 342027.7812 - mean_absolute_error: 298.0399 110/110 [==============================] - 0s 2ms/step - loss: 342027.8438 - mean_absolute_error: 298.0400
In [ ]:
Copied!