Skip to content

Commit

Permalink
add model selection and seq2seq model and multi step forcasting in ex…
Browse files Browse the repository at this point in the history
…ample
  • Loading branch information
shanyu-sys authored and shane-huang committed Mar 16, 2020
1 parent 5ec62f9 commit a71545a
Show file tree
Hide file tree
Showing 9 changed files with 920 additions and 242 deletions.
976 changes: 761 additions & 215 deletions apps/automl/nyc_taxi_dataset.ipynb

Large diffs are not rendered by default.

11 changes: 4 additions & 7 deletions pyzoo/test/zoo/automl/model/test_VanillaLSTM.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@
# limitations under the License.
#

import shutil
import tempfile

import pytest
from zoo.automl.model.VanillaLSTM import *
from zoo.automl.feature.time_sequence import TimeSequenceFeatureTransformer
Expand All @@ -43,7 +40,7 @@ def test_fit_eval(self):
"dropout_2": 0.2,
"batch_size": 32,
}
model = VanillaLSTM(check_optional_config=False)
model = VanillaLSTM(check_optional_config=False, future_seq_len=future_seq_len)
print("fit_eval:", model.fit_eval(x_train, y_train, **config))

def test_evaluate(self):
Expand All @@ -68,7 +65,7 @@ def test_evaluate(self):
"dropout_2": 0.2,
"batch_size": 32,
}
model = VanillaLSTM(check_optional_config=False)
model = VanillaLSTM(check_optional_config=False, future_seq_len=future_seq_len)
model.fit_eval(x_train, y_train, **config)
mse, rs = model.evaluate(x_val, y_val, metric=['mean_squared_error', 'r_square'])
print("Mean squared error is:", mse)
Expand Down Expand Up @@ -96,7 +93,7 @@ def test_predict(self):
"dropout_2": 0.2,
"batch_size": 32,
}
model = VanillaLSTM(check_optional_config=False)
model = VanillaLSTM(check_optional_config=False, future_seq_len=future_seq_len)
model.fit_eval(x_train, y_train, **config)
y_pred = model.predict(x_test)
assert y_pred.shape == (x_test.shape[0], 1)
Expand Down Expand Up @@ -124,7 +121,7 @@ def test_save_restore(self):
"batch_size": 32,
}

model = VanillaLSTM(check_optional_config=False)
model = VanillaLSTM(check_optional_config=False, future_seq_len=future_seq_len)
model.fit_eval(x_train, y_train, **config)
predict_before = model.predict(x_test)

Expand Down
37 changes: 35 additions & 2 deletions pyzoo/test/zoo/automl/pipeline/test_time_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ def test_evaluate_predict_2(self):
assert np.array_equal(mse1, mse2)
assert np.array_equal(rs1, rs2)

def test_save_restore(self):
def test_save_restore_1(self):
future_seq_len = 1
sample_num = 100
train_df = pd.DataFrame({"datetime": pd.date_range('1/1/2019', periods=sample_num),
"value": np.random.randn(sample_num)})
Expand All @@ -145,6 +146,7 @@ def test_save_restore(self):

tsp = TimeSequencePredictor(dt_col="datetime",
target_col="value",
future_seq_len=future_seq_len,
extra_features_col=None, )
pipeline = tsp.fit(train_df)
pred = pipeline.predict(test_df)
Expand All @@ -161,6 +163,37 @@ def test_save_restore(self):
finally:
shutil.rmtree(dirname)

def test_save_restore_2(self):
future_seq_len = 2
sample_num = 100
train_df = pd.DataFrame({"datetime": pd.date_range('1/1/2019', periods=sample_num),
"value": np.random.randn(sample_num)})
sample_num = 64
test_df = pd.DataFrame({"datetime": pd.date_range('1/10/2019', periods=sample_num),
"value": np.random.randn(sample_num)})

tsp = TimeSequencePredictor(dt_col="datetime",
target_col="value",
future_seq_len=future_seq_len,
extra_features_col=None, )
pipeline = tsp.fit(train_df)
pred = pipeline.predict(test_df)

dirname = tempfile.mkdtemp(prefix="saved_pipeline")
try:
save_pipeline_file = os.path.join(dirname, "my.ppl")
pipeline.save(save_pipeline_file)
assert os.path.isfile(save_pipeline_file)
new_pipeline = load_ts_pipeline(save_pipeline_file)

new_pred = new_pipeline.predict(test_df)
print(pred)
print(new_pred)
columns = ["value_{}".format(i) for i in range(future_seq_len)]
np.testing.assert_allclose(pred[columns].values, new_pred[columns].values)
finally:
shutil.rmtree(dirname)


if __name__ == '__main__':
pytest.main([__file__])
pytest.main([__file__])
15 changes: 8 additions & 7 deletions pyzoo/zoo/automl/common/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,17 @@ def split_input_df(input_df, val_split_ratio=0, test_split_ratio=0.1):
:return:
"""
# suitable to nyc taxi dataset.
input_df.insert(loc=0, column="datetime", value=pd.to_datetime(input_df["timestamp"]))
df = input_df.copy()
df.insert(loc=0, column="datetime", value=pd.to_datetime(input_df["timestamp"]))
# input_df["datetime"] = pd.to_datetime(input_df["timestamp"])
input_df = input_df.drop(columns="timestamp")
df = df.drop(columns="timestamp")

val_size = int(len(input_df) * val_split_ratio)
test_size = int(len(input_df) * test_split_ratio)
val_size = int(len(df) * val_split_ratio)
test_size = int(len(df) * test_split_ratio)

train_df = input_df.iloc[:-(test_size + val_size)].copy()
val_df = input_df.iloc[-(test_size + val_size):-test_size].copy()
test_df = input_df.iloc[-test_size:].copy()
train_df = df.iloc[:-(test_size + val_size)]
val_df = df.iloc[-(test_size + val_size):-test_size]
test_df = df.iloc[-test_size:]

val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
Expand Down
8 changes: 2 additions & 6 deletions pyzoo/zoo/automl/model/VanillaLSTM.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,13 @@

class VanillaLSTM(BaseModel):

def __init__(self, check_optional_config=True):
def __init__(self, check_optional_config=True, future_seq_len=1):
"""
Constructor of Vanilla LSTM model
"""
self.model = None
self.check_optional_config = check_optional_config
self.future_seq_len = None

def _get_len(self, y):
self.future_seq_len = y.shape[1]
self.future_seq_len = future_seq_len

def _build(self, **config):
"""
Expand Down Expand Up @@ -78,7 +75,6 @@ def fit_eval(self, x, y, validation_data=None, **config):
:param config: optimization hyper parameters
:return: the resulting metric
"""
self._get_len(y)
# if model is not initialized, __build the model
if self.model is None:
self._build(**config)
Expand Down
1 change: 1 addition & 0 deletions pyzoo/zoo/automl/model/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class BaseModel(ABC):
"""

check_optional_config = False
future_seq_len = None

@abstractmethod
def fit_eval(self, x, y, validation_data=None, **config):
Expand Down
102 changes: 102 additions & 0 deletions pyzoo/zoo/automl/model/time_sequence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#
# Copyright 2018 Analytics Zoo Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from .abstract import BaseModel
from .VanillaLSTM import VanillaLSTM
from .Seq2Seq import LSTMSeq2Seq


class TimeSequenceModel(BaseModel):
"""
Time Sequence Model is used to do model selection.
"""
def __init__(self, check_optional_config=False, future_seq_len=None):
"""
Contructor of time sequence model
:param check_optional_config:
:param future_seq_len:
"""
if future_seq_len:
self._model_selection(future_seq_len, check_optional_config)

def _model_selection(self, future_seq_len, check_optional_config=False, verbose=1):
if future_seq_len == 1:
self.model = VanillaLSTM(check_optional_config=check_optional_config,
future_seq_len=future_seq_len)
if verbose == 1:
print("Model selection: Vanilla LSTM model is selected.")
else:
self.model = LSTMSeq2Seq(check_optional_config=check_optional_config,
future_seq_len=future_seq_len)
if verbose == 1:
print("Model selection: LSTM Seq2Seq model is selected.")

def fit_eval(self, x, y, validation_data=None, **config):
"""
fit for one iteration
:param x: 3-d array in format (no. of samples, past sequence length, 2+feature length), in the last
dimension, the 1st col is the time index (data type needs to be numpy datetime type, e.g. "datetime64"),
the 2nd col is the target value (data type should be numeric)
:param y: 2-d numpy array in format (no. of samples, future sequence length) if future sequence length > 1,
or 1-d numpy array in format (no. of samples, ) if future sequence length = 1
:param validation_data: tuple in format (x_test,y_test), data used for validation. If this is specified,
validation result will be the optimization target for automl. Otherwise, train metric will be the optimization
target.
:param config: optimization hyper parameters
:return: the resulting metric
"""
return self.model.fit_eval(x, y, validation_data=None, **config)

def evaluate(self, x, y, metric=['mean_squared_error']):
"""
Evaluate on x, y
:param x: input
:param y: target
:param metric: a list of metrics in string format
:return: a list of metric evaluation results
"""
return self.model.evaluate(x, y, metric)

def predict(self, x):
"""
Prediction on x.
:param x: input
:return: predicted y
"""
return self.model.predict(x)

def save(self, model_path, config_path):
"""
save model to file.
:param model_path: the model file.
:param config_path: the config file
:return:
"""
self.model.save(model_path, config_path)

def restore(self, model_path, **config):
self._model_selection(future_seq_len=config["future_seq_len"], verbose=0)
self.model.restore(model_path, **config)

def _get_required_parameters(self):
return self.model._get_required_parameters()

def _get_optional_parameters(self):
return self.model._get_optional_parameters()





5 changes: 3 additions & 2 deletions pyzoo/zoo/automl/pipeline/time_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from zoo.automl.pipeline.abstract import Pipeline
from zoo.automl.common.util import *
from zoo.automl.feature.time_sequence import TimeSequenceFeatureTransformer
from zoo.automl.model import VanillaLSTM
from zoo.automl.model import TimeSequenceModel


class TimeSequencePipeline(Pipeline):
Expand Down Expand Up @@ -69,7 +69,8 @@ def save(self, file):

def load_ts_pipeline(file):
feature_transformers = TimeSequenceFeatureTransformer()
model = VanillaLSTM(check_optional_config=False)
model = TimeSequenceModel(check_optional_config=False)

restore_zip(file, feature_transformers, model)
ts_pipeline = TimeSequencePipeline(feature_transformers, model)
print("Restore pipeline from", file)
Expand Down
7 changes: 4 additions & 3 deletions pyzoo/zoo/automl/regression/time_sequence_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from zoo.automl.common.metrics import Evaluator
from zoo.automl.feature.time_sequence import TimeSequenceFeatureTransformer

from zoo.automl.model import VanillaLSTM
from zoo.automl.model import TimeSequenceModel
from zoo.automl.pipeline.time_sequence import TimeSequencePipeline
from zoo.automl.common.util import *
from abc import ABC, abstractmethod
Expand Down Expand Up @@ -224,8 +224,9 @@ def _hp_search(self,
self.drop_missing)

feature_list = ft.get_feature_list(input_df)
# model
model = VanillaLSTM(check_optional_config=False)

# model = VanillaLSTM(check_optional_config=False)
model = TimeSequenceModel(check_optional_config=False, future_seq_len=self.future_seq_len)

# prepare parameters for search engine
search_space = recipe.search_space(feature_list)
Expand Down

0 comments on commit a71545a

Please sign in to comment.