add model selection and seq2seq model and multi step forcasting in ex…

…ample
zzwgit · Mar 16, 2020 · a71545a · a71545a
1 parent 5ec62f9
commit a71545a
Show file tree

Hide file tree

Showing 9 changed files with 920 additions and 242 deletions.
diff --git a/apps/automl/nyc_taxi_dataset.ipynb b/apps/automl/nyc_taxi_dataset.ipynb
diff --git a/pyzoo/test/zoo/automl/model/test_VanillaLSTM.py b/pyzoo/test/zoo/automl/model/test_VanillaLSTM.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 #
 
-import shutil
-import tempfile
-
 import pytest
 from zoo.automl.model.VanillaLSTM import *
 from zoo.automl.feature.time_sequence import TimeSequenceFeatureTransformer
@@ -43,7 +40,7 @@ def test_fit_eval(self):
             "dropout_2": 0.2,
             "batch_size": 32,
         }
-        model = VanillaLSTM(check_optional_config=False)
+        model = VanillaLSTM(check_optional_config=False, future_seq_len=future_seq_len)
         print("fit_eval:", model.fit_eval(x_train, y_train, **config))
 
     def test_evaluate(self):
@@ -68,7 +65,7 @@ def test_evaluate(self):
             "dropout_2": 0.2,
             "batch_size": 32,
         }
-        model = VanillaLSTM(check_optional_config=False)
+        model = VanillaLSTM(check_optional_config=False, future_seq_len=future_seq_len)
         model.fit_eval(x_train, y_train, **config)
         mse, rs = model.evaluate(x_val, y_val, metric=['mean_squared_error', 'r_square'])
         print("Mean squared error is:", mse)
@@ -96,7 +93,7 @@ def test_predict(self):
             "dropout_2": 0.2,
             "batch_size": 32,
         }
-        model = VanillaLSTM(check_optional_config=False)
+        model = VanillaLSTM(check_optional_config=False, future_seq_len=future_seq_len)
         model.fit_eval(x_train, y_train, **config)
         y_pred = model.predict(x_test)
         assert y_pred.shape == (x_test.shape[0], 1)
@@ -124,7 +121,7 @@ def test_save_restore(self):
             "batch_size": 32,
         }
 
-        model = VanillaLSTM(check_optional_config=False)
+        model = VanillaLSTM(check_optional_config=False, future_seq_len=future_seq_len)
         model.fit_eval(x_train, y_train, **config)
         predict_before = model.predict(x_test)
 

diff --git a/pyzoo/test/zoo/automl/pipeline/test_time_sequence.py b/pyzoo/test/zoo/automl/pipeline/test_time_sequence.py
@@ -135,7 +135,8 @@ def test_evaluate_predict_2(self):
         assert np.array_equal(mse1, mse2)
         assert np.array_equal(rs1, rs2)
 
-    def test_save_restore(self):
+    def test_save_restore_1(self):
+        future_seq_len = 1
         sample_num = 100
         train_df = pd.DataFrame({"datetime": pd.date_range('1/1/2019', periods=sample_num),
                                  "value": np.random.randn(sample_num)})
@@ -145,6 +146,7 @@ def test_save_restore(self):
 
         tsp = TimeSequencePredictor(dt_col="datetime",
                                     target_col="value",
+                                    future_seq_len=future_seq_len,
                                     extra_features_col=None, )
         pipeline = tsp.fit(train_df)
         pred = pipeline.predict(test_df)
@@ -161,6 +163,37 @@ def test_save_restore(self):
         finally:
             shutil.rmtree(dirname)
 
+    def test_save_restore_2(self):
+        future_seq_len = 2
+        sample_num = 100
+        train_df = pd.DataFrame({"datetime": pd.date_range('1/1/2019', periods=sample_num),
+                                 "value": np.random.randn(sample_num)})
+        sample_num = 64
+        test_df = pd.DataFrame({"datetime": pd.date_range('1/10/2019', periods=sample_num),
+                                "value": np.random.randn(sample_num)})
+
+        tsp = TimeSequencePredictor(dt_col="datetime",
+                                    target_col="value",
+                                    future_seq_len=future_seq_len,
+                                    extra_features_col=None, )
+        pipeline = tsp.fit(train_df)
+        pred = pipeline.predict(test_df)
+
+        dirname = tempfile.mkdtemp(prefix="saved_pipeline")
+        try:
+            save_pipeline_file = os.path.join(dirname, "my.ppl")
+            pipeline.save(save_pipeline_file)
+            assert os.path.isfile(save_pipeline_file)
+            new_pipeline = load_ts_pipeline(save_pipeline_file)
+
+            new_pred = new_pipeline.predict(test_df)
+            print(pred)
+            print(new_pred)
+            columns = ["value_{}".format(i) for i in range(future_seq_len)]
+            np.testing.assert_allclose(pred[columns].values, new_pred[columns].values)
+        finally:
+            shutil.rmtree(dirname)
+
 
 if __name__ == '__main__':
-    pytest.main([__file__])
+    pytest.main([__file__])
diff --git a/pyzoo/zoo/automl/common/util.py b/pyzoo/zoo/automl/common/util.py
@@ -33,16 +33,17 @@ def split_input_df(input_df, val_split_ratio=0, test_split_ratio=0.1):
     :return:
     """
     # suitable to nyc taxi dataset.
-    input_df.insert(loc=0, column="datetime", value=pd.to_datetime(input_df["timestamp"]))
+    df = input_df.copy()
+    df.insert(loc=0, column="datetime", value=pd.to_datetime(input_df["timestamp"]))
     # input_df["datetime"] = pd.to_datetime(input_df["timestamp"])
-    input_df = input_df.drop(columns="timestamp")
+    df = df.drop(columns="timestamp")
 
-    val_size = int(len(input_df) * val_split_ratio)
-    test_size = int(len(input_df) * test_split_ratio)
+    val_size = int(len(df) * val_split_ratio)
+    test_size = int(len(df) * test_split_ratio)
 
-    train_df = input_df.iloc[:-(test_size + val_size)].copy()
-    val_df = input_df.iloc[-(test_size + val_size):-test_size].copy()
-    test_df = input_df.iloc[-test_size:].copy()
+    train_df = df.iloc[:-(test_size + val_size)]
+    val_df = df.iloc[-(test_size + val_size):-test_size]
+    test_df = df.iloc[-test_size:]
 
     val_df = val_df.reset_index(drop=True)
     test_df = test_df.reset_index(drop=True)

diff --git a/pyzoo/zoo/automl/model/VanillaLSTM.py b/pyzoo/zoo/automl/model/VanillaLSTM.py
@@ -26,16 +26,13 @@
 
 class VanillaLSTM(BaseModel):
 
-    def __init__(self, check_optional_config=True):
+    def __init__(self, check_optional_config=True, future_seq_len=1):
         """
         Constructor of Vanilla LSTM model
         """
         self.model = None
         self.check_optional_config = check_optional_config
-        self.future_seq_len = None
-
-    def _get_len(self, y):
-        self.future_seq_len = y.shape[1]
+        self.future_seq_len = future_seq_len
 
     def _build(self, **config):
         """
@@ -78,7 +75,6 @@ def fit_eval(self, x, y, validation_data=None, **config):
         :param config: optimization hyper parameters
         :return: the resulting metric
         """
-        self._get_len(y)
         # if model is not initialized, __build the model
         if self.model is None:
             self._build(**config)

diff --git a/pyzoo/zoo/automl/model/abstract.py b/pyzoo/zoo/automl/model/abstract.py
@@ -23,6 +23,7 @@ class BaseModel(ABC):
     """
 
     check_optional_config = False
+    future_seq_len = None
 
     @abstractmethod
     def fit_eval(self, x, y, validation_data=None, **config):

diff --git a/pyzoo/zoo/automl/model/time_sequence.py b/pyzoo/zoo/automl/model/time_sequence.py
@@ -0,0 +1,102 @@
+#
+# Copyright 2018 Analytics Zoo Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from .abstract import BaseModel
+from .VanillaLSTM import VanillaLSTM
+from .Seq2Seq import LSTMSeq2Seq
+
+
+class TimeSequenceModel(BaseModel):
+    """
+    Time Sequence Model is used to do model selection.
+    """
+    def __init__(self, check_optional_config=False, future_seq_len=None):
+        """
+        Contructor of time sequence model
+        :param check_optional_config:
+        :param future_seq_len:
+        """
+        if future_seq_len:
+            self._model_selection(future_seq_len, check_optional_config)
+
+    def _model_selection(self, future_seq_len, check_optional_config=False, verbose=1):
+        if future_seq_len == 1:
+            self.model = VanillaLSTM(check_optional_config=check_optional_config,
+                                     future_seq_len=future_seq_len)
+            if verbose == 1:
+                print("Model selection: Vanilla LSTM model is selected.")
+        else:
+            self.model = LSTMSeq2Seq(check_optional_config=check_optional_config,
+                                     future_seq_len=future_seq_len)
+            if verbose == 1:
+                print("Model selection: LSTM Seq2Seq model is selected.")
+
+    def fit_eval(self, x, y, validation_data=None, **config):
+        """
+        fit for one iteration
+        :param x: 3-d array in format (no. of samples, past sequence length, 2+feature length), in the last
+        dimension, the 1st col is the time index (data type needs to be numpy datetime type, e.g. "datetime64"),
+        the 2nd col is the target value (data type should be numeric)
+        :param y: 2-d numpy array in format (no. of samples, future sequence length) if future sequence length > 1,
+        or 1-d numpy array in format (no. of samples, ) if future sequence length = 1
+        :param validation_data: tuple in format (x_test,y_test), data used for validation. If this is specified,
+        validation result will be the optimization target for automl. Otherwise, train metric will be the optimization
+        target.
+        :param config: optimization hyper parameters
+        :return: the resulting metric
+        """
+        return self.model.fit_eval(x, y, validation_data=None, **config)
+
+    def evaluate(self, x, y, metric=['mean_squared_error']):
+        """
+        Evaluate on x, y
+        :param x: input
+        :param y: target
+        :param metric: a list of metrics in string format
+        :return: a list of metric evaluation results
+        """
+        return self.model.evaluate(x, y, metric)
+
+    def predict(self, x):
+        """
+        Prediction on x.
+        :param x: input
+        :return: predicted y
+        """
+        return self.model.predict(x)
+
+    def save(self, model_path, config_path):
+        """
+        save model to file.
+        :param model_path: the model file.
+        :param config_path: the config file
+        :return:
+        """
+        self.model.save(model_path, config_path)
+
+    def restore(self, model_path, **config):
+        self._model_selection(future_seq_len=config["future_seq_len"], verbose=0)
+        self.model.restore(model_path, **config)
+
+    def _get_required_parameters(self):
+        return self.model._get_required_parameters()
+
+    def _get_optional_parameters(self):
+        return self.model._get_optional_parameters()
+
+
+
+
+
diff --git a/pyzoo/zoo/automl/pipeline/time_sequence.py b/pyzoo/zoo/automl/pipeline/time_sequence.py
@@ -20,7 +20,7 @@
 from zoo.automl.pipeline.abstract import Pipeline
 from zoo.automl.common.util import *
 from zoo.automl.feature.time_sequence import TimeSequenceFeatureTransformer
-from zoo.automl.model import VanillaLSTM
+from zoo.automl.model import TimeSequenceModel
 
 
 class TimeSequencePipeline(Pipeline):
@@ -69,7 +69,8 @@ def save(self, file):
 
 def load_ts_pipeline(file):
     feature_transformers = TimeSequenceFeatureTransformer()
-    model = VanillaLSTM(check_optional_config=False)
+    model = TimeSequenceModel(check_optional_config=False)
+
     restore_zip(file, feature_transformers, model)
     ts_pipeline = TimeSequencePipeline(feature_transformers, model)
     print("Restore pipeline from", file)

diff --git a/pyzoo/zoo/automl/regression/time_sequence_predictor.py b/pyzoo/zoo/automl/regression/time_sequence_predictor.py
@@ -26,7 +26,7 @@
 from zoo.automl.common.metrics import Evaluator
 from zoo.automl.feature.time_sequence import TimeSequenceFeatureTransformer
 
-from zoo.automl.model import VanillaLSTM
+from zoo.automl.model import TimeSequenceModel
 from zoo.automl.pipeline.time_sequence import TimeSequencePipeline
 from zoo.automl.common.util import *
 from abc import ABC, abstractmethod
@@ -224,8 +224,9 @@ def _hp_search(self,
                                             self.drop_missing)
 
         feature_list = ft.get_feature_list(input_df)
-        # model
-        model = VanillaLSTM(check_optional_config=False)
+
+        # model = VanillaLSTM(check_optional_config=False)
+        model = TimeSequenceModel(check_optional_config=False, future_seq_len=self.future_seq_len)
 
         # prepare parameters for search engine
         search_space = recipe.search_space(feature_list)