From 90b9145ac16af174216595f83c26904926f6b299 Mon Sep 17 00:00:00 2001 From: tailaiw <29800495+tailaiw@users.noreply.github.com> Date: Tue, 18 Feb 2020 14:01:05 -0600 Subject: [PATCH] Optimized the logic applying univariate model to DataFrame (#67) * optimized applying univariate model to DF * updated version number and changelogs * minor optimization * Fixed a bug that model trained with Series cannot be applied to DataFrame due to name matching error * modified docstrings * updated version number * updated changelog --- docs/conf.py | 2 +- docs/releasehistory.rst | 7 +++ setup.cfg | 2 +- src/adtk/__init__.py | 2 +- src/adtk/_base.py | 79 +++++++++++++++++++++----- src/adtk/_detector_base.py | 21 ++++--- src/adtk/_transformer_base.py | 16 ++++-- src/adtk/detector/detector_1d.py | 77 +------------------------ src/adtk/transformer/transformer_1d.py | 60 +------------------ tests/test_detector1d.py | 21 +++++++ tests/test_series_name.py | 44 ++++++++++++-- 11 files changed, 164 insertions(+), 167 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index d6fabd9..51b976d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -66,7 +66,7 @@ # The short X.Y version. version = "0.5" # The full version, including alpha/beta/rc tags. -release = "0.5.3" +release = "0.5.4" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/releasehistory.rst b/docs/releasehistory.rst index b86b061..f62eb85 100644 --- a/docs/releasehistory.rst +++ b/docs/releasehistory.rst @@ -2,6 +2,13 @@ Release History *************** +Version 0.5.4 (Feb 18, 2020) +=================================== +- Optimized the workflow of how a univariate model is applied to pandas DataFrame + - Added more informative error messages + - Fixed some bugs resulting in model-column matching error due to inconsistency between output Series names and DataFrame columns + - Clarified the workflow in the documentation + Version 0.5.3 (Feb 12, 2020) =================================== - Quick hotfix to avoid errors caused by statsmodels v0.11 by requiring statsmodels dependency <0.11 diff --git a/setup.cfg b/setup.cfg index 2127f22..3ad827b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = adtk -version = 0.5.3 +version = 0.5.4 author = Arundo Analytics, Inc. maintainer = Tailai Wen maintainer_email = tailai.wen@arundo.com diff --git a/src/adtk/__init__.py b/src/adtk/__init__.py index ef749f2..19cf1b7 100644 --- a/src/adtk/__init__.py +++ b/src/adtk/__init__.py @@ -20,4 +20,4 @@ """ -__version__ = "0.5.3" +__version__ = "0.5.4" diff --git a/src/adtk/_base.py b/src/adtk/_base.py index cdb57ca..268663c 100644 --- a/src/adtk/_base.py +++ b/src/adtk/_base.py @@ -11,7 +11,9 @@ class _Model(ABC): def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) - self._fitted = False + self._fitted = ( + 0 + ) # 0 for not fitted, 1 for fitted, 2 for univariate model fitted by DF @abstractmethod def _fit(self, ts): @@ -91,21 +93,35 @@ def _fit(self, ts): s = ts.copy() self._fit_core(s) self._models = None + self._fitted = 1 elif isinstance(ts, pd.DataFrame): df = ts.copy() + if df.columns.duplicated().any(): + raise ValueError( + "Input DataFrame must have unique column names." + ) if self._need_fit: self._update_models(df.columns) # fit model for each column for col in df.columns: self._models[col].fit(df[col]) + self._fitted = 2 + else: + pass else: raise TypeError("Input must be a pandas Series or DataFrame.") - self._fitted = True def _predict(self, ts): - if self._need_fit and (not self._fitted): + if self._need_fit and (self._fitted == 0): raise RuntimeError("The model must be trained first.") if isinstance(ts, pd.Series): + if self._need_fit and ( + self._fitted == 2 + ): # fitted by DF, to be applied to Series + raise RuntimeError( + "The model was trained by a pandas DataFrame object, " + "it can only be applied to a pandas DataFrame object." + ) s = ts.copy() predicted = self._predict_core(s) # if a Series-to-Series operation, make sure Series name keeps @@ -113,15 +129,42 @@ def _predict(self, ts): predicted.name = ts.name elif isinstance(ts, pd.DataFrame): df = ts.copy() - # if the model doesn't neef fit, initialize or reset a model for - # each column - if not self._need_fit: - self._update_models(df.columns) - # predict for each column - predicted = pd.concat( - [self._models[col]._predict(df[col]) for col in df.columns], - axis=1, - ) + if df.columns.duplicated().any(): + raise ValueError( + "Input DataFrame must have unique column names." + ) + if (not self._need_fit) or (self._fitted == 1): + # apply the model to each column + predicted = [] + for col in df.columns: + predicted_this_col = self._predict(df[col]) + if isinstance(predicted_this_col, pd.DataFrame): + predicted_this_col = predicted_this_col.rename( + columns={ + col1: "{}_{}".format(col, col1) + for col1 in predicted_this_col.columns + } + ) + predicted.append(predicted_this_col) + predicted = pd.concat(predicted, axis=1) + else: + # predict for each column + if not (set(self._models.keys()) >= set(df.columns)): + raise ValueError( + "The model was trained by a pandas DataFrame with " + "columns {}, but the input DataFrame contains columns " + "{} which are unknown to the model.".format( + list(set(self._models.keys())), + list(set(df.columns) - set(self._models.keys())), + ) + ) + predicted = pd.concat( + [ + self._models[col]._predict(df[col]) + for col in df.columns + ], + axis=1, + ) else: raise TypeError("Input must be a pandas Series or DataFrame.") # make sure index freq is the same (because pandas has a bug that some @@ -153,16 +196,24 @@ def fit_predict(self, ts): class _ModelHD(_Model): def _fit(self, df): if isinstance(df, pd.DataFrame): + if df.columns.duplicated().any(): + raise ValueError( + "Input DataFrame must have unique column names." + ) df_copy = df.copy() self._fit_core(df_copy) else: raise TypeError("Input must be a pandas DataFrame.") - self._fitted = True + self._fitted = 1 def _predict(self, df): - if self._need_fit and (not self._fitted): + if self._need_fit and (self._fitted == 0): raise RuntimeError("The model must be trained first.") if isinstance(df, pd.DataFrame): + if df.columns.duplicated().any(): + raise ValueError( + "Input DataFrame must have unique column names." + ) df_copy = df.copy() predicted = self._predict_core(df_copy) else: diff --git a/src/adtk/_detector_base.py b/src/adtk/_detector_base.py index d855290..e9eb4bf 100644 --- a/src/adtk/_detector_base.py +++ b/src/adtk/_detector_base.py @@ -25,9 +25,14 @@ def detect(self, ts, return_list=False): Parameters ---------- ts: pandas.Series or pandas.DataFrame - Time series to detect anomalies from. - If a DataFrame with k columns, k univariate detectors will be - applied to them independently. + Time series to detect anomalies from. If a DataFrame with k + columns, it is treated as k independent univariate time series. + + - If the detector was trained with a Series, the detector will be + applied to each univariate series independently; + - If the detector was trained with a DataFrame, i.e. the detector + is essentially k detectors, those detectors will be applied to + each univariate series respectivley. return_list: bool, optional Whether to return a list of anomalous time stamps, or a binary @@ -66,8 +71,9 @@ def fit_detect(self, ts, return_list=False): ---------- ts: pandas.Series or pandas.DataFrame Time series to be used for training and be detected for anomalies. - If a DataFrame with k columns, k univariate detectors will be - trained and applied to them independently. + If a DataFrame with k columns, it is treated as k independent + univariate time series, and k univariate detectors will be trained + and applied to each series independently. return_list: bool, optional Whether to return a list of anomalous time stamps, or a binary @@ -109,8 +115,9 @@ def score(self, ts, anomaly_true, scoring="recall", **kwargs): ---------- ts: pandas Series or pandas.DataFrame Time series to detect anomalies from. - If a DataFrame with k columns, k univariate detectors will be - applied to them independently. + If a DataFrame with k columns, it is treated as k independent + univariate time series, and k univariate detectors will be trained + and applied to each series independently. anomaly_true: pandas.Series, pandas.DataFrame, list, or dict True anomalies. diff --git a/src/adtk/_transformer_base.py b/src/adtk/_transformer_base.py index ec49b2c..823cebf 100644 --- a/src/adtk/_transformer_base.py +++ b/src/adtk/_transformer_base.py @@ -21,9 +21,14 @@ def transform(self, ts): Parameters ---------- ts: pandas.Series or pandas.DataFrame - Time series to be transformed. - If a DataFrame with k columns, k univariate transformers will be - applied to them independently. + Time series to be transformed. If a DataFrame with k columns, it is + treated as k independent univariate time series. + + - If the transformer was trained with a Series, the transformer + will be applied to each univariate series independently; + - If the transformer was trained with a DataFrame, i.e. the + transformer is essentially k transformers, those transformers + will be applied to each univariate series respectivley. Returns ------- @@ -41,8 +46,9 @@ def fit_transform(self, ts): ---------- ts: pandas.Series or pandas.DataFrame Time series to be used for training and be transformed. - If a DataFrame with k columns, k univariate transformers will be - applied to them independently. + If a DataFrame with k columns, it is treated as k independent + univariate time series, and k univariate transformers will be + trained and applied to each series independently. Returns ------- diff --git a/src/adtk/detector/detector_1d.py b/src/adtk/detector/detector_1d.py index 48151ab..d795d46 100644 --- a/src/adtk/detector/detector_1d.py +++ b/src/adtk/detector/detector_1d.py @@ -39,13 +39,6 @@ class CustomizedDetector1D(_Detector1D): """Detector derived from a user-given function and parameters. - This is an univariate detector. When it is applied to a multivariate time - series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - Parameters ---------- detect_func: function @@ -133,13 +126,6 @@ class ThresholdAD(_Detector1D): This detector compares time series values with user-given thresholds, and identifies time points as anomalous when values are beyond the thresholds. - This is an univariate detector. When it is applied to a multivariate time - series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - Parameters ---------- low: float, optional @@ -178,13 +164,6 @@ class QuantileAD(_Detector1D): of historical data, and identifies time points as anomalous when values are beyond the thresholds. - This is an univariate detector. When it is applied to a multivariate time - series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - Parameters ---------- low: float, optional @@ -239,13 +218,6 @@ class InterQuartileRangeAD(_Detector1D): historical data, and identifies time points as anomalous when differences are beyond the inter-quartile range times a user-given factor c. - This is an univariate detector. When it is applied to a multivariate time - series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - Parameters ---------- c: float, or 2-tuple (float, float), optional @@ -317,13 +289,6 @@ class GeneralizedESDTestAD(_Detector1D): follow an approximately normal distribution. Please only use this detector when this assumption holds. - This is an univariate detector. When it is applied to a multivariate time - series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - [1] Rosner, Bernard (May 1983), Percentage Points for a Generalized ESD Many-Outlier Procedure,Technometrics, 25(2), pp. 165-172. @@ -412,13 +377,6 @@ class PersistAD(_Detector1D): This detector is internally implemented as a `Pipenet` object. Advanced users may learn more details by checking attribute `pipe_`. - This is an univariate detector. When it is applied to a multivariate time - series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - Parameters ---------- window: int, optional @@ -575,13 +533,6 @@ class LevelShiftAD(_Detector1D): This detector is internally implemented as a `Pipenet` object. Advanced users may learn more details by checking attribute `pipe_`. - This is an univariate detector. When it is applied to a multivariate time - series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - Parameters ---------- window: int, optional @@ -723,13 +674,6 @@ class VolatilityShiftAD(_Detector1D): This detector is internally implemented as a `Pipenet` object. Advanced users may learn more details by checking attribute `pipe_`. - This is an univariate detector. When it is applied to a multivariate time - series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - Parameters ---------- window: int, optional @@ -886,13 +830,6 @@ class AutoregressionAD(_Detector1D): This detector is internally implemented aattribute `pipe_`.nced users may learn more details by checking attribute `pipe_`. - This is an univariate detector. When it is applied to a multivariate time - series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - Parameters ---------- n_steps: int, optional @@ -1042,13 +979,6 @@ class SeasonalAD(_Detector1D): This detector is internally implemented aattribute `pipe_`.nced users may learn more details by checking attribute `pipe_`. - This is an univariate detector. When it is applied to a multivariate time - series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - Parameters ---------- freq: int, optional @@ -1084,12 +1014,7 @@ class SeasonalAD(_Detector1D): """ - _default_params = { - "freq": None, - "side": "both", - "c": 3.0, - "trend": False, - } + _default_params = {"freq": None, "side": "both", "c": 3.0, "trend": False} def __init__( self, diff --git a/src/adtk/transformer/transformer_1d.py b/src/adtk/transformer/transformer_1d.py index ab1500c..4679282 100644 --- a/src/adtk/transformer/transformer_1d.py +++ b/src/adtk/transformer/transformer_1d.py @@ -49,14 +49,6 @@ class CustomizedTransformer1D(_Transformer1D): fit_func_params: dict, optional Parameters of fit_func. Default: None. - - This is an univariate transformer. When it is applied to a multivariate - time series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - """ _need_fit = False @@ -125,10 +117,6 @@ class StandardScale(_Transformer1D): """Transformer that scales time series such that mean is equal to 0 and standard deviation is equal to 1. - This is an univariate transformer. When it is applied to a multivariate - time series (i.e. pandas DataFrame), it will be applied to every series - independently. - """ _need_fit = False @@ -153,13 +141,6 @@ class RollingAggregate(_Transformer1D): """Transformer that roll a sliding window along a time series, and aggregates using a user-selected operation. - This is an univariate transformer. When it is applied to a multivariate - time series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - Parameters ---------- agg: str or function @@ -372,11 +353,7 @@ def agg_wrapped(x): if isinstance(s_rolling, pd.Series): s_rolling.name = s.name - else: - if s.name is not None: - s_rolling.columns = [ - "{}_{}".format(s.name, col) for col in s_rolling.columns - ] + return s_rolling @@ -385,13 +362,6 @@ class DoubleRollingAggregate(_Transformer1D): series, aggregates using a user-given operation, and calcuates the difference of aggregated metrics between two sliding windows. - This is an univariate transformer. When it is applied to a multivariate - time series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - Parameters ---------- agg: str, function, or tuple @@ -698,14 +668,6 @@ class ClassicSeasonalDecomposition(_Transformer1D): seasonal_: pandas.Series Seasonal pattern extracted from training series. - - This is an univariate transformer. When it is applied to a multivariate - time series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - """ _default_params = {"freq": None, "trend": False} @@ -859,13 +821,6 @@ def _predict_core(self, s): def _identify_seasonal_period(s, low_autocorr=0.1, high_autocorr=0.3): """Identify seasonal period of a time series based on autocorrelation. - This is an univariate transformer. When it is applied to a multivariate - time series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - Parameters ---------- s: pandas Series or DataFrame @@ -925,13 +880,6 @@ class Retrospect(_Transformer1D): u_[t-5], and a series y_t are needed to learn the relationship between control and outcome. - This is an univariate transformer. When it is applied to a multivariate - time series (i.e. pandas DataFrame), it will be applied to every series - independently. All parameters can be defined as a dict object where key- - value pairs are series names (i.e. column names of DataFrame) and the - model parameter for that series. If not, then the same parameter will be - applied to all series. - Parameters ---------- n_steps: int, optional @@ -1003,11 +951,7 @@ def _predict_core(self, s): df = pd.DataFrame(index=s.index) df = df.assign( **{ - ( - "t-{}".format(i) - if s.name is None - else "{}_t-{}".format(s.name, i) - ): s.shift(i) + ("t-{}".format(i)): s.shift(i) for i in range(till, till + n_steps * step_size, step_size) } ) diff --git a/tests/test_detector1d.py b/tests/test_detector1d.py index 4cec3dc..386ab5f 100644 --- a/tests/test_detector1d.py +++ b/tests/test_detector1d.py @@ -382,6 +382,27 @@ def test_dataframe(testCase): pd.testing.assert_frame_equal(a, a_true, check_dtype=False) +@pytest.mark.parametrize("testCase", testCases) +def test_fit_series_predict_dataframe(testCase): + """Test fit the detector with a series and predict with dataframe.""" + s = pd.Series( + testCase["s"], + pd.date_range(start="2017-1-1", periods=len(testCase["s"]), freq="D"), + ) + df = pd.concat([s.rename("A"), s.rename("B")], axis=1) + model = testCase["model"](**testCase["params"]) + a_true = pd.Series(testCase["a"], index=s.index) + a_true = pd.concat([a_true.rename("A"), a_true.rename("B")], axis=1) + if testCase["pandas_bug"] and (parse(pd.__version__) < parse("0.25")): + with pytest.raises(PandasBugError): + model.fit(s) + a = model.detect(df) + else: + model.fit(s) + a = model.detect(df) + pd.testing.assert_frame_equal(a, a_true, check_dtype=False) + + def test_autoregressive_ad_dataframe(): """Make sure deepcopy works """ diff --git a/tests/test_series_name.py b/tests/test_series_name.py index a92843b..dde3546 100644 --- a/tests/test_series_name.py +++ b/tests/test_series_name.py @@ -11,6 +11,12 @@ from sklearn.cluster import KMeans from sklearn.linear_model import LinearRegression +# We have 4 types of models +# - one-to-one: input a univariate series, output a univariate series +# - one-to-many: input a univariate series, output a multivariate series +# - many-to-one: input a multivariate series, output a univariate series +# - many-to-many: input a multivariate series, output a multivariate series + one2one_models = [ detector.ThresholdAD(), detector.QuantileAD(), @@ -58,7 +64,11 @@ @pytest.mark.parametrize("model", one2one_models) -def test_one2one_s2s_wo_name(model): +def test_one2one_s2s_w_name(model): + """ + if a one-to-one model is applied to a Series, it should keep the Series + name unchanged + """ s_name = pd.Series( np.arange(100), index=pd.date_range(start="2017-1-1", periods=100, freq="D"), @@ -69,7 +79,11 @@ def test_one2one_s2s_wo_name(model): @pytest.mark.parametrize("model", one2one_models) -def test_one2one_s2s_w_name(model): +def test_one2one_s2s_wo_name(model): + """ + if a one-to-one model is applied to a Series, it should keep the Series + name unchanged + """ s_no_name = pd.Series( np.arange(100), index=pd.date_range(start="2017-1-1", periods=100, freq="D"), @@ -80,6 +94,10 @@ def test_one2one_s2s_w_name(model): @pytest.mark.parametrize("model", one2one_models) def test_one2one_df2df(model): + """ + if a one-to-one model is applied to a DataFrame, it should keep the column + names unchanged + """ df = pd.DataFrame( np.arange(300).reshape(100, 3), index=pd.date_range(start="2017-1-1", periods=100, freq="D"), @@ -91,6 +109,10 @@ def test_one2one_df2df(model): @pytest.mark.parametrize("model", one2one_models) def test_one2one_df2list(model): + """ + if a one-to-one model (detector) is applied to a DataFrame and returns a + dict, the output dict keys should match the input column names + """ if hasattr(model, "fit_detect"): df = pd.DataFrame( np.arange(300).reshape(100, 3), @@ -106,18 +128,25 @@ def test_one2one_df2list(model): @pytest.mark.parametrize("model", one2many_models) def test_one2many_s2df_w_name(model): + """ + if a one-to-many model is applied to a Series, the output should not have + prefix in column names, no matter whether the input Series has a name. + """ s_name = pd.Series( np.arange(100), index=pd.date_range(start="2017-1-1", periods=100, freq="D"), name="A", ) result = model.fit_predict(s_name) - assert all([col[:2] == "A_" for col in result.columns]) - assert all([col[2:4] != "A_" for col in result.columns]) + assert all([col[:2] != "A_" for col in result.columns]) @pytest.mark.parametrize("model", one2many_models) def test_one2many_s2df_wo_name(model): + """ + if a one-to-many model is applied to a Series, the output should not have + prefix in column names, no matter whether the input Series has a name. + """ s_no_name = pd.Series( np.arange(100), index=pd.date_range(start="2017-1-1", periods=100, freq="D"), @@ -128,6 +157,10 @@ def test_one2many_s2df_wo_name(model): @pytest.mark.parametrize("model", one2many_models) def test_one2many_df2df(model): + """ + if a one-to-many model is applied to a DataFrame, the output should have + prefix in column names to indicate the input columns they correspond. + """ df = pd.DataFrame( np.arange(300).reshape(100, 3), index=pd.date_range(start="2017-1-1", periods=100, freq="D"), @@ -149,6 +182,9 @@ def test_one2many_df2df(model): @pytest.mark.parametrize("model", many2one_models) def test_many2one(model): + """ + The output Series from a many-to-one model should NOT have name + """ df = pd.DataFrame( np.arange(300).reshape(100, 3), index=pd.date_range(start="2017-1-1", periods=100, freq="D"),