Merge branch 'master' of https://github.com/blue-yonder/tsfresh

xiehaizheng · Sep 19, 2018 · 52e50bd · 52e50bd
2 parents fd4fea2 + a53fb6a
commit 52e50bd
Show file tree

Hide file tree

Showing 9 changed files with 55 additions and 105 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -10,6 +10,8 @@ Unreleased
 - change chunking in energy_ratio_by_chunks to use all data points
 - fix warning for spkt_welch_density
 - adapt default settings for "value_count" and "range_count"
+- added
+    - maxlag parameter to agg_autocorrelation function
 
 Version 0.11.1
 ==============

diff --git a/docs/api/tests.integrations.rst b/docs/api/tests.integrations.rst
diff --git a/docs/api/tests.rst b/docs/api/tests.rst
diff --git a/docs/api/tests.units.rst b/docs/api/tests.units.rst
diff --git a/docs/conf.py b/docs/conf.py
@@ -87,7 +87,7 @@
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ['_build', 'api/tests*']
 
 # The reST default role (used for this markup: `text`) to use for all documents.
 # default_role = None

diff --git a/rdocs-requirements.txt b/rdocs-requirements.txt
@@ -1,3 +1,3 @@
-Sphinx>=1.6.4
+Sphinx==1.6.4
 sphinx_rtd_theme>=0.2.4
 -r requirements.txt
diff --git a/tests/units/feature_extraction/test_feature_calculations.py b/tests/units/feature_extraction/test_feature_calculations.py
@@ -129,36 +129,51 @@ def test_sum(self):
         self.assertEqualOnAllArrayTypes(sum_values, [-1.2, -2, -3, -4], -10.2)
         self.assertEqualOnAllArrayTypes(sum_values, [], 0)
 
-    def test_agg_autocorrelation(self):
+    def test_agg_autocorrelation_returns_correct_values(self):
 
-        param = [{"f_agg": "mean"}]
+        param = [{"f_agg": "mean", "maxlag": 10}]
         x = [1, 1, 1, 1, 1, 1, 1]
         expected_res = 0
-        res = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\""]
+        res = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\"__maxlag_10"]
         self.assertAlmostEqual(res, expected_res, places=4)
 
         x = [1, 2, -3]
         expected_res = 1 / np.var(x) * (((1 * 2 + 2 * (-3)) / 2 + (1 * -3)) / 2)
-        res = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\""]
+        res = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\"__maxlag_10"]
         self.assertAlmostEqual(res, expected_res, places=4)
 
         np.random.seed(42)
         x = np.random.normal(size=3000)
         expected_res = 0
-        res = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\""]
+        res = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\"__maxlag_10"]
         self.assertAlmostEqual(res, expected_res, places=2)
 
-        param=[{"f_agg": "median"}]
+        param = [{"f_agg": "median", "maxlag": 10}]
         x = [1, 1, 1, 1, 1, 1, 1]
         expected_res = 0
-        res = dict(agg_autocorrelation(x, param=param))["f_agg_\"median\""]
+        res = dict(agg_autocorrelation(x, param=param))["f_agg_\"median\"__maxlag_10"]
         self.assertAlmostEqual(res, expected_res, places=4)
 
         x = [1, 2, -3]
         expected_res = 1 / np.var(x) * (((1 * 2 + 2 * (-3)) / 2 + (1 * -3)) / 2)
-        res = dict(agg_autocorrelation(x, param=param))["f_agg_\"median\""]
+        res = dict(agg_autocorrelation(x, param=param))["f_agg_\"median\"__maxlag_10"]
         self.assertAlmostEqual(res, expected_res, places=4)
 
+    def test_agg_autocorrelation_returns_max_lag_does_not_affect_other_results(self):
+
+        param = [{"f_agg": "mean", "maxlag": 1},
+                 {"f_agg": "mean", "maxlag": 10}]
+        x = range(10)
+        res1 = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\"__maxlag_1"]
+        res10 = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\"__maxlag_10"]
+        self.assertAlmostEqual(res1, 0.77777777, places=4)
+        self.assertAlmostEqual(res10, -0.64983164983165, places=4)
+
+        param = [{"f_agg": "mean", "maxlag": 1}]
+        x = range(10)
+        res1 = dict(agg_autocorrelation(x, param=param))["f_agg_\"mean\"__maxlag_1"]
+        self.assertAlmostEqual(res1, 0.77777777, places=4)
+
     def test_partial_autocorrelation(self):
 
         # Test for altering time series

diff --git a/tsfresh/feature_extraction/feature_calculators.py b/tsfresh/feature_extraction/feature_calculators.py
@@ -276,33 +276,48 @@ def sum_values(x):
 
 @set_property("fctype", "combiner")
 def agg_autocorrelation(x, param):
-    """
-    Calculates the value of an aggregation function f_agg (e.g. var or mean) of the autocorrelation
-    (Compare to http://en.wikipedia.org/wiki/Autocorrelation#Estimation), taken over different all possible lags
-    (1 to length of x)
+    r"""
+    Calculates the value of an aggregation function :math:`f_{agg}` (e.g. the variance or the mean) over the
+    autocorrelation :math:`R(l)` for different lags. The autocorrelation :math:`R(l)` for lag :math:`l` is defined as
 
     .. math::
 
-        \\frac{1}{n-1} \\sum_{l=1,\ldots, n} \\frac{1}{(n-l)\sigma^{2}} \\sum_{t=1}^{n-l}(X_{t}-\\mu )(X_{t+l}-\\mu)
+        R(l) = \frac{1}{(n-l)\sigma^{2}} \sum_{t=1}^{n-l}(X_{t}-\mu )(X_{t+l}-\mu)
 
-    where :math:`n` is the length of the time series :math:`X_i`, :math:`\sigma^2` its variance and :math:`\mu` its
-    mean.
+    where :math:`X_i` are the values of the time series, :math:`n` its length. Finally, :math:`\sigma^2` and
+    :math:`\mu` are estimators for its variance and mean
+    (See `Estimation of the Autocorrelation function <http://en.wikipedia.org/wiki/Autocorrelation#Estimation>`_).
+
+    The :math:`R(l)` for different lags :math:`l` form a vector. This feature calculator applies the aggregation
+    function :math:`f_{agg}` to this vector and returns
+
+    .. math::
+
+        f_{agg} \left( R(1), \ldots, R(m)\right) \quad \text{for} \quad m = max(n, maxlag).
+
+    Here :math:`maxlag` is the second parameter passed to this function.
 
     :param x: the time series to calculate the feature of
     :type x: pandas.Series
-    :param param: contains dictionaries {"attr": x} with x str, name of a numpy function (e.g. mean, var, std, median),
-                   the name of the aggregator function that is applied to the autocorrelations
+    :param param: contains dictionaries {"attr": x, "maxlag", n} with x str, the name of a numpy function
+                  (e.g. mean, var, std, median), its the name of the aggregator function that is applied to the
+                  autocorrelations. Further, n is an int and the maximal number of lags to consider.
     :type param: list
     :return: the value of this feature
     :return type: float
     """
+    # if the time series is longer than the following threshold, we use fft to calculate the acf
+    THRESHOLD_TO_USE_FFT = 1250
     var = np.var(x)
     n = len(x)
+    max_maxlag = max([config["maxlag"] for config in param])
+
     if np.abs(var) < 10**-10 or n == 1:
-        a = 0
+        a = [0] * len(x)
     else:
-        a = acf(x, unbiased=True, fft=n > 1250)[1:]
-    return [("f_agg_\"{}\"".format(config["f_agg"]), getattr(np, config["f_agg"])(a)) for config in param]
+        a = acf(x, unbiased=True, fft=n > THRESHOLD_TO_USE_FFT, nlags=max_maxlag)[1:]
+    return [("f_agg_\"{}\"__maxlag_{}".format(config["f_agg"], config["maxlag"]),
+             getattr(np, config["f_agg"])(a[:int(config["maxlag"])])) for config in param]
 
 
 @set_property("fctype", "combiner")

diff --git a/tsfresh/feature_extraction/settings.py b/tsfresh/feature_extraction/settings.py
@@ -114,7 +114,7 @@ def __init__(self):
             "large_standard_deviation": [{"r": r * 0.05} for r in range(1, 20)],
             "quantile": [{"q": q} for q in [.1, .2, .3, .4, .6, .7, .8, .9]],
             "autocorrelation": [{"lag": lag} for lag in range(10)],
-            "agg_autocorrelation": [{"f_agg": s} for s in ["mean", "median", "var"]],
+            "agg_autocorrelation": [{"f_agg": s, "maxlag": 40} for s in ["mean", "median", "var"]],
             "partial_autocorrelation": [{"lag": lag} for lag in range(10)],
             "number_cwt_peaks": [{"n": n} for n in [1, 5]],
             "number_peaks": [{"n": n} for n in [1, 3, 5, 10, 50]],