ENH: Improve histogram bins="auto" for data with little variance (num…

…py#10739) Now falls back on sturges estimator when the IQR is zero
pchanial · Apr 10, 2018 · 918a167 · 918a167
1 parent ab4e4c9
commit 918a167
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 9 deletions.
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst
@@ -146,6 +146,11 @@ as usual with `errstate`.
 Dates, times, and timedeltas can now be histogrammed. The bin edges must be
 passed explicitly, and are not yet computed automatically.
 
+``histogram`` "auto" estimator handles limited variance better
+------------------------------------------------------------------------
+No longer does an IQR of 0 result in `n_bins=1`, rather the number of bins
+chosen is related to the data size in this situation
+
 ``histogramdd`` allows explicit ranges to be given in a subset of axes
 ----------------------------------------------------------------------
 The ``range`` argument of `histogramdd` can now contain ``None`` values to

diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
@@ -167,12 +167,22 @@ def _hist_bin_fd(x):
 def _hist_bin_auto(x):
     """
     Histogram bin estimator that uses the minimum width of the
-    Freedman-Diaconis and Sturges estimators.
+    Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero
+    and the Sturges estimator if the FD bandwidth is 0.
 
     The FD estimator is usually the most robust method, but its width
-    estimate tends to be too large for small `x`. The Sturges estimator
-    is quite good for small (<1000) datasets and is the default in the R
-    language. This method gives good off the shelf behaviour.
+    estimate tends to be too large for small `x` and bad for data with limited
+    variance. The Sturges estimator is quite good for small (<1000) datasets
+    and is the default in the R language. This method gives good off the shelf
+    behaviour.
+
+    .. versionchanged:: 1.15.0
+    If there is limited variance the IQR can be 0, which results in the
+    FD bin width being 0 too. This is not a valid bin width, so
+    ``np.histogram_bin_edges`` chooses 1 bin instead, which may not be optimal.
+    If the IQR is 0, it's unlikely any variance based estimators will be of
+    use, so we revert to the sturges estimator, which only uses the size of the
+    dataset in its calculation.
 
     Parameters
     ----------
@@ -188,10 +198,13 @@ def _hist_bin_auto(x):
     --------
     _hist_bin_fd, _hist_bin_sturges
     """
-    # There is no need to check for zero here. If ptp is, so is IQR and
-    # vice versa. Either both are zero or neither one is.
-    return min(_hist_bin_fd(x), _hist_bin_sturges(x))
-
+    fd_bw = _hist_bin_fd(x)
+    sturges_bw = _hist_bin_sturges(x)
+    if fd_bw:
+        return min(fd_bw, sturges_bw)
+    else:
+        # limited variance, so we return a len dependent bw estimator
+        return sturges_bw
 
 # Private dict initialized at module load time
 _hist_bin_selectors = {'auto': _hist_bin_auto,
@@ -440,7 +453,7 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
     below, :math:`h` is the binwidth and :math:`n_h` is the number of
     bins. All estimators that compute bin counts are recast to bin width
     using the `ptp` of the data. The final bin count is obtained from
-    ``np.round(np.ceil(range / h))`.
+    ``np.round(np.ceil(range / h))``.
 
     'Auto' (maximum of the 'Sturges' and 'FD' estimators)
         A compromise to get a good value. For small datasets the Sturges

diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
@@ -443,6 +443,24 @@ def test_novariance(self):
             assert_equal(len(a), numbins, err_msg="{0} estimator, "
                          "No Variance test".format(estimator))
 
+    def test_limited_variance(self):
+        """
+        Check when IQR is 0, but variance exists, we return the sturges value
+        and not the fd value.
+        """
+        lim_var_data = np.ones(1000)
+        lim_var_data[:3] = 0
+        lim_var_data[-4:] = 100
+
+        edges_auto = histogram_bin_edges(lim_var_data, 'auto')
+        assert_equal(edges_auto, np.linspace(0, 100, 12))
+
+        edges_fd = histogram_bin_edges(lim_var_data, 'fd')
+        assert_equal(edges_fd, np.array([0, 100]))
+
+        edges_sturges = histogram_bin_edges(lim_var_data, 'sturges')
+        assert_equal(edges_sturges, np.linspace(0, 100, 12))
+
     def test_outlier(self):
         """
         Check the FD, Scott and Doane with outliers.