Skip to content

Commit

Permalink
BUG: lib: Fix histogram problem with signed integer arrays.
Browse files Browse the repository at this point in the history
An input such as

    np.histogram(np.array([-2, 0, 127], dtype=np.int8), bins="auto")

would raise the exception

    ValueError: Number of samples, -1, must be non-negative.

The problem was that the peak-to-peak value for the input array was
computed with the `ptp` method, which returned negative values for
signed integer arrays when the actual value was more than the
maximum signed value of the array's data type.

The fix is to use a peak-to-peak function that returns an
unsigned value for signed integer arrays.

Closes numpygh-14379.
  • Loading branch information
WarrenWeckesser committed Oct 15, 2019
1 parent dc20ec8 commit 3ff4924
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 5 deletions.
20 changes: 15 additions & 5 deletions numpy/lib/histograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@
_range = range


def _ptp(x):
"""Peak-to-peak value of x.
This implementation avoids the problem of signed integer arrays having a
peak-to-peak value that cannot be represented with the array's data type.
This function returns an unsigned value for signed integer arrays.
"""
return _unsigned_subtract(x.max(), x.min())


def _hist_bin_sqrt(x, range):
"""
Square root histogram bin estimator.
Expand All @@ -40,7 +50,7 @@ def _hist_bin_sqrt(x, range):
h : An estimate of the optimal bin width for the given data.
"""
del range # unused
return x.ptp() / np.sqrt(x.size)
return _ptp(x) / np.sqrt(x.size)


def _hist_bin_sturges(x, range):
Expand All @@ -63,7 +73,7 @@ def _hist_bin_sturges(x, range):
h : An estimate of the optimal bin width for the given data.
"""
del range # unused
return x.ptp() / (np.log2(x.size) + 1.0)
return _ptp(x) / (np.log2(x.size) + 1.0)


def _hist_bin_rice(x, range):
Expand All @@ -87,7 +97,7 @@ def _hist_bin_rice(x, range):
h : An estimate of the optimal bin width for the given data.
"""
del range # unused
return x.ptp() / (2.0 * x.size ** (1.0 / 3))
return _ptp(x) / (2.0 * x.size ** (1.0 / 3))


def _hist_bin_scott(x, range):
Expand Down Expand Up @@ -137,7 +147,7 @@ def _hist_bin_stone(x, range):
"""

n = x.size
ptp_x = np.ptp(x)
ptp_x = _ptp(x)
if n <= 1 or ptp_x == 0:
return 0

Expand Down Expand Up @@ -184,7 +194,7 @@ def _hist_bin_doane(x, range):
np.true_divide(temp, sigma, temp)
np.power(temp, 3, temp)
g1 = np.mean(temp)
return x.ptp() / (1.0 + np.log2(x.size) +
return _ptp(x) / (1.0 + np.log2(x.size) +
np.log2(1.0 + np.absolute(g1) / sg1))
return 0.0

Expand Down
11 changes: 11 additions & 0 deletions numpy/lib/tests/test_histograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
assert_array_almost_equal, assert_raises, assert_allclose,
assert_array_max_ulp, assert_raises_regex, suppress_warnings,
)
import pytest


class TestHistogram(object):
Expand Down Expand Up @@ -591,6 +592,16 @@ def test_simple_range(self):
msg += " with datasize of {0}".format(testlen)
assert_equal(len(a), numbins, err_msg=msg)

@pytest.mark.parametrize("bins", ['auto', 'fd', 'doane', 'scott',
'stone', 'rice', 'sturges'])
def test_signed_integer_data(self, bins):
# Regression test for gh-14379.
a = np.array([-2, 0, 127], dtype=np.int8)
hist, edges = np.histogram(a, bins=bins)
hist32, edges32 = np.histogram(a.astype(np.int32), bins=bins)
assert_array_equal(hist, hist32)
assert_array_equal(edges, edges32)

def test_simple_weighted(self):
"""
Check that weighted data raises a TypeError
Expand Down

0 comments on commit 3ff4924

Please sign in to comment.