From f3f91be7920f52cd6abace64d7e14b876e5f054f Mon Sep 17 00:00:00 2001 From: Marten van Kerkwijk Date: Wed, 8 Feb 2017 22:17:49 -0500 Subject: [PATCH 1/2] BUG: MaskedArray __eq__ wrong for masked scalar, multi-d recarray In the process of trying to fix the "questionable behaviour in `MaskedArray.__eq__`" (gh-8589), it became clear that the code was buggy. E.g., `ma == ma[0]` failed if `ma` held a structured dtype; multi-d structured dtypes failed generally; and, more worryingly, a masked scalar comparison could be wrong: `np.ma.MaskedArray(1, mask=True) == 0` yields True. This commit solves these problems, adding tests to prevent regression. In the process, it also ensures that the results for structured arrays always equals what one would get by logically combining the results over individual parts of the structure. --- doc/release/1.13.0-notes.rst | 9 +++ numpy/ma/core.py | 139 ++++++++++++++++++----------------- numpy/ma/tests/test_core.py | 89 ++++++++++++++++++++-- 3 files changed, 164 insertions(+), 73 deletions(-) diff --git a/doc/release/1.13.0-notes.rst b/doc/release/1.13.0-notes.rst index 049653ea4b1e..49fd6735dda3 100644 --- a/doc/release/1.13.0-notes.rst +++ b/doc/release/1.13.0-notes.rst @@ -179,6 +179,15 @@ Better default repr for ``ndarray`` subclasses Subclasses of ndarray with no ``repr`` specialization now correctly indent their data and type lines. +More reliable comparisons of masked arrays +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Comparisons of masked arrays were buggy for masked scalars and failed for +structured arrays with dimension higher than one. Both problems are now +solved. In the process, it was ensured that in getting the result for a +structured array, masked fields are properly ignored, i.e., the result is equal +if all fields that are non-masked in both are equal, thus making the behaviour +identical to what one gets by comparing an unstructured masked array and then +doing ``.all()`` over some axis. Changes ======= diff --git a/numpy/ma/core.py b/numpy/ma/core.py index 3b2b39b18511..35d4b72bc469 100644 --- a/numpy/ma/core.py +++ b/numpy/ma/core.py @@ -23,6 +23,7 @@ from __future__ import division, absolute_import, print_function import sys +import operator import warnings from functools import reduce @@ -1733,7 +1734,8 @@ def _recursive_mask_or(m1, m2, newmask): if (dtype1 != dtype2): raise ValueError("Incompatible dtypes '%s'<>'%s'" % (dtype1, dtype2)) if dtype1.names: - newmask = np.empty_like(m1) + # Allocate an output mask array with the properly broadcast shape. + newmask = np.empty(np.broadcast(m1, m2).shape, dtype1) _recursive_mask_or(m1, m2, newmask) return newmask return make_mask(umath.logical_or(m1, m2), copy=copy, shrink=shrink) @@ -3873,81 +3875,84 @@ def _delegate_binop(self, other): return True return False - def __eq__(self, other): - """ - Check whether other equals self elementwise. + def _comparison(self, other, compare): + """Compare self with other using operator.eq or operator.ne. + When either of the elements is masked, the result is masked as well, + but the underlying boolean data are still set, with self and other + considered equal if both are masked, and unequal otherwise. + + For structured arrays, all fields are combined, with masked values + ignored. The result is masked if all fields were masked, with self + and other considered equal only if both were fully masked. """ - if self is masked: - return masked omask = getmask(other) - if omask is nomask: - check = self.filled(0).__eq__(other) - try: - check = check.view(type(self)) - check._mask = self._mask - except AttributeError: - # Dang, we have a bool instead of an array: return the bool - return check + smask = self.mask + mask = mask_or(smask, omask, copy=True) + + odata = getdata(other) + if mask.dtype.names: + # For possibly masked structured arrays we need to be careful, + # since the standard structured array comparison will use all + # fields, masked or not. To avoid masked fields influencing the + # outcome, we set all masked fields in self to other, so they'll + # count as equal. To prepare, we ensure we have the right shape. + broadcast_shape = np.broadcast(self, odata).shape + sbroadcast = np.broadcast_to(self, broadcast_shape, subok=True) + sbroadcast._mask = mask + sdata = sbroadcast.filled(odata) + # Now take care of the mask; the merged mask should have an item + # masked if all fields were masked (in one and/or other). + mask = (mask == np.ones((), mask.dtype)) + else: - odata = filled(other, 0) - check = self.filled(0).__eq__(odata).view(type(self)) - if self._mask is nomask: - check._mask = omask - else: - mask = mask_or(self._mask, omask) - if mask.dtype.names: - if mask.size > 1: - axis = 1 - else: - axis = None - try: - mask = mask.view((bool_, len(self.dtype))).all(axis) - except (ValueError, np.AxisError): - # TODO: what error are we trying to catch here? - # invalid axis, or invalid view? - mask = np.all([[f[n].all() for n in mask.dtype.names] - for f in mask], axis=axis) - check._mask = mask + # For regular arrays, just use the data as they come. + sdata = self.data + + check = compare(sdata, odata) + + if isinstance(check, (np.bool_, bool)): + return masked if mask else check + + if mask is not nomask: + # Adjust elements that were masked, which should be treated + # as equal if masked in both, unequal if masked in one. + # Note that this works automatically for structured arrays too. + check = np.where(mask, compare(smask, omask), check) + if mask.shape != check.shape: + # Guarantee consistency of the shape, making a copy since the + # the mask may need to get written to later. + mask = np.broadcast_to(mask, check.shape).copy() + + check = check.view(type(self)) + check._mask = mask return check - def __ne__(self, other): + def __eq__(self, other): + """Check whether other equals self elementwise. + + When either of the elements is masked, the result is masked as well, + but the underlying boolean data are still set, with self and other + considered equal if both are masked, and unequal otherwise. + + For structured arrays, all fields are combined, with masked values + ignored. The result is masked if all fields were masked, with self + and other considered equal only if both were fully masked. """ - Check whether other doesn't equal self elementwise + return self._comparison(other, operator.eq) + + def __ne__(self, other): + """Check whether other does not equal self elementwise. + + When either of the elements is masked, the result is masked as well, + but the underlying boolean data are still set, with self and other + considered equal if both are masked, and unequal otherwise. + For structured arrays, all fields are combined, with masked values + ignored. The result is masked if all fields were masked, with self + and other considered equal only if both were fully masked. """ - if self is masked: - return masked - omask = getmask(other) - if omask is nomask: - check = self.filled(0).__ne__(other) - try: - check = check.view(type(self)) - check._mask = self._mask - except AttributeError: - # In case check is a boolean (or a numpy.bool) - return check - else: - odata = filled(other, 0) - check = self.filled(0).__ne__(odata).view(type(self)) - if self._mask is nomask: - check._mask = omask - else: - mask = mask_or(self._mask, omask) - if mask.dtype.names: - if mask.size > 1: - axis = 1 - else: - axis = None - try: - mask = mask.view((bool_, len(self.dtype))).all(axis) - except (ValueError, np.AxisError): - # TODO: what error are we trying to catch here? - # invalid axis, or invalid view? - mask = np.all([[f[n].all() for n in mask.dtype.names] - for f in mask], axis=axis) - check._mask = mask - return check + return self._comparison(other, operator.ne) def __add__(self, other): """ diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py index f9d032f097c4..d64f1acdc0d4 100644 --- a/numpy/ma/tests/test_core.py +++ b/numpy/ma/tests/test_core.py @@ -1335,32 +1335,95 @@ def test_eq_on_structured(self): ndtype = [('A', int), ('B', int)] a = array([(1, 1), (2, 2)], mask=[(0, 1), (0, 0)], dtype=ndtype) test = (a == a) - assert_equal(test, [True, True]) + assert_equal(test.data, [True, True]) + assert_equal(test.mask, [False, False]) + test = (a == a[0]) + assert_equal(test.data, [True, False]) assert_equal(test.mask, [False, False]) b = array([(1, 1), (2, 2)], mask=[(1, 0), (0, 0)], dtype=ndtype) test = (a == b) - assert_equal(test, [False, True]) + assert_equal(test.data, [False, True]) + assert_equal(test.mask, [True, False]) + test = (a[0] == b) + assert_equal(test.data, [False, False]) assert_equal(test.mask, [True, False]) b = array([(1, 1), (2, 2)], mask=[(0, 1), (1, 0)], dtype=ndtype) test = (a == b) - assert_equal(test, [True, False]) + assert_equal(test.data, [True, True]) assert_equal(test.mask, [False, False]) + # complicated dtype, 2-dimensional array. + ndtype = [('A', int), ('B', [('BA', int), ('BB', int)])] + a = array([[(1, (1, 1)), (2, (2, 2))], + [(3, (3, 3)), (4, (4, 4))]], + mask=[[(0, (1, 0)), (0, (0, 1))], + [(1, (0, 0)), (1, (1, 1))]], dtype=ndtype) + test = (a[0, 0] == a) + assert_equal(test.data, [[True, False], [False, False]]) + assert_equal(test.mask, [[False, False], [False, True]]) def test_ne_on_structured(self): # Test the equality of structured arrays ndtype = [('A', int), ('B', int)] a = array([(1, 1), (2, 2)], mask=[(0, 1), (0, 0)], dtype=ndtype) test = (a != a) - assert_equal(test, [False, False]) + assert_equal(test.data, [False, False]) + assert_equal(test.mask, [False, False]) + test = (a != a[0]) + assert_equal(test.data, [False, True]) assert_equal(test.mask, [False, False]) b = array([(1, 1), (2, 2)], mask=[(1, 0), (0, 0)], dtype=ndtype) test = (a != b) - assert_equal(test, [True, False]) + assert_equal(test.data, [True, False]) + assert_equal(test.mask, [True, False]) + test = (a[0] != b) + assert_equal(test.data, [True, True]) assert_equal(test.mask, [True, False]) b = array([(1, 1), (2, 2)], mask=[(0, 1), (1, 0)], dtype=ndtype) test = (a != b) - assert_equal(test, [False, True]) + assert_equal(test.data, [False, False]) assert_equal(test.mask, [False, False]) + # complicated dtype, 2-dimensional array. + ndtype = [('A', int), ('B', [('BA', int), ('BB', int)])] + a = array([[(1, (1, 1)), (2, (2, 2))], + [(3, (3, 3)), (4, (4, 4))]], + mask=[[(0, (1, 0)), (0, (0, 1))], + [(1, (0, 0)), (1, (1, 1))]], dtype=ndtype) + test = (a[0, 0] != a) + assert_equal(test.data, [[False, True], [True, True]]) + assert_equal(test.mask, [[False, False], [False, True]]) + + def test_eq_ne_structured_extra(self): + # ensure simple examples are symmetric and make sense. + # from https://github.com/numpy/numpy/pull/8590#discussion_r101126465 + dt = np.dtype('i4,i4') + for m1 in (mvoid((1, 2), mask=(0, 0), dtype=dt), + mvoid((1, 2), mask=(0, 1), dtype=dt), + mvoid((1, 2), mask=(1, 0), dtype=dt), + mvoid((1, 2), mask=(1, 1), dtype=dt)): + ma1 = m1.view(MaskedArray) + r1 = ma1.view('2i4') + for m2 in (mvoid((1, 1), dtype=dt), + mvoid((1, 0), mask=(0, 1), dtype=dt), + mvoid((3, 2), mask=(0, 1), dtype=dt)): + ma2 = m2.view(MaskedArray) + r2 = ma2.view('2i4') + eq_expected = (r1 == r2).all() + assert_equal(m1 == m2, eq_expected) + assert_equal(m2 == m1, eq_expected) + assert_equal(ma1 == m2, eq_expected) + assert_equal(m1 == ma2, eq_expected) + assert_equal(ma1 == ma2, eq_expected) + # Also check it is the same if we do it element by element. + el_by_el = [m1[name] == m2[name] for name in dt.names] + assert_equal(array(el_by_el, dtype=bool).all(), eq_expected) + ne_expected = (r1 != r2).any() + assert_equal(m1 != m2, ne_expected) + assert_equal(m2 != m1, ne_expected) + assert_equal(ma1 != m2, ne_expected) + assert_equal(m1 != ma2, ne_expected) + assert_equal(ma1 != ma2, ne_expected) + el_by_el = [m1[name] != m2[name] for name in dt.names] + assert_equal(array(el_by_el, dtype=bool).any(), ne_expected) def test_eq_with_None(self): # Really, comparisons with None should not be done, but check them @@ -1393,6 +1456,20 @@ def test_eq_with_scalar(self): assert_equal(a == 0, False) assert_equal(a != 1, False) assert_equal(a != 0, True) + b = array(1, mask=True) + assert_equal(b == 0, masked) + assert_equal(b == 1, masked) + assert_equal(b != 0, masked) + assert_equal(b != 1, masked) + + def test_eq_different_dimensions(self): + m1 = array([[0, 1], [1, 2]]) + m2 = array([1, 1], mask=[0, 1]) + test = (m1 == m2) + assert_equal(test, [[False, False], + [True, False]]) + assert_equal(test.mask, [[False, True], + [False, True]]) def test_numpyarithmetics(self): # Check that the mask is not back-propagated when using numpy functions From 3435dd9881d84ba2b91faa478555a3c58e7921af Mon Sep 17 00:00:00 2001 From: Marten van Kerkwijk Date: Tue, 21 Feb 2017 09:50:42 -0500 Subject: [PATCH 2/2] BUG: ensure masked array comparison with regular void works. --- numpy/ma/core.py | 20 +++++--------------- numpy/ma/tests/test_core.py | 30 ++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/numpy/ma/core.py b/numpy/ma/core.py index 35d4b72bc469..af8a523ab80c 100644 --- a/numpy/ma/core.py +++ b/numpy/ma/core.py @@ -1603,21 +1603,11 @@ def make_mask(m, copy=False, shrink=True, dtype=MaskType): """ if m is nomask: return nomask - elif isinstance(m, ndarray): - # We won't return after this point to make sure we can shrink the mask - # Fill the mask in case there are missing data - m = filled(m, True) - # Make sure the input dtype is valid - dtype = make_mask_descr(dtype) - if m.dtype == dtype: - if copy: - result = m.copy() - else: - result = m - else: - result = np.array(m, dtype=dtype, copy=copy) - else: - result = np.array(filled(m, True), dtype=MaskType) + + # Make sure the input dtype is valid. + dtype = make_mask_descr(dtype) + # Fill the mask in case there are missing data; turn it into an ndarray. + result = np.array(filled(m, True), copy=copy, dtype=dtype, subok=True) # Bas les masques ! if shrink and (not result.dtype.names) and (not result.any()): return nomask diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py index d64f1acdc0d4..ca1ef16c4f0d 100644 --- a/numpy/ma/tests/test_core.py +++ b/numpy/ma/tests/test_core.py @@ -1402,7 +1402,8 @@ def test_eq_ne_structured_extra(self): mvoid((1, 2), mask=(1, 1), dtype=dt)): ma1 = m1.view(MaskedArray) r1 = ma1.view('2i4') - for m2 in (mvoid((1, 1), dtype=dt), + for m2 in (np.array((1, 1), dtype=dt), + mvoid((1, 1), dtype=dt), mvoid((1, 0), mask=(0, 1), dtype=dt), mvoid((3, 2), mask=(0, 1), dtype=dt)): ma2 = m2.view(MaskedArray) @@ -1463,13 +1464,15 @@ def test_eq_with_scalar(self): assert_equal(b != 1, masked) def test_eq_different_dimensions(self): - m1 = array([[0, 1], [1, 2]]) - m2 = array([1, 1], mask=[0, 1]) - test = (m1 == m2) - assert_equal(test, [[False, False], - [True, False]]) - assert_equal(test.mask, [[False, True], - [False, True]]) + m1 = array([1, 1], mask=[0, 1]) + # test comparison with both masked and regular arrays. + for m2 in (array([[0, 1], [1, 2]]), + np.array([[0, 1], [1, 2]])): + test = (m1 == m2) + assert_equal(test.data, [[False, False], + [True, False]]) + assert_equal(test.mask, [[False, True], + [False, True]]) def test_numpyarithmetics(self): # Check that the mask is not back-propagated when using numpy functions @@ -4055,7 +4058,15 @@ def test_make_mask(self): test = make_mask(mask, dtype=mask.dtype) assert_equal(test.dtype, bdtype) assert_equal(test, np.array([(0, 0), (0, 1)], dtype=bdtype)) - + # Ensure this also works for void + mask = np.array((False, True), dtype='?,?')[()] + assert_(isinstance(mask, np.void)) + test = make_mask(mask, dtype=mask.dtype) + assert_equal(test, mask) + assert_(test is not mask) + mask = np.array((0, 1), dtype='i4,i4')[()] + test2 = make_mask(mask, dtype=mask.dtype) + assert_equal(test2, test) # test that nomask is returned when m is nomask. bools = [True, False] dtypes = [MaskType, np.float] @@ -4064,7 +4075,6 @@ def test_make_mask(self): res = make_mask(nomask, copy=cpy, shrink=shr, dtype=dt) assert_(res is nomask, msgformat % (cpy, shr, dt)) - def test_mask_or(self): # Initialize mtype = [('a', np.bool), ('b', np.bool)]