Merge pull request numpy#10411 from ahaldane/revert_multifield_view

BUG: Revert multifield-indexing adds padding bytes for NumPy 1.15.
merraksh · Jun 11, 2018 · 1b92080 · 1b92080
2 parents 8525d3e + e08eced
commit 1b92080
Show file tree

Hide file tree

Showing 10 changed files with 297 additions and 34 deletions.
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst
@@ -154,6 +154,16 @@ Since ``np.ma.masked`` is a readonly scalar, copying should be a no-op. These
 functions now behave consistently with ``np.copy()``.
 
 
+Multifield Indexing of Structured Arrays will still return a copy
+-----------------------------------------------------------------
+The change that multi-field indexing of structured arrays returns a view
+instead of a copy is pushed back to 1.16. A new method
+``numpy.lib.recfunctions.repack_fields`` has been introduced to help mitigate
+the effects of this change, which can be used to write code compatible with
+both numpy 1.15 and 1.16. For more information on how to update code to account
+for this future change see "basics/structured arrays/accessing multiple fields"
+in the user guide.
+
 C API changes
 =============
 

diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
@@ -424,7 +424,7 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims,
      * __len__ is not defined.
      */
     if (maxdims == 0 || !PySequence_Check(obj) || PySequence_Size(obj) < 0) {
-        // clear any PySequence_Size error, which corrupts further calls to it
+        /* clear any PySequence_Size error which corrupts further calls */
         PyErr_Clear();
 
         if (*out_dtype == NULL || (*out_dtype)->type_num != NPY_OBJECT) {

diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
@@ -613,11 +613,14 @@ PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject *pytype)
         subtype = Py_TYPE(self);
     }
 
-    if (type != NULL && (PyArray_FLAGS(self) & NPY_ARRAY_WARN_ON_WRITE)) {
+    dtype = PyArray_DESCR(self);
+
+    if (type != NULL && !PyArray_EquivTypes(dtype, type) &&
+            (PyArray_FLAGS(self) & NPY_ARRAY_WARN_ON_WRITE)) {
         const char *msg =
             "Numpy has detected that you may be viewing or writing to an array "
             "returned by selecting multiple fields in a structured array. \n\n"
-            "This code may break in numpy 1.13 because this will return a view "
+            "This code may break in numpy 1.16 because this will return a view "
             "instead of a copy -- see release notes for details.";
         /* 2016-09-19, 1.12 */
         if (DEPRECATE_FUTUREWARNING(msg) < 0) {
@@ -629,7 +632,6 @@ PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject *pytype)
 
     flags = PyArray_FLAGS(self);
 
-    dtype = PyArray_DESCR(self);
     Py_INCREF(dtype);
     ret = (PyArrayObject *)PyArray_NewFromDescr_int(
             subtype, dtype,

diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
@@ -1386,15 +1386,55 @@ array_subscript_asarray(PyArrayObject *self, PyObject *op)
     return PyArray_EnsureAnyArray(array_subscript(self, op));
 }
 
+/*
+ * Helper function for _get_field_view which turns a multifield
+ * view into a "packed" copy, as done in numpy 1.15 and before.
+ * In numpy 1.16 this function should be removed.
+ */
+NPY_NO_EXPORT int
+_multifield_view_to_copy(PyArrayObject **view) {
+    static PyObject *copyfunc = NULL;
+    PyObject *viewcopy;
+
+    /* return a repacked copy of the view */
+    npy_cache_import("numpy.lib.recfunctions", "repack_fields", &copyfunc);
+    if (copyfunc == NULL) {
+        goto view_fail;
+    }
+
+    PyArray_CLEARFLAGS(*view, NPY_ARRAY_WARN_ON_WRITE);
+    viewcopy = PyObject_CallFunction(copyfunc, "O", *view);
+    if (viewcopy == NULL) {
+        goto view_fail;
+    }
+    Py_DECREF(*view);
+    *view = (PyArrayObject*)viewcopy;
+
+    /* warn when writing to the copy */
+    PyArray_ENABLEFLAGS(*view, NPY_ARRAY_WARN_ON_WRITE);
+    return 0;
+
+view_fail:
+    Py_DECREF(*view);
+    *view = NULL;
+    return 0;
+}
+
 /*
  * Attempts to subscript an array using a field name or list of field names.
  *
  * If an error occurred, return 0 and set view to NULL. If the subscript is not
  * a string or list of strings, return -1 and set view to NULL. Otherwise
  * return 0 and set view to point to a new view into arr for the given fields.
+ *
+ * In numpy 1.15 and before, in the case of a list of field names the returned
+ * view will actually be a copy by default, with fields packed together.
+ * The `force_view` argument causes a view to be returned. This argument can be
+ * removed in 1.16 when we plan to return a view always.
  */
 NPY_NO_EXPORT int
-_get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
+_get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view,
+                int force_view)
 {
     *view = NULL;
 
@@ -1489,12 +1529,12 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 Py_DECREF(names);
                 return 0;
             }
-            // disallow use of titles as index
+            /* disallow use of titles as index */
             if (PyTuple_Size(tup) == 3) {
                 PyObject *title = PyTuple_GET_ITEM(tup, 2);
                 int titlecmp = PyObject_RichCompareBool(title, name, Py_EQ);
                 if (titlecmp == 1) {
-                    // if title == name, we were given a title, not a field name
+                    /* if title == name, we got a title, not a field name */
                     PyErr_SetString(PyExc_KeyError,
                                 "cannot use field titles in multi-field index");
                 }
@@ -1507,7 +1547,7 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 }
                 Py_DECREF(title);
             }
-            // disallow duplicate field indices
+            /* disallow duplicate field indices */
             if (PyDict_Contains(fields, name)) {
                 PyObject *errmsg = PyUString_FromString(
                                        "duplicate field of name ");
@@ -1552,10 +1592,16 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 PyArray_FLAGS(arr),
                 (PyObject *)arr, (PyObject *)arr,
                 0, 1);
+
         if (*view == NULL) {
             return 0;
         }
-        return 0;
+
+        /* the code below can be replaced by "return 0" in 1.16 */
+        if (force_view) {
+            return 0;
+        }
+        return _multifield_view_to_copy(view);
     }
     return -1;
 }
@@ -1583,7 +1629,7 @@ array_subscript(PyArrayObject *self, PyObject *op)
     /* return fields if op is a string index */
     if (PyDataType_HASFIELDS(PyArray_DESCR(self))) {
         PyArrayObject *view;
-        int ret = _get_field_view(self, op, &view);
+        int ret = _get_field_view(self, op, &view, 0);
         if (ret == 0){
             if (view == NULL) {
                 return NULL;
@@ -1865,7 +1911,7 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op)
     /* field access */
     if (PyDataType_HASFIELDS(PyArray_DESCR(self))){
         PyArrayObject *view;
-        int ret = _get_field_view(self, ind, &view);
+        int ret = _get_field_view(self, ind, &view, 1);
         if (ret == 0){
             if (view == NULL) {
                 return -1;

diff --git a/numpy/core/src/private/npy_import.h b/numpy/core/src/private/npy_import.h
@@ -29,4 +29,18 @@ npy_cache_import(const char *module, const char *attr, PyObject **cache)
     }
 }
 
+NPY_INLINE static PyObject *
+npy_import(const char *module, const char *attr)
+{
+    PyObject *mod = PyImport_ImportModule(module);
+    PyObject *ret = NULL;
+
+    if (mod != NULL) {
+        ret = PyObject_GetAttrString(mod, attr);
+    }
+    Py_XDECREF(mod);
+
+    return ret;
+}
+
 #endif
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
@@ -4796,9 +4796,25 @@ def test_field_names(self):
             fn2 = func('f2')
             b[fn2] = 3
 
-            assert_equal(b[['f1', 'f2']][0].tolist(), (2, 3))
-            assert_equal(b[['f2', 'f1']][0].tolist(), (3, 2))
-            assert_equal(b[['f1', 'f3']][0].tolist(), (2, (1,)))
+            # In 1.16 code below can be replaced by:
+            # assert_equal(b[['f1', 'f2']][0].tolist(), (2, 3))
+            # assert_equal(b[['f2', 'f1']][0].tolist(), (3, 2))
+            # assert_equal(b[['f1', 'f3']][0].tolist(), (2, (1,)))
+            with suppress_warnings() as sup:
+                sup.filter(FutureWarning,
+                           ".* selecting multiple fields .*")
+
+                assert_equal(b[['f1', 'f2']][0].tolist(), (2, 3))
+                assert_equal(b[['f2', 'f1']][0].tolist(), (3, 2))
+                assert_equal(b[['f1', 'f3']][0].tolist(), (2, (1,)))
+                # view of subfield view/copy
+                assert_equal(b[['f1', 'f2']][0].view(('i4', 2)).tolist(),
+                             (2, 3))
+                assert_equal(b[['f2', 'f1']][0].view(('i4', 2)).tolist(),
+                             (3, 2))
+                view_dtype = [('f1', 'i4'), ('f3', [('', 'i4')])]
+                assert_equal(b[['f1', 'f3']][0].view(view_dtype).tolist(),
+                             (2, (1,)))
 
         # non-ascii unicode field indexing is well behaved
         if not is_py3:
@@ -4808,6 +4824,51 @@ def test_field_names(self):
             assert_raises(ValueError, a.__setitem__, u'\u03e0', 1)
             assert_raises(ValueError, a.__getitem__, u'\u03e0')
 
+    # can be removed in 1.16
+    def test_field_names_deprecation(self):
+
+        def collect_warnings(f, *args, **kwargs):
+            with warnings.catch_warnings(record=True) as log:
+                warnings.simplefilter("always")
+                f(*args, **kwargs)
+            return [w.category for w in log]
+
+        a = np.zeros((1,), dtype=[('f1', 'i4'),
+                                  ('f2', 'i4'),
+                                  ('f3', [('sf1', 'i4')])])
+        a['f1'][0] = 1
+        a['f2'][0] = 2
+        a['f3'][0] = (3,)
+        b = np.zeros((1,), dtype=[('f1', 'i4'),
+                                  ('f2', 'i4'),
+                                  ('f3', [('sf1', 'i4')])])
+        b['f1'][0] = 1
+        b['f2'][0] = 2
+        b['f3'][0] = (3,)
+
+        # All the different functions raise a warning, but not an error
+        assert_equal(collect_warnings(a[['f1', 'f2']].__setitem__, 0, (10, 20)),
+                     [FutureWarning])
+        # For <=1.12 a is not modified, but it will be in 1.13
+        assert_equal(a, b)
+
+        # Views also warn
+        subset = a[['f1', 'f2']]
+        subset_view = subset.view()
+        assert_equal(collect_warnings(subset_view['f1'].__setitem__, 0, 10),
+                     [FutureWarning])
+        # But the write goes through:
+        assert_equal(subset['f1'][0], 10)
+        # Only one warning per multiple field indexing, though (even if there
+        # are multiple views involved):
+        assert_equal(collect_warnings(subset['f1'].__setitem__, 0, 10), [])
+
+        # make sure views of a multi-field index warn too
+        c = np.zeros(3, dtype='i8,i8,i8')
+        assert_equal(collect_warnings(c[['f0', 'f2']].view, 'i8,i8'),
+                     [FutureWarning])
+
+
     def test_record_hash(self):
         a = np.array([(1, 2), (1, 2)], dtype='i1,i2')
         a.flags.writeable = False

diff --git a/numpy/core/tests/test_records.py b/numpy/core/tests/test_records.py
@@ -11,6 +11,7 @@
 import warnings
 import textwrap
 from os import path
+import pytest
 
 import numpy as np
 from numpy.testing import (
@@ -360,6 +361,7 @@ def test_nonwriteable_setfield(self):
         with assert_raises(ValueError):
             r.setfield([2,3], *r.dtype.fields['f'])
 
+    @pytest.mark.xfail(reason="See gh-10411, becomes real error in 1.16")
     def test_out_of_order_fields(self):
         # names in the same order, padding added to descr
         x = self.data[['col1', 'col2']]

diff --git a/numpy/doc/structured_arrays.py b/numpy/doc/structured_arrays.py
@@ -133,10 +133,9 @@
 
      Offsets may be chosen such that the fields overlap, though this will mean
      that assigning to one field may clobber any overlapping field's data. As
-     an exception, fields of :class:`numpy.object` type .. (see 
-     :ref:`object arrays <arrays.object>`) cannot overlap with other fields,
-     because of the risk of clobbering the internal object pointer and then
-     dereferencing it.
+     an exception, fields of :class:`numpy.object` type cannot overlap with
+     other fields, because of the risk of clobbering the internal object
+     pointer and then dereferencing it.
 
      The optional 'aligned' value can be set to ``True`` to make the automatic
      offset computation use aligned offsets (see :ref:`offsets-and-alignment`),
@@ -235,6 +234,11 @@
 alignment conditions, the array will have the ``ALIGNED`` :ref:`flag
 <numpy.ndarray.flags>` set.
 
+A convenience function :func:`numpy.lib.recfunctions.repack_fields` converts an
+aligned dtype or array to a packed one and vice versa. It takes either a dtype
+or structured ndarray as an argument, and returns a copy with fields re-packed,
+with or without padding bytes.
+
 .. _titles:
 
 Field Titles
@@ -396,27 +400,61 @@
 Accessing Multiple Fields
 ```````````````````````````
 
-One can index a structured array with a multi-field index, where the index is a
-list of field names::
+One can index and assign to a structured array with a multi-field index, where
+the index is a list of field names.
+
+.. warning::
+    The behavior of multi-field indexes will change from Numpy 1.15 to Numpy
+    1.16.
 
- >>> a = np.zeros(3, dtype=[('a', 'i8'), ('b', 'i4'), ('c', 'f8')])
+In Numpy 1.16, the result of indexing with a multi-field index will be a view
+into the original array, as follows::
+
+ >>> a = np.zeros(3, dtype=[('a', 'i4'), ('b', 'i4'), ('c', 'f4')])
  >>> a[['a', 'c']]
- array([(0, 0.0), (0, 0.0), (0, 0.0)],
-       dtype={'names':['a','c'], 'formats':['<i8','<f8'], 'offsets':[0,11], 'itemsize':19})
+ array([(0, 0.), (0, 0.), (0, 0.)],
+      dtype={'names':['a','c'], 'formats':['<i4','<f4'], 'offsets':[0,8], 'itemsize':12})
+
+Assignment to the view modifies the original array. The view's fields will be
+in the order they were indexed. Note that unlike for single-field indexing, the
+view's dtype has the same itemsize as the original array, and has fields at the
+same offsets as in the original array, and unindexed fields are merely missing.
+
+In Numpy 1.15, indexing an array with a multi-field index returns a copy of
+the result above for 1.16, but with fields packed together in memory as if
+passed through :func:`numpy.lib.recfunctions.repack_fields`. This is the
+behavior since Numpy 1.7.
+
+.. warning::
+   The new behavior in Numpy 1.16 leads to extra "padding" bytes at the
+   location of unindexed fields. You will need to update any code which depends
+   on the data having a "packed" layout. For instance code such as::
+
+    >>> a[['a','c']].view('i8')  # will fail in Numpy 1.16
+    ValueError: When changing to a smaller dtype, its size must be a divisor of the size of original dtype
+
+   will need to be changed. This code has raised a ``FutureWarning`` since
+   Numpy 1.12.
+
+   The following is a recommended fix, which will behave identically in Numpy
+   1.15 and Numpy 1.16::
+
+    >>> from numpy.lib.recfunctions import repack_fields
+    >>> repack_fields(a[['a','c']]).view('i8')  # supported 1.15 and 1.16
+    array([0, 0, 0])
+
+Assigning to an array with a multi-field index will behave the same in Numpy
+1.15 and Numpy 1.16. In both versions the assignment will modify the original
+array::
+
  >>> a[['a', 'c']] = (2, 3)
  >>> a
  array([(2, 0, 3.0), (2, 0, 3.0), (2, 0, 3.0)],
        dtype=[('a', '<i8'), ('b', '<i4'), ('c', '<f8')])
 
-The resulting array is a view into the original array, such that assignment to
-the view modifies the original array. The view's fields will be in the order
-they were indexed. Note that unlike for single-field indexing, the view's dtype
-has the same itemsize as the original array, and has fields at the same offsets
-as in the original array, and unindexed fields are merely missing.
-
-Since the view is a structured array itself, it obeys the assignment rules
-described above. For example, this means that one can swap the values of two
-fields using appropriate multi-field indexes::
+This obeys the structured array assignment rules described above. For example,
+this means that one can swap the values of two fields using appropriate
+multi-field indexes::
 
  >>> a[['a', 'c']] = a[['c', 'a']]