From 812e359b50919fb99942c674098ce8b8c506ab5d Mon Sep 17 00:00:00 2001
From: Allan Haldane <allan.haldane@gmail.com>
Date: Thu, 27 Dec 2018 21:45:26 -0500
Subject: [PATCH] BUG: fix uint alignment asserts in lowlevel loops

---
 doc/source/reference/alignment.rst            | 24 ++++++-----
 .../core/src/multiarray/array_assign_array.c  | 12 +++++-
 .../core/src/multiarray/array_assign_scalar.c | 17 +++++---
 numpy/core/src/multiarray/common.h            |  2 +
 numpy/core/src/multiarray/ctors.c             |  3 +-
 numpy/core/src/multiarray/dtype_transfer.c    | 15 ++++---
 .../multiarray/lowlevel_strided_loops.c.src   | 40 +++++++++----------
 numpy/core/src/multiarray/mapping.c           |  6 ++-
 numpy/core/src/multiarray/nditer_constr.c     | 34 ++++++++++++----
 numpy/core/tests/test_multiarray.py           | 16 ++++++--
 10 files changed, 114 insertions(+), 55 deletions(-)

diff --git a/doc/source/reference/alignment.rst b/doc/source/reference/alignment.rst
index c749972b420d..b182a1e7b6b3 100644
--- a/doc/source/reference/alignment.rst
+++ b/doc/source/reference/alignment.rst
@@ -34,6 +34,14 @@ datatype is implemented as ``struct { float real, imag; }``. This has "true"
 alignment of 4 and "uint" alignment of 8 (equal to the true alignment of
 ``uint64``).
 
+Some cases where uint and true alignment are different (default gcc linux):
+   arch     type        true-aln    uint-aln
+   ----     ----        --------    --------
+   x86_64   complex64          4           8
+   x86_64   float128          16           8
+   x86      float96            4           -
+
+
 Variables in Numpy which control and describe alignment
 -------------------------------------------------------
 
@@ -82,17 +90,15 @@ Here is how the variables above are used:
     appropriate N. Otherwise numpy copies by doing ``memcpy(dst, src, N)``.
  5. Nditer code: Since this often calls the strided copy code, it must
     check for "uint alignment".
- 6. Cast code: if the array is "uint aligned" this will essentially do
-    ``*dst = CASTFUNC(*src)``. If not, it does
+ 6. Cast code: This checks for "true" alignment, as it does
+    ``*dst = CASTFUNC(*src)`` if aligned. Otherwise, it does
     ``memmove(srcval, src); dstval = CASTFUNC(srcval); memmove(dst, dstval)``
     where dstval/srcval are aligned.
 
-Note that in principle, only "true alignment" is required for casting code.
-However, because the casting code and copy code are deeply intertwined they
-both use "uint" alignment. This should be safe assuming uint alignment is
-always larger than true alignment, though it can cause unnecessary buffering if
-an array is "true aligned" but not "uint aligned". If there is ever a big
-rewrite of this code it would be good to allow them to use different
-alignments.
+Note that the strided-copy and strided-cast code are deeply intertwined and so
+any arrays being processed by them must be both uint and true aligned, even
+though te copy-code only needs uint alignment and the cast code only true
+alignment.  If there is ever a big rewrite of this code it would be good to
+allow them to use different alignments.
 
 
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
index f692e03076f3..69a527dfab55 100644
--- a/numpy/core/src/multiarray/array_assign_array.c
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -48,11 +48,15 @@ raw_array_assign_array(int ndim, npy_intp *shape,
 
     NPY_BEGIN_THREADS_DEF;
 
-    /* Check alignment */
+    /* Check both uint and true alignment */
     aligned = raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
                                    npy_uint_alignment(dst_dtype->elsize)) &&
+              raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
+                                   dst_dtype->alignment) &&
               raw_array_is_aligned(ndim, shape, src_data, src_strides,
                                    npy_uint_alignment(src_dtype->elsize));
+              raw_array_is_aligned(ndim, shape, src_data, src_strides,
+                                   src_dtype->alignment);
 
     /* Use raw iteration with no heap allocation */
     if (PyArray_PrepareTwoRawArrayIter(
@@ -133,11 +137,15 @@ raw_array_wheremasked_assign_array(int ndim, npy_intp *shape,
 
     NPY_BEGIN_THREADS_DEF;
 
-    /* Check alignment */
+    /* Check both uint and true alignment */
     aligned = raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
                                    npy_uint_alignment(dst_dtype->elsize)) &&
+              raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
+                                   dst_dtype->alignment) &&
               raw_array_is_aligned(ndim, shape, src_data, src_strides,
                                    npy_uint_alignment(src_dtype->elsize));
+              raw_array_is_aligned(ndim, shape, src_data, src_strides,
+                                   src_dtype->alignment);
 
     /* Use raw iteration with no heap allocation */
     if (PyArray_PrepareThreeRawArrayIter(
diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c
index 841a41850793..ecb5be47b50a 100644
--- a/numpy/core/src/multiarray/array_assign_scalar.c
+++ b/numpy/core/src/multiarray/array_assign_scalar.c
@@ -45,10 +45,13 @@ raw_array_assign_scalar(int ndim, npy_intp *shape,
 
     NPY_BEGIN_THREADS_DEF;
 
-    /* Check alignment */
+    /* Check both uint and true alignment */
     aligned = raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
                                    npy_uint_alignment(dst_dtype->elsize)) &&
-              npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize));
+              raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
+                                   dst_dtype->alignment) &&
+              npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize) &&
+              npy_is_aligned(src_data, src_dtype->alignment));
 
     /* Use raw iteration with no heap allocation */
     if (PyArray_PrepareOneRawArrayIter(
@@ -116,10 +119,13 @@ raw_array_wheremasked_assign_scalar(int ndim, npy_intp *shape,
 
     NPY_BEGIN_THREADS_DEF;
 
-    /* Check alignment */
+    /* Check both uint and true alignment */
     aligned = raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
                                    npy_uint_alignment(dst_dtype->elsize)) &&
-              npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize));
+              raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
+                                   dst_dtype->alignment) &&
+              npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize) &&
+              npy_is_aligned(src_data, src_dtype->alignment));
 
     /* Use raw iteration with no heap allocation */
     if (PyArray_PrepareTwoRawArrayIter(
@@ -220,7 +226,8 @@ PyArray_AssignRawScalar(PyArrayObject *dst,
      * we also skip this if 'dst' has an object dtype.
      */
     if ((!PyArray_EquivTypes(PyArray_DESCR(dst), src_dtype) ||
-            !npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize))) &&
+            !(npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize)) &&
+              npy_is_aligned(src_data, src_dtype->alignment))) &&
                     PyArray_SIZE(dst) > 1 &&
                     !PyDataType_REFCHK(PyArray_DESCR(dst))) {
         char *tmp_src_data;
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 2b8d3d3a42ea..0e162903d4ea 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -182,6 +182,7 @@ check_and_adjust_axis(int *axis, int ndim)
 
 /* used for some alignment checks */
 #define _ALIGN(type) offsetof(struct {char c; type v;}, v)
+#define _UINT_ALIGN(type) npy_uint_alignment(sizeof(type))
 /*
  * Disable harmless compiler warning "4116: unnamed type definition in
  * parentheses" which is caused by the _ALIGN macro.
@@ -201,6 +202,7 @@ npy_is_aligned(const void * p, const npy_uintp alignment)
      * Assumes cast from pointer to uintp gives a sensible representation we
      * can use bitwise & on (not required by C standard, but used by glibc).
      * This test is faster than a direct modulo.
+     * Note alignment value of 0 is allowed and returns False.
      */
     return ((npy_uintp)(p) & ((alignment) - 1)) == 0;
 }
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index f77e414daa6d..a17621946014 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -2827,7 +2827,8 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
      * contiguous strides, etc.
      */
     if (PyArray_GetDTypeTransferFunction(
-                    IsUintAligned(src) && IsUintAligned(dst),
+                    IsUintAligned(src) && IsAligned(src) &&
+                    IsUintAligned(dst) && IsAligned(dst),
                     src_stride, dst_stride,
                     PyArray_DESCR(src), PyArray_DESCR(dst),
                     0,
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 2b29d4f8c9dc..63b1ead25cb3 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -26,6 +26,7 @@
 #include "_datetime.h"
 #include "datetime_strings.h"
 #include "descriptor.h"
+#include "array_assign.h"
 
 #include "shape.h"
 #include "lowlevel_strided_loops.h"
@@ -3765,11 +3766,15 @@ PyArray_CastRawArrays(npy_intp count,
         return NPY_SUCCEED;
     }
 
-    /* Check data alignment */
-    aligned = (((npy_intp)src | src_stride) &
-                                (src_dtype->alignment - 1)) == 0 &&
-              (((npy_intp)dst | dst_stride) &
-                                (dst_dtype->alignment - 1)) == 0;
+    /* Check data alignment, both uint and true */
+    aligned = raw_array_is_aligned(1, &count, dst, &dst_stride,
+                                   npy_uint_alignment(dst_dtype->elsize)) &&
+              raw_array_is_aligned(1, &count, dst, &dst_stride,
+                                   dst_dtype->alignment) &&
+              raw_array_is_aligned(1, &count, src, &src_stride,
+                                   npy_uint_alignment(src_dtype->elsize)) &&
+              raw_array_is_aligned(1, &count, src, &src_stride,
+                                   src_dtype->alignment);
 
     /* Get the function to do the casting */
     if (PyArray_GetDTypeTransferFunction(aligned,
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index 896e466c8930..16bacf1abc48 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -82,7 +82,7 @@
 /**begin repeat
  * #elsize = 1, 2, 4, 8, 16#
  * #elsize_half = 0, 1, 2, 4, 8#
- * #type = npy_uint8, npy_uint16, npy_uint32, npy_uint64, npy_uint128#
+ * #type = npy_uint8, npy_uint16, npy_uint32, npy_uint64, npy_uint64#
  */
 /**begin repeat1
  * #oper = strided_to_strided, strided_to_contig,
@@ -119,10 +119,10 @@ static void
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                         NpyAuxData *NPY_UNUSED(data))
 {
-#if @is_aligned@ && @elsize@ != 16
+#if @is_aligned@
     /* sanity check */
-    assert(N == 0 || npy_is_aligned(dst, _ALIGN(@type@)));
-    assert(N == 0 || npy_is_aligned(src, _ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(dst, _UINT_ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(src, _UINT_ALIGN(@type@)));
 #endif
     /*printf("fn @prefix@_@oper@_size@elsize@\n");*/
     while (N > 0) {
@@ -201,8 +201,8 @@ static NPY_GCC_OPT_3 void
     }
 #if @is_aligned@ && @elsize@ != 16
     /* sanity check */
-    assert(N == 0 || npy_is_aligned(dst, _ALIGN(@type@)));
-    assert(N == 0 || npy_is_aligned(src, _ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(dst, _UINT_ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(src, _UINT_ALIGN(@type@)));
 #endif
 #if @elsize@ == 1 && @dst_contig@
     memset(dst, *src, N);
@@ -808,12 +808,8 @@ static NPY_GCC_OPT_3 void
 
 #if @aligned@
    /* sanity check */
-#  if !@is_complex1@
     assert(N == 0 || npy_is_aligned(src, _ALIGN(_TYPE1)));
-#  endif
-#  if !@is_complex2@
     assert(N == 0 || npy_is_aligned(dst, _ALIGN(_TYPE2)));
-#  endif
 #endif
 
     /*printf("@prefix@_cast_@name1@_to_@name2@\n");*/
@@ -1425,7 +1421,7 @@ mapiter_trivial_@name@(PyArrayObject *self, PyArrayObject *ind,
         while (itersize--) {
             char * self_ptr;
             npy_intp indval = *((npy_intp*)ind_ptr);
-            assert(npy_is_aligned(ind_ptr, _ALIGN(npy_intp)));
+            assert(npy_is_aligned(ind_ptr, _UINT_ALIGN(npy_intp)));
 #if @isget@
             if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
                 return -1;
@@ -1439,8 +1435,8 @@ mapiter_trivial_@name@(PyArrayObject *self, PyArrayObject *ind,
 
 #if @isget@
 #if @elsize@
-            assert(npy_is_aligned(result_ptr, _ALIGN(@copytype@)));
-            assert(npy_is_aligned(self_ptr, _ALIGN(@copytype@)));
+            assert(npy_is_aligned(result_ptr, _UINT_ALIGN(@copytype@)));
+            assert(npy_is_aligned(self_ptr, _UINT_ALIGN(@copytype@)));
             *(@copytype@ *)result_ptr = *(@copytype@ *)self_ptr;
 #else
             copyswap(result_ptr, self_ptr, 0, self);
@@ -1448,8 +1444,8 @@ mapiter_trivial_@name@(PyArrayObject *self, PyArrayObject *ind,
 
 #else /* !@isget@ */
 #if @elsize@
-            assert(npy_is_aligned(result_ptr, _ALIGN(@copytype@)));
-            assert(npy_is_aligned(self_ptr, _ALIGN(@copytype@)));
+            assert(npy_is_aligned(result_ptr, _UINT_ALIGN(@copytype@)));
+            assert(npy_is_aligned(self_ptr, _UINT_ALIGN(@copytype@)));
             *(@copytype@ *)self_ptr = *(@copytype@ *)result_ptr;
 #else
             copyswap(self_ptr, result_ptr, 0, self);
@@ -1571,7 +1567,7 @@ mapiter_@name@(PyArrayMapIterObject *mit)
                         for (i=0; i < @numiter@; i++) {
                             npy_intp indval = *((npy_intp*)outer_ptrs[i]);
                             assert(npy_is_aligned(outer_ptrs[i],
-                                                  _ALIGN(npy_intp)));
+                                                  _UINT_ALIGN(npy_intp)));
 
 #if @isget@ && @one_iter@
                             if (check_and_adjust_index(&indval, fancy_dims[i],
@@ -1591,16 +1587,20 @@ mapiter_@name@(PyArrayMapIterObject *mit)
 
 #if @isget@
 #if @elsize@
-                        assert(npy_is_aligned(outer_ptrs[i], _ALIGN(@copytype@)));
-                        assert(npy_is_aligned(self_ptr, _ALIGN(@copytype@)));
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              _UINT_ALIGN(@copytype@)));
+                        assert(npy_is_aligned(self_ptr,
+                                              _UINT_ALIGN(@copytype@)));
                         *(@copytype@ *)(outer_ptrs[i]) = *(@copytype@ *)self_ptr;
 #else
                         copyswap(outer_ptrs[i], self_ptr, 0, array);
 #endif
 #else /* !@isget@ */
 #if @elsize@
-                        assert(npy_is_aligned(outer_ptrs[i], _ALIGN(@copytype@)));
-                        assert(npy_is_aligned(self_ptr, _ALIGN(@copytype@)));
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               _UINT_ALIGN(@copytype@)));
+                        assert(npy_is_aligned(self_ptr,
+                               _UINT_ALIGN(@copytype@)));
                         *(@copytype@ *)self_ptr = *(@copytype@ *)(outer_ptrs[i]);
 #else
                         copyswap(self_ptr, outer_ptrs[i], 0, array);
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 1b05faeebf37..17edd2bbf474 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -1064,7 +1064,8 @@ array_boolean_subscript(PyArrayObject *self,
 
         /* Get a dtype transfer function */
         NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
-        if (PyArray_GetDTypeTransferFunction(IsUintAligned(self),
+        if (PyArray_GetDTypeTransferFunction(
+                        IsUintAligned(self) && IsAligned(self),
                         fixed_strides[0], itemsize,
                         dtype, dtype,
                         0,
@@ -1253,7 +1254,8 @@ array_assign_boolean_subscript(PyArrayObject *self,
         /* Get a dtype transfer function */
         NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
         if (PyArray_GetDTypeTransferFunction(
-                        IsUintAligned(self) && IsUintAligned(v),
+                        IsUintAligned(self) && IsAligned(self) &&
+                        IsUintAligned(v) && IsAligned(v),
                         v_stride, fixed_strides[0],
                         PyArray_DESCR(v), PyArray_DESCR(self),
                         0,
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index dbb24f26bc00..c0341f8828bf 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -1132,7 +1132,7 @@ npyiter_prepare_one_operand(PyArrayObject **op,
         /* Check if the operand is aligned */
         if (op_flags & NPY_ITER_ALIGNED) {
             /* Check alignment */
-            if (!IsUintAligned(*op)) {
+            if (!(IsUintAligned(*op) && IsAligned(*op))) {
                 NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST "
                                     "because of NPY_ITER_ALIGNED\n");
                 *op_itflags |= NPY_OP_ITFLAG_CAST;
@@ -2851,8 +2851,14 @@ npyiter_allocate_arrays(NpyIter *iter,
             npyiter_replace_axisdata(iter, iop, op[iop], ondim,
                     PyArray_DATA(op[iop]), op_axes ? op_axes[iop] : NULL);
 
-            /* New arrays are aligned and need no cast */
-            op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            /*
+             * New arrays are guaranteed true-aligned, but copy/cast code
+             * needs uint-alignment in addition.
+             */
+            if (IsUintAligned(out)) {
+                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            }
+            /* New arrays need no cast */
             op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
         }
         /*
@@ -2888,11 +2894,17 @@ npyiter_allocate_arrays(NpyIter *iter,
                     PyArray_DATA(op[iop]), NULL);
 
             /*
-             * New arrays are aligned need no cast, and in the case
+             * New arrays are guaranteed true-aligned, but copy/cast code
+             * needs uint-alignment in addition.
+             */
+            if (IsUintAligned(temp)) {
+                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            }
+            /*
+             * New arrays need no cast, and in the case
              * of scalars, always have stride 0 so never need buffering
              */
-            op_itflags[iop] |= (NPY_OP_ITFLAG_ALIGNED |
-                                  NPY_OP_ITFLAG_BUFNEVER);
+            op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER;
             op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
             if (itflags & NPY_ITFLAG_BUFFER) {
                 NBF_STRIDES(bufferdata)[iop] = 0;
@@ -2953,8 +2965,14 @@ npyiter_allocate_arrays(NpyIter *iter,
             npyiter_replace_axisdata(iter, iop, op[iop], ondim,
                     PyArray_DATA(op[iop]), op_axes ? op_axes[iop] : NULL);
 
-            /* The temporary copy is aligned and needs no cast */
-            op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            /*
+             * New arrays are guaranteed true-aligned, but copy/cast code
+             * additionally needs uint-alignment in addition.
+             */
+            if (IsUintAligned(temp)) {
+                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            }
+            /* The temporary copy needs no cast */
             op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
         }
         else {
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 91f4526bd439..70cf74c4a19d 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -54,7 +54,12 @@
 
 
 def _aligned_zeros(shape, dtype=float, order="C", align=None):
-    """Allocate a new ndarray with aligned memory."""
+    """
+    Allocate a new ndarray with aligned memory.
+
+    The ndarray is guaranteed *not* aligned to twice the requested alignment.
+    Eg, if align=4, guarantees it is not aligned to 8. If align=None uses
+    dtype.alignment."""
     dtype = np.dtype(dtype)
     if dtype == np.dtype(object):
         # Can't do this, fall back to standard allocation (which
@@ -67,10 +72,15 @@ def _aligned_zeros(shape, dtype=float, order="C", align=None):
     if not hasattr(shape, '__len__'):
         shape = (shape,)
     size = functools.reduce(operator.mul, shape) * dtype.itemsize
-    buf = np.empty(size + align + 1, np.uint8)
-    offset = buf.__array_interface__['data'][0] % align
+    buf = np.empty(size + 2*align + 1, np.uint8)
+
+    ptr = buf.__array_interface__['data'][0]
+    offset = ptr % align
     if offset != 0:
         offset = align - offset
+    if (ptr % (2*align)) == 0:
+        offset += align
+
     # Note: slices producing 0-size arrays do not necessarily change
     # data pointer --- so we use and allocate size+1
     buf = buf[offset:offset+size+1][:-1]