Skip to content

Commit

Permalink
Merge pull request numpy#3931 from juliantaylor/memchr-move
Browse files Browse the repository at this point in the history
move memchr like code to a function
  • Loading branch information
charris committed Oct 17, 2013
2 parents cf0869e + 7d4ea16 commit 3b3fa76
Show file tree
Hide file tree
Showing 11 changed files with 136 additions and 54 deletions.
4 changes: 4 additions & 0 deletions numpy/core/include/numpy/npy_cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,8 @@
#endif
#endif

#if (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64))
#define NPY_CPU_HAVE_UNALIGNED_ACCESS
#endif

#endif
1 change: 1 addition & 0 deletions numpy/core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -894,6 +894,7 @@ def generate_umath_c(ext, build_dir):

umath_deps = [
generate_umath_py,
join('src', 'multiarray', 'common.h'),
join('src', 'umath', 'simd.inc.src'),
join(codegen_dir, 'generate_ufunc_api.py'),
join('src', 'private', 'ufunc_override.h')] + npymath_sources
Expand Down
1 change: 1 addition & 0 deletions numpy/core/setup_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def check_api_version(apiversion, codegen_dir):
("__builtin_bswap32", '5u'),
("__builtin_bswap64", '5u'),
("__builtin_expect", '5, 0'),
("__builtin_ctz", '5'),
("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE
("_mm_load_pd", '(double*)0', "emmintrin.h"), # SSE2
]
Expand Down
2 changes: 1 addition & 1 deletion numpy/core/src/multiarray/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ _IsAligned(PyArrayObject *ap)
aligned |= (npy_uintp)PyArray_STRIDES(ap)[i];
#endif /* not NPY_RELAXED_STRIDES_CHECKING */
}
return npy_is_aligned(aligned, alignment);
return npy_is_aligned((void *)aligned, alignment);
}

NPY_NO_EXPORT npy_bool
Expand Down
66 changes: 66 additions & 0 deletions numpy/core/src/multiarray/common.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef _NPY_PRIVATE_COMMON_H_
#define _NPY_PRIVATE_COMMON_H_
#include <numpy/npy_common.h>
#include <numpy/npy_cpu.h>

#define error_converting(x) (((x) == -1) && PyErr_Occurred())

Expand Down Expand Up @@ -84,6 +85,71 @@ npy_is_aligned(const void * p, const npy_uintp alignment)
}
}

/*
* memchr with stride and invert argument
* intended for small searches where a call out to libc memchr is costly.
* stride must be a multiple of size.
* compared to memchr it returns one stride past end instead of NULL if needle
* is not found.
*/
static NPY_INLINE char *
npy_memchr(char * haystack, char needle,
npy_intp stride, npy_intp size, npy_intp * subloopsize, int invert)
{
char * p = haystack;
char * const end = haystack + size;
if (stride == 0) {
if (!invert) {
p = (*p != needle) ? end : haystack;
}
else {
p = (*p == needle) ? end : haystack;
}
*subloopsize = (p - haystack);
return haystack;
}

if (!invert) {
/*
* this is usually the path to determine elements to process,
* performance less important here.
* memchr has large setup cost if 0 byte is close to start.
*/
while (p < end && *p != needle) {
p += stride;
}
}
else {
/* usually find elements to skip path */
#if (defined HAVE___BUILTIN_CTZ && defined NPY_CPU_HAVE_UNALIGNED_ACCESS)
if (needle == 0 && stride == 1) {
while (p < end - ((npy_uintp)end % sizeof(unsigned int))) {
unsigned int v = *(unsigned int*)p;
if (v == 0) {
p += sizeof(unsigned int);
continue;
}
p += __builtin_ctz(v) / 8;
*subloopsize = (p - haystack) / stride;
return p;
}
}
#endif
while (p < end && *p == needle) {
p += stride;
}
}

/* division is very expensive */
if (NPY_LIKELY(stride == 1)) {
*subloopsize = (p - haystack);
}
else {
*subloopsize = (p - haystack) / stride;
}
return p;
}

#include "ucsnarrow.h"

#endif
24 changes: 4 additions & 20 deletions numpy/core/src/multiarray/dtype_transfer.c
Original file line number Diff line number Diff line change
Expand Up @@ -3059,22 +3059,14 @@ void _strided_masked_wrapper_decsrcref_transfer_function(

while (N > 0) {
/* Skip masked values, still calling decsrcref for move_references */
subloopsize = 0;
while (subloopsize < N && !*mask) {
++subloopsize;
mask += mask_stride;
}
mask = npy_memchr((char *)mask, 0, mask_stride, N, &subloopsize, 1);
decsrcref_stransfer(NULL, 0, src, src_stride,
subloopsize, src_itemsize, decsrcref_transferdata);
dst += subloopsize * dst_stride;
src += subloopsize * src_stride;
N -= subloopsize;
/* Process unmasked values */
subloopsize = 0;
while (subloopsize < N && *mask) {
++subloopsize;
mask += mask_stride;
}
mask = npy_memchr((char *)mask, 0, mask_stride, N, &subloopsize, 0);
unmasked_stransfer(dst, dst_stride, src, src_stride,
subloopsize, src_itemsize, unmasked_transferdata);
dst += subloopsize * dst_stride;
Expand Down Expand Up @@ -3102,20 +3094,12 @@ void _strided_masked_wrapper_transfer_function(

while (N > 0) {
/* Skip masked values */
subloopsize = 0;
while (subloopsize < N && !*mask) {
++subloopsize;
mask += mask_stride;
}
mask = npy_memchr((char *)mask, 0, mask_stride, N, &subloopsize, 1);
dst += subloopsize * dst_stride;
src += subloopsize * src_stride;
N -= subloopsize;
/* Process unmasked values */
subloopsize = 0;
while (subloopsize < N && *mask) {
++subloopsize;
mask += mask_stride;
}
mask = npy_memchr((char *)mask, 0, mask_stride, N, &subloopsize, 0);
unmasked_stransfer(dst, dst_stride, src, src_stride,
subloopsize, src_itemsize, unmasked_transferdata);
dst += subloopsize * dst_stride;
Expand Down
4 changes: 2 additions & 2 deletions numpy/core/src/multiarray/item_selection.c
Original file line number Diff line number Diff line change
Expand Up @@ -2414,7 +2414,7 @@ count_nonzero_bytes_128(npy_uint64 * w)
*/
if (NPY_UNLIKELY(((w1 | w2) & 0xFEFEFEFEFEFEFEFEULL) != 0)) {
/* reload from pointer to avoid a unnecessary stack spill with gcc */
char * c = w;
char * c = (char *)w;
npy_uintp i, count = 0;
for (i = 0; i < 16; i++) {
count += (c[i] != 0);
Expand Down Expand Up @@ -2469,7 +2469,7 @@ count_boolean_trues(int ndim, char *data, npy_intp *ashape, npy_intp *astrides)
if (npy_is_aligned(data, sizeof(npy_uint64))) {
npy_uintp stride = 2 * sizeof(npy_uint64);
for (; d < e - (shape[0] % stride); d += stride) {
count += count_nonzero_bytes_128(d);
count += count_nonzero_bytes_128((npy_uint64 *)d);
}
}
for (; d < e; ++d) {
Expand Down
2 changes: 1 addition & 1 deletion numpy/core/src/multiarray/lowlevel_strided_loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
* instructions (16 byte).
* So this flag can only be enabled if autovectorization is disabled.
*/
#if (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64))
#ifdef NPY_CPU_HAVE_UNALIGNED_ACCESS
# define NPY_USE_UNALIGNED_ACCESS 0
#else
# define NPY_USE_UNALIGNED_ACCESS 0
Expand Down
28 changes: 8 additions & 20 deletions numpy/core/src/multiarray/mapping.c
Original file line number Diff line number Diff line change
Expand Up @@ -725,19 +725,13 @@ array_boolean_subscript(PyArrayObject *self,

while (innersize > 0) {
/* Skip masked values */
subloopsize = 0;
while (subloopsize < innersize && *bmask_data == 0) {
++subloopsize;
bmask_data += bmask_stride;
}
bmask_data = npy_memchr(bmask_data, 0, bmask_stride,
innersize, &subloopsize, 1);
innersize -= subloopsize;
self_data += subloopsize * self_stride;
/* Process unmasked values */
subloopsize = 0;
while (subloopsize < innersize && *bmask_data != 0) {
++subloopsize;
bmask_data += bmask_stride;
}
bmask_data = npy_memchr(bmask_data, 0, bmask_stride, innersize,
&subloopsize, 0);
stransfer(ret_data, itemsize, self_data, self_stride,
subloopsize, itemsize, transferdata);
innersize -= subloopsize;
Expand Down Expand Up @@ -884,19 +878,13 @@ array_ass_boolean_subscript(PyArrayObject *self,

while (innersize > 0) {
/* Skip masked values */
subloopsize = 0;
while (subloopsize < innersize && *bmask_data == 0) {
++subloopsize;
bmask_data += bmask_stride;
}
bmask_data = npy_memchr(bmask_data, 0, bmask_stride,
innersize, &subloopsize, 1);
innersize -= subloopsize;
self_data += subloopsize * self_stride;
/* Process unmasked values */
subloopsize = 0;
while (subloopsize < innersize && *bmask_data != 0) {
++subloopsize;
bmask_data += bmask_stride;
}
bmask_data = npy_memchr(bmask_data, 0, bmask_stride, innersize,
&subloopsize, 0);
stransfer(self_data, self_stride, v_data, v_stride,
subloopsize, src_itemsize, transferdata);
innersize -= subloopsize;
Expand Down
13 changes: 3 additions & 10 deletions numpy/core/src/umath/ufunc_type_resolution.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include "numpy/ufuncobject.h"
#include "ufunc_type_resolution.h"
#include "common.h"

static const char *
npy_casting_to_string(NPY_CASTING casting)
Expand Down Expand Up @@ -1343,11 +1344,7 @@ unmasked_ufunc_loop_as_masked(
/* Process the data as runs of unmasked values */
do {
/* Skip masked values */
subloopsize = 0;
while (subloopsize < loopsize && !*mask) {
++subloopsize;
mask += mask_stride;
}
mask = npy_memchr(mask, 0, mask_stride, loopsize, &subloopsize, 1);
for (iargs = 0; iargs < nargs; ++iargs) {
dataptrs[iargs] += subloopsize * strides[iargs];
}
Expand All @@ -1356,11 +1353,7 @@ unmasked_ufunc_loop_as_masked(
* Process unmasked values (assumes unmasked loop doesn't
* mess with the 'args' pointer values)
*/
subloopsize = 0;
while (subloopsize < loopsize && *mask) {
++subloopsize;
mask += mask_stride;
}
mask = npy_memchr(mask, 0, mask_stride, loopsize, &subloopsize, 0);
unmasked_innerloop(dataptrs, &subloopsize, strides,
unmasked_innerloopdata);
for (iargs = 0; iargs < nargs; ++iargs) {
Expand Down
45 changes: 45 additions & 0 deletions numpy/core/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,51 @@ def test_copyto():
# 'dst' must be an array
assert_raises(TypeError, np.copyto, [1, 2, 3], [2, 3, 4])

def test_copyto_permut():
# test all permutation of possible masks, 9 should be sufficient for
# current 4 byte unrolled code
power = 9
d = np.ones(power)
for i in range(2**power):
r = np.zeros(power)
l = [(i & x) != 0 for x in range(power)]
mask = np.array(l)
np.copyto(r, d, where=mask)
assert_array_equal(r == 1, l)
assert_equal(r.sum(), sum(l))

r = np.zeros(power)
np.copyto(r, d, where=mask[::-1])
assert_array_equal(r == 1, l[::-1])
assert_equal(r.sum(), sum(l))

r = np.zeros(power)
np.copyto(r[::2], d[::2], where=mask[::2])
assert_array_equal(r[::2] == 1, l[::2])
assert_equal(r[::2].sum(), sum(l[::2]))

r = np.zeros(power)
np.copyto(r[::2], d[::2], where=mask[::-2])
assert_array_equal(r[::2] == 1, l[::-2])
assert_equal(r[::2].sum(), sum(l[::-2]))

for c in [0xFF, 0x7F, 0x02, 0x10]:
r = np.zeros(power)
mask = np.array(l)
imask = np.array(l).view(np.uint8)
imask[mask != 0] = 0xFF
np.copyto(r, d, where=mask)
assert_array_equal(r == 1, l)
assert_equal(r.sum(), sum(l))

r = np.zeros(power)
np.copyto(r, d, where=True)
assert_equal(r.sum(), r.size)
r = np.ones(power)
d = np.zeros(power)
np.copyto(r, d, where=False)
assert_equal(r.sum(), r.size)

def test_copy_order():
a = np.arange(24).reshape(2, 1, 3, 4)
b = a.copy(order='F')
Expand Down

0 comments on commit 3b3fa76

Please sign in to comment.