forked from numpy/numpy
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
MAINT: Extract the loop macros into their own header
This makes it possible to use them in other source files (if we go down a source-per-ufunc approach later), and if nothing else makes a very long file a little less long.
- Loading branch information
1 parent
4df5bfd
commit fe8072a
Showing
2 changed files
with
206 additions
and
193 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
/** | ||
* Macros to help build fast ufunc inner loops. | ||
* | ||
* These expect to have access to the arguments of a typical ufunc loop, | ||
* | ||
* char **args | ||
* npy_intp *dimensions | ||
* npy_intp *steps | ||
*/ | ||
#ifndef _NPY_UMATH_FAST_LOOP_MACROS_H_ | ||
#define _NPY_UMATH_FAST_LOOP_MACROS_H_ | ||
|
||
#include "simd.inc" | ||
|
||
/* unary loop input and output contiguous */ | ||
#define IS_UNARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \ | ||
steps[1] == sizeof(tout)) | ||
|
||
#define IS_BINARY_REDUCE ((args[0] == args[2])\ | ||
&& (steps[0] == steps[2])\ | ||
&& (steps[0] == 0)) | ||
|
||
/* binary loop input and output contiguous */ | ||
#define IS_BINARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \ | ||
steps[1] == sizeof(tin) && \ | ||
steps[2] == sizeof(tout)) | ||
/* binary loop input and output contiguous with first scalar */ | ||
#define IS_BINARY_CONT_S1(tin, tout) (steps[0] == 0 && \ | ||
steps[1] == sizeof(tin) && \ | ||
steps[2] == sizeof(tout)) | ||
/* binary loop input and output contiguous with second scalar */ | ||
#define IS_BINARY_CONT_S2(tin, tout) (steps[0] == sizeof(tin) && \ | ||
steps[1] == 0 && \ | ||
steps[2] == sizeof(tout)) | ||
|
||
#define OUTPUT_LOOP\ | ||
char *op1 = args[1];\ | ||
npy_intp os1 = steps[1];\ | ||
npy_intp n = dimensions[0];\ | ||
npy_intp i;\ | ||
for(i = 0; i < n; i++, op1 += os1) | ||
|
||
#define UNARY_LOOP\ | ||
char *ip1 = args[0], *op1 = args[1];\ | ||
npy_intp is1 = steps[0], os1 = steps[1];\ | ||
npy_intp n = dimensions[0];\ | ||
npy_intp i;\ | ||
for(i = 0; i < n; i++, ip1 += is1, op1 += os1) | ||
|
||
/* | ||
* loop with contiguous specialization | ||
* op should be the code working on `tin in` and | ||
* storing the result in `tout * out` | ||
* combine with NPY_GCC_OPT_3 to allow autovectorization | ||
* should only be used where its worthwhile to avoid code bloat | ||
*/ | ||
#define BASE_UNARY_LOOP(tin, tout, op) \ | ||
UNARY_LOOP { \ | ||
const tin in = *(tin *)ip1; \ | ||
tout * out = (tout *)op1; \ | ||
op; \ | ||
} | ||
#define UNARY_LOOP_FAST(tin, tout, op) \ | ||
do { \ | ||
/* condition allows compiler to optimize the generic macro */ \ | ||
if (IS_UNARY_CONT(tin, tout)) { \ | ||
if (args[0] == args[1]) { \ | ||
BASE_UNARY_LOOP(tin, tout, op) \ | ||
} \ | ||
else { \ | ||
BASE_UNARY_LOOP(tin, tout, op) \ | ||
} \ | ||
} \ | ||
else { \ | ||
BASE_UNARY_LOOP(tin, tout, op) \ | ||
} \ | ||
} \ | ||
while (0) | ||
|
||
#define UNARY_LOOP_TWO_OUT\ | ||
char *ip1 = args[0], *op1 = args[1], *op2 = args[2];\ | ||
npy_intp is1 = steps[0], os1 = steps[1], os2 = steps[2];\ | ||
npy_intp n = dimensions[0];\ | ||
npy_intp i;\ | ||
for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2) | ||
|
||
#define BINARY_LOOP\ | ||
char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\ | ||
npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\ | ||
npy_intp n = dimensions[0];\ | ||
npy_intp i;\ | ||
for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) | ||
|
||
/* | ||
* loop with contiguous specialization | ||
* op should be the code working on `tin in1`, `tin in2` and | ||
* storing the result in `tout * out` | ||
* combine with NPY_GCC_OPT_3 to allow autovectorization | ||
* should only be used where its worthwhile to avoid code bloat | ||
*/ | ||
#define BASE_BINARY_LOOP(tin, tout, op) \ | ||
BINARY_LOOP { \ | ||
const tin in1 = *(tin *)ip1; \ | ||
const tin in2 = *(tin *)ip2; \ | ||
tout * out = (tout *)op1; \ | ||
op; \ | ||
} | ||
/* | ||
* unfortunately gcc 6/7 regressed and we need to give it additional hints to | ||
* vectorize inplace operations (PR80198) | ||
* must only be used after op1 == ip1 or ip2 has been checked | ||
* TODO: using ivdep might allow other compilers to vectorize too | ||
*/ | ||
#if __GNUC__ >= 6 | ||
#define IVDEP_LOOP _Pragma("GCC ivdep") | ||
#else | ||
#define IVDEP_LOOP | ||
#endif | ||
#define BASE_BINARY_LOOP_INP(tin, tout, op) \ | ||
char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\ | ||
npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\ | ||
npy_intp n = dimensions[0];\ | ||
npy_intp i;\ | ||
IVDEP_LOOP \ | ||
for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \ | ||
const tin in1 = *(tin *)ip1; \ | ||
const tin in2 = *(tin *)ip2; \ | ||
tout * out = (tout *)op1; \ | ||
op; \ | ||
} | ||
#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \ | ||
const tin cin = *(tin *)cinp; \ | ||
BINARY_LOOP { \ | ||
const tin vin = *(tin *)vinp; \ | ||
tout * out = (tout *)op1; \ | ||
op; \ | ||
} | ||
/* PR80198 again, scalar works without the pragma */ | ||
#define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \ | ||
const tin cin = *(tin *)cinp; \ | ||
BINARY_LOOP { \ | ||
const tin vin = *(tin *)vinp; \ | ||
tout * out = (tout *)vinp; \ | ||
op; \ | ||
} | ||
#define BINARY_LOOP_FAST(tin, tout, op) \ | ||
do { \ | ||
/* condition allows compiler to optimize the generic macro */ \ | ||
if (IS_BINARY_CONT(tin, tout)) { \ | ||
if (abs_ptrdiff(args[2], args[0]) == 0 && \ | ||
abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \ | ||
BASE_BINARY_LOOP_INP(tin, tout, op) \ | ||
} \ | ||
else if (abs_ptrdiff(args[2], args[1]) == 0 && \ | ||
abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \ | ||
BASE_BINARY_LOOP_INP(tin, tout, op) \ | ||
} \ | ||
else { \ | ||
BASE_BINARY_LOOP(tin, tout, op) \ | ||
} \ | ||
} \ | ||
else if (IS_BINARY_CONT_S1(tin, tout)) { \ | ||
if (abs_ptrdiff(args[2], args[1]) == 0) { \ | ||
BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \ | ||
} \ | ||
else { \ | ||
BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \ | ||
} \ | ||
} \ | ||
else if (IS_BINARY_CONT_S2(tin, tout)) { \ | ||
if (abs_ptrdiff(args[2], args[0]) == 0) { \ | ||
BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \ | ||
} \ | ||
else { \ | ||
BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \ | ||
}\ | ||
} \ | ||
else { \ | ||
BASE_BINARY_LOOP(tin, tout, op) \ | ||
} \ | ||
} \ | ||
while (0) | ||
|
||
#define BINARY_REDUCE_LOOP_INNER\ | ||
char *ip2 = args[1]; \ | ||
npy_intp is2 = steps[1]; \ | ||
npy_intp n = dimensions[0]; \ | ||
npy_intp i; \ | ||
for(i = 0; i < n; i++, ip2 += is2) | ||
|
||
#define BINARY_REDUCE_LOOP(TYPE)\ | ||
char *iop1 = args[0]; \ | ||
TYPE io1 = *(TYPE *)iop1; \ | ||
BINARY_REDUCE_LOOP_INNER | ||
|
||
#define BINARY_LOOP_TWO_OUT\ | ||
char *ip1 = args[0], *ip2 = args[1], *op1 = args[2], *op2 = args[3];\ | ||
npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2], os2 = steps[3];\ | ||
npy_intp n = dimensions[0];\ | ||
npy_intp i;\ | ||
for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1, op2 += os2) | ||
|
||
|
||
#endif /* _NPY_UMATH_FAST_LOOP_MACROS_H_ */ |
Oops, something went wrong.