Skip to content

Commit

Permalink
Merge pull request numpy#23351 from r-devulap/avx512fp16
Browse files Browse the repository at this point in the history
ENH: Use AVX512-FP16 SVML content for float16 umath functions
  • Loading branch information
seiko2plus authored Jan 22, 2024
2 parents 09eb0ce + bc27299 commit 1d0edc1
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 6 deletions.
9 changes: 8 additions & 1 deletion numpy/_core/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -1139,6 +1139,13 @@ src_umath = umath_gen_headers + [
# here. Note that this migration is desirable; we then get the performance
# benefits for all platforms rather than only for AVX512 on 64-bit Linux, and
# may be able to avoid the accuracy regressions in SVML.
#
CPU_FEATURES_NAMES = CPU_BASELINE_NAMES + CPU_DISPATCH_NAMES
svml_file_suffix = ['d_la', 's_la', 'd_ha', 's_la']
if CPU_FEATURES_NAMES.contains('AVX512_SPR')
svml_file_suffix += ['h_la']
endif

svml_objects = []
if use_svml
foreach svml_func : [
Expand All @@ -1153,7 +1160,7 @@ if use_svml
'pow', 'sin', 'sinh', 'tan',
'tanh'
]
foreach svml_sfx : ['d_la', 's_la', 'd_ha', 's_la']
foreach svml_sfx : svml_file_suffix
svml_objects += [
'src/umath/svml/linux/avx512/svml_z0_'+svml_func+'_'+svml_sfx+'.s'
]
Expand Down
24 changes: 24 additions & 0 deletions numpy/_core/src/common/npy_svml.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,27 @@
#if NPY_SIMD && defined(NPY_HAVE_AVX512_SPR) && defined(NPY_CAN_LINK_SVML)
extern void __svml_exps32(const npy_half*, npy_half*, npy_intp);
extern void __svml_exp2s32(const npy_half*, npy_half*, npy_intp);
extern void __svml_logs32(const npy_half*, npy_half*, npy_intp);
extern void __svml_log2s32(const npy_half*, npy_half*, npy_intp);
extern void __svml_log10s32(const npy_half*, npy_half*, npy_intp);
extern void __svml_expm1s32(const npy_half*, npy_half*, npy_intp);
extern void __svml_log1ps32(const npy_half*, npy_half*, npy_intp);
extern void __svml_cbrts32(const npy_half*, npy_half*, npy_intp);
extern void __svml_sins32(const npy_half*, npy_half*, npy_intp);
extern void __svml_coss32(const npy_half*, npy_half*, npy_intp);
extern void __svml_tans32(const npy_half*, npy_half*, npy_intp);
extern void __svml_asins32(const npy_half*, npy_half*, npy_intp);
extern void __svml_acoss32(const npy_half*, npy_half*, npy_intp);
extern void __svml_atans32(const npy_half*, npy_half*, npy_intp);
extern void __svml_atan2s32(const npy_half*, npy_half*, npy_intp);
extern void __svml_sinhs32(const npy_half*, npy_half*, npy_intp);
extern void __svml_coshs32(const npy_half*, npy_half*, npy_intp);
extern void __svml_tanhs32(const npy_half*, npy_half*, npy_intp);
extern void __svml_asinhs32(const npy_half*, npy_half*, npy_intp);
extern void __svml_acoshs32(const npy_half*, npy_half*, npy_intp);
extern void __svml_atanhs32(const npy_half*, npy_half*, npy_intp);
#endif

#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
extern __m512 __svml_expf16(__m512 x);
extern __m512 __svml_exp2f16(__m512 x);
Expand Down
14 changes: 11 additions & 3 deletions numpy/_core/src/umath/loops_umath_fp.dispatch.c.src
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*@targets
** $maxopt baseline avx512_skx
** $maxopt baseline avx512_skx avx512_spr
*/
#include "numpy/npy_math.h"
#include "simd/simd.h"
Expand Down Expand Up @@ -156,7 +156,8 @@ avx512_@func@_f16(const npy_half *src, npy_half *dst, npy_intp len)
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_@func@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
#if defined(NPY_HAVE_AVX512_SPR) || defined(NPY_HAVE_AVX512_SKX)
#if NPY_SIMD && defined(NPY_CAN_LINK_SVML)
const npy_half *src = (npy_half*)args[0];
npy_half *dst = (npy_half*)args[1];
const int lsize = sizeof(src[0]);
Expand All @@ -166,10 +167,17 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_@func@)
if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
(ssrc == 1) &&
(sdst == 1)) {
#if defined(NPY_HAVE_AVX512_SPR)
__svml_@intrin@s32(src, dst, len);
return;
#endif
#if defined(NPY_HAVE_AVX512_SKX)
avx512_@intrin@_f16(src, dst, len);
return;
}
#endif
}
#endif // NPY_SIMD && NPY_CAN_LINK_SVML
#endif // SPR or SKX
UNARY_LOOP {
const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
*((npy_half *)op1) = npy_float_to_half(npy_@intrin@f(in1));
Expand Down
2 changes: 1 addition & 1 deletion numpy/_core/src/umath/svml
2 changes: 1 addition & 1 deletion numpy/_core/tests/test_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -1667,7 +1667,7 @@ def test_tanh(self):
for dt in ['e', 'f', 'd']:
in_arr = np.array(in_, dtype=dt)
out_arr = np.array(out, dtype=dt)
assert_equal(np.tanh(in_arr), out_arr)
assert_array_max_ulp(np.tanh(in_arr), out_arr, 3)

def test_arcsinh(self):
in_ = [np.nan, -np.nan, np.inf, -np.inf]
Expand Down
41 changes: 41 additions & 0 deletions numpy/_core/tests/test_umath_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@

IS_AVX = __cpu_features__.get('AVX512F', False) or \
(__cpu_features__.get('FMA3', False) and __cpu_features__.get('AVX2', False))

IS_AVX512FP16 = __cpu_features__.get('AVX512FP16', False)

# only run on linux with AVX, also avoid old glibc (numpy/numpy#20448).
runtest = (sys.platform.startswith('linux')
and IS_AVX and not _glibc_older_than("2.17"))
Expand Down Expand Up @@ -68,6 +71,8 @@ def test_validate_transcendentals(self):
maxulperr = data_subset['ulperr'].max()
assert_array_max_ulp(npfunc(inval), outval, maxulperr)

@pytest.mark.skipif(IS_AVX512FP16,
reason = "SVML FP16 have slightly higher ULP errors")
@pytest.mark.parametrize("ufunc", UNARY_OBJECT_UFUNCS)
def test_validate_fp16_transcendentals(self, ufunc):
with np.errstate(all='ignore'):
Expand All @@ -76,3 +81,39 @@ def test_validate_fp16_transcendentals(self, ufunc):
datafp32 = datafp16.astype(np.float32)
assert_array_max_ulp(ufunc(datafp16), ufunc(datafp32),
maxulp=1, dtype=np.float16)

@pytest.mark.skipif(not IS_AVX512FP16,
reason="lower ULP only apply for SVML FP16")
def test_validate_svml_fp16(self):
max_ulp_err = {
"arccos": 2.54,
"arccosh": 2.09,
"arcsin": 3.06,
"arcsinh": 1.51,
"arctan": 2.61,
"arctanh": 1.88,
"cbrt": 1.57,
"cos": 1.43,
"cosh": 1.33,
"exp2": 1.33,
"exp": 1.27,
"expm1": 0.53,
"log": 1.80,
"log10": 1.27,
"log1p": 1.88,
"log2": 1.80,
"sin": 1.88,
"sinh": 2.05,
"tan": 2.26,
"tanh": 3.00,
}

with np.errstate(all='ignore'):
arr = np.arange(65536, dtype=np.int16)
datafp16 = np.frombuffer(arr.tobytes(), dtype=np.float16)
datafp32 = datafp16.astype(np.float32)
for func in max_ulp_err:
ufunc = getattr(np, func)
ulp = np.ceil(max_ulp_err[func])
assert_array_max_ulp(ufunc(datafp16), ufunc(datafp32),
maxulp=ulp, dtype=np.float16)

0 comments on commit 1d0edc1

Please sign in to comment.