Merge pull request numpy#23351 from r-devulap/avx512fp16

ENH: Use AVX512-FP16 SVML content for float16 umath functions
hdlee · Jan 22, 2024 · 1d0edc1 · 1d0edc1
2 parents 09eb0ce + bc27299
commit 1d0edc1
Show file tree

Hide file tree

Showing 6 changed files with 86 additions and 6 deletions.
diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
@@ -1139,6 +1139,13 @@ src_umath = umath_gen_headers + [
 # here. Note that this migration is desirable; we then get the performance
 # benefits for all platforms rather than only for AVX512 on 64-bit Linux, and
 # may be able to avoid the accuracy regressions in SVML.
+#
+CPU_FEATURES_NAMES = CPU_BASELINE_NAMES + CPU_DISPATCH_NAMES
+svml_file_suffix = ['d_la', 's_la', 'd_ha', 's_la']
+if CPU_FEATURES_NAMES.contains('AVX512_SPR')
+  svml_file_suffix += ['h_la']
+endif
+
 svml_objects = []
 if use_svml
   foreach svml_func : [
@@ -1153,7 +1160,7 @@ if use_svml
     'pow', 'sin', 'sinh', 'tan',
     'tanh'
   ]
-    foreach svml_sfx : ['d_la', 's_la', 'd_ha', 's_la']
+    foreach svml_sfx : svml_file_suffix
       svml_objects += [
         'src/umath/svml/linux/avx512/svml_z0_'+svml_func+'_'+svml_sfx+'.s'
       ]

diff --git a/numpy/_core/src/common/npy_svml.h b/numpy/_core/src/common/npy_svml.h
@@ -1,3 +1,27 @@
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SPR) && defined(NPY_CAN_LINK_SVML)
+extern void __svml_exps32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_exp2s32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_logs32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_log2s32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_log10s32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_expm1s32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_log1ps32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_cbrts32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_sins32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_coss32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_tans32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_asins32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_acoss32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_atans32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_atan2s32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_sinhs32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_coshs32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_tanhs32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_asinhs32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_acoshs32(const npy_half*, npy_half*, npy_intp);
+extern void __svml_atanhs32(const npy_half*, npy_half*, npy_intp);
+#endif
+
 #if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
 extern __m512 __svml_expf16(__m512 x);
 extern __m512 __svml_exp2f16(__m512 x);

diff --git a/numpy/_core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/_core/src/umath/loops_umath_fp.dispatch.c.src
@@ -1,5 +1,5 @@
 /*@targets
- ** $maxopt baseline avx512_skx
+ ** $maxopt baseline avx512_skx avx512_spr
  */
 #include "numpy/npy_math.h"
 #include "simd/simd.h"
@@ -156,7 +156,8 @@ avx512_@func@_f16(const npy_half *src, npy_half *dst, npy_intp len)
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_@func@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
-#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+#if defined(NPY_HAVE_AVX512_SPR) || defined(NPY_HAVE_AVX512_SKX)
+#if NPY_SIMD && defined(NPY_CAN_LINK_SVML)
     const npy_half *src = (npy_half*)args[0];
           npy_half *dst = (npy_half*)args[1];
     const int lsize = sizeof(src[0]);
@@ -166,10 +167,17 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_@func@)
     if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
         (ssrc == 1) &&
         (sdst == 1)) {
+#if defined(NPY_HAVE_AVX512_SPR)
+        __svml_@intrin@s32(src, dst, len);
+        return;
+#endif
+#if defined(NPY_HAVE_AVX512_SKX)
         avx512_@intrin@_f16(src, dst, len);
         return;
-    }
 #endif
+    }
+#endif // NPY_SIMD && NPY_CAN_LINK_SVML
+#endif // SPR or SKX
     UNARY_LOOP {
         const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
         *((npy_half *)op1) = npy_float_to_half(npy_@intrin@f(in1));

diff --git a/numpy/_core/src/umath/svml b/numpy/_core/src/umath/svml
diff --git a/numpy/_core/tests/test_umath.py b/numpy/_core/tests/test_umath.py
@@ -1667,7 +1667,7 @@ def test_tanh(self):
         for dt in ['e', 'f', 'd']:
             in_arr = np.array(in_, dtype=dt)
             out_arr = np.array(out, dtype=dt)
-            assert_equal(np.tanh(in_arr), out_arr)
+            assert_array_max_ulp(np.tanh(in_arr), out_arr, 3)
 
     def test_arcsinh(self):
         in_ = [np.nan, -np.nan, np.inf, -np.inf]

diff --git a/numpy/_core/tests/test_umath_accuracy.py b/numpy/_core/tests/test_umath_accuracy.py
@@ -18,6 +18,9 @@
 
 IS_AVX = __cpu_features__.get('AVX512F', False) or \
         (__cpu_features__.get('FMA3', False) and __cpu_features__.get('AVX2', False))
+
+IS_AVX512FP16 = __cpu_features__.get('AVX512FP16', False)
+
 # only run on linux with AVX, also avoid old glibc (numpy/numpy#20448).
 runtest = (sys.platform.startswith('linux')
            and IS_AVX and not _glibc_older_than("2.17"))
@@ -68,6 +71,8 @@ def test_validate_transcendentals(self):
                         maxulperr = data_subset['ulperr'].max()
                         assert_array_max_ulp(npfunc(inval), outval, maxulperr)
 
+    @pytest.mark.skipif(IS_AVX512FP16,
+            reason = "SVML FP16 have slightly higher ULP errors")
     @pytest.mark.parametrize("ufunc", UNARY_OBJECT_UFUNCS)
     def test_validate_fp16_transcendentals(self, ufunc):
         with np.errstate(all='ignore'):
@@ -76,3 +81,39 @@ def test_validate_fp16_transcendentals(self, ufunc):
             datafp32 = datafp16.astype(np.float32)
             assert_array_max_ulp(ufunc(datafp16), ufunc(datafp32),
                     maxulp=1, dtype=np.float16)
+
+    @pytest.mark.skipif(not IS_AVX512FP16,
+                               reason="lower ULP only apply for SVML FP16")
+    def test_validate_svml_fp16(self):
+        max_ulp_err = {
+                "arccos": 2.54,
+                "arccosh": 2.09,
+                "arcsin": 3.06,
+                "arcsinh": 1.51,
+                "arctan": 2.61,
+                "arctanh": 1.88,
+                "cbrt": 1.57,
+                "cos": 1.43,
+                "cosh": 1.33,
+                "exp2": 1.33,
+                "exp": 1.27,
+                "expm1": 0.53,
+                "log": 1.80,
+                "log10": 1.27,
+                "log1p": 1.88,
+                "log2": 1.80,
+                "sin": 1.88,
+                "sinh": 2.05,
+                "tan": 2.26,
+                "tanh": 3.00,
+                }
+
+        with np.errstate(all='ignore'):
+            arr = np.arange(65536, dtype=np.int16)
+            datafp16 = np.frombuffer(arr.tobytes(), dtype=np.float16)
+            datafp32 = datafp16.astype(np.float32)
+            for func in max_ulp_err:
+                ufunc = getattr(np, func)
+                ulp = np.ceil(max_ulp_err[func])
+                assert_array_max_ulp(ufunc(datafp16), ufunc(datafp32),
+                        maxulp=ulp, dtype=np.float16)