Skip to content

Commit

Permalink
x86emul: AVX512-FP16 testing
Browse files Browse the repository at this point in the history
Naming of some of the builtins isn't fully consistent with that of pre-
existing ones, so there's a need for a new BR2() wrapper macro.

With the tests providing some proof of proper functioning of the
emulator code also enable use of the feature by guests, as there's no
other infrastructure involved in enabling this ISA extension.

Signed-off-by: Jan Beulich <[email protected]>
Acked-by: Andrew Cooper <[email protected]>
Acked-by: Henry Wang <[email protected]> # CHANGELOG
  • Loading branch information
jbeulich committed Jun 5, 2023
1 parent b3880c3 commit e291c4c
Show file tree
Hide file tree
Showing 7 changed files with 292 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
- Bus-lock detection, used by Xen to mitigate (by rate-limiting) the system
wide impact of a guest misusing atomic instructions.
- xl/libxl can customize SMBIOS strings for HVM guests.
- Add support for AVX512-FP16 on x86.

## [4.17.0](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.17.0) - 2022-12-12

Expand Down
7 changes: 5 additions & 2 deletions tools/tests/x86_emulator/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ vpath %.c $(XEN_ROOT)/xen/lib/x86

CFLAGS += $(CFLAGS_xeninclude)

SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi
SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi avx512fp16
FMA := fma4 fma
SG := avx2-sg avx512f-sg avx512vl-sg
AES := ssse3-aes avx-aes avx2-vaes avx512bw-vaes
Expand Down Expand Up @@ -91,6 +91,9 @@ avx512vbmi-vecs := $(avx512bw-vecs)
avx512vbmi-ints := $(avx512bw-ints)
avx512vbmi-flts := $(avx512bw-flts)
avx512vbmi2-vecs := $(avx512bw-vecs)
avx512fp16-vecs := $(avx512bw-vecs)
avx512fp16-ints :=
avx512fp16-flts := 2

avx512f-opmask-vecs := 2
avx512dq-opmask-vecs := 1 2
Expand Down Expand Up @@ -248,7 +251,7 @@ $(addsuffix .c,$(GF)):

$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(CLMUL) $(SHA) $(GF)): simd.h

xop.h avx512f.h: simd-fma.c
xop.h avx512f.h avx512fp16.h: simd-fma.c

endif # 32-bit override

Expand Down
165 changes: 165 additions & 0 deletions tools/tests/x86_emulator/simd-fma.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ ENTRY(fma_test);
# define fmaddsub(x, y, z) BR(vfmaddsubps, _mask, x, y, z, ~0)
# elif FLOAT_SIZE == 8
# define fmaddsub(x, y, z) BR(vfmaddsubpd, _mask, x, y, z, ~0)
# elif FLOAT_SIZE == 2
# define fmaddsub(x, y, z) BR(vfmaddsubph, _mask, x, y, z, ~0)
# endif
#elif VEC_SIZE == 16
# if FLOAT_SIZE == 4
Expand Down Expand Up @@ -70,6 +72,75 @@ ENTRY(fma_test);
# endif
#endif

#ifdef __AVX512FP16__
# define I (1.if16)
# if VEC_SIZE > FLOAT_SIZE
# define CELEM_COUNT (ELEM_COUNT / 2)
static const unsigned int conj_mask = 0x80000000;
# define conj(z) ({ \
vec_t r_; \
asm ( "vpxord %2%{1to%c3%}, %1, %0" \
: "=v" (r_) \
: "v" (z), "m" (conj_mask), "i" (CELEM_COUNT) ); \
r_; \
})
# define _cmul_vv(a, b, c) BR2(vf##c##mulcph, , a, b)
# define _cmul_vs(a, b, c) ({ \
vec_t r_; \
_Complex _Float16 b_ = (b); \
asm ( "vf"#c"mulcph %2%{1to%c3%}, %1, %0" \
: "=v" (r_) \
: "v" (a), "m" (b_), "i" (CELEM_COUNT) ); \
r_; \
})
# define cmadd_vv(a, b, c) BR2(vfmaddcph, , a, b, c)
# define cmadd_vs(a, b, c) ({ \
_Complex _Float16 b_ = (b); \
vec_t r_; \
asm ( "vfmaddcph %2%{1to%c3%}, %1, %0" \
: "=v" (r_) \
: "v" (a), "m" (b_), "i" (CELEM_COUNT), "0" (c) ); \
r_; \
})
# else
# define CELEM_COUNT 1
typedef _Float16 __attribute__((vector_size(4))) cvec_t;
# define conj(z) ({ \
cvec_t r_; \
asm ( "xor $0x80000000, %0" : "=rm" (r_) : "0" (z) ); \
r_; \
})
# define _cmul_vv(a, b, c) ({ \
cvec_t r_; \
/* "=&x" to force destination to be different from both sources */ \
asm ( "vf"#c"mulcsh %2, %1, %0" : "=&x" (r_) : "x" (a), "m" (b) ); \
r_; \
})
# define _cmul_vs(a, b, c) ({ \
_Complex _Float16 b_ = (b); \
cvec_t r_; \
/* "=&x" to force destination to be different from both sources */ \
asm ( "vf"#c"mulcsh %2, %1, %0" : "=&x" (r_) : "x" (a), "m" (b_) ); \
r_; \
})
# define cmadd_vv(a, b, c) ({ \
cvec_t r_ = (c); \
asm ( "vfmaddcsh %2, %1, %0" : "+x" (r_) : "x" (a), "m" (b) ); \
r_; \
})
# define cmadd_vs(a, b, c) ({ \
_Complex _Float16 b_ = (b); \
cvec_t r_ = (c); \
asm ( "vfmaddcsh %2, %1, %0" : "+x" (r_) : "x" (a), "m" (b_) ); \
r_; \
})
# endif
# define cmul_vv(a, b) _cmul_vv(a, b, )
# define cmulc_vv(a, b) _cmul_vv(a, b, c)
# define cmul_vs(a, b) _cmul_vs(a, b, )
# define cmulc_vs(a, b) _cmul_vs(a, b, c)
#endif

int fma_test(void)
{
unsigned int i;
Expand Down Expand Up @@ -156,5 +227,99 @@ int fma_test(void)
touch(inv);
#endif

#ifdef CELEM_COUNT

# if VEC_SIZE > FLOAT_SIZE
# define cvec_t vec_t
# define ceq eq
# else
{
/* Cannot re-use the function-scope variables (for being too small). */
cvec_t x, y, z, src = { 1, 2 }, inv = { 2, 1 }, one = { 1, 1 };
# define ceq(x, y) ({ \
unsigned int r_; \
asm ( "vcmpph $0, %1, %2, %0" : "=k" (r_) : "x" (x), "x" (y) ); \
(r_ & 3) == 3; \
})
# endif

/* (a * i)² == -a² */
x = cmul_vs(src, I);
y = cmul_vv(x, x);
x = -src;
touch(src);
z = cmul_vv(x, src);
if ( !ceq(y, z) ) return __LINE__;

/* conj(a * b) == conj(a) * conj(b) */
touch(src);
x = conj(src);
touch(inv);
y = cmulc_vv(x, inv);
touch(src);
touch(inv);
z = conj(cmul_vv(src, inv));
if ( !ceq(y, z) ) return __LINE__;

/* a * conj(a) == |a|² */
touch(src);
y = src;
touch(src);
x = cmulc_vv(y, src);
y *= y;
for ( i = 0; i < ELEM_COUNT; i += 2 )
{
if ( x[i] != y[i] + y[i + 1] ) return __LINE__;
if ( x[i + 1] ) return __LINE__;
}

/* a * b == b * a + 0 */
touch(src);
touch(inv);
x = cmul_vv(src, inv);
touch(src);
touch(inv);
y = cmadd_vv(inv, src, (cvec_t){});
if ( !ceq(x, y) ) return __LINE__;

/* a * 1 + b == b * 1 + a */
touch(src);
touch(inv);
x = cmadd_vs(src, 1, inv);
for ( i = 0; i < ELEM_COUNT; i += 2 )
{
z[i] = 1;
z[i + 1] = 0;
}
touch(z);
y = cmadd_vv(inv, z, src);
if ( !ceq(x, y) ) return __LINE__;

/* (a + b) * c == a * c + b * c */
touch(one);
touch(inv);
x = cmul_vv(src + one, inv);
touch(inv);
y = cmul_vv(one, inv);
touch(inv);
z = cmadd_vv(src, inv, y);
if ( !ceq(x, z) ) return __LINE__;

/* a * i + conj(a) == (Re(a) - Im(a)) * (1 + i) */
x = cmadd_vs(src, I, conj(src));
for ( i = 0; i < ELEM_COUNT; i += 2 )
{
typeof(x[0]) val = src[i] - src[i + 1];

if ( x[i] != val ) return __LINE__;
if ( x[i + 1] != val ) return __LINE__;
}

# if VEC_SIZE == FLOAT_SIZE
}
# endif

#endif /* CELEM_COUNT */

return 0;
}
90 changes: 89 additions & 1 deletion tools/tests/x86_emulator/simd.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ ENTRY(simd_test);
asm ( "vcmpsd $0, %1, %2, %0" : "=k" (r_) : "m" (x_), "v" (y_) ); \
r_ == 1; \
})
# elif VEC_SIZE == 2
# define eq(x, y) ({ \
_Float16 x_ = (x)[0]; \
_Float16 __attribute__((vector_size(16))) y_ = { (y)[0] }; \
unsigned int r_; \
asm ( "vcmpsh $0, %1, %2, %0" : "=k" (r_) : "m" (x_), "v" (y_) ); \
r_ == 1; \
})
# elif FLOAT_SIZE == 4
/*
* gcc's (up to at least 8.2) __builtin_ia32_cmpps256_mask() has an anomaly in
Expand All @@ -31,6 +39,8 @@ ENTRY(simd_test);
# define eq(x, y) ((BR(cmpps, _mask, x, y, 0, -1) & ALL_TRUE) == ALL_TRUE)
# elif FLOAT_SIZE == 8
# define eq(x, y) (BR(cmppd, _mask, x, y, 0, -1) == ALL_TRUE)
# elif FLOAT_SIZE == 2
# define eq(x, y) (B(cmpph, _mask, x, y, 0, -1) == ALL_TRUE)
# elif (INT_SIZE == 1 || UINT_SIZE == 1) && defined(__AVX512BW__)
# define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
# elif (INT_SIZE == 2 || UINT_SIZE == 2) && defined(__AVX512BW__)
Expand Down Expand Up @@ -116,6 +126,14 @@ static inline bool _to_bool(byte_vec_t bv)
asm ( "vcvtusi2sd%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \
(vec_t){ t_[0] }; \
})
# elif FLOAT_SIZE == 2
# define to_u_int(type, x) ({ \
unsigned type u_; \
_Float16 __attribute__((vector_size(16))) t_; \
asm ( "vcvtsh2usi %1, %0" : "=r" (u_) : "m" ((x)[0]) ); \
asm ( "vcvtusi2sh%z1 %1, %0, %0" : "=v" (t_) : "m" (u_) ); \
(vec_t){ t_[0] }; \
})
# endif
# define to_uint(x) to_u_int(int, x)
# ifdef __x86_64__
Expand Down Expand Up @@ -153,6 +171,43 @@ static inline bool _to_bool(byte_vec_t bv)
# define to_wint(x) BR(cvtqq2pd, _mask, BR(cvtpd2qq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0)
# define to_uwint(x) BR(cvtuqq2pd, _mask, BR(cvtpd2uqq, _mask, x, (vdi_t)undef(), ~0), undef(), ~0)
# endif
# elif FLOAT_SIZE == 2
# define to_int(x) BR2(vcvtw2ph, _mask, BR2(vcvtph2w, _mask, x, (vhi_t)undef(), ~0), undef(), ~0)
# define to_uint(x) BR2(vcvtuw2ph, _mask, BR2(vcvtph2uw, _mask, x, (vhi_t)undef(), ~0), undef(), ~0)
# if VEC_SIZE == 16
# define low_half(x) (x)
# define high_half(x) ((vec_t)B_(movhlps, , (vsf_t)undef(), (vsf_t)(x)))
# define insert_half(x, y, p) ((vec_t)((p) ? B_(movlhps, , (vsf_t)(x), (vsf_t)(y)) \
: B_(shufps, , (vsf_t)(y), (vsf_t)(x), 0b11100100)))
# elif VEC_SIZE == 32
# define _half(x, lh) ((vhf_half_t)B(extracti32x4_, _mask, (vsi_t)(x), lh, (vsi_half_t){}, ~0))
# define low_half(x) _half(x, 0)
# define high_half(x) _half(x, 1)
# define insert_half(x, y, p) \
((vec_t)B(inserti32x4_, _mask, (vsi_t)(x), (vsi_half_t)(y), p, (vsi_t)undef(), ~0))
# elif VEC_SIZE == 64
# define _half(x, lh) \
((vhf_half_t)__builtin_ia32_extracti64x4_mask((vdi_t)(x), lh, (vdi_half_t){}, ~0))
# define low_half(x) _half(x, 0)
# define high_half(x) _half(x, 1)
# define insert_half(x, y, p) \
((vec_t)__builtin_ia32_inserti64x4_mask((vdi_t)(x), (vdi_half_t)(y), p, (vdi_t)undef(), ~0))
# endif
# define to_w_int(x, s) ({ \
vhf_half_t t_ = low_half(x); \
vsi_t lo_, hi_; \
touch(t_); \
lo_ = BR2(vcvtph2 ## s ## dq, _mask, t_, (vsi_t)undef(), ~0); \
t_ = high_half(x); \
touch(t_); \
hi_ = BR2(vcvtph2 ## s ## dq, _mask, t_, (vsi_t)undef(), ~0); \
touch(lo_); touch(hi_); \
insert_half(insert_half(undef(), \
BR2(vcvt ## s ## dq2ph, _mask, lo_, (vhf_half_t){}, ~0), 0), \
BR2(vcvt ## s ## dq2ph, _mask, hi_, (vhf_half_t){}, ~0), 1); \
})
# define to_wint(x) to_w_int(x, )
# define to_uwint(x) to_w_int(x, u)
# endif
#elif VEC_SIZE == 16 && defined(__SSE2__)
# if FLOAT_SIZE == 4
Expand Down Expand Up @@ -240,10 +295,18 @@ static inline vec_t movlhps(vec_t x, vec_t y) {
# define scale(x, y) scalar_2op(x, y, "vscalefsd %[in2], %[in1], %[out]")
# define sqrt(x) scalar_1op(x, "vsqrtsd %[in], %[out], %[out]")
# define trunc(x) scalar_1op(x, "vrndscalesd $0b1011, %[in], %[out], %[out]")
# elif FLOAT_SIZE == 2
# define getexp(x) scalar_1op(x, "vgetexpsh %[in], %[out], %[out]")
# define getmant(x) scalar_1op(x, "vgetmantsh $0, %[in], %[out], %[out]")
# define recip(x) scalar_1op(x, "vrcpsh %[in], %[out], %[out]")
# define rsqrt(x) scalar_1op(x, "vrsqrtsh %[in], %[out], %[out]")
# define scale(x, y) scalar_2op(x, y, "vscalefsh %[in2], %[in1], %[out]")
# define sqrt(x) scalar_1op(x, "vsqrtsh %[in], %[out], %[out]")
# define trunc(x) scalar_1op(x, "vrndscalesh $0b1011, %[in], %[out], %[out]")
# endif
#elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
(VEC_SIZE == 64 || defined(__AVX512VL__))
# if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \
# if (ELEM_COUNT == 8 && ELEM_SIZE >= 4) /* vextractf{32,64}x4 */ || \
(ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextractf32x8 */ || \
(ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
# define _half(x, lh) ({ \
Expand Down Expand Up @@ -398,6 +461,21 @@ static inline vec_t movlhps(vec_t x, vec_t y) {
VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \
0b01010101, undef(), ~0)
# endif
# elif FLOAT_SIZE == 2
# define frac(x) BR2(reduceph, _mask, x, 0b00001011, undef(), ~0)
# define getexp(x) BR(getexpph, _mask, x, undef(), ~0)
# define getmant(x) BR(getmantph, _mask, x, 0, undef(), ~0)
# define max(x, y) BR2(maxph, _mask, x, y, undef(), ~0)
# define min(x, y) BR2(minph, _mask, x, y, undef(), ~0)
# define scale(x, y) BR2(scalefph, _mask, x, y, undef(), ~0)
# define recip(x) B(rcpph, _mask, x, undef(), ~0)
# define rsqrt(x) B(rsqrtph, _mask, x, undef(), ~0)
# define shrink1(x) BR2(vcvtps2phx, _mask, (vsf_t)(x), (vhf_half_t){}, ~0)
# define shrink2(x) BR2(vcvtpd2ph, _mask, (vdf_t)(x), (vhf_quarter_t){}, ~0)
# define sqrt(x) BR2(sqrtph, _mask, x, undef(), ~0)
# define trunc(x) BR2(rndscaleph, _mask, x, 0b1011, undef(), ~0)
# define widen1(x) ((vec_t)BR2(vcvtph2psx, _mask, x, (vsf_t)undef(), ~0))
# define widen2(x) ((vec_t)BR2(vcvtph2pd, _mask, x, (vdf_t)undef(), ~0))
# endif
#elif FLOAT_SIZE == 4 && defined(__SSE__)
# if VEC_SIZE == 32 && defined(__AVX__)
Expand Down Expand Up @@ -920,6 +998,16 @@ static inline vec_t movlhps(vec_t x, vec_t y) {
# define dup_lo(x) B(movddup, _mask, x, undef(), ~0)
# endif
#endif
#if FLOAT_SIZE == 2 && ELEM_COUNT > 1
# define dup_hi(x) ((vec_t)B(pshufhw, _mask, \
B(pshuflw, _mask, (vhi_t)(x), 0b11110101, \
(vhi_t)undef(), ~0), \
0b11110101, (vhi_t)undef(), ~0))
# define dup_lo(x) ((vec_t)B(pshufhw, _mask, \
B(pshuflw, _mask, (vhi_t)(x), 0b10100000, \
(vhi_t)undef(), ~0), \
0b10100000, (vhi_t)undef(), ~0))
#endif
#if VEC_SIZE == 16 && defined(__SSSE3__) && !defined(__AVX512VL__)
# if INT_SIZE == 1
# define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x)))
Expand Down
Loading

0 comments on commit e291c4c

Please sign in to comment.