CMake: update the x86 intrinsic checks

Merge all the existing checks into a single one, which is a simple pass or fail, since all our supported compilers support all the intrinsics up to Cannon Lake. The two I've recently added (AVX512VBMI2 and VAES) aren't yet supported everywhere, so they stay. For some reason, all intrinsics seem to be disabled on Android. It looks like some support was missing during the CMake port and this was never again looked at. I'm leaving it be. As for WASM, discussion with maintainers is that the WASM emulation of x86 intrinsics is too hit-and-miss. No one is testing the performance, particularly the person writing such code (me). They also have some non-obvious selection of what is supported natively and what is merely emulated. Using the actual WASM intrinsics is preferred, but someone else's job. Change-Id: Ib42b3adc93bf4d43bd55fffd16c10d66208e8384 Reviewed-by: Tor Arne Vestbø <[email protected]> Reviewed-by: Morten Johan Sørvig <[email protected]> Reviewed-by: Lorn Potter <[email protected]> Reviewed-by: Kai Koehne <[email protected]>
JoneChenHub · Jun 28, 2022 · db342f4 · db342f4
1 parent 500c116
commit db342f4
Show file tree

Hide file tree

Showing 8 changed files with 158 additions and 381 deletions.
diff --git a/cmake/QtBaseGlobalTargets.cmake b/cmake/QtBaseGlobalTargets.cmake
@@ -91,11 +91,11 @@ if(MACOS AND QT_IS_MACOS_UNIVERSAL
         AND __qt_osx_first_arch IN_LIST __qt_apple_silicon_arches)
     # The test in configure.cmake will not be run, but we know that
     # the compiler supports these intrinsics
-    set(QT_FORCE_FEATURE_sse2 ON CACHE INTERNAL "Force enable sse2 due to platform requirements.")
+    set(QT_FORCE_FEATURE_x86intrin ON CACHE INTERNAL "Force-enable x86 intrinsics due to platform requirements.")
     set(__QtFeature_custom_enabled_cache_variables
-        TEST_subarch_sse2
-        FEATURE_sse2
-        QT_FEATURE_sse2)
+        TEST_x86intrin
+        FEATURE_x86intrin
+        QT_FEATURE_x86intrin)
 endif()
 
 if(MACOS AND QT_IS_MACOS_UNIVERSAL AND __qt_osx_first_arch STREQUAL "x86_64")

diff --git a/cmake/QtCompilerOptimization.cmake b/cmake/QtCompilerOptimization.cmake
@@ -13,7 +13,6 @@ if (MSVC)
     set(QT_CFLAGS_AESNI      "${QT_CFLAGS_SSE2}")
     set(QT_CFLAGS_SHANI      "${QT_CFLAGS_SSE2}")
 
-    # FIXME to be Visual Studio version specific, like in mkspecs/common/msvc-version.conf
     set(QT_CFLAGS_AVX     "-arch:AVX")
     set(QT_CFLAGS_AVX2    "-arch:AVX2")
     set(QT_CFLAGS_F16C    "-arch:AVX")

diff --git a/config.tests/x86_simd/main.cpp b/config.tests/x86_simd/main.cpp
@@ -12,222 +12,6 @@
 #  define attribute_target(x)
 #endif
 
-#if T(SSE2)
-attribute_target("sse2") void test_sse2()
-{
-    __m128i a = _mm_setzero_si128();
-    _mm_maskmoveu_si128(a, _mm_setzero_si128(), 0);
-}
-#endif
-
-#if T(SSE3)
-attribute_target("sse3") void test_sse3()
-{
-    __m128d a = _mm_set1_pd(6.28);
-    __m128d b = _mm_set1_pd(3.14);
-    __m128d result = _mm_addsub_pd(a, b);
-    (void) _mm_movedup_pd(result);
-}
-#endif
-
-#if T(SSSE3)
-attribute_target("ssse3") void test_ssse3()
-{
-    __m128i a = _mm_set1_epi32(42);
-    _mm_abs_epi8(a);
-    (void) _mm_sign_epi16(a, _mm_set1_epi32(64));
-}
-#endif
-
-#if T(SSE4_1)
-attribute_target("sse4.1") void test_sse4_1()
-{
-    __m128 a = _mm_setzero_ps();
-    _mm_ceil_ps(a);
-    __m128i result = _mm_mullo_epi32(_mm_set1_epi32(42), _mm_set1_epi32(64));
-    (void)result;
-}
-#endif
-
-#if T(SSE4_2)
-attribute_target("sse4.2") void test_sse4_2()
-{
-    __m128i a = _mm_setzero_si128();
-    __m128i b = _mm_set1_epi32(42);
-    (void) _mm_cmpestrm(a, 16, b, 16, 0);
-}
-#endif
-
-#if T(AESNI)
-attribute_target("aes,sse4.2") void test_aesni()
-{
-    __m128i a = _mm_setzero_si128();
-    __m128i b = _mm_aesenc_si128(a, a);
-    __m128i c = _mm_aesdec_si128(a, b);
-    (void)c;
-}
-#endif
-
-#if T(F16C)
-attribute_target("f16c") void test_f16c()
-{
-    __m128i a = _mm_setzero_si128();
-    __m128 b = _mm_cvtph_ps(a);
-    __m256 b256 = _mm256_cvtph_ps(a);
-    (void) _mm_cvtps_ph(b, 0);
-    (void) _mm256_cvtps_ph(b256, 0);
-}
-#endif
-
-#if T(RDRND)
-attribute_target("rdrnd") int test_rdrnd()
-{
-    unsigned short us;
-    unsigned int ui;
-    if (_rdrand16_step(&us))
-        return 1;
-    if (_rdrand32_step(&ui))
-        return 1;
-#  if defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
-    unsigned long long ull;
-    if (_rdrand64_step(&ull))
-        return 1;
-#  endif
-}
-#endif
-
-#if T(RDSEED)
-attribute_target("rdseed") int test_rdseed()
-{
-    unsigned short us;
-    unsigned int ui;
-    if (_rdseed16_step(&us))
-        return 1;
-    if (_rdseed32_step(&ui))
-        return 1;
-#  if defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
-    unsigned long long ull;
-    if (_rdseed64_step(&ull))
-        return 1;
-#  endif
-}
-#endif
-
-#if T(SHANI)
-attribute_target("sha") void test_shani()
-{
-    __m128i a = _mm_setzero_si128();
-    __m128i b = _mm_sha1rnds4_epu32(a, a, 0);
-    __m128i c = _mm_sha1msg1_epu32(a, b);
-    __m128i d = _mm_sha256msg2_epu32(b, c);
-    (void)d;
-}
-#endif
-
-#if T(AVX)
-attribute_target("avx") void test_avx()
-{
-    __m256d a = _mm256_setzero_pd();
-    __m256d b = _mm256_set1_pd(42.42);
-    (void) _mm256_add_pd(a, b);
-}
-#endif
-
-#if T(AVX2)
-attribute_target("avx2") void test_avx2()
-{
-    _mm256_zeroall();
-    __m256i a = _mm256_setzero_si256();
-    __m256i b = _mm256_and_si256(a, a);
-    (void) _mm256_add_epi8(a, b);
-}
-#endif
-
-#if T(AVX512F)
-attribute_target("avx512f") void test_avx512f(char *ptr)
-{
-    /* AVX512 Foundation */
-    __mmask16 m = ~1;
-    __m512i i;
-    __m512d d;
-    __m512 f;
-    i = _mm512_maskz_loadu_epi32(0, ptr);
-    d = _mm512_loadu_pd((double *)ptr + 64);
-    f = _mm512_loadu_ps((float *)ptr + 128);
-    _mm512_mask_storeu_epi64(ptr, m, i);
-    _mm512_mask_storeu_ps(ptr + 64, m, f);
-    _mm512_mask_storeu_pd(ptr + 128, m, d);
-}
-#endif
-
-#if T(AVX512ER)
-attribute_target("avx512er") void test_avx512er()
-{
-    /* AVX512 Exponential and Reciprocal */
-    __m512 f;
-    f =  _mm512_exp2a23_round_ps(f, 8);
-}
-#endif
-
-#if T(AVX512CD)
-attribute_target("avx512cd") void test_avx512cd()
-{
-    /* AVX512 Conflict Detection */
-    __mmask16 m = ~1;
-    __m512i i;
-    i = _mm512_maskz_conflict_epi32(m, i);
-}
-#endif
-
-#if T(AVX512PF)
-attribute_target("avx512pf") void test_avx512pf(void *ptr)
-{
-    /* AVX512 Prefetch */
-    __m512i i;
-    __mmask16 m = 0xf;
-    _mm512_mask_prefetch_i64scatter_pd(ptr, m, i, 2, 2);
-}
-#endif
-
-#if T(AVX512DQ)
-attribute_target("avx512dq") void test_avx512dq()
-{
-    /* AVX512 Doubleword and Quadword support */
-    __m512i i;
-    __mmask16 m = ~1;
-    m = _mm512_movepi32_mask(i);
-}
-#endif
-
-#if T(AVX512BW)
-attribute_target("avx512bw") void test_avx512bw(char *ptr)
-{
-    /* AVX512 Byte and Word support */
-    __m512i i;
-    __mmask16 m = ~1;
-    i =  _mm512_mask_loadu_epi8(i, m, ptr - 8);
-}
-#endif
-
-#if T(AVX512VL)
-attribute_target("avx512vl") void test_avx512vl(char *ptr)
-{
-    /* AVX512 Vector Length */
-    __mmask16 m = ~1;
-    __m256i i2 = _mm256_maskz_loadu_epi32(0, ptr);
-    _mm256_mask_storeu_epi32(ptr + 1, m, i2);
-}
-#endif
-
-#if T(AVX512IFMA)
-attribute_target("avx512ifma") void test_avx512ifma()
-{
-    /* AVX512 Integer Fused Multiply-Add */
-    __m512i i;
-    i = _mm512_madd52lo_epu64(i, i, i);
-}
-#endif
-
 #if T(AVX512VBMI2)
 attribute_target("avx512vl,avx512vbmi2") void test_avx512vbmi2()
 {

diff --git a/config.tests/x86intrin/CMakeLists.txt b/config.tests/x86intrin/CMakeLists.txt
@@ -0,0 +1,9 @@
+cmake_minimum_required(VERSION 3.16)
+project(x86intrin LANGUAGES CXX)
+add_executable(x86intrin main.cpp)
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU|IntelLLVM|QCC")
+    target_compile_options(x86intrin PUBLIC
+        "-march=cannonlake" "-mrdrnd" "-mrdseed" "-maes" "-msha" "-w")
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    target_compile_options(x86intrin PUBLIC "-arch:AVX512" "-W0")
+endif()
diff --git a/config.tests/x86intrin/main.cpp b/config.tests/x86intrin/main.cpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2022 Intel Corporation.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
+
+#include <immintrin.h>
+
+// Skylake AVX512 was added to GCC 4.9, Clang 3.7, and MSVC 2015.
+// Cannon Lake was added to GCC 5, Clang 3.8, and MSVC 2017 15.7,
+// so that's our minimum.
+// Ice Lake was completed with GCC 8, Clang 6, and MSVC 2017 15.8.
+
+int test(int argc, char **argv)
+{
+    unsigned randomvalue;
+    _rdrand32_step(&randomvalue);               // RDRND (IVB)
+    _rdseed32_step(&randomvalue);               // RDSEED (BDW)
+    unsigned mask = _blsmsk_u32(argc);          // BMI (HSW)
+    int clz = _lzcnt_u32(mask);                 // LZCNT (HSW)
+    int ctz = _tzcnt_u32(mask);                 // BMI (HSW)
+    mask = _bzhi_u32(-1, argc);                 // BMI2 (HSW)
+
+    __m128d d = _mm_setzero_pd();               // SSE2
+    d = _mm_cvtsi32_sd(d, argc);                // SSE2
+    __m256d d2 = _mm256_broadcastsd_pd(d);      // AVX (SNB)
+    d2 = _mm256_fmadd_pd(d2, d2, d2);           // FMA (HSW)
+
+    __m128 f = _mm256_cvtpd_ps(d2);             // AVX (SNB)
+    __m128i a = _mm_cvtps_ph(f, 0);             // F16C (IVB)
+    __m128i b = _mm_aesenc_si128(a, a);         // AESNI (WSM)
+    __m128i c = _mm_sha1rnds4_epu32(a, a, 0);   // SHA (CNL)
+    __m128i e = _mm_sha1msg1_epu32(a, b);       // SHA (CNL)
+    __m128i g = _mm_sha256msg2_epu32(b, c);     // SHA (CNL)
+
+    __m512i zero = _mm512_setzero_si512();                  // AVX512F (SKX)
+    __m512i data = _mm512_maskz_loadu_epi8(mask, argv[0]);  // AVX512BW (SKX)
+    __m256i ptrs = _mm256_maskz_loadu_epi64(mask, argv);    // AVX512VL (SKX)
+    __m512i data2 = _mm512_broadcast_i64x4(ptrs);           // AVX512DQ (SKX)
+    __m256i data3 = _mm256_madd52lo_epu64(ptrs, ptrs, ptrs);// AVX512IFMA (CNL)
+    data2 = _mm512_multishift_epi64_epi8(data, data2);      // AVX512VBMI (CNL)
+
+    return _mm256_extract_epi32(data3, 0);      // AVX2 (HSW)
+}
+
+int main(int argc, char **argv)
+{
+    return test(argc, argv);
+}
diff --git a/config_help.txt b/config_help.txt
@@ -112,10 +112,6 @@ Build options:
 
   -c++std <edition> .... Select C++ standard <edition> [c++2b/c++20/c++17/c++14/c++11]
 
-  -sse2 ................ Use SSE2 instructions [auto]
-  -sse3/-ssse3/-sse4.1/-sse4.2/-avx/-avx2/-avx512
-                         Enable use of particular x86 instructions [auto]
-                         Enabled ones are still subject to runtime detection.
   -mips_dsp/-mips_dspr2  Use MIPS DSP/rev2 instructions [auto]
 
   -qreal <type> ........ typedef qreal to the specified type. [double]