From 5231fe48737972143d180e1fedf04ea8f32f70f9 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 5 Apr 2022 16:36:23 +0100
Subject: [PATCH] [SYCL][CUDA] bfloat16 in oneapi namespace and also supporting
 CUDA (#5393)

There is a bug in the verify_logic function in the bfloat16_type.cpp test (C accessor is not written to) - I'm not sure how this did not lead to a failure already.  With the bug fixed the test passes for the CUDA backend with this patch. I've added a draft test file that also increases the coverage to test unary minus operator here: https://github.com/intel/llvm-test-suite/pull/889.

Note that the unary neg intrinsic added here that is used in unary minus will be pulled down from upstream via e.g. https://reviews.llvm.org/D117887.
---
 ...idoc => sycl_ext_oneapi_bfloat16.asciidoc} | 31 ++++++++++---------
 sycl/include/CL/sycl/feature_test.hpp.in      |  2 +-
 .../experimental/bfloat16.hpp                 | 28 ++++++++++++++---
 sycl/test/extensions/bfloat16.cpp             |  4 +--
 4 files changed, 43 insertions(+), 22 deletions(-)
 rename sycl/doc/extensions/experimental/{sycl_ext_intel_bf16_conversion.asciidoc => sycl_ext_oneapi_bfloat16.asciidoc} (92%)
 rename sycl/include/sycl/ext/{intel => oneapi}/experimental/bfloat16.hpp (90%)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_bf16_conversion.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
similarity index 92%
rename from sycl/doc/extensions/experimental/sycl_ext_intel_bf16_conversion.asciidoc
rename to sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
index 9b1018ced0b34..88b6c73b02514 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_intel_bf16_conversion.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
@@ -1,4 +1,4 @@
-= SYCL_INTEL_bf16_conversion
+= sycl_ext_oneapi_bfloat16
 
 :source-highlighter: coderay
 :coderay-linenums-mode: table
@@ -24,7 +24,7 @@
 
 IMPORTANT: This specification is a draft.
 
-Copyright (c) 2021 Intel Corporation. All rights reserved.
+Copyright (c) 2021-2022 Intel Corporation. All rights reserved.
 
 NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
 trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc.
@@ -32,7 +32,7 @@ used by permission by Khronos.
 
 == Dependencies
 
-This extension is written against the SYCL 2020 specification, Revision 3.
+This extension is written against the SYCL 2020 specification, Revision 4.
 
 == Status
 
@@ -48,7 +48,7 @@ products.
 
 == Version
 
-Revision: 3
+Revision: 4
 
 == Introduction
 
@@ -57,7 +57,7 @@ floating-point type(`float`) to `bfloat16` type and vice versa. The extension
 doesn't add support for `bfloat16` type as such, instead it uses 16-bit integer
 type(`uint16_t`) as a storage for `bfloat16` values.
 
-The purpose of conversion from float to bfloat16 is to reduce ammount of memory
+The purpose of conversion from float to bfloat16 is to reduce the amount of memory
 required to store floating-point numbers. Computations are expected to be done with
 32-bit floating-point values.
 
@@ -73,7 +73,7 @@ command (e.g. from `parallel_for`).
 This extension provides a feature-test macro as described in the core SYCL
 specification section 6.3.3 "Feature test macros". Therefore, an implementation
 supporting this extension must predefine the macro
-`SYCL_EXT_INTEL_BF16_CONVERSION` to one of the values defined in the table
+`SYCL_EXT_ONEAPI_BFLOAT16` to one of the values defined in the table
 below. Applications can test for the existence of this macro to determine if
 the implementation supports this feature, or applications can test the macro’s
  value to determine which of the extension’s APIs the implementation supports.
@@ -91,19 +91,19 @@ the implementation supports this feature, or applications can test the macro’s
 namespace sycl {
 enum class aspect {
   ...
-  ext_intel_bf16_conversion
+  ext_oneapi_bfloat16
 }
 }
 ----
 
-If a SYCL device has the `ext_intel_bf16_conversion` aspect, then it natively
+If a SYCL device has the `ext_oneapi_bfloat16` aspect, then it natively
 supports conversion of values of `float` type to `bfloat16` and back.
 
 If the device doesn't have the aspect, objects of `bfloat16` class must not be
 used in the device code.
 
-**NOTE**: The `ext_intel_bf16_conversion` aspect is not yet supported.  The
-`bfloat16` class is currently supported only on Xe HP GPU.
+**NOTE**: The `ext_oneapi_bfloat16` aspect is not yet supported.  The
+`bfloat16` class is currently supported only on Xe HP GPU and Nvidia A100 GPU.
 
 == New `bfloat16` class
 
@@ -115,7 +115,7 @@ mode.
 ----
 namespace sycl {
 namespace ext {
-namespace intel {
+namespace oneapi {
 namespace experimental {
 
 class bfloat16 {
@@ -171,7 +171,7 @@ public:
 };
 
 } // namespace experimental
-} // namespace intel
+} // namespace oneapi
 } // namespace ext
 } // namespace sycl
 ----
@@ -277,9 +277,9 @@ OP is `==, !=, <, >, <=, >=`
 [source]
 ----
 #include <sycl/sycl.hpp>
-#include <sycl/ext/intel/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
 
-using sycl::ext::intel::experimental::bfloat16;
+using sycl::ext::oneapi::experimental::bfloat16;
 
 bfloat16 operator+(const bfloat16 &lhs, const bfloat16 &rhs) {
   return static_cast<float>(lhs) + static_cast<float>(rhs);
@@ -304,7 +304,7 @@ int main (int argc, char *argv[]) {
   sycl::queue deviceQueue{dev};
   sycl::buffer<float, 1> buf {data, sycl::range<1> {3}};
 
-  if (dev.has(sycl::aspect::ext_intel_bf16_conversion)) {
+  if (dev.has(sycl::aspect::ext_oneapi_bfloat16)) {
     deviceQueue.submit ([&] (sycl::handler& cgh) {
       auto numbers = buf.get_access<sycl::access::mode::read_write> (cgh);
       cgh.single_task<class simple_kernel> ([=] () {
@@ -332,4 +332,5 @@ None.
                              Add operator overloadings +
                              Apply code review suggestions
 |3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor
+|4|2022-03-07|Aidan Belton and Jack Kirk |Switch from Intel vendor specific to oneapi
 |========================================
diff --git a/sycl/include/CL/sycl/feature_test.hpp.in b/sycl/include/CL/sycl/feature_test.hpp.in
index 89596995fcef1..cf7bf86335f4f 100644
--- a/sycl/include/CL/sycl/feature_test.hpp.in
+++ b/sycl/include/CL/sycl/feature_test.hpp.in
@@ -55,7 +55,7 @@ namespace sycl {
 #define SYCL_EXT_ONEAPI_SUB_GROUP 1
 #define SYCL_EXT_ONEAPI_PROPERTIES 1
 #define SYCL_EXT_ONEAPI_NATIVE_MATH 1
-#define SYCL_EXT_INTEL_BF16_CONVERSION 1
+#define SYCL_EXT_ONEAPI_BFLOAT16 1
 #define SYCL_EXT_INTEL_DATAFLOW_PIPES 1
 #ifdef __clang__
 #if __has_extension(sycl_extended_atomics)
diff --git a/sycl/include/sycl/ext/intel/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
similarity index 90%
rename from sycl/include/sycl/ext/intel/experimental/bfloat16.hpp
rename to sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
index 5a51f3746e225..1190c80631928 100644
--- a/sycl/include/sycl/ext/intel/experimental/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
@@ -14,10 +14,10 @@
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace ext {
-namespace intel {
+namespace oneapi {
 namespace experimental {
 
-class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
+class bfloat16 {
   using storage_t = uint16_t;
   storage_t value;
 
@@ -29,7 +29,11 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
   // Explicit conversion functions
   static storage_t from_float(const float &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    return __nvvm_f2bf16_rn(a);
+#else
     return __spirv_ConvertFToBF16INTEL(a);
+#endif
 #else
     throw exception{errc::feature_not_supported,
                     "Bfloat16 conversion is not supported on host device"};
@@ -37,7 +41,14 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
   }
   static float to_float(const storage_t &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    uint32_t y = a;
+    y = y << 16;
+    float *res = reinterpret_cast<float *>(&y);
+    return *res;
+#else
     return __spirv_ConvertBF16ToFINTEL(a);
+#endif
 #else
     throw exception{errc::feature_not_supported,
                     "Bfloat16 conversion is not supported on host device"};
@@ -70,7 +81,16 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
 
   // Unary minus operator overloading
   friend bfloat16 operator-(bfloat16 &lhs) {
-    return bfloat16{-to_float(lhs.value)};
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    return from_bits(__nvvm_neg_bf16(lhs.value));
+#else
+    return bfloat16{-__spirv_ConvertBF16ToFINTEL(lhs.value)};
+#endif
+#else
+    throw exception{errc::feature_not_supported,
+                    "Bfloat16 unary minus is not supported on host device"};
+#endif
   }
 
 // Increment and decrement operators overloading
@@ -143,7 +163,7 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
 };
 
 } // namespace experimental
-} // namespace intel
+} // namespace oneapi
 } // namespace ext
 
 } // namespace sycl
diff --git a/sycl/test/extensions/bfloat16.cpp b/sycl/test/extensions/bfloat16.cpp
index 80696140b2dc8..847be9508bdb5 100644
--- a/sycl/test/extensions/bfloat16.cpp
+++ b/sycl/test/extensions/bfloat16.cpp
@@ -2,10 +2,10 @@
 
 // UNSUPPORTED: cuda || hip_amd
 
-#include <sycl/ext/intel/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
 #include <sycl/sycl.hpp>
 
-using sycl::ext::intel::experimental::bfloat16;
+using sycl::ext::oneapi::experimental::bfloat16;
 
 SYCL_EXTERNAL uint16_t some_bf16_intrinsic(uint16_t x, uint16_t y);
 SYCL_EXTERNAL void foo(long x, sycl::half y);