From b476add70e65f39f87c1f58b657f53f406ad6a62 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Thu, 17 Feb 2022 10:19:00 +0000
Subject: [PATCH 1/9] add aspect for bf16

---
 sycl/include/CL/sycl/aspects.hpp            |  1 +
 sycl/include/CL/sycl/detail/pi.h            |  1 +
 sycl/include/CL/sycl/info/device_traits.def |  1 +
 sycl/include/CL/sycl/info/info_desc.hpp     |  3 ++-
 sycl/plugins/cuda/pi_cuda.cpp               | 10 ++++++++++
 sycl/plugins/hip/pi_hip.cpp                 |  1 +
 sycl/plugins/level_zero/pi_level_zero.cpp   |  2 ++
 sycl/plugins/opencl/pi_opencl.cpp           |  2 ++
 sycl/source/detail/device_impl.cpp          |  2 ++
 sycl/source/detail/device_info.hpp          | 20 ++++++++++++++++++++
 sycl/test/abi/sycl_symbols_linux.dump       |  1 +
 11 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/sycl/include/CL/sycl/aspects.hpp b/sycl/include/CL/sycl/aspects.hpp
index 5b5d16175977a..f386a135c7eb8 100644
--- a/sycl/include/CL/sycl/aspects.hpp
+++ b/sycl/include/CL/sycl/aspects.hpp
@@ -49,6 +49,7 @@ enum class aspect {
   ext_oneapi_native_assert = 31,
   host_debuggable = 32,
   ext_intel_gpu_hw_threads_per_eu = 33,
+  bf16 = 34
 };
 
 } // namespace sycl
diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
index 0e15200d19a6b..0736a88f1ec66 100644
--- a/sycl/include/CL/sycl/detail/pi.h
+++ b/sycl/include/CL/sycl/detail/pi.h
@@ -317,6 +317,7 @@ typedef enum {
   PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 0x11000,
   PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 0x10112,
   PI_DEVICE_INFO_BACKEND_VERSION = 0x10113,
+  PI_DEVICE_INFO_BF16 = 0x12000,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_GLOBAL_WORK_GROUPS = 0x20000,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_1D = 0x20001,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_2D = 0x20002,
diff --git a/sycl/include/CL/sycl/info/device_traits.def b/sycl/include/CL/sycl/info/device_traits.def
index 3e1692df2e943..c75c72889553a 100644
--- a/sycl/include/CL/sycl/info/device_traits.def
+++ b/sycl/include/CL/sycl/info/device_traits.def
@@ -27,6 +27,7 @@ __SYCL_PARAM_TRAITS_SPEC(device, atomic_memory_order_capabilities,
                          std::vector<cl::sycl::memory_order>)
 __SYCL_PARAM_TRAITS_SPEC(device, atomic_memory_scope_capabilities,
                          std::vector<cl::sycl::memory_scope>)
+__SYCL_PARAM_TRAITS_SPEC(device, bf16, bool)
 __SYCL_PARAM_TRAITS_SPEC(device, max_read_image_args, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, max_write_image_args, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, image2d_max_width, size_t)
diff --git a/sycl/include/CL/sycl/info/info_desc.hpp b/sycl/include/CL/sycl/info/info_desc.hpp
index f56f06247f51e..168b64d95ea4f 100644
--- a/sycl/include/CL/sycl/info/info_desc.hpp
+++ b/sycl/include/CL/sycl/info/info_desc.hpp
@@ -174,7 +174,8 @@ enum class device : cl_device_info {
   ext_oneapi_max_work_groups_2d = PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_2D,
   ext_oneapi_max_work_groups_3d = PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D,
   atomic_memory_scope_capabilities =
-      PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES
+      PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES,
+  bf16 = PI_DEVICE_INFO_BF16
 };
 
 enum class device_type : pi_uint64 {
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 777e97c0f2570..f5f4792a56a64 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -1173,6 +1173,16 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    capabilities);
   }
+  case PI_DEVICE_INFO_BF16: {
+    int major = 0;
+    cl::sycl::detail::pi::assertion(
+        cuDeviceGetAttribute(&major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             device->get()) == CUDA_SUCCESS);
+
+    bool bf16 = (major >= 8) ? true : false;
+    return getInfo(param_value_size, param_value, param_value_size_ret, bf16);
+  }
   case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
     // NVIDIA devices only support one sub-group size (the warp size)
     int warpSize = 0;
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index bd54d668eade0..afbeb256cb7ee 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -1669,6 +1669,7 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name,
   case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
   case PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
+  case PI_DEVICE_INFO_BF16:
     return PI_INVALID_VALUE;
 
   default:
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index bde8efc6dc89e..c9b24159c6b93 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -2892,6 +2892,8 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
     // currently not supported in level zero runtime
     return PI_INVALID_VALUE;
+  case PI_DEVICE_INFO_BF16:
+    return PI_INVALID_VALUE;
 
   // TODO: Implement.
   case PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index 6eddf92c95964..eb083a691c462 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -230,6 +230,8 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
     std::memcpy(paramValue, &result, sizeof(cl_bool));
     return PI_SUCCESS;
   }
+  case PI_DEVICE_INFO_BF16:
+    return PI_INVALID_VALUE;
   case PI_DEVICE_INFO_IMAGE_SRGB: {
     cl_bool result = true;
     std::memcpy(paramValue, &result, sizeof(cl_bool));
diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index bd66022b52405..6743808d11af0 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -276,6 +276,8 @@ bool device_impl::has(aspect Aspect) const {
     return has_extension("cl_khr_fp16");
   case aspect::fp64:
     return has_extension("cl_khr_fp64");
+  case aspect::bf16:
+    return get_info<info::device::bf16>();
   case aspect::int64_base_atomics:
     return has_extension("cl_khr_int64_base_atomics");
   case aspect::int64_extended_atomics:
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 9193825c9f4ba..741e948e29771 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -264,6 +264,22 @@ struct get_device_info<std::vector<memory_scope>,
   }
 };
 
+// Specialization for bf16
+template <> struct get_device_info<bool, info::device::bf16> {
+  static bool get(RT::PiDevice dev, const plugin &Plugin) {
+
+    bool result = false;
+
+    RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
+        dev, pi::cast<RT::PiDeviceInfo>(info::device::bf16), sizeof(result),
+        &result, nullptr);
+    if (Err != PI_SUCCESS) {
+      return false;
+    }
+    return result;
+  }
+};
+
 // Specialization for exec_capabilities, OpenCL returns a bitfield
 template <>
 struct get_device_info<std::vector<info::execution_capability>,
@@ -769,6 +785,10 @@ get_device_info_host<info::device::atomic_memory_scope_capabilities>() {
           memory_scope::work_group, memory_scope::device, memory_scope::system};
 }
 
+template <> inline bool get_device_info_host<info::device::bf16>() {
+  return false;
+}
+
 template <>
 inline cl_uint get_device_info_host<info::device::max_read_image_args>() {
   // current value is the required minimum
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 291140d0a933c..ab8697011d01a 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -4297,6 +4297,7 @@ _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65809EEENS3_12param_traitsIS4_XT_
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65810EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65811EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE69632EEENS3_12param_traitsIS4_XT_EE11return_typeEv
+_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE73728EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device9getNativeEv
 _ZNK2cl4sycl6kernel11get_backendEv
 _ZNK2cl4sycl6kernel11get_contextEv

From 25a67072d61d9f55326d4525c0e051ce0b3f4eee Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Tue, 8 Mar 2022 17:49:48 +0000
Subject: [PATCH 2/9] make aspect oneapi extension

---
 sycl/include/CL/sycl/aspects.hpp            | 2 +-
 sycl/include/CL/sycl/info/device_traits.def | 2 +-
 sycl/include/CL/sycl/info/info_desc.hpp     | 2 +-
 sycl/plugins/cuda/pi_cuda.cpp               | 7 ++++---
 sycl/plugins/hip/pi_hip.cpp                 | 2 +-
 sycl/plugins/level_zero/pi_level_zero.cpp   | 2 +-
 sycl/source/detail/device_impl.cpp          | 4 ++--
 sycl/source/detail/device_info.hpp          | 9 +++++----
 8 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/sycl/include/CL/sycl/aspects.hpp b/sycl/include/CL/sycl/aspects.hpp
index f386a135c7eb8..51d9a5a2c1a08 100644
--- a/sycl/include/CL/sycl/aspects.hpp
+++ b/sycl/include/CL/sycl/aspects.hpp
@@ -49,7 +49,7 @@ enum class aspect {
   ext_oneapi_native_assert = 31,
   host_debuggable = 32,
   ext_intel_gpu_hw_threads_per_eu = 33,
-  bf16 = 34
+  ext_oneapi_bfloat16 = 34,
 };
 
 } // namespace sycl
diff --git a/sycl/include/CL/sycl/info/device_traits.def b/sycl/include/CL/sycl/info/device_traits.def
index c75c72889553a..34385674b29da 100644
--- a/sycl/include/CL/sycl/info/device_traits.def
+++ b/sycl/include/CL/sycl/info/device_traits.def
@@ -27,7 +27,7 @@ __SYCL_PARAM_TRAITS_SPEC(device, atomic_memory_order_capabilities,
                          std::vector<cl::sycl::memory_order>)
 __SYCL_PARAM_TRAITS_SPEC(device, atomic_memory_scope_capabilities,
                          std::vector<cl::sycl::memory_scope>)
-__SYCL_PARAM_TRAITS_SPEC(device, bf16, bool)
+__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_bfloat16, bool)
 __SYCL_PARAM_TRAITS_SPEC(device, max_read_image_args, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, max_write_image_args, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, image2d_max_width, size_t)
diff --git a/sycl/include/CL/sycl/info/info_desc.hpp b/sycl/include/CL/sycl/info/info_desc.hpp
index 168b64d95ea4f..513aa94f52913 100644
--- a/sycl/include/CL/sycl/info/info_desc.hpp
+++ b/sycl/include/CL/sycl/info/info_desc.hpp
@@ -175,7 +175,7 @@ enum class device : cl_device_info {
   ext_oneapi_max_work_groups_3d = PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D,
   atomic_memory_scope_capabilities =
       PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES,
-  bf16 = PI_DEVICE_INFO_BF16
+  ext_oneapi_bfloat16 = PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16
 };
 
 enum class device_type : pi_uint64 {
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index f5f4792a56a64..708430f614805 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -1173,15 +1173,16 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    capabilities);
   }
-  case PI_DEVICE_INFO_BF16: {
+  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16: {
     int major = 0;
     cl::sycl::detail::pi::assertion(
         cuDeviceGetAttribute(&major,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
                              device->get()) == CUDA_SUCCESS);
 
-    bool bf16 = (major >= 8) ? true : false;
-    return getInfo(param_value_size, param_value, param_value_size_ret, bf16);
+    bool bfloat16 = (major >= 8) ? true : false;
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   bfloat16);
   }
   case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
     // NVIDIA devices only support one sub-group size (the warp size)
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index afbeb256cb7ee..f8cb01f237228 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -1669,7 +1669,7 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name,
   case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
   case PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
-  case PI_DEVICE_INFO_BF16:
+  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16:
     return PI_INVALID_VALUE;
 
   default:
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index c9b24159c6b93..68efc560ee800 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -2892,7 +2892,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
     // currently not supported in level zero runtime
     return PI_INVALID_VALUE;
-  case PI_DEVICE_INFO_BF16:
+  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16:
     return PI_INVALID_VALUE;
 
   // TODO: Implement.
diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index 6743808d11af0..a976d2dfe825a 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -276,8 +276,8 @@ bool device_impl::has(aspect Aspect) const {
     return has_extension("cl_khr_fp16");
   case aspect::fp64:
     return has_extension("cl_khr_fp64");
-  case aspect::bf16:
-    return get_info<info::device::bf16>();
+  case aspect::ext_oneapi_bfloat16:
+    return get_info<info::device::ext_oneapi_bfloat16>();
   case aspect::int64_base_atomics:
     return has_extension("cl_khr_int64_base_atomics");
   case aspect::int64_extended_atomics:
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 741e948e29771..d1c1e4493dbec 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -265,14 +265,14 @@ struct get_device_info<std::vector<memory_scope>,
 };
 
 // Specialization for bf16
-template <> struct get_device_info<bool, info::device::bf16> {
+template <> struct get_device_info<bool, info::device::ext_oneapi_bfloat16> {
   static bool get(RT::PiDevice dev, const plugin &Plugin) {
 
     bool result = false;
 
     RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
-        dev, pi::cast<RT::PiDeviceInfo>(info::device::bf16), sizeof(result),
-        &result, nullptr);
+        dev, pi::cast<RT::PiDeviceInfo>(info::device::ext_oneapi_bfloat16),
+        sizeof(result), &result, nullptr);
     if (Err != PI_SUCCESS) {
       return false;
     }
@@ -785,7 +785,8 @@ get_device_info_host<info::device::atomic_memory_scope_capabilities>() {
           memory_scope::work_group, memory_scope::device, memory_scope::system};
 }
 
-template <> inline bool get_device_info_host<info::device::bf16>() {
+template <>
+inline bool get_device_info_host<info::device::ext_oneapi_bfloat16>() {
   return false;
 }
 

From e055600eb415ecfaed33675da50b54146e427219 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Wed, 9 Mar 2022 09:53:45 +0000
Subject: [PATCH 3/9] Update bf16_conversion extension

---
 .../SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc  | 335 ++++++++++++++++++
 .../sycl/ext/oneapi/experimental/bfloat16.hpp |  27 +-
 2 files changed, 337 insertions(+), 25 deletions(-)
 create mode 100644 sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc
new file mode 100644
index 0000000000000..49547f55e940c
--- /dev/null
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc
@@ -0,0 +1,335 @@
+= SYCL_ONEAPI_bfloat16
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+
+:blank: pass:[ +]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+// This is necessary for asciidoc, but not for asciidoctor
+:cpp: C++
+
+== Notice
+
+IMPORTANT: This specification is a draft.
+
+Copyright (c) 2021 Intel Corporation. All rights reserved.
+
+NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
+trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc.
+used by permission by Khronos.
+
+== Dependencies
+
+This extension is written against the SYCL 2020 specification, Revision 3.
+
+== Status
+
+Draft
+
+This is a preview extension specification, intended to provide early access to
+a feature for review and community feedback. When the feature matures, this
+specification may be released as a formal extension.
+
+Because the interfaces defined by this specification are not final and are
+subject to change they are not intended to be used by shipping software
+products.
+
+== Version
+
+Revision: 3
+
+== Introduction
+
+This extension adds functionality to convert value of single-precision
+floating-point type(`float`) to `bfloat16` type and vice versa. The extension
+doesn't add support for `bfloat16` type as such, instead it uses 16-bit integer
+type(`uint16_t`) as a storage for `bfloat16` values.
+
+The purpose of conversion from float to bfloat16 is to reduce ammount of memory
+required to store floating-point numbers. Computations are expected to be done with
+32-bit floating-point values.
+
+This extension is an optional kernel feature as described in
+https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:optional-kernel-features[section 5.7]
+of the SYCL 2020 spec. Therefore, attempting to submit a kernel using this
+feature to a device that does not support it should cause a synchronous
+`errc::kernel_not_supported` exception to be thrown from the kernel invocation
+command (e.g. from `parallel_for`).
+
+== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification section 6.3.3 "Feature test macros". Therefore, an implementation
+supporting this extension must predefine the macro
+`SYCL_EXT_ONEAPI_BFLOAT16_CONVERSION` to one of the values defined in the table
+below. Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro’s
+ value to determine which of the extension’s APIs the implementation supports.
+
+[%header,cols="1,5"]
+|===
+|Value |Description
+|1     |Initial extension version. Base features are supported.
+|===
+
+== Extension to `enum class aspect`
+
+[source]
+----
+namespace sycl {
+enum class aspect {
+  ...
+  ext_oneapi_bfloat16
+}
+}
+----
+
+If a SYCL device has the `ext_oneapi_bfloat16` aspect, then it natively
+supports conversion of values of `float` type to `bfloat16` and back.
+
+If the device doesn't have the aspect, objects of `bfloat16` class must not be
+used in the device code.
+
+**NOTE**: The `ext_oneapi_bfloat16` aspect is not yet supported.  The
+`bfloat16` class is currently supported only on Xe HP GPU.
+
+== New `bfloat16` class
+
+The `bfloat16` class below provides the conversion functionality. Conversion
+from `float` to `bfloat16` is done with round to nearest even(RTE) rounding
+mode.
+
+[source]
+----
+namespace sycl {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+class bfloat16 {
+  using storage_t = uint16_t;
+  storage_t value;
+
+public:
+  bfloat16() = default;
+  bfloat16(const bfloat16 &) = default;
+  ~bfloat16() = default;
+
+  // Explicit conversion functions
+  static storage_t from_float(const float &a);
+  static float to_float(const storage_t &a);
+
+  // Convert from float to bfloat16
+  bfloat16(const float &a);
+  bfloat16 &operator=(const float &a);
+
+  // Convert from bfloat16 to float
+  operator float() const;
+
+  // Get bfloat16 as uint16.
+  operator storage_t() const;
+
+  // Convert to bool type
+  explicit operator bool();
+
+  friend bfloat16 operator-(bfloat16 &bf) { /* ... */ }
+
+  // OP is: prefix ++, --
+  friend bfloat16 &operatorOP(bfloat16 &bf) { /* ... */ }
+
+  // OP is: postfix ++, --
+  friend bfloat16 operatorOP(bfloat16 &bf, int) { /* ... */ }
+
+  // OP is: +=, -=, *=, /=
+  friend bfloat16 &operatorOP(bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+
+  // OP is +, -, *, /
+  friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+  template <typename T>
+  friend bfloat16 operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }
+  template <typename T>
+  friend bfloat16 operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }
+
+  // OP is ==,!=, <, >, <=, >=
+  friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+  template <typename T>
+  friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }
+  template <typename T>
+  friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }
+};
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace sycl
+----
+
+Table 1. Member functions of `bfloat16` class.
+|===
+| Member Function | Description
+
+|  `static storage_t from_float(const float &a);`
+|  Explicitly convert from `float` to `bfloat16`.
+
+|  `static float to_float(const storage_t &a);`
+|  Interpret `a` as `bfloat16` and explicitly convert it to `float`.
+
+| `bfloat16(const float& a);`
+| Construct `bfloat16` from `float`. Converts `float` to `bfloat16`.
+
+| `bfloat16 &operator=(const float &a);`
+| Replace the value with `a` converted to `bfloat16`
+
+| `operator float() const;`
+|  Return `bfloat16` value converted to `float`.
+
+| `operator storage_t() const;`
+| Return `uint16_t` value, whose bits represent `bfloat16` value.
+
+| `explicit operator bool() { /* ... */ }`
+| Convert `bfloat16` to `bool` type. Return `false` if the value equals to
+  zero, return `true` otherwise.
+
+| `friend bfloat16 operator-(bfloat16 &bf) { /* ... */ }`
+| Construct new instance of `bfloat16` class with negated value of the `bf`.
+
+| `friend bfloat16 &operatorOP(bfloat16 &bf) { /* ... */ }`
+| Perform an in-place `OP` prefix arithmetic operation on the `bf`,
+  assigning the result to the `bf` and return the `bf`.
+
+  OP is: `++, --`
+
+| `friend bfloat16 operatorOP(bfloat16 &bf, int) { /* ... */ }`
+| Perform an in-place `OP` postfix arithmetic operation on `bf`, assigning
+  the result to the `bf` and return a copy of `bf` before the operation is
+  performed.
+
+  OP is: `++, --`
+
+| `friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Perform an in-place `OP` arithmetic operation between the `lhs` and the `rhs`
+  and return the `lhs`.
+
+  OP is: `+=, -=, *=, /=`
+
+| `friend type operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Construct a new instance of the `bfloat16` class with the value of the new
+  `bfloat16` instance being the result of an OP arithmetic operation between
+  the `lhs` `bfloat16` and `rhs` `bfloat16` values.
+
+  OP is `+, -, *, /`
+
+| `template <typename T>
+  friend bfloat16 operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }`
+| Construct a new instance of the `bfloat16` class with the value of the new
+  `bfloat16` instance being the result of an OP arithmetic operation between
+  the `lhs` `bfloat16` value and `rhs` of template type `T`. Type `T` must be
+  convertible to `float`.
+
+  OP is `+, -, *, /`
+
+| `template <typename T>
+  friend bfloat16 operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Construct a new instance of the `bfloat16` class with the value of the new
+  `bfloat16` instance being the result of an OP arithmetic operation between
+  the `lhs` of template type `T` and `rhs` `bfloat16` value. Type `T` must be
+  convertible to `float`.
+
+  OP is `+, -, *, /`
+
+| `friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Perform comparison operation OP between `lhs` `bfloat16` and `rhs` `bfloat16`
+  values and return the result as a boolean value.
+
+OP is `==, !=, <, >, <=, >=`
+
+| `template <typename T>
+  friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }`
+| Perform comparison operation OP between `lhs` `bfloat16` and `rhs` of
+  template type `T` and return the result as a boolean value. Type `T` must be
+  convertible to `float`.
+
+OP is `==, !=, <, >, <=, >=`
+
+| `template <typename T>
+  friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Perform comparison operation OP between `lhs` of template type `T` and `rhs`
+  `bfloat16` value and return the result as a boolean value. Type `T` must be
+  convertible to `float`.
+
+OP is `==, !=, <, >, <=, >=`
+|===
+
+== Example
+
+[source]
+----
+#include <sycl/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+
+using sycl::ext::oneapi::experimental::bfloat16;
+
+bfloat16 operator+(const bfloat16 &lhs, const bfloat16 &rhs) {
+  return static_cast<float>(lhs) + static_cast<float>(rhs);
+}
+
+float foo(float a, float b) {
+  // Convert from float to bfloat16.
+  bfloat16 A {a};
+  bfloat16 B {b};
+
+  // Convert A and B from bfloat16 to float, do addition on floating-pointer
+  // numbers, then convert the result to bfloat16 and store it in C.
+  bfloat16 C = A + B;
+
+  // Return the result converted from bfloat16 to float.
+  return C;
+}
+
+int main (int argc, char *argv[]) {
+  float data[3] = {7.0, 8.1, 0.0};
+  sycl::device dev;
+  sycl::queue deviceQueue{dev};
+  sycl::buffer<float, 1> buf {data, sycl::range<1> {3}};
+
+  if (dev.has(sycl::aspect::ext_oneapi_bfloat16)) {
+    deviceQueue.submit ([&] (sycl::handler& cgh) {
+      auto numbers = buf.get_access<sycl::access::mode::read_write> (cgh);
+      cgh.single_task<class simple_kernel> ([=] () {
+        numbers[2] = foo(numbers[0], numbers[1]);
+      });
+    });
+  }
+  return 0;
+}
+----
+
+== Issues
+
+None.
+
+== Revision History
+
+[cols="5,15,15,70"]
+[grid="rows"]
+[options="header"]
+|========================================
+|Rev|Date|Author|Changes
+|1|2021-08-02|Alexey Sotkin |Initial public working draft
+|2|2021-08-17|Alexey Sotkin |Add explicit conversion functions +
+                             Add operator overloadings +
+                             Apply code review suggestions
+|3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor
+|========================================
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
index 39bad8d1ecc35..77878e6d6e5e1 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
@@ -17,7 +17,7 @@ namespace ext {
 namespace oneapi {
 namespace experimental {
 
-class bfloat16 {
+class [[sycl_detail::uses_aspects(ext_oneapi_bfloat16)]] bfloat16 {
   using storage_t = uint16_t;
   storage_t value;
 
@@ -29,29 +29,16 @@ class bfloat16 {
   // Explicit conversion functions
   static storage_t from_float(const float &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
-#if defined(__NVPTX__)
-    return __nvvm_f2bf16_rn(a);
-#else
     return __spirv_ConvertFToBF16INTEL(a);
-#endif
 #else
-    (void)a;
     throw exception{errc::feature_not_supported,
                     "Bfloat16 conversion is not supported on host device"};
 #endif
   }
   static float to_float(const storage_t &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
-#if defined(__NVPTX__)
-    uint32_t y = a;
-    y = y << 16;
-    float *res = reinterpret_cast<float *>(&y);
-    return *res;
-#else
     return __spirv_ConvertBF16ToFINTEL(a);
-#endif
 #else
-    (void)a;
     throw exception{errc::feature_not_supported,
                     "Bfloat16 conversion is not supported on host device"};
 #endif
@@ -83,17 +70,7 @@ class bfloat16 {
 
   // Unary minus operator overloading
   friend bfloat16 operator-(bfloat16 &lhs) {
-#if defined(__SYCL_DEVICE_ONLY__)
-#if defined(__NVPTX__)
-    return from_bits(__nvvm_neg_bf16(lhs.value));
-#else
-    return bfloat16{-__spirv_ConvertBF16ToFINTEL(lhs.value)};
-#endif
-#else
-    (void)lhs;
-    throw exception{errc::feature_not_supported,
-                    "Bfloat16 unary minus is not supported on host device"};
-#endif
+    return bfloat16{-to_float(lhs.value)};
   }
 
 // Increment and decrement operators overloading

From 992ba74d51fade745ca0aaae62f1e79c73d1699f Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Tue, 15 Mar 2022 13:48:55 +0000
Subject: [PATCH 4/9] Undo update bf16_conversion extension

---
 ...> SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc} | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)
 rename sycl/doc/extensions/experimental/{SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc => SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc} (95%)

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc
similarity index 95%
rename from sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc
rename to sycl/doc/extensions/experimental/SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc
index 49547f55e940c..9b1018ced0b34 100644
--- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc
@@ -1,4 +1,4 @@
-= SYCL_ONEAPI_bfloat16
+= SYCL_INTEL_bf16_conversion
 
 :source-highlighter: coderay
 :coderay-linenums-mode: table
@@ -73,7 +73,7 @@ command (e.g. from `parallel_for`).
 This extension provides a feature-test macro as described in the core SYCL
 specification section 6.3.3 "Feature test macros". Therefore, an implementation
 supporting this extension must predefine the macro
-`SYCL_EXT_ONEAPI_BFLOAT16_CONVERSION` to one of the values defined in the table
+`SYCL_EXT_INTEL_BF16_CONVERSION` to one of the values defined in the table
 below. Applications can test for the existence of this macro to determine if
 the implementation supports this feature, or applications can test the macro’s
  value to determine which of the extension’s APIs the implementation supports.
@@ -91,18 +91,18 @@ the implementation supports this feature, or applications can test the macro’s
 namespace sycl {
 enum class aspect {
   ...
-  ext_oneapi_bfloat16
+  ext_intel_bf16_conversion
 }
 }
 ----
 
-If a SYCL device has the `ext_oneapi_bfloat16` aspect, then it natively
+If a SYCL device has the `ext_intel_bf16_conversion` aspect, then it natively
 supports conversion of values of `float` type to `bfloat16` and back.
 
 If the device doesn't have the aspect, objects of `bfloat16` class must not be
 used in the device code.
 
-**NOTE**: The `ext_oneapi_bfloat16` aspect is not yet supported.  The
+**NOTE**: The `ext_intel_bf16_conversion` aspect is not yet supported.  The
 `bfloat16` class is currently supported only on Xe HP GPU.
 
 == New `bfloat16` class
@@ -115,7 +115,7 @@ mode.
 ----
 namespace sycl {
 namespace ext {
-namespace oneapi {
+namespace intel {
 namespace experimental {
 
 class bfloat16 {
@@ -171,7 +171,7 @@ public:
 };
 
 } // namespace experimental
-} // namespace oneapi
+} // namespace intel
 } // namespace ext
 } // namespace sycl
 ----
@@ -277,9 +277,9 @@ OP is `==, !=, <, >, <=, >=`
 [source]
 ----
 #include <sycl/sycl.hpp>
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/ext/intel/experimental/bfloat16.hpp>
 
-using sycl::ext::oneapi::experimental::bfloat16;
+using sycl::ext::intel::experimental::bfloat16;
 
 bfloat16 operator+(const bfloat16 &lhs, const bfloat16 &rhs) {
   return static_cast<float>(lhs) + static_cast<float>(rhs);
@@ -304,7 +304,7 @@ int main (int argc, char *argv[]) {
   sycl::queue deviceQueue{dev};
   sycl::buffer<float, 1> buf {data, sycl::range<1> {3}};
 
-  if (dev.has(sycl::aspect::ext_oneapi_bfloat16)) {
+  if (dev.has(sycl::aspect::ext_intel_bf16_conversion)) {
     deviceQueue.submit ([&] (sycl::handler& cgh) {
       auto numbers = buf.get_access<sycl::access::mode::read_write> (cgh);
       cgh.single_task<class simple_kernel> ([=] () {

From 1ee0543a90bd4e5734a2172c50309fe22405b22f Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Mon, 16 May 2022 12:48:12 +0100
Subject: [PATCH 5/9] fix rebase

---
 .../SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc   | 335 ------------------
 sycl/include/CL/sycl/builtins.hpp             |  24 +-
 sycl/include/CL/sycl/detail/pi.h              |   2 +-
 sycl/include/CL/sycl/info/info_desc.hpp       |   2 +-
 .../sycl/ext/oneapi/experimental/bfloat16.hpp |  25 +-
 sycl/plugins/opencl/pi_opencl.cpp             |   2 +-
 6 files changed, 39 insertions(+), 351 deletions(-)
 delete mode 100644 sycl/doc/extensions/experimental/SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc
deleted file mode 100644
index 9b1018ced0b34..0000000000000
--- a/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc
+++ /dev/null
@@ -1,335 +0,0 @@
-= SYCL_INTEL_bf16_conversion
-
-:source-highlighter: coderay
-:coderay-linenums-mode: table
-
-// This section needs to be after the document title.
-:doctype: book
-:toc2:
-:toc: left
-:encoding: utf-8
-:lang: en
-
-:blank: pass:[ +]
-
-// Set the default source code type in this document to C++,
-// for syntax highlighting purposes.  This is needed because
-// docbook uses c++ and html5 uses cpp.
-:language: {basebackend@docbook:c++:cpp}
-
-// This is necessary for asciidoc, but not for asciidoctor
-:cpp: C++
-
-== Notice
-
-IMPORTANT: This specification is a draft.
-
-Copyright (c) 2021 Intel Corporation. All rights reserved.
-
-NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
-trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc.
-used by permission by Khronos.
-
-== Dependencies
-
-This extension is written against the SYCL 2020 specification, Revision 3.
-
-== Status
-
-Draft
-
-This is a preview extension specification, intended to provide early access to
-a feature for review and community feedback. When the feature matures, this
-specification may be released as a formal extension.
-
-Because the interfaces defined by this specification are not final and are
-subject to change they are not intended to be used by shipping software
-products.
-
-== Version
-
-Revision: 3
-
-== Introduction
-
-This extension adds functionality to convert value of single-precision
-floating-point type(`float`) to `bfloat16` type and vice versa. The extension
-doesn't add support for `bfloat16` type as such, instead it uses 16-bit integer
-type(`uint16_t`) as a storage for `bfloat16` values.
-
-The purpose of conversion from float to bfloat16 is to reduce ammount of memory
-required to store floating-point numbers. Computations are expected to be done with
-32-bit floating-point values.
-
-This extension is an optional kernel feature as described in
-https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:optional-kernel-features[section 5.7]
-of the SYCL 2020 spec. Therefore, attempting to submit a kernel using this
-feature to a device that does not support it should cause a synchronous
-`errc::kernel_not_supported` exception to be thrown from the kernel invocation
-command (e.g. from `parallel_for`).
-
-== Feature test macro
-
-This extension provides a feature-test macro as described in the core SYCL
-specification section 6.3.3 "Feature test macros". Therefore, an implementation
-supporting this extension must predefine the macro
-`SYCL_EXT_INTEL_BF16_CONVERSION` to one of the values defined in the table
-below. Applications can test for the existence of this macro to determine if
-the implementation supports this feature, or applications can test the macro’s
- value to determine which of the extension’s APIs the implementation supports.
-
-[%header,cols="1,5"]
-|===
-|Value |Description
-|1     |Initial extension version. Base features are supported.
-|===
-
-== Extension to `enum class aspect`
-
-[source]
-----
-namespace sycl {
-enum class aspect {
-  ...
-  ext_intel_bf16_conversion
-}
-}
-----
-
-If a SYCL device has the `ext_intel_bf16_conversion` aspect, then it natively
-supports conversion of values of `float` type to `bfloat16` and back.
-
-If the device doesn't have the aspect, objects of `bfloat16` class must not be
-used in the device code.
-
-**NOTE**: The `ext_intel_bf16_conversion` aspect is not yet supported.  The
-`bfloat16` class is currently supported only on Xe HP GPU.
-
-== New `bfloat16` class
-
-The `bfloat16` class below provides the conversion functionality. Conversion
-from `float` to `bfloat16` is done with round to nearest even(RTE) rounding
-mode.
-
-[source]
-----
-namespace sycl {
-namespace ext {
-namespace intel {
-namespace experimental {
-
-class bfloat16 {
-  using storage_t = uint16_t;
-  storage_t value;
-
-public:
-  bfloat16() = default;
-  bfloat16(const bfloat16 &) = default;
-  ~bfloat16() = default;
-
-  // Explicit conversion functions
-  static storage_t from_float(const float &a);
-  static float to_float(const storage_t &a);
-
-  // Convert from float to bfloat16
-  bfloat16(const float &a);
-  bfloat16 &operator=(const float &a);
-
-  // Convert from bfloat16 to float
-  operator float() const;
-
-  // Get bfloat16 as uint16.
-  operator storage_t() const;
-
-  // Convert to bool type
-  explicit operator bool();
-
-  friend bfloat16 operator-(bfloat16 &bf) { /* ... */ }
-
-  // OP is: prefix ++, --
-  friend bfloat16 &operatorOP(bfloat16 &bf) { /* ... */ }
-
-  // OP is: postfix ++, --
-  friend bfloat16 operatorOP(bfloat16 &bf, int) { /* ... */ }
-
-  // OP is: +=, -=, *=, /=
-  friend bfloat16 &operatorOP(bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
-
-  // OP is +, -, *, /
-  friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
-  template <typename T>
-  friend bfloat16 operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }
-  template <typename T>
-  friend bfloat16 operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }
-
-  // OP is ==,!=, <, >, <=, >=
-  friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
-  template <typename T>
-  friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }
-  template <typename T>
-  friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }
-};
-
-} // namespace experimental
-} // namespace intel
-} // namespace ext
-} // namespace sycl
-----
-
-Table 1. Member functions of `bfloat16` class.
-|===
-| Member Function | Description
-
-|  `static storage_t from_float(const float &a);`
-|  Explicitly convert from `float` to `bfloat16`.
-
-|  `static float to_float(const storage_t &a);`
-|  Interpret `a` as `bfloat16` and explicitly convert it to `float`.
-
-| `bfloat16(const float& a);`
-| Construct `bfloat16` from `float`. Converts `float` to `bfloat16`.
-
-| `bfloat16 &operator=(const float &a);`
-| Replace the value with `a` converted to `bfloat16`
-
-| `operator float() const;`
-|  Return `bfloat16` value converted to `float`.
-
-| `operator storage_t() const;`
-| Return `uint16_t` value, whose bits represent `bfloat16` value.
-
-| `explicit operator bool() { /* ... */ }`
-| Convert `bfloat16` to `bool` type. Return `false` if the value equals to
-  zero, return `true` otherwise.
-
-| `friend bfloat16 operator-(bfloat16 &bf) { /* ... */ }`
-| Construct new instance of `bfloat16` class with negated value of the `bf`.
-
-| `friend bfloat16 &operatorOP(bfloat16 &bf) { /* ... */ }`
-| Perform an in-place `OP` prefix arithmetic operation on the `bf`,
-  assigning the result to the `bf` and return the `bf`.
-
-  OP is: `++, --`
-
-| `friend bfloat16 operatorOP(bfloat16 &bf, int) { /* ... */ }`
-| Perform an in-place `OP` postfix arithmetic operation on `bf`, assigning
-  the result to the `bf` and return a copy of `bf` before the operation is
-  performed.
-
-  OP is: `++, --`
-
-| `friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
-| Perform an in-place `OP` arithmetic operation between the `lhs` and the `rhs`
-  and return the `lhs`.
-
-  OP is: `+=, -=, *=, /=`
-
-| `friend type operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
-| Construct a new instance of the `bfloat16` class with the value of the new
-  `bfloat16` instance being the result of an OP arithmetic operation between
-  the `lhs` `bfloat16` and `rhs` `bfloat16` values.
-
-  OP is `+, -, *, /`
-
-| `template <typename T>
-  friend bfloat16 operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }`
-| Construct a new instance of the `bfloat16` class with the value of the new
-  `bfloat16` instance being the result of an OP arithmetic operation between
-  the `lhs` `bfloat16` value and `rhs` of template type `T`. Type `T` must be
-  convertible to `float`.
-
-  OP is `+, -, *, /`
-
-| `template <typename T>
-  friend bfloat16 operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }`
-| Construct a new instance of the `bfloat16` class with the value of the new
-  `bfloat16` instance being the result of an OP arithmetic operation between
-  the `lhs` of template type `T` and `rhs` `bfloat16` value. Type `T` must be
-  convertible to `float`.
-
-  OP is `+, -, *, /`
-
-| `friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
-| Perform comparison operation OP between `lhs` `bfloat16` and `rhs` `bfloat16`
-  values and return the result as a boolean value.
-
-OP is `==, !=, <, >, <=, >=`
-
-| `template <typename T>
-  friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }`
-| Perform comparison operation OP between `lhs` `bfloat16` and `rhs` of
-  template type `T` and return the result as a boolean value. Type `T` must be
-  convertible to `float`.
-
-OP is `==, !=, <, >, <=, >=`
-
-| `template <typename T>
-  friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }`
-| Perform comparison operation OP between `lhs` of template type `T` and `rhs`
-  `bfloat16` value and return the result as a boolean value. Type `T` must be
-  convertible to `float`.
-
-OP is `==, !=, <, >, <=, >=`
-|===
-
-== Example
-
-[source]
-----
-#include <sycl/sycl.hpp>
-#include <sycl/ext/intel/experimental/bfloat16.hpp>
-
-using sycl::ext::intel::experimental::bfloat16;
-
-bfloat16 operator+(const bfloat16 &lhs, const bfloat16 &rhs) {
-  return static_cast<float>(lhs) + static_cast<float>(rhs);
-}
-
-float foo(float a, float b) {
-  // Convert from float to bfloat16.
-  bfloat16 A {a};
-  bfloat16 B {b};
-
-  // Convert A and B from bfloat16 to float, do addition on floating-pointer
-  // numbers, then convert the result to bfloat16 and store it in C.
-  bfloat16 C = A + B;
-
-  // Return the result converted from bfloat16 to float.
-  return C;
-}
-
-int main (int argc, char *argv[]) {
-  float data[3] = {7.0, 8.1, 0.0};
-  sycl::device dev;
-  sycl::queue deviceQueue{dev};
-  sycl::buffer<float, 1> buf {data, sycl::range<1> {3}};
-
-  if (dev.has(sycl::aspect::ext_intel_bf16_conversion)) {
-    deviceQueue.submit ([&] (sycl::handler& cgh) {
-      auto numbers = buf.get_access<sycl::access::mode::read_write> (cgh);
-      cgh.single_task<class simple_kernel> ([=] () {
-        numbers[2] = foo(numbers[0], numbers[1]);
-      });
-    });
-  }
-  return 0;
-}
-----
-
-== Issues
-
-None.
-
-== Revision History
-
-[cols="5,15,15,70"]
-[grid="rows"]
-[options="header"]
-|========================================
-|Rev|Date|Author|Changes
-|1|2021-08-02|Alexey Sotkin |Initial public working draft
-|2|2021-08-17|Alexey Sotkin |Add explicit conversion functions +
-                             Add operator overloadings +
-                             Apply code review suggestions
-|3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor
-|========================================
diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp
index c6cd1c03f87d0..c268009216ddc 100644
--- a/sycl/include/CL/sycl/builtins.hpp
+++ b/sycl/include/CL/sycl/builtins.hpp
@@ -1573,73 +1573,73 @@ detail::enable_if_t<detail::is_genfloatf<T>::value, T> tan(T x) __NOEXC {
 /* ----------------- -ffast-math functions. ---------------------------------*/
 // genfloatf cos (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloatf<T>::value, T> cos(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloat<T>::value, T> cos(T x) __NOEXC {
   return native::cos(x);
 }
 
 // genfloatf exp (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloatf<T>::value, T> exp(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloat<T>::value, T> exp(T x) __NOEXC {
   return native::exp(x);
 }
 
 // genfloatf exp2 (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloatf<T>::value, T> exp2(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloat<T>::value, T> exp2(T x) __NOEXC {
   return native::exp2(x);
 }
 
 // genfloatf exp10 (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloatf<T>::value, T> exp10(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloat<T>::value, T> exp10(T x) __NOEXC {
   return native::exp10(x);
 }
 
 // genfloatf log(genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloatf<T>::value, T> log(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloat<T>::value, T> log(T x) __NOEXC {
   return native::log(x);
 }
 
 // genfloatf log2 (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloatf<T>::value, T> log2(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloat<T>::value, T> log2(T x) __NOEXC {
   return native::log2(x);
 }
 
 // genfloatf log10 (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloatf<T>::value, T> log10(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloat<T>::value, T> log10(T x) __NOEXC {
   return native::log10(x);
 }
 
 // genfloatf powr (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloatf<T>::value, T> powr(T x, T y) __NOEXC {
+detail::enable_if_t<detail::is_genfloat<T>::value, T> powr(T x, T y) __NOEXC {
   return native::powr(x, y);
 }
 
 // genfloatf rsqrt (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloatf<T>::value, T> rsqrt(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloat<T>::value, T> rsqrt(T x) __NOEXC {
   return native::rsqrt(x);
 }
 
 // genfloatf sin (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloatf<T>::value, T> sin(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloat<T>::value, T> sin(T x) __NOEXC {
   return native::sin(x);
 }
 
 // genfloatf sqrt (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloatf<T>::value, T> sqrt(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloat<T>::value, T> sqrt(T x) __NOEXC {
   return native::sqrt(x);
 }
 
 // genfloatf tan (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloatf<T>::value, T> tan(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloat<T>::value, T> tan(T x) __NOEXC {
   return native::tan(x);
 }
 
diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
index 0736a88f1ec66..adf18437ed330 100644
--- a/sycl/include/CL/sycl/detail/pi.h
+++ b/sycl/include/CL/sycl/detail/pi.h
@@ -317,7 +317,7 @@ typedef enum {
   PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 0x11000,
   PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 0x10112,
   PI_DEVICE_INFO_BACKEND_VERSION = 0x10113,
-  PI_DEVICE_INFO_BF16 = 0x12000,
+  PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16 = 0x12000,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_GLOBAL_WORK_GROUPS = 0x20000,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_1D = 0x20001,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_2D = 0x20002,
diff --git a/sycl/include/CL/sycl/info/info_desc.hpp b/sycl/include/CL/sycl/info/info_desc.hpp
index 513aa94f52913..8b0bc8be5a229 100644
--- a/sycl/include/CL/sycl/info/info_desc.hpp
+++ b/sycl/include/CL/sycl/info/info_desc.hpp
@@ -175,7 +175,7 @@ enum class device : cl_device_info {
   ext_oneapi_max_work_groups_3d = PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D,
   atomic_memory_scope_capabilities =
       PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES,
-  ext_oneapi_bfloat16 = PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16
+  ext_oneapi_bfloat16 = PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16,
 };
 
 enum class device_type : pi_uint64 {
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
index 77878e6d6e5e1..dbb4dde1ada59 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
@@ -29,16 +29,29 @@ class [[sycl_detail::uses_aspects(ext_oneapi_bfloat16)]] bfloat16 {
   // Explicit conversion functions
   static storage_t from_float(const float &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    return __nvvm_f2bf16_rn(a);
+#else
     return __spirv_ConvertFToBF16INTEL(a);
+#endif
 #else
+    (void)a;
     throw exception{errc::feature_not_supported,
                     "Bfloat16 conversion is not supported on host device"};
 #endif
   }
   static float to_float(const storage_t &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    uint32_t y = a;
+    y = y << 16;
+    float *res = reinterpret_cast<float *>(&y);
+    return *res;
+#else
     return __spirv_ConvertBF16ToFINTEL(a);
+#endif
 #else
+    (void)a;
     throw exception{errc::feature_not_supported,
                     "Bfloat16 conversion is not supported on host device"};
 #endif
@@ -70,7 +83,17 @@ class [[sycl_detail::uses_aspects(ext_oneapi_bfloat16)]] bfloat16 {
 
   // Unary minus operator overloading
   friend bfloat16 operator-(bfloat16 &lhs) {
-    return bfloat16{-to_float(lhs.value)};
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    return from_bits(__nvvm_neg_bf16(lhs.value));
+#else
+    return bfloat16{-__spirv_ConvertBF16ToFINTEL(lhs.value)};
+#endif
+#else
+    (void)lhs;
+    throw exception{errc::feature_not_supported,
+                    "Bfloat16 unary minus is not supported on host device"};
+#endif
   }
 
 // Increment and decrement operators overloading
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index eb083a691c462..d5ebba9a76ea5 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -230,7 +230,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
     std::memcpy(paramValue, &result, sizeof(cl_bool));
     return PI_SUCCESS;
   }
-  case PI_DEVICE_INFO_BF16:
+  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16:
     return PI_INVALID_VALUE;
   case PI_DEVICE_INFO_IMAGE_SRGB: {
     cl_bool result = true;

From abeca3f2ea1397763c8c68ef7c3c78abb6e6bac6 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Mon, 16 May 2022 14:15:35 +0100
Subject: [PATCH 6/9] Fix werror

---
 sycl/include/CL/sycl/builtins.hpp             | 24 +++++++++----------
 .../sycl/ext/oneapi/experimental/bfloat16.hpp |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp
index c268009216ddc..c6cd1c03f87d0 100644
--- a/sycl/include/CL/sycl/builtins.hpp
+++ b/sycl/include/CL/sycl/builtins.hpp
@@ -1573,73 +1573,73 @@ detail::enable_if_t<detail::is_genfloatf<T>::value, T> tan(T x) __NOEXC {
 /* ----------------- -ffast-math functions. ---------------------------------*/
 // genfloatf cos (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> cos(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloatf<T>::value, T> cos(T x) __NOEXC {
   return native::cos(x);
 }
 
 // genfloatf exp (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> exp(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloatf<T>::value, T> exp(T x) __NOEXC {
   return native::exp(x);
 }
 
 // genfloatf exp2 (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> exp2(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloatf<T>::value, T> exp2(T x) __NOEXC {
   return native::exp2(x);
 }
 
 // genfloatf exp10 (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> exp10(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloatf<T>::value, T> exp10(T x) __NOEXC {
   return native::exp10(x);
 }
 
 // genfloatf log(genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> log(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloatf<T>::value, T> log(T x) __NOEXC {
   return native::log(x);
 }
 
 // genfloatf log2 (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> log2(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloatf<T>::value, T> log2(T x) __NOEXC {
   return native::log2(x);
 }
 
 // genfloatf log10 (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> log10(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloatf<T>::value, T> log10(T x) __NOEXC {
   return native::log10(x);
 }
 
 // genfloatf powr (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> powr(T x, T y) __NOEXC {
+detail::enable_if_t<detail::is_genfloatf<T>::value, T> powr(T x, T y) __NOEXC {
   return native::powr(x, y);
 }
 
 // genfloatf rsqrt (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> rsqrt(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloatf<T>::value, T> rsqrt(T x) __NOEXC {
   return native::rsqrt(x);
 }
 
 // genfloatf sin (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> sin(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloatf<T>::value, T> sin(T x) __NOEXC {
   return native::sin(x);
 }
 
 // genfloatf sqrt (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> sqrt(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloatf<T>::value, T> sqrt(T x) __NOEXC {
   return native::sqrt(x);
 }
 
 // genfloatf tan (genfloatf x)
 template <typename T>
-detail::enable_if_t<detail::is_genfloat<T>::value, T> tan(T x) __NOEXC {
+detail::enable_if_t<detail::is_genfloatf<T>::value, T> tan(T x) __NOEXC {
   return native::tan(x);
 }
 
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
index dbb4dde1ada59..39bad8d1ecc35 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
@@ -17,7 +17,7 @@ namespace ext {
 namespace oneapi {
 namespace experimental {
 
-class [[sycl_detail::uses_aspects(ext_oneapi_bfloat16)]] bfloat16 {
+class bfloat16 {
   using storage_t = uint16_t;
   storage_t value;
 

From cf60751762acc19a84f61d9ddbfec7e774e9990a Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Tue, 17 May 2022 11:17:44 +0100
Subject: [PATCH 7/9] Add device info comment and update value

---
 sycl/include/CL/sycl/detail/pi.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
index adf18437ed330..6780f49f108bc 100644
--- a/sycl/include/CL/sycl/detail/pi.h
+++ b/sycl/include/CL/sycl/detail/pi.h
@@ -317,7 +317,8 @@ typedef enum {
   PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 0x11000,
   PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 0x10112,
   PI_DEVICE_INFO_BACKEND_VERSION = 0x10113,
-  PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16 = 0x12000,
+  // Return true if bfloat16 data type is supported by device
+  PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16 = 0x1FFFF,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_GLOBAL_WORK_GROUPS = 0x20000,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_1D = 0x20001,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_2D = 0x20002,

From 70fcad298a9e47685c79bf8719012020670eedc2 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Tue, 17 May 2022 11:52:14 +0100
Subject: [PATCH 8/9] Update symbols dump

---
 sycl/test/abi/sycl_symbols_linux.dump | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index ab8697011d01a..6a55d4427de28 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -4297,7 +4297,7 @@ _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65809EEENS3_12param_traitsIS4_XT_
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65810EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65811EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE69632EEENS3_12param_traitsIS4_XT_EE11return_typeEv
-_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE73728EEENS3_12param_traitsIS4_XT_EE11return_typeEv
+_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE131071EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device9getNativeEv
 _ZNK2cl4sycl6kernel11get_backendEv
 _ZNK2cl4sycl6kernel11get_contextEv

From 43e26d9375ff7bb009d902113ee1f67d13f406e5 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Mon, 13 Jun 2022 09:36:38 +0100
Subject: [PATCH 9/9] Fix PI_INVALID_VALUE error

---
 sycl/plugins/opencl/pi_opencl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index 3558747b1cb37..e75a7d12c5a9f 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -231,7 +231,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
     return PI_SUCCESS;
   }
   case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16:
-    return PI_INVALID_VALUE;
+    return PI_ERROR_INVALID_VALUE;
   case PI_DEVICE_INFO_IMAGE_SRGB: {
     cl_bool result = true;
     std::memcpy(paramValue, &result, sizeof(cl_bool));