From 6014cef1cda24c40c9bca217c2fa31c0e3e18ca1 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 3 Aug 2022 15:09:01 -0700
Subject: [PATCH 01/63] [SYCL] Move bfloat support from experimental to
 supported.

Signed-off-by: Rajiv Deodhar <rajiv.deodhar@intel.com>
---
 .../sycl_ext_oneapi_bfloat16.asciidoc         | 15 ++--
 .../oneapi/{experimental => }/bfloat16.hpp    |  2 -
 .../sycl/ext/oneapi/experimental/builtins.hpp |  2 +-
 .../sycl/ext/oneapi/matrix/matrix-jit.hpp     | 80 +++++++++----------
 sycl/test/extensions/bfloat16.cpp             |  4 +-
 5 files changed, 47 insertions(+), 56 deletions(-)
 rename sycl/doc/extensions/{experimental => supported}/sycl_ext_oneapi_bfloat16.asciidoc (97%)
 rename sycl/include/sycl/ext/oneapi/{experimental => }/bfloat16.hpp (99%)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
similarity index 97%
rename from sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
rename to sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index bec08876ed084..8bc01a56077d7 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -135,13 +135,11 @@ public:
   bfloat16(const float &a);
   bfloat16 &operator=(const float &a);
 
-  // Convert from bfloat16 to float
+  // Convert bfloat16 to floating-point types
   operator float() const;
+  operator sycl::half() const;
 
-  // Get bfloat16 as uint16.
-  operator storage_t() const;
-
-  // Convert to bool type
+  // Convert bfloat16 to bool type
   explicit operator bool();
 
   friend bfloat16 operator-(bfloat16 &bf) { /* ... */ }
@@ -195,11 +193,11 @@ Table 1. Member functions of `bfloat16` class.
 | `operator float() const;`
 |  Return `bfloat16` value converted to `float`.
 
-| `operator storage_t() const;`
-| Return `uint16_t` value, whose bits represent `bfloat16` value.
+| `operator sycl::half() const;`
+| Return `bfloat16` value converted to `sycl::half`.
 
 | `explicit operator bool() { /* ... */ }`
-| Convert `bfloat16` to `bool` type. Return `false` if the value equals to
+| Convert `bfloat16` to `bool` type. Return `false` if the `value` equals to
   zero, return `true` otherwise.
 
 | `friend bfloat16 operator-(bfloat16 &bf) { /* ... */ }`
@@ -408,4 +406,5 @@ Compute absolute value of a `bfloat16`.
 |3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor
 |4|2022-03-07|Aidan Belton and Jack Kirk |Switch from Intel vendor specific to oneapi
 |5|2022-04-05|Jack Kirk | Added section for bfloat16 math builtins
+|6|2022-08-03|Alexey Sotkin |Add `operator sycl::half()`
 |========================================
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
similarity index 99%
rename from sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
rename to sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 533976f36b890..033884b41a824 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -15,7 +15,6 @@ __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace ext {
 namespace oneapi {
-namespace experimental {
 
 class bfloat16 {
   using storage_t = uint16_t;
@@ -165,7 +164,6 @@ class bfloat16 {
   // for floating-point types.
 };
 
-} // namespace experimental
 } // namespace oneapi
 } // namespace ext
 
diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
index 4de7ebb92dec7..3bae87470e7b6 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
@@ -15,7 +15,7 @@
 #include <sycl/detail/type_traits.hpp>
 
 #include <CL/__spirv/spirv_ops.hpp>
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
 
 // TODO Decide whether to mark functions with this attribute.
 #define __NOEXC /*noexcept*/
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp
index b287419843d7a..58aacf9c7c28a 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp
@@ -10,7 +10,7 @@
 
 #include <CL/__spirv/spirv_ops.hpp>
 #include <sycl/detail/defines_elementary.hpp>
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
 #include <sycl/feature_test.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
@@ -458,18 +458,16 @@ class wi_element<uint16_t, NumRows, NumCols, Layout, Group> {
 };
 
 template <size_t NumRows, size_t NumCols, matrix_layout Layout, typename Group>
-class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
-                 Layout, Group> {
-  joint_matrix<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
-               Layout, Group> &M;
+class wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Layout, Group> {
+  joint_matrix<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Layout, Group> &M;
   std::size_t idx;
 
 public:
-  wi_element(joint_matrix<sycl::ext::oneapi::experimental::bfloat16, NumRows,
-                          NumCols, Layout, Group> &Mat,
+  wi_element(joint_matrix<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Layout,
+                          Group> &Mat,
              std::size_t i)
       : M(Mat), idx(i) {}
-  operator sycl::ext::oneapi::experimental::bfloat16() {
+  operator sycl::ext::oneapi::bfloat16() {
 #ifdef __SYCL_DEVICE_ONLY__
     return __spirv_VectorExtractDynamic(M.spvm, idx);
 #else
@@ -488,7 +486,7 @@ class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
 #endif // __SYCL_DEVICE_ONLY__
   }
 
-  wi_element &operator=(const sycl::ext::oneapi::experimental::bfloat16 &rhs) {
+  wi_element &operator=(const sycl::ext::oneapi::bfloat16 &rhs) {
 #ifdef __SYCL_DEVICE_ONLY__
     M.spvm = __spirv_VectorInsertDynamic(M.spvm, rhs, idx);
     return *this;
@@ -499,9 +497,8 @@ class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
 #endif // __SYCL_DEVICE_ONLY__
   }
 
-  wi_element &
-  operator=(const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,
-                             NumCols, Layout, Group> &rhs) {
+  wi_element &operator=(const wi_element<sycl::ext::oneapi::bfloat16, NumRows,
+                                         NumCols, Layout, Group> &rhs) {
 #ifdef __SYCL_DEVICE_ONLY__
     M.spvm = __spirv_VectorInsertDynamic(
         M.spvm, __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx), idx);
@@ -515,16 +512,14 @@ class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
 
 #if __SYCL_DEVICE_ONLY__
 #define OP(opassign, op)                                                       \
-  wi_element &operator opassign(                                               \
-      const sycl::ext::oneapi::experimental::bfloat16 &rhs) {                  \
+  wi_element &operator opassign(const sycl::ext::oneapi::bfloat16 &rhs) {      \
     M.spvm = __spirv_VectorInsertDynamic(                                      \
         M.spvm, __spirv_VectorExtractDynamic(M.spvm, idx) op rhs, idx);        \
     return *this;                                                              \
   }
 #else // __SYCL_DEVICE_ONLY__
 #define OP(opassign, op)                                                       \
-  wi_element &operator opassign(                                               \
-      const sycl::ext::oneapi::experimental::bfloat16 &rhs) {                  \
+  wi_element &operator opassign(const sycl::ext::oneapi::bfloat16 &rhs) {      \
     (void)rhs;                                                                 \
     throw runtime_error("joint matrix is not supported on host device.",       \
                         PI_ERROR_INVALID_DEVICE);                              \
@@ -539,34 +534,34 @@ class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
 #if __SYCL_DEVICE_ONLY__
 #define OP(type, op)                                                           \
   friend type operator op(                                                     \
-      const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,     \
-                       NumCols, Layout, Group> &lhs,                           \
-      const sycl::ext::oneapi::experimental::bfloat16 &rhs) {                  \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Layout,  \
+                       Group> &lhs,                                            \
+      const sycl::ext::oneapi::bfloat16 &rhs) {                                \
     return __spirv_VectorExtractDynamic(lhs.M.spvm, lhs.idx) op rhs;           \
   }                                                                            \
   friend type operator op(                                                     \
-      const sycl::ext::oneapi::experimental::bfloat16 &lhs,                    \
-      const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,     \
-                       NumCols, Layout, Group> &rhs) {                         \
+      const sycl::ext::oneapi::bfloat16 &lhs,                                  \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Layout,  \
+                       Group> &rhs) {                                          \
     return __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx) op lhs;           \
   }
-  OP(sycl::ext::oneapi::experimental::bfloat16, +)
-  OP(sycl::ext::oneapi::experimental::bfloat16, -)
-  OP(sycl::ext::oneapi::experimental::bfloat16, *)
-  OP(sycl::ext::oneapi::experimental::bfloat16, /)
+  OP(sycl::ext::oneapi::bfloat16, +)
+  OP(sycl::ext::oneapi::bfloat16, -)
+  OP(sycl::ext::oneapi::bfloat16, *)
+  OP(sycl::ext::oneapi::bfloat16, /)
 #undef OP
 #define OP(type, op)                                                           \
   friend type operator op(                                                     \
-      const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,     \
-                       NumCols, Layout, Group> &lhs,                           \
-      const sycl::ext::oneapi::experimental::bfloat16 &rhs) {                  \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Layout,  \
+                       Group> &lhs,                                            \
+      const sycl::ext::oneapi::bfloat16 &rhs) {                                \
     return type{static_cast<float>(__spirv_VectorExtractDynamic(               \
         lhs.M.spvm, lhs.idx)) op static_cast<float>(rhs)};                     \
   }                                                                            \
   friend type operator op(                                                     \
-      const sycl::ext::oneapi::experimental::bfloat16 &lhs,                    \
-      const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,     \
-                       NumCols, Layout, Group> &rhs) {                         \
+      const sycl::ext::oneapi::bfloat16 &lhs,                                  \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Layout,  \
+                       Group> &rhs) {                                          \
     return type{static_cast<float>(__spirv_VectorExtractDynamic(               \
         rhs.M.spvm, rhs.idx)) op static_cast<float>(lhs)};                     \
   }
@@ -579,24 +574,23 @@ class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
 #undef OP
 #else // __SYCL_DEVICE_ONLY__
 #define OP(type, op)                                                           \
-  friend type operator op(                                                     \
-      const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,     \
-                       NumCols, Layout, Group> &,                              \
-      const sycl::ext::oneapi::experimental::bfloat16 &) {                     \
+  friend type operator op(const wi_element<sycl::ext::oneapi::bfloat16,        \
+                                           NumRows, NumCols, Layout, Group> &, \
+                          const sycl::ext::oneapi::bfloat16 &) {               \
     throw runtime_error("joint matrix is not supported on host device.",       \
                         PI_ERROR_INVALID_DEVICE);                              \
   }                                                                            \
   friend type operator op(                                                     \
-      const sycl::ext::oneapi::experimental::bfloat16 &,                       \
-      const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,     \
-                       NumCols, Layout, Group> &) {                            \
+      const sycl::ext::oneapi::bfloat16 &,                                     \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Layout,  \
+                       Group> &) {                                             \
     throw runtime_error("joint matrix is not supported on host device.",       \
                         PI_ERROR_INVALID_DEVICE);                              \
   }
-  OP(sycl::ext::oneapi::experimental::bfloat16, +)
-  OP(sycl::ext::oneapi::experimental::bfloat16, -)
-  OP(sycl::ext::oneapi::experimental::bfloat16, *)
-  OP(sycl::ext::oneapi::experimental::bfloat16, /)
+  OP(sycl::ext::oneapi::bfloat16, +)
+  OP(sycl::ext::oneapi::bfloat16, -)
+  OP(sycl::ext::oneapi::bfloat16, *)
+  OP(sycl::ext::oneapi::bfloat16, /)
   OP(bool, ==)
   OP(bool, !=)
   OP(bool, <)
diff --git a/sycl/test/extensions/bfloat16.cpp b/sycl/test/extensions/bfloat16.cpp
index 3666b32177116..63eecc8b30534 100644
--- a/sycl/test/extensions/bfloat16.cpp
+++ b/sycl/test/extensions/bfloat16.cpp
@@ -2,10 +2,10 @@
 
 // UNSUPPORTED: cuda || hip_amd
 
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
 #include <sycl/sycl.hpp>
 
-using sycl::ext::oneapi::experimental::bfloat16;
+using sycl::ext::oneapi::bfloat16;
 
 SYCL_EXTERNAL uint16_t some_bf16_intrinsic(uint16_t x, uint16_t y);
 SYCL_EXTERNAL void foo(long x, sycl::half y);

From bdd88e50f0165e339f85c9b134d49972de079370 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 3 Aug 2022 16:10:25 -0700
Subject: [PATCH 02/63] Corrections to tests.

---
 .../ext/oneapi/matrix/matrix-tensorcore.hpp     | 17 ++++++++---------
 .../matrix/matrix-nvptx-bfloat16-test.cpp       |  2 +-
 sycl/test/matrix/matrix-bfloat16-test.cpp       |  2 +-
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
index cf53bec8f943c..2c2cac78c6f14 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -7,7 +7,7 @@
 // ===--------------------------------------------------------------------=== //
 
 #pragma once
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
@@ -219,8 +219,7 @@ struct joint_matrix_load_impl<
                 S, Use, NumRows, NumCols, Layout, sycl::sub_group> &res,
             multi_ptr<T, Space> src, size_t stride) {
     if constexpr (std::is_same<T, uint16_t>::value ||
-                  std::is_same<
-                      T, sycl::ext::oneapi::experimental::bfloat16>::value) {
+                  std::is_same<T, sycl::ext::oneapi::bfloat16>::value) {
       auto tileptr = reinterpret_cast<int32_t const *>(src.get());
       auto destptr = reinterpret_cast<int32_t *>(&res.wi_marray);
       if constexpr (NumRows == 16 && NumCols == 16) {
@@ -585,8 +584,8 @@ struct joint_matrix_mad_impl<
               get_layout_pair_id<LayoutA, LayoutB>(), 0);
         }
       } else if constexpr (std::is_same<T1, uint16_t>::value ||
-                           std::is_same<T1, sycl::ext::oneapi::experimental::
-                                                bfloat16>::value) {
+                           std::is_same<T1,
+                                        sycl::ext::oneapi::bfloat16>::value) {
         __mma_bf16_m16n16k16_mma_f32(
             reinterpret_cast<float *>(&D.wi_marray),
             reinterpret_cast<int32_t const *>(&A.wi_marray),
@@ -622,8 +621,8 @@ struct joint_matrix_mad_impl<
               get_layout_pair_id<LayoutA, LayoutB>(), 0);
         }
       } else if constexpr (std::is_same<T1, uint16_t>::value ||
-                           std::is_same<T1, sycl::ext::oneapi::experimental::
-                                                bfloat16>::value) {
+                           std::is_same<T1,
+                                        sycl::ext::oneapi::bfloat16>::value) {
         __mma_bf16_m8n32k16_mma_f32(
             reinterpret_cast<float *>(&D.wi_marray),
             reinterpret_cast<int32_t const *>(&A.wi_marray),
@@ -645,8 +644,8 @@ struct joint_matrix_mad_impl<
                                  get_layout_pair_id<LayoutA, LayoutB>(), 0);
         }
       } else if constexpr (std::is_same<T1, uint16_t>::value ||
-                           std::is_same<T1, sycl::ext::oneapi::experimental::
-                                                bfloat16>::value) {
+                           std::is_same<T1,
+                                        sycl::ext::oneapi::bfloat16>::value) {
         __mma_bf16_m32n8k16_mma_f32(
             reinterpret_cast<float *>(&D.wi_marray),
             reinterpret_cast<int32_t const *>(&A.wi_marray),
diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-bfloat16-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-bfloat16-test.cpp
index 73e3ed1b337e3..0fe27b1033d90 100644
--- a/sycl/test/check_device_code/matrix/matrix-nvptx-bfloat16-test.cpp
+++ b/sycl/test/check_device_code/matrix/matrix-nvptx-bfloat16-test.cpp
@@ -6,7 +6,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
-using sycl::ext::oneapi::experimental::bfloat16;
+using sycl::ext::oneapi::bfloat16;
 
 constexpr int stride = 16;
 
diff --git a/sycl/test/matrix/matrix-bfloat16-test.cpp b/sycl/test/matrix/matrix-bfloat16-test.cpp
index 384714adef3d2..fffbf58c240f9 100644
--- a/sycl/test/matrix/matrix-bfloat16-test.cpp
+++ b/sycl/test/matrix/matrix-bfloat16-test.cpp
@@ -4,7 +4,7 @@
 #include <iostream>
 
 using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 static constexpr auto TILE_SZ = 16;
 static constexpr auto TM = TILE_SZ - 1;

From 0fe18841176e963c3567cd0f51c3015c59b614b1 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 24 Aug 2022 15:09:26 -0700
Subject: [PATCH 03/63] Moved another file out of experimental space.

---
 sycl/include/sycl/ext/oneapi/{experimental => }/builtins.hpp | 0
 sycl/include/sycl/sycl.hpp                                   | 3 ++-
 2 files changed, 2 insertions(+), 1 deletion(-)
 rename sycl/include/sycl/ext/oneapi/{experimental => }/builtins.hpp (100%)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/builtins.hpp
similarity index 100%
rename from sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
rename to sycl/include/sycl/ext/oneapi/builtins.hpp
diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp
index 587b9c6b34b78..faff26f9c2f3e 100644
--- a/sycl/include/sycl/sycl.hpp
+++ b/sycl/include/sycl/sycl.hpp
@@ -62,7 +62,8 @@
 #endif
 #include <sycl/ext/oneapi/device_global/device_global.hpp>
 #include <sycl/ext/oneapi/device_global/properties.hpp>
-#include <sycl/ext/oneapi/experimental/builtins.hpp>
+//#include <sycl/ext/oneapi/experimental/builtins.hpp>
+#include <sycl/ext/oneapi/builtins.hpp>
 #include <sycl/ext/oneapi/experimental/cuda/barrier.hpp>
 #include <sycl/ext/oneapi/filter_selector.hpp>
 #include <sycl/ext/oneapi/group_algorithm.hpp>

From feb9d5feb7d8e433162983b6746e442fe6f1503b Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 25 Aug 2022 14:52:59 -0700
Subject: [PATCH 04/63] Responses to review comments.

---
 .../sycl_ext_oneapi_bfloat16.asciidoc         | 142 +++++++++---------
 sycl/include/sycl/ext/oneapi/bfloat16.hpp     |   2 +
 2 files changed, 75 insertions(+), 69 deletions(-)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 8bc01a56077d7..78c76125837a5 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -22,13 +22,20 @@
 
 == Notice
 
-IMPORTANT: This specification is a draft.
+[%hardbreaks]
+Copyright (C) 2022-2022 Intel Corporation.  All rights reserved.
 
-Copyright (c) 2021-2022 Intel Corporation. All rights reserved.
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
 
-NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
-trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc.
-used by permission by Khronos.
 
 == Dependencies
 
@@ -36,23 +43,21 @@ This extension is written against the SYCL 2020 specification, Revision 4.
 
 == Status
 
-Draft
-
-This is a preview extension specification, intended to provide early access to
-a feature for review and community feedback. When the feature matures, this
-specification may be released as a formal extension.
-
-Because the interfaces defined by this specification are not final and are
-subject to change they are not intended to be used by shipping software
-products.
+This extension is implemented and fully supported by DPC++.
+[NOTE]
+====
+This extension is currently implemented in {dpcpp} only for GPU devices that support bfloat16 natively. Attempting to use this extension in
+kernels that run on other devices may result in undefined behavior.
+Be aware that the compiler is not able to issue a diagnostic to warn you if this happens.
+====
 
 == Version
 
 Revision: 5
 
-== Introduction
+== Overview
 
-This extension adds functionality to convert value of single-precision
+This extension adds functionality to convert values of single-precision
 floating-point type(`float`) to `bfloat16` type and vice versa. The extension
 doesn't add support for `bfloat16` type as such, instead it uses 16-bit integer
 type(`uint16_t`) as a storage for `bfloat16` values.
@@ -68,7 +73,9 @@ feature to a device that does not support it should cause a synchronous
 `errc::kernel_not_supported` exception to be thrown from the kernel invocation
 command (e.g. from `parallel_for`).
 
-== Feature test macro
+== Specification
+
+=== Feature test macro
 
 This extension provides a feature-test macro as described in the core SYCL
 specification section 6.3.3 "Feature test macros". Therefore, an implementation
@@ -103,7 +110,7 @@ If the device doesn't have the aspect, objects of `bfloat16` class must not be
 used in the device code.
 
 **NOTE**: The `ext_oneapi_bfloat16` aspect is not yet supported.  The
-`bfloat16` class is currently supported only on Xe HP GPU and Nvidia GPUs with Compute Capability >= SM80.
+`bfloat16` class is currently supported only on Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80.
 
 == New `bfloat16` class
 
@@ -116,7 +123,6 @@ mode.
 namespace sycl {
 namespace ext {
 namespace oneapi {
-namespace experimental {
 
 class bfloat16 {
   using storage_t = uint16_t;
@@ -127,10 +133,6 @@ public:
   bfloat16(const bfloat16 &) = default;
   ~bfloat16() = default;
 
-  // Explicit conversion functions
-  static storage_t from_float(const float &a);
-  static float to_float(const storage_t &a);
-
   // Convert from float to bfloat16
   bfloat16(const float &a);
   bfloat16 &operator=(const float &a);
@@ -168,7 +170,6 @@ public:
   friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }
 };
 
-} // namespace experimental
 } // namespace oneapi
 } // namespace ext
 } // namespace sycl
@@ -178,12 +179,6 @@ Table 1. Member functions of `bfloat16` class.
 |===
 | Member Function | Description
 
-|  `static storage_t from_float(const float &a);`
-|  Explicitly convert from `float` to `bfloat16`.
-
-|  `static float to_float(const storage_t &a);`
-|  Interpret `a` as `bfloat16` and explicitly convert it to `float`.
-
 | `bfloat16(const float& a);`
 | Construct `bfloat16` from `float`. Converts `float` to `bfloat16`.
 
@@ -251,7 +246,7 @@ Table 1. Member functions of `bfloat16` class.
 | Perform comparison operation OP between `lhs` `bfloat16` and `rhs` `bfloat16`
   values and return the result as a boolean value.
 
-OP is `==, !=, <, >, <=, >=`
+OP is `+==, !=, <, >, <=, >=+`
 
 | `template <typename T>
   friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }`
@@ -259,7 +254,7 @@ OP is `==, !=, <, >, <=, >=`
   template type `T` and return the result as a boolean value. Type `T` must be
   convertible to `float`.
 
-OP is `==, !=, <, >, <=, >=`
+OP is `+==, !=, <, >, <=, >=+`
 
 | `template <typename T>
   friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }`
@@ -267,26 +262,23 @@ OP is `==, !=, <, >, <=, >=`
   `bfloat16` value and return the result as a boolean value. Type `T` must be
   convertible to `float`.
 
-OP is `==, !=, <, >, <=, >=`
+OP is `+==, !=, <, >, <=, >=+`
 |===
 
 == Example
 
 [source]
 ----
+#include <sycl/ext/oneapi/bfloat16.hpp>
 #include <sycl/sycl.hpp>
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
 
-using sycl::ext::oneapi::experimental::bfloat16;
-
-bfloat16 operator+(const bfloat16 &lhs, const bfloat16 &rhs) {
-  return static_cast<float>(lhs) + static_cast<float>(rhs);
-}
+using namespace sycl;
+using sycl::ext::oneapi::bfloat16;
 
 float foo(float a, float b) {
   // Convert from float to bfloat16.
-  bfloat16 A {a};
-  bfloat16 B {b};
+  bfloat16 A{a};
+  bfloat16 B{b};
 
   // Convert A and B from bfloat16 to float, do addition on floating-pointer
   // numbers, then convert the result to bfloat16 and store it in C.
@@ -296,18 +288,17 @@ float foo(float a, float b) {
   return C;
 }
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
   float data[3] = {7.0, 8.1, 0.0};
-  sycl::device dev;
-  sycl::queue deviceQueue{dev};
-  sycl::buffer<float, 1> buf {data, sycl::range<1> {3}};
-
-  if (dev.has(sycl::aspect::ext_oneapi_bfloat16)) {
-    deviceQueue.submit ([&] (sycl::handler& cgh) {
-      auto numbers = buf.get_access<sycl::access::mode::read_write> (cgh);
-      cgh.single_task<class simple_kernel> ([=] () {
-        numbers[2] = foo(numbers[0], numbers[1]);
-      });
+  device dev;
+  queue deviceQueue{dev};
+  buffer buf{data, {3}};
+
+  if (dev.has(aspect::ext_oneapi_bfloat16)) {
+    deviceQueue.submit([&](handler &cgh) {
+      accessor numbers{buf, read_write};
+      cgh.single_task<class simple_kernel>(
+          [=]() { numbers[2] = foo(numbers[0], numbers[1]); });
     });
   }
   return 0;
@@ -325,11 +316,14 @@ The following functions are only available when `T` is `bfloat16` or `sycl::marr
 === fma
 
 ```c++
-namespace sycl::ext::oneapi::experimental {
+namespace sycl::ext::oneapi {
 
-template <typename T>
-T fma(T a, T b, T c);
-} // namespace sycl::ext::oneapi::experimental
+bfloat16 fma(bfloat16 a, bfloat16 b, bfloat16 c);
+
+template<size_t N>
+marray<bfloat16, N> fma(marray<bfloat16, N> a, marray<bfloat16, N> b, marray<bfloat16, N> c);
+
+} // namespace sycl::ext::oneapi
 ```
 
 ==== Description
@@ -340,10 +334,14 @@ Rounding of intermediate products shall not occur. The mantissa LSB rounds to th
 === fmax
 
 ```c++
-namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T fmax(T x, T y);
-} // namespace sycl::ext::oneapi::experimental
+namespace sycl::ext::oneapi {
+
+bfloat16 fmax(bfloat16 x, bfloat16 y);
+
+template<size_t N>
+marray<bfloat16, N> fmax(marray<bfloat16, N> x, marray<bfloat16, N> y);
+
+} // namespace sycl::ext::oneapi
 ```
 
 ==== Description
@@ -358,10 +356,14 @@ NaNs, `fmax()` returns a NaN.
 === fmin
 
 ```c++
-namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T fmin(T x, T y);
-} // namespace sycl::ext::oneapi::experimental
+namespace sycl::ext::oneapi {
+
+bfloat16 fmin(bfloat16 a, bfloat16 b);
+
+template<size_t N>
+marray<bfloat16, N> fmin(marray<bfloat16, N> a, marray<bfloat16, N> b);
+
+} // namespace sycl::ext::oneapi
 ```
 
 ==== Description
@@ -369,17 +371,19 @@ T fmin(T x, T y);
 Returns `y` if
 `y < x`, otherwise it
 returns `x`. If one argument is a
-NaN, `fmax()` returns the other
+NaN, `fmin()` returns the other
 argument. If both arguments are
-NaNs, `fmax()` returns a NaN.
+NaNs, `fmin()` returns a NaN.
 
 === fabs
 
 ```c++
-namespace sycl::ext::oneapi::experimental {
+namespace sycl::ext::oneapi {
+
 template <typename T>
 T fabs(T x);
-} // namespace sycl::ext::oneapi::experimental
+
+} // namespace sycl::ext::oneapi
 ```
 
 ==== Description
@@ -406,5 +410,5 @@ Compute absolute value of a `bfloat16`.
 |3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor
 |4|2022-03-07|Aidan Belton and Jack Kirk |Switch from Intel vendor specific to oneapi
 |5|2022-04-05|Jack Kirk | Added section for bfloat16 math builtins
-|6|2022-08-03|Alexey Sotkin |Add `operator sycl::half()`
+|6|2022-08-24|Rajiv Deodhar |Add `operator sycl::half()` and some other conversions
 |========================================
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index c0ba6c8d94585..acb1fb429e6e1 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -29,6 +29,7 @@ class bfloat16 {
   bfloat16(const bfloat16 &) = default;
   ~bfloat16() = default;
 
+private:
   // Explicit conversion functions
   static storage_t from_float(const float &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
@@ -69,6 +70,7 @@ class bfloat16 {
 #endif
   }
 
+public:
   static bfloat16 from_bits(const storage_t &a) {
     bfloat16 res;
     res.value = a;

From 129f53f450e7d91c05c8edbd744bfc497600c522 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Fri, 26 Aug 2022 16:00:01 -0700
Subject: [PATCH 05/63] Removed unneeded sycl::half conversion and updated doc.

---
 .../sycl_ext_oneapi_bfloat16.asciidoc         | 26 +++++++++----------
 sycl/include/sycl/ext/oneapi/bfloat16.hpp     |  1 -
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 78c76125837a5..8462beb010e11 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -109,8 +109,7 @@ supports conversion of values of `float` type to `bfloat16` and back.
 If the device doesn't have the aspect, objects of `bfloat16` class must not be
 used in the device code.
 
-**NOTE**: The `ext_oneapi_bfloat16` aspect is not yet supported.  The
-`bfloat16` class is currently supported only on Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80.
+**NOTE**: The `bfloat16` class is currently supported only on Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80.
 
 == New `bfloat16` class
 
@@ -137,9 +136,8 @@ public:
   bfloat16(const float &a);
   bfloat16 &operator=(const float &a);
 
-  // Convert bfloat16 to floating-point types
+  // Convert bfloat16 to float
   operator float() const;
-  operator sycl::half() const;
 
   // Convert bfloat16 to bool type
   explicit operator bool();
@@ -188,9 +186,6 @@ Table 1. Member functions of `bfloat16` class.
 | `operator float() const;`
 |  Return `bfloat16` value converted to `float`.
 
-| `operator sycl::half() const;`
-| Return `bfloat16` value converted to `sycl::half`.
-
 | `explicit operator bool() { /* ... */ }`
 | Convert `bfloat16` to `bool` type. Return `false` if the `value` equals to
   zero, return `true` otherwise.
@@ -269,7 +264,6 @@ OP is `+==, !=, <, >, <=, >=+`
 
 [source]
 ----
-#include <sycl/ext/oneapi/bfloat16.hpp>
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
@@ -280,27 +274,33 @@ float foo(float a, float b) {
   bfloat16 A{a};
   bfloat16 B{b};
 
-  // Convert A and B from bfloat16 to float, do addition on floating-pointer
+  // Convert A and B from bfloat16 to float, do addition on floating-point
   // numbers, then convert the result to bfloat16 and store it in C.
   bfloat16 C = A + B;
 
   // Return the result converted from bfloat16 to float.
+  // return sycl::ext::oneapi::float(C);
   return C;
 }
 
 int main(int argc, char *argv[]) {
   float data[3] = {7.0, 8.1, 0.0};
-  device dev;
+  device dev{gpu_selector()};
   queue deviceQueue{dev};
-  buffer buf{data, {3}};
+  buffer<float, 1> buf{data, 3};
 
   if (dev.has(aspect::ext_oneapi_bfloat16)) {
     deviceQueue.submit([&](handler &cgh) {
-      accessor numbers{buf, read_write};
+      accessor numbers{buf, cgh, read_write};
       cgh.single_task<class simple_kernel>(
           [=]() { numbers[2] = foo(numbers[0], numbers[1]); });
     });
+  } else {
+    std::cout << "No bfloat16 support\n";
+    return 1;
   }
+  host_accessor hostOutAcc{buf, read_only};
+  std::cout << "Result = " << hostOutAcc[2] << std::endl;
   return 0;
 }
 ----
@@ -410,5 +410,5 @@ Compute absolute value of a `bfloat16`.
 |3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor
 |4|2022-03-07|Aidan Belton and Jack Kirk |Switch from Intel vendor specific to oneapi
 |5|2022-04-05|Jack Kirk | Added section for bfloat16 math builtins
-|6|2022-08-24|Rajiv Deodhar |Add `operator sycl::half()` and some other conversions
+|6|2022-08-24|Rajiv Deodhar |Move bfloat16 from experimental to supported
 |========================================
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index acb1fb429e6e1..316461f823a49 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -87,7 +87,6 @@ class bfloat16 {
 
   // Implicit conversion from bfloat16 to float
   operator float() const { return to_float(value); }
-  operator sycl::half() const { return to_float(value); }
 
   // Get raw bits representation of bfloat16
   storage_t raw() const { return value; }

From 2115f091f2ac923fbf7b8a15d049cdf05281d2fb Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 29 Aug 2022 14:34:31 -0700
Subject: [PATCH 06/63] Added conversion from sycl::half to bfloat16.

---
 .../supported/sycl_ext_oneapi_bfloat16.asciidoc       |  7 +++++++
 sycl/include/sycl/ext/oneapi/bfloat16.hpp             | 11 +++++++++++
 2 files changed, 18 insertions(+)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 8462beb010e11..ff3f0c4e9354b 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -138,6 +138,13 @@ public:
 
   // Convert bfloat16 to float
   operator float() const;
+  
+  // Convert from sycl::half to bfloat16
+  bfloat16(const sycl::half &a);
+  bfloat16 &operator=(const sycl::half &a);
+
+  // Convert bfloat16 to sycl::half
+  operator sycl::half() const;
 
   // Convert bfloat16 to bool type
   explicit operator bool();
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 316461f823a49..3b66dbef9c76a 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -85,9 +85,20 @@ class bfloat16 {
     return *this;
   }
 
+  // Implicit conversion from sycl::half to bfloat16
+  bfloat16(const sycl::half &a) { value = from_float(a); }
+
+  bfloat16 &operator=(const sycl::half &rhs) {
+    value = from_float(rhs);
+    return *this;
+  }
+
   // Implicit conversion from bfloat16 to float
   operator float() const { return to_float(value); }
 
+  // Implicit conversion from bfloat16 to sycl::half
+  operator sycl::half() const { return to_float(value); }
+
   // Get raw bits representation of bfloat16
   storage_t raw() const { return value; }
 

From 3c2eb8085414cd1f3912aa4e09dbd9c1dcd3fdf4 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 30 Aug 2022 17:08:25 -0700
Subject: [PATCH 07/63] Cleanup of documentation.

---
 .../sycl_ext_oneapi_bfloat16.asciidoc         | 60 ++++++++-----------
 sycl/include/sycl/ext/oneapi/bfloat16.hpp     |  4 +-
 2 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index ff3f0c4e9354b..e980c5dc1a984 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -39,39 +39,22 @@ https://github.com/intel/llvm/issues
 
 == Dependencies
 
-This extension is written against the SYCL 2020 specification, Revision 4.
+This extension is written against the SYCL 2020 specification, Revision 5.
 
 == Status
 
 This extension is implemented and fully supported by DPC++.
 [NOTE]
 ====
-This extension is currently implemented in {dpcpp} only for GPU devices that support bfloat16 natively. Attempting to use this extension in
+This extension is currently implemented in `dpcpp` only for GPU devices that support `bfloat16` natively. Attempting to use this extension in
 kernels that run on other devices may result in undefined behavior.
 Be aware that the compiler is not able to issue a diagnostic to warn you if this happens.
 ====
 
-== Version
-
-Revision: 5
-
 == Overview
 
-This extension adds functionality to convert values of single-precision
-floating-point type(`float`) to `bfloat16` type and vice versa. The extension
-doesn't add support for `bfloat16` type as such, instead it uses 16-bit integer
-type(`uint16_t`) as a storage for `bfloat16` values.
-
-The purpose of conversion from float to bfloat16 is to reduce the amount of memory
-required to store floating-point numbers. Computations are expected to be done with
-32-bit floating-point values.
+This extension adds support for a 16-bit floating point type `bfloat16`. This type occupies 16 bits of storage space as does the `sycl::half` type. However, `bfloat16` allots 8 bits to the exponent instead of the 5 bits used by `sycl::half` and 7 bits to the significand versus 10 bits used by `sycl::half`. Thus, `bfloat16` has the same dynamic range as a 32-bit `float` but with reduced precision. This type is useful when memory required to store the values must be reduced, and when the calculations require high dynamic range but can tolerate lower-precision. Some implementations may still perform operations on this type using 32-bit math. For example, they may convert the `bfloat16` value to `float`, and then perform the operation on the 32-bit `float`.
 
-This extension is an optional kernel feature as described in
-https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:optional-kernel-features[section 5.7]
-of the SYCL 2020 spec. Therefore, attempting to submit a kernel using this
-feature to a device that does not support it should cause a synchronous
-`errc::kernel_not_supported` exception to be thrown from the kernel invocation
-command (e.g. from `parallel_for`).
 
 == Specification
 
@@ -91,7 +74,7 @@ the implementation supports this feature, or applications can test the macro’s
 |1     |Initial extension version. Base features are supported.
 |===
 
-== Extension to `enum class aspect`
+=== Extension to `enum class aspect`
 
 [source]
 ----
@@ -106,16 +89,18 @@ enum class aspect {
 If a SYCL device has the `ext_oneapi_bfloat16` aspect, then it natively
 supports conversion of values of `float` type to `bfloat16` and back.
 
-If the device doesn't have the aspect, objects of `bfloat16` class must not be
-used in the device code.
+This extension is an optional kernel feature as described in section 5.7 of the SYCL 2020 spec, with the associated aspect `ext_oneapi_bfloat16`. Applications can query whether the device has this aspect to determine if it supports kernels that use `bfloat16`. Attempting to submit a kernel using `bfloat16` to a device that does not support it causes a synchronous `errc::kernel_not_supported` exception to be thrown from the kernel invocation command (e.g. from `parallel_for`).
+
+[NOTE]
+====
+. DPC++ does not currently implement the `errc::kernel_not_supported` exception in this case. Attempting to submit a kernel using `bfloat16` to a device that does not have the `ext_oneapi_bfloat16` aspect results in undefined behavior.
+. The `bfloat16` class is currently supported only on Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80.
+====
 
-**NOTE**: The `bfloat16` class is currently supported only on Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80.
 
-== New `bfloat16` class
+=== New `bfloat16` class
 
-The `bfloat16` class below provides the conversion functionality. Conversion
-from `float` to `bfloat16` is done with round to nearest even(RTE) rounding
-mode.
+The `bfloat16` type represents a 16-bit floating point value. Conversions from `float` to `bfloat16` are done with round to nearest even (RTE) rounding mode.
 
 [source]
 ----
@@ -124,8 +109,6 @@ namespace ext {
 namespace oneapi {
 
 class bfloat16 {
-  using storage_t = uint16_t;
-  storage_t value;
 
 public:
   bfloat16() = default;
@@ -193,6 +176,15 @@ Table 1. Member functions of `bfloat16` class.
 | `operator float() const;`
 |  Return `bfloat16` value converted to `float`.
 
+| `bfloat16(const sycl::half& a);`
+| Construct `bfloat16` from `sycl::half`. Converts `sycl::half` to `bfloat16`.
+
+| `bfloat16 &operator=(const sycl::half &a);`
+| Replace the value with `a` converted to `bfloat16`
+
+| `operator sycl::half() const;`
+|  Return `bfloat16` value converted to `sycl::half`.
+
 | `explicit operator bool() { /* ... */ }`
 | Convert `bfloat16` to `bool` type. Return `false` if the `value` equals to
   zero, return `true` otherwise.
@@ -286,7 +278,6 @@ float foo(float a, float b) {
   bfloat16 C = A + B;
 
   // Return the result converted from bfloat16 to float.
-  // return sycl::ext::oneapi::float(C);
   return C;
 }
 
@@ -299,8 +290,7 @@ int main(int argc, char *argv[]) {
   if (dev.has(aspect::ext_oneapi_bfloat16)) {
     deviceQueue.submit([&](handler &cgh) {
       accessor numbers{buf, cgh, read_write};
-      cgh.single_task<class simple_kernel>(
-          [=]() { numbers[2] = foo(numbers[0], numbers[1]); });
+      cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); });
     });
   } else {
     std::cout << "No bfloat16 support\n";
@@ -314,11 +304,11 @@ int main(int argc, char *argv[]) {
 
 == New bfloat16 math functions
 
-Many applications will require dedicated functions that take parameters of type `bfloat16`. This extension adds `bfloat16` support to the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point math functions. These functions can be used as element wise operations on matrices, supplementing the `bfloat16` support in the sycl_ext_oneapi_matrix extension.
+Many applications will require dedicated functions that take parameters of type `bfloat16`. This extension adds `bfloat16` support to the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point math functions. These functions can be used as element wise operations on matrices, supplementing the `bfloat16` support in the `sycl_ext_oneapi_matrix` extension.
 
 The descriptions of the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point math functions can be found in the SYCL specification: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_math_functions.
 
-The following functions are only available when `T` is `bfloat16` or `sycl::marray<bfloat16, {N}>`, where `{N}` means any positive value of `size_t` type.
+
 
 === fma
 
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 3b66dbef9c76a..5837b29acde51 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -52,6 +52,7 @@ class bfloat16 {
     return static_cast<uint16_t>((intStorage + roundingBias) >> 16);
 #endif
   }
+
   static float to_float(const storage_t &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
 #if defined(__NVPTX__)
@@ -70,13 +71,14 @@ class bfloat16 {
 #endif
   }
 
-public:
   static bfloat16 from_bits(const storage_t &a) {
     bfloat16 res;
     res.value = a;
     return res;
   }
 
+public:
+
   // Implicit conversion from float to bfloat16
   bfloat16(const float &a) { value = from_float(a); }
 

From 74aa175eb2dcfe0de194e294c0489099a1822c11 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Fri, 2 Sep 2022 15:56:02 -0700
Subject: [PATCH 08/63] Hooked up bfloat16 aspect within OpenCL plugin.

---
 .../sycl_ext_oneapi_bfloat16.asciidoc         | 29 +++++++++----------
 sycl/include/sycl/ext/oneapi/bfloat16.hpp     |  1 -
 sycl/plugins/opencl/pi_opencl.cpp             | 14 +++++++--
 3 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index e980c5dc1a984..6128aae0bda8e 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -46,7 +46,7 @@ This extension is written against the SYCL 2020 specification, Revision 5.
 This extension is implemented and fully supported by DPC++.
 [NOTE]
 ====
-This extension is currently implemented in `dpcpp` only for GPU devices that support `bfloat16` natively. Attempting to use this extension in
+This extension is currently implemented in DPC++ only for GPU devices that support `bfloat16` natively. Attempting to use this extension in
 kernels that run on other devices may result in undefined behavior.
 Be aware that the compiler is not able to issue a diagnostic to warn you if this happens.
 ====
@@ -86,15 +86,12 @@ enum class aspect {
 }
 ----
 
-If a SYCL device has the `ext_oneapi_bfloat16` aspect, then it natively
-supports conversion of values of `float` type to `bfloat16` and back.
-
 This extension is an optional kernel feature as described in section 5.7 of the SYCL 2020 spec, with the associated aspect `ext_oneapi_bfloat16`. Applications can query whether the device has this aspect to determine if it supports kernels that use `bfloat16`. Attempting to submit a kernel using `bfloat16` to a device that does not support it causes a synchronous `errc::kernel_not_supported` exception to be thrown from the kernel invocation command (e.g. from `parallel_for`).
 
 [NOTE]
 ====
-. DPC++ does not currently implement the `errc::kernel_not_supported` exception in this case. Attempting to submit a kernel using `bfloat16` to a device that does not have the `ext_oneapi_bfloat16` aspect results in undefined behavior.
-. The `bfloat16` class is currently supported only on Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80.
+. DPC++ does not currently implement the `errc::kernel_not_supported` exception in this case. Attempting to submit a kernel using `bfloat16` to a device that does not have the `ext_oneapi_bfloat16` aspect results in undefined behavior. 
+. The `bfloat16` class is currently supported only on Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80. 
 ====
 
 
@@ -259,7 +256,7 @@ OP is `+==, !=, <, >, <=, >=+`
 OP is `+==, !=, <, >, <=, >=+`
 |===
 
-== Example
+=== Example
 
 [source]
 ----
@@ -302,7 +299,7 @@ int main(int argc, char *argv[]) {
 }
 ----
 
-== New bfloat16 math functions
+=== New bfloat16 math functions
 
 Many applications will require dedicated functions that take parameters of type `bfloat16`. This extension adds `bfloat16` support to the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point math functions. These functions can be used as element wise operations on matrices, supplementing the `bfloat16` support in the `sycl_ext_oneapi_matrix` extension.
 
@@ -310,7 +307,7 @@ The descriptions of the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point mat
 
 
 
-=== fma
+==== fma
 
 ```c++
 namespace sycl::ext::oneapi {
@@ -323,12 +320,12 @@ marray<bfloat16, N> fma(marray<bfloat16, N> a, marray<bfloat16, N> b, marray<bfl
 } // namespace sycl::ext::oneapi
 ```
 
-==== Description
+===== Description
 
 Returns the correctly rounded floating-point representation of the sum of `c` with the infinitely precise product of `a` and `b`.
 Rounding of intermediate products shall not occur. The mantissa LSB rounds to the nearest even. Subnormal numbers are supported.
 
-=== fmax
+==== fmax
 
 ```c++
 namespace sycl::ext::oneapi {
@@ -341,7 +338,7 @@ marray<bfloat16, N> fmax(marray<bfloat16, N> x, marray<bfloat16, N> y);
 } // namespace sycl::ext::oneapi
 ```
 
-==== Description
+===== Description
 
 Returns `y` if
 `x < y`, otherwise it
@@ -350,7 +347,7 @@ NaN, `fmax()` returns the other
 argument. If both arguments are
 NaNs, `fmax()` returns a NaN.
 
-=== fmin
+==== fmin
 
 ```c++
 namespace sycl::ext::oneapi {
@@ -363,7 +360,7 @@ marray<bfloat16, N> fmin(marray<bfloat16, N> a, marray<bfloat16, N> b);
 } // namespace sycl::ext::oneapi
 ```
 
-==== Description
+===== Description
 
 Returns `y` if
 `y < x`, otherwise it
@@ -372,7 +369,7 @@ NaN, `fmin()` returns the other
 argument. If both arguments are
 NaNs, `fmin()` returns a NaN.
 
-=== fabs
+==== fabs
 
 ```c++
 namespace sycl::ext::oneapi {
@@ -383,7 +380,7 @@ T fabs(T x);
 } // namespace sycl::ext::oneapi
 ```
 
-==== Description
+===== Description
 
 Compute absolute value of a `bfloat16`.
 
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 5837b29acde51..543dd3a8ba7fd 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -78,7 +78,6 @@ class bfloat16 {
   }
 
 public:
-
   // Implicit conversion from float to bfloat16
   bfloat16(const float &a) { value = from_float(a); }
 
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index dbb3c763b4b2c..ca6e804e8ec03 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -228,8 +228,18 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
     std::memcpy(paramValue, &result, sizeof(cl_bool));
     return PI_SUCCESS;
   }
-  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16:
-    return PI_ERROR_INVALID_VALUE;
+  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16: {
+    size_t extSize;
+    cl_bool result = clGetDeviceInfo(
+        cast<cl_device_id>(device), CL_DEVICE_EXTENSIONS, 0, nullptr, &extSize);
+    std::string extStr(extSize, '\0');
+    result = clGetDeviceInfo(cast<cl_device_id>(device), CL_DEVICE_EXTENSIONS,
+                             extSize, &extStr.front(), nullptr);
+    result =
+        (extStr.find("cl_intel_bfloat16_conversions") != std::string::npos);
+    std::memcpy(paramValue, &result, sizeof(cl_bool));
+    return PI_SUCCESS;
+  }
   case PI_DEVICE_INFO_IMAGE_SRGB: {
     cl_bool result = true;
     std::memcpy(paramValue, &result, sizeof(cl_bool));

From bd05711836b16956bb3e2e2b381435e5a23fc364 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 7 Sep 2022 17:53:12 -0700
Subject: [PATCH 09/63] Support for bfloat16 aspect, and native or fallback
 support.

---
 libdevice/bfloat16_wrapper.cpp                | 26 +++++++++++
 libdevice/cmake/modules/SYCLLibdevice.cmake   |  4 ++
 libdevice/fallback-bfloat16.cpp               | 44 +++++++++++++++++++
 .../sycl-post-link/SYCLDeviceLibReqMask.cpp   |  5 +++
 .../sycl-post-link/SYCLDeviceLibReqMask.h     |  1 +
 sycl/include/sycl/ext/oneapi/bfloat16.hpp     | 13 ++++--
 sycl/plugins/level_zero/pi_level_zero.cpp     | 10 ++++-
 .../program_manager/program_manager.cpp       | 23 ++++++++--
 .../program_manager/program_manager.hpp       |  1 +
 9 files changed, 119 insertions(+), 8 deletions(-)
 create mode 100755 libdevice/bfloat16_wrapper.cpp
 create mode 100755 libdevice/fallback-bfloat16.cpp

diff --git a/libdevice/bfloat16_wrapper.cpp b/libdevice/bfloat16_wrapper.cpp
new file mode 100755
index 0000000000000..b2b8709f9dfbc
--- /dev/null
+++ b/libdevice/bfloat16_wrapper.cpp
@@ -0,0 +1,26 @@
+//==--- bfloat16_wrapper.cpp - wrappers for bfloat16 library functions ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#include "device.h"
+
+#ifdef __SPIR__
+
+#include <CL/__spirv/spirv_ops.hpp>
+#include <cstdint>
+
+DEVICE_EXTERN_C_INLINE
+uint16_t __devicelib_ConvertFToBF16INTEL(const float &x) {
+  return __spirv_ConvertFToBF16INTEL(x);
+}
+
+DEVICE_EXTERN_C_INLINE
+float __devicelib_ConvertBF16ToFINTEL(const uint16_t &x) {
+  return __spirv_ConvertBF16ToFINTEL(x);
+}
+
+#endif // __SPIR__
diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake
index 9c5e9133fef64..8935e8e3dd2d3 100644
--- a/libdevice/cmake/modules/SYCLLibdevice.cmake
+++ b/libdevice/cmake/modules/SYCLLibdevice.cmake
@@ -93,6 +93,7 @@ set(complex_obj_deps device_complex.h device.h sycl-compiler)
 set(cmath_obj_deps device_math.h device.h sycl-compiler)
 set(imf_obj_deps device_imf.hpp imf_half.hpp device.h sycl-compiler)
 set(itt_obj_deps device_itt.h spirv_vars.h device.h sycl-compiler)
+set(bfloat16_obj_deps sycl-compiler)
 
 add_devicelib_obj(libsycl-itt-stubs SRC itt_stubs.cpp DEP ${itt_obj_deps})
 add_devicelib_obj(libsycl-itt-compiler-wrappers SRC itt_compiler_wrappers.cpp DEP ${itt_obj_deps})
@@ -108,6 +109,7 @@ add_devicelib_obj(libsycl-imf-fp64 SRC imf_wrapper_fp64.cpp DEP ${imf_obj_deps})
 if(WIN32)
 add_devicelib_obj(libsycl-msvc-math SRC msvc_math.cpp DEP ${cmath_obj_deps})
 endif()
+add_devicelib_obj(libsycl-bfloat16 SRC bfloat16_wrapper.cpp DEP ${cmath_obj_deps} )
 
 add_fallback_devicelib(libsycl-fallback-cassert SRC fallback-cassert.cpp DEP ${crt_obj_deps} EXTRA_ARGS -fno-sycl-instrument-device-code)
 add_fallback_devicelib(libsycl-fallback-cstring SRC fallback-cstring.cpp DEP ${crt_obj_deps})
@@ -115,6 +117,8 @@ add_fallback_devicelib(libsycl-fallback-complex SRC fallback-complex.cpp DEP ${c
 add_fallback_devicelib(libsycl-fallback-complex-fp64 SRC fallback-complex-fp64.cpp DEP ${complex_obj_deps} )
 add_fallback_devicelib(libsycl-fallback-cmath SRC fallback-cmath.cpp DEP ${cmath_obj_deps})
 add_fallback_devicelib(libsycl-fallback-cmath-fp64 SRC fallback-cmath-fp64.cpp DEP ${cmath_obj_deps})
+add_fallback_devicelib(libsycl-fallback-bfloat16 SRC fallback-bfloat16.cpp DEP ${bfloat16_obj_deps})
+add_fallback_devicelib(libsycl-native-bfloat16 SRC bfloat16_wrapper.cpp DEP ${bfloat16_obj_deps})
 
 file(MAKE_DIRECTORY ${obj_binary_dir}/libdevice)
 set(imf_fallback_src_dir ${obj_binary_dir}/libdevice)
diff --git a/libdevice/fallback-bfloat16.cpp b/libdevice/fallback-bfloat16.cpp
new file mode 100755
index 0000000000000..b4c016bfab632
--- /dev/null
+++ b/libdevice/fallback-bfloat16.cpp
@@ -0,0 +1,44 @@
+//== fallback-bfloat16.cpp - fallback implementation of bfloat16 conversions ==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===//
+
+#include "device.h"
+
+#ifdef __SPIR__
+
+#include <cstdint>
+
+// To support fallback device libraries on-demand loading, please update the
+// DeviceLibFuncMap in llvm/tools/sycl-post-link/sycl-post-link.cpp if you add
+// or remove any item in this file.
+// TODO: generate the DeviceLibFuncMap in sycl-post-link.cpp automatically
+// during the build based on libdevice to avoid manually sync.
+
+DEVICE_EXTERN_C_INLINE uint16_t
+__devicelib_ConvertFToBF16INTEL(const float &a) {
+  // In case float value is nan - propagate bfloat16's qnan
+  // if (std::isnan(a))
+  //  return 0xffc1;
+  union {
+    uint32_t intStorage;
+    float floatValue;
+  };
+  floatValue = a;
+  // Do RNE and truncate
+  uint32_t roundingBias = ((intStorage >> 16) & 0x1) + 0x00007FFF;
+  return static_cast<uint16_t>((intStorage + roundingBias) >> 16);
+}
+
+DEVICE_EXTERN_C_INLINE float
+__devicelib_ConvertBF16ToFINTEL(const uint16_t &a) {
+  uint32_t y = a;
+  y = y << 16;
+  float *res = reinterpret_cast<float *>(&y);
+  return *res;
+}
+
+#endif // __SPIR__
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
index 9ce9c56ac46f7..1989f5b01bf7f 100644
--- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
+++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
@@ -409,6 +409,10 @@ SYCLDeviceLibFuncMap SDLMap = {
      DeviceLibExt::cl_intel_devicelib_imf_fp64},
     {"__devicelib_imf_longlong_as_double",
      DeviceLibExt::cl_intel_devicelib_imf_fp64},
+    { "__devicelib_ConvertFToBF16INTEL",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
+    { "__devicelib_ConvertBF16ToFINTEL",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
 };
 
 // Each fallback device library corresponds to one bit in "require mask" which
@@ -423,6 +427,7 @@ SYCLDeviceLibFuncMap SDLMap = {
 // fallback-cstring:      0x20
 // fallback-imf:          0x40
 // fallback-imf-fp64:     0x80
+// fallback-bfloat16:     0x100
 uint32_t getDeviceLibBits(const std::string &FuncName) {
   auto DeviceLibFuncIter = SDLMap.find(FuncName);
   return ((DeviceLibFuncIter == SDLMap.end())
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.h b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.h
index 15cae43da0779..d1a28c7c4d7e1 100644
--- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.h
+++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.h
@@ -34,6 +34,7 @@ enum class DeviceLibExt : std::uint32_t {
   cl_intel_devicelib_cstring,
   cl_intel_devicelib_imf,
   cl_intel_devicelib_imf_fp64,
+  cl_intel_devicelib_bfloat16,
 };
 
 uint32_t getSYCLDeviceLibReqMask(const Module &M);
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 543dd3a8ba7fd..e37261afe3fc5 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -15,6 +15,11 @@
 #include <cmath>
 #endif
 
+extern "C" SYCL_EXTERNAL uint16_t
+__devicelib_ConvertFToBF16INTEL(const float &) noexcept;
+extern "C" SYCL_EXTERNAL float
+__devicelib_ConvertBF16ToFINTEL(const uint16_t &) noexcept;
+
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace ext {
@@ -36,10 +41,11 @@ class bfloat16 {
 #if defined(__NVPTX__)
     return __nvvm_f2bf16_rn(a);
 #else
-    return __spirv_ConvertFToBF16INTEL(a);
+    //return __spirv_ConvertFToBF16INTEL(a);
+    return __devicelib_ConvertFToBF16INTEL(a);
 #endif
 #else
-    // In case of float value is nan - propagate bfloat16's qnan
+    // In case float value is nan - propagate bfloat16's qnan
     if (std::isnan(a))
       return 0xffc1;
     union {
@@ -61,7 +67,8 @@ class bfloat16 {
     float *res = reinterpret_cast<float *>(&y);
     return *res;
 #else
-    return __spirv_ConvertBF16ToFINTEL(a);
+    //return __spirv_ConvertBF16ToFINTEL(a);
+    return __devicelib_ConvertBF16ToFINTEL(a);
 #endif
 #else
     // Shift temporary variable to silence the warning
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 09ebb1068c8c5..e63878c4ccd70 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -3151,8 +3151,14 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
     // currently not supported in level zero runtime
     return PI_ERROR_INVALID_VALUE;
-  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16:
-    return PI_ERROR_INVALID_VALUE;
+  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16: {
+    // L0 does not yet tell us if bfloat16 is supported.
+    // TBD change the way we detect bfloat16 support.
+    // For now, assume ATS and PVC support it.
+    return ReturnValue(
+        bool{(Device->ZeDeviceProperties->deviceId & 0xfff) == 0x201 ||
+             (Device->ZeDeviceProperties->deviceId & 0xff0) == 0xbd0});
+  }
 
   // TODO: Implement.
   case PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index f1b08b27cad79..f1d6e6df52e72 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -772,6 +772,8 @@ static const char *getDeviceLibFilename(DeviceLibExt Extension) {
     return "libsycl-fallback-imf.spv";
   case DeviceLibExt::cl_intel_devicelib_imf_fp64:
     return "libsycl-fallback-imf-fp64.spv";
+  case DeviceLibExt::cl_intel_devicelib_bfloat16:
+    return "libsycl-fallback-bfloat16.spv";
   }
   throw compile_program_error("Unhandled (new?) device library extension",
                               PI_ERROR_INVALID_OPERATION);
@@ -795,6 +797,8 @@ static const char *getDeviceLibExtensionStr(DeviceLibExt Extension) {
     return "cl_intel_devicelib_imf";
   case DeviceLibExt::cl_intel_devicelib_imf_fp64:
     return "cl_intel_devicelib_imf_fp64";
+  case DeviceLibExt::cl_intel_devicelib_bfloat16:
+    return "cl_intel_devicelib_bfloat16";
   }
   throw compile_program_error("Unhandled (new?) device library extension",
                               PI_ERROR_INVALID_OPERATION);
@@ -802,9 +806,15 @@ static const char *getDeviceLibExtensionStr(DeviceLibExt Extension) {
 
 static RT::PiProgram loadDeviceLibFallback(const ContextImplPtr Context,
                                            DeviceLibExt Extension,
-                                           const RT::PiDevice &Device) {
+                                           const RT::PiDevice &Device,
+                                           bool UseNativeLib) {
 
   const char *LibFileName = getDeviceLibFilename(Extension);
+  std::string LibFileNameStr(LibFileName);
+  if (UseNativeLib) {
+    LibFileNameStr.replace(8, 8, "native");
+    LibFileName = LibFileNameStr.c_str();
+  }
 
   auto LockedCache = Context->acquireCachedLibPrograms();
   auto CachedLibPrograms = LockedCache.get();
@@ -959,7 +969,8 @@ getDeviceLibPrograms(const ContextImplPtr Context, const RT::PiDevice &Device,
       {DeviceLibExt::cl_intel_devicelib_complex_fp64, false},
       {DeviceLibExt::cl_intel_devicelib_cstring, false},
       {DeviceLibExt::cl_intel_devicelib_imf, false},
-      {DeviceLibExt::cl_intel_devicelib_imf_fp64, false}};
+      {DeviceLibExt::cl_intel_devicelib_imf_fp64, false},
+      {DeviceLibExt::cl_intel_devicelib_bfloat16, false}};
 
   // Disable all devicelib extensions requiring fp64 support if at least
   // one underlying device doesn't support cl_khr_fp64.
@@ -997,8 +1008,14 @@ getDeviceLibPrograms(const ContextImplPtr Context, const RT::PiDevice &Device,
     bool DeviceSupports = DevExtList.npos != DevExtList.find(ExtStr);
 
     if (!DeviceSupports || InhibitNativeImpl) {
-      Programs.push_back(loadDeviceLibFallback(Context, Ext, Device));
+      Programs.push_back(loadDeviceLibFallback(Context, Ext, Device, false));
       FallbackIsLoaded = true;
+    } else {
+      // bfloat16 needs native library if device supports it
+      if (Ext == DeviceLibExt::cl_intel_devicelib_bfloat16) {
+        Programs.push_back(loadDeviceLibFallback(Context, Ext, Device, true));
+        FallbackIsLoaded = true;
+      }
     }
   }
   return Programs;
diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp
index 25efbc1525d0a..59e702bf9b47e 100644
--- a/sycl/source/detail/program_manager/program_manager.hpp
+++ b/sycl/source/detail/program_manager/program_manager.hpp
@@ -66,6 +66,7 @@ enum class DeviceLibExt : std::uint32_t {
   cl_intel_devicelib_cstring,
   cl_intel_devicelib_imf,
   cl_intel_devicelib_imf_fp64,
+  cl_intel_devicelib_bfloat16,
 };
 
 // Provides single loading and building OpenCL programs with unique contexts

From 2ad68f644fcffefbd3aee20dc0674f8936dca364 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 7 Sep 2022 18:08:24 -0700
Subject: [PATCH 10/63] Formatting changes.

---
 libdevice/fallback-bfloat16.cpp               |  4 +-
 .../sycl-post-link/SYCLDeviceLibReqMask.cpp   |  4 +-
 sycl/include/sycl/ext/oneapi/bfloat16.hpp     | 45 +++++++++----------
 sycl/include/sycl/sycl.hpp                    |  1 -
 4 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/libdevice/fallback-bfloat16.cpp b/libdevice/fallback-bfloat16.cpp
index b4c016bfab632..3a7787edbf58b 100755
--- a/libdevice/fallback-bfloat16.cpp
+++ b/libdevice/fallback-bfloat16.cpp
@@ -1,10 +1,10 @@
-//== fallback-bfloat16.cpp - fallback implementation of bfloat16 conversions ==//
+//==------- fallback-bfloat16.cpp - bfloat16 conversions in software -------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-//===-----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 
 #include "device.h"
 
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
index 1989f5b01bf7f..feec4727a420d 100644
--- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
+++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
@@ -409,9 +409,9 @@ SYCLDeviceLibFuncMap SDLMap = {
      DeviceLibExt::cl_intel_devicelib_imf_fp64},
     {"__devicelib_imf_longlong_as_double",
      DeviceLibExt::cl_intel_devicelib_imf_fp64},
-    { "__devicelib_ConvertFToBF16INTEL",
+    {"__devicelib_ConvertFToBF16INTEL",
      DeviceLibExt::cl_intel_devicelib_bfloat16},
-    { "__devicelib_ConvertBF16ToFINTEL",
+    {"__devicelib_ConvertBF16ToFINTEL",
      DeviceLibExt::cl_intel_devicelib_bfloat16},
 };
 
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 20d51e1b40e85..81f90b1377478 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -41,7 +41,6 @@ class bfloat16 {
 #if defined(__NVPTX__)
     return __nvvm_f2bf16_rn(a);
 #else
-    //return __spirv_ConvertFToBF16INTEL(a);
     return __devicelib_ConvertFToBF16INTEL(a);
 #endif
 #else
@@ -67,13 +66,13 @@ class bfloat16 {
     float *res = reinterpret_cast<float *>(&y);
     return *res;
 #else
-    //return __spirv_ConvertBF16ToFINTEL(a);
     return __devicelib_ConvertBF16ToFINTEL(a);
 #endif
 #else
+    // Shift temporary variable to silence the warning
     uint32_t bits = a;
     bits <<= 16;
-    return sycl::bit_cast<float>(bits);
+    return static_cast<float>(bits);
 #endif
   }
 
@@ -139,11 +138,11 @@ class bfloat16 {
     operator op(lhs);                                                          \
     return old;                                                                \
   }
-  OP(++)
-  OP(--)
+    OP(++)
+    OP(--)
 #undef OP
 
-  // Assignment operators overloading
+    // Assignment operators overloading
 #define OP(op)                                                                 \
   friend bfloat16 &operator op(bfloat16 &lhs, const bfloat16 &rhs) {           \
     float f = static_cast<float>(lhs);                                         \
@@ -161,10 +160,10 @@ class bfloat16 {
     f op static_cast<float>(rhs);                                              \
     return lhs = f;                                                            \
   }
-  OP(+=)
-  OP(-=)
-  OP(*=)
-  OP(/=)
+    OP(+=)
+    OP(-=)
+    OP(*=)
+    OP(/=)
 #undef OP
 
 // Binary operators overloading
@@ -180,21 +179,21 @@ class bfloat16 {
   friend type operator op(const T &lhs, const bfloat16 &rhs) {                 \
     return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
   }
-  OP(bfloat16, +)
-  OP(bfloat16, -)
-  OP(bfloat16, *)
-  OP(bfloat16, /)
-  OP(bool, ==)
-  OP(bool, !=)
-  OP(bool, <)
-  OP(bool, >)
-  OP(bool, <=)
-  OP(bool, >=)
+    OP(bfloat16, +)
+    OP(bfloat16, -)
+    OP(bfloat16, *)
+    OP(bfloat16, /)
+    OP(bool, ==)
+    OP(bool, !=)
+    OP(bool, <)
+    OP(bool, >)
+    OP(bool, <=)
+    OP(bool, >=)
 #undef OP
 
-  // Bitwise(|,&,~,^), modulo(%) and shift(<<,>>) operations are not supported
-  // for floating-point types.
-};
+    // Bitwise(|,&,~,^), modulo(%) and shift(<<,>>) operations are not supported
+    // for floating-point types.
+  };
 
 } // namespace oneapi
 } // namespace ext
diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp
index 99cf80239ede7..f79e57484f2a9 100644
--- a/sycl/include/sycl/sycl.hpp
+++ b/sycl/include/sycl/sycl.hpp
@@ -61,7 +61,6 @@
 #endif
 #include <sycl/ext/oneapi/device_global/device_global.hpp>
 #include <sycl/ext/oneapi/device_global/properties.hpp>
-//#include <sycl/ext/oneapi/experimental/builtins.hpp>
 #include <sycl/ext/oneapi/builtins.hpp>
 #include <sycl/ext/oneapi/experimental/cuda/barrier.hpp>
 #include <sycl/ext/oneapi/filter_selector.hpp>

From 4b78c035406efa43c4c7e703c919af1f355945d9 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 7 Sep 2022 18:46:07 -0700
Subject: [PATCH 11/63] Formatting changes.

---
 sycl/include/sycl/ext/oneapi/bfloat16.hpp | 40 +++++++++++------------
 sycl/include/sycl/sycl.hpp                |  2 +-
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 81f90b1377478..9a50c06bec992 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -138,11 +138,11 @@ class bfloat16 {
     operator op(lhs);                                                          \
     return old;                                                                \
   }
-    OP(++)
-    OP(--)
+  OP(++)
+  OP(--)
 #undef OP
 
-    // Assignment operators overloading
+  // Assignment operators overloading
 #define OP(op)                                                                 \
   friend bfloat16 &operator op(bfloat16 &lhs, const bfloat16 &rhs) {           \
     float f = static_cast<float>(lhs);                                         \
@@ -160,10 +160,10 @@ class bfloat16 {
     f op static_cast<float>(rhs);                                              \
     return lhs = f;                                                            \
   }
-    OP(+=)
-    OP(-=)
-    OP(*=)
-    OP(/=)
+  OP(+=)
+  OP(-=)
+  OP(*=)
+  OP(/=)
 #undef OP
 
 // Binary operators overloading
@@ -179,21 +179,21 @@ class bfloat16 {
   friend type operator op(const T &lhs, const bfloat16 &rhs) {                 \
     return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
   }
-    OP(bfloat16, +)
-    OP(bfloat16, -)
-    OP(bfloat16, *)
-    OP(bfloat16, /)
-    OP(bool, ==)
-    OP(bool, !=)
-    OP(bool, <)
-    OP(bool, >)
-    OP(bool, <=)
-    OP(bool, >=)
+  OP(bfloat16, +)
+  OP(bfloat16, -)
+  OP(bfloat16, *)
+  OP(bfloat16, /)
+  OP(bool, ==)
+  OP(bool, !=)
+  OP(bool, <)
+  OP(bool, >)
+  OP(bool, <=)
+  OP(bool, >=)
 #undef OP
 
-    // Bitwise(|,&,~,^), modulo(%) and shift(<<,>>) operations are not supported
-    // for floating-point types.
-  };
+  // Bitwise(|,&,~,^), modulo(%) and shift(<<,>>) operations are not supported
+  // for floating-point types.
+};
 
 } // namespace oneapi
 } // namespace ext
diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp
index f79e57484f2a9..fcdc9643ff4d4 100644
--- a/sycl/include/sycl/sycl.hpp
+++ b/sycl/include/sycl/sycl.hpp
@@ -59,9 +59,9 @@
 #if SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
 #endif
+#include <sycl/ext/oneapi/builtins.hpp>
 #include <sycl/ext/oneapi/device_global/device_global.hpp>
 #include <sycl/ext/oneapi/device_global/properties.hpp>
-#include <sycl/ext/oneapi/builtins.hpp>
 #include <sycl/ext/oneapi/experimental/cuda/barrier.hpp>
 #include <sycl/ext/oneapi/filter_selector.hpp>
 #include <sycl/ext/oneapi/group_algorithm.hpp>

From 0fce16d04e99a8f5361f6ff1e50a59a8d49e80fd Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 7 Sep 2022 23:09:18 -0700
Subject: [PATCH 12/63] Update to documentation.

---
 .../sycl_ext_oneapi_bfloat16.asciidoc         | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 6128aae0bda8e..46ef6e15b9fd4 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -44,12 +44,7 @@ This extension is written against the SYCL 2020 specification, Revision 5.
 == Status
 
 This extension is implemented and fully supported by DPC++.
-[NOTE]
-====
-This extension is currently implemented in DPC++ only for GPU devices that support `bfloat16` natively. Attempting to use this extension in
-kernels that run on other devices may result in undefined behavior.
-Be aware that the compiler is not able to issue a diagnostic to warn you if this happens.
-====
+
 
 == Overview
 
@@ -66,7 +61,12 @@ supporting this extension must predefine the macro
 `SYCL_EXT_ONEAPI_BFLOAT16` to one of the values defined in the table
 below. Applications can test for the existence of this macro to determine if
 the implementation supports this feature, or applications can test the macro’s
- value to determine which of the extension’s APIs the implementation supports.
+ value to determine which of the extension’s APIs the implementation supports. Attempting to submit a kernel using `bfloat16` to a device that does not support it causes a synchronous `errc::kernel_not_supported` exception to be thrown from the kernel invocation command (e.g. from `parallel_for`).
+
+[NOTE]
+====
+. DPC++ does not currently implement the `errc::kernel_not_supported` exception in this case. Attempting to submit a kernel using `bfloat16` to a device that does not support this extension results in undefined behavior.
+====
 
 [%header,cols="1,5"]
 |===
@@ -86,12 +86,11 @@ enum class aspect {
 }
 ----
 
-This extension is an optional kernel feature as described in section 5.7 of the SYCL 2020 spec, with the associated aspect `ext_oneapi_bfloat16`. Applications can query whether the device has this aspect to determine if it supports kernels that use `bfloat16`. Attempting to submit a kernel using `bfloat16` to a device that does not support it causes a synchronous `errc::kernel_not_supported` exception to be thrown from the kernel invocation command (e.g. from `parallel_for`).
+This extension is an optional kernel feature as described in section 5.7 of the SYCL 2020 spec, with the associated aspect `ext_oneapi_bfloat16`. Applications can query whether the device has this aspect to determine if it supports the  `bfloat16` type in hardware.
 
 [NOTE]
-====
-. DPC++ does not currently implement the `errc::kernel_not_supported` exception in this case. Attempting to submit a kernel using `bfloat16` to a device that does not have the `ext_oneapi_bfloat16` aspect results in undefined behavior. 
-. The `bfloat16` class is currently supported only on Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80. 
+==== 
+. The `bfloat16` class is currently supported in hardware only on Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80. 
 ====
 
 

From 4bcb383d952ded23070fb776a6c6ad2266e2ef8b Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 8 Sep 2022 12:57:50 -0700
Subject: [PATCH 13/63] Deprecate bfloat16 aspect.

---
 .../supported/sycl_ext_oneapi_bfloat16.asciidoc  |  9 ++++-----
 sycl/plugins/level_zero/pi_level_zero.cpp        | 16 ++++++++++------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 46ef6e15b9fd4..922bddd63da26 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -86,13 +86,12 @@ enum class aspect {
 }
 ----
 
-This extension is an optional kernel feature as described in section 5.7 of the SYCL 2020 spec, with the associated aspect `ext_oneapi_bfloat16`. Applications can query whether the device has this aspect to determine if it supports the  `bfloat16` type in hardware.
-
+This extension is an optional kernel feature as described in section 5.7 of the SYCL 2020 spec, with the associated aspect `ext_oneapi_bfloat16`. Applications can query whether the device has this aspect to determine if it supports `bfloat16` conversions.
+The `bfloat16` class is currently supported in hardware on Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80. On other devices it is emulated in software.
 [NOTE]
-==== 
-. The `bfloat16` class is currently supported in hardware only on Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80. 
 ====
-
+Aspect `ext_oneapi_bfloat16` is deprecated because `bfloat16` is supported on all devices.
+====
 
 === New `bfloat16` class
 
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 1ae2b369a4c09..db3a493584088 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -2748,6 +2748,13 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
       // Supports reading and writing of images.
       SupportedExtensions += ("cl_khr_3d_image_writes ");
 
+    // L0 does not tell us if bfloat16 is supported.
+    // For now, assume ATS and PVC support it.
+    // TODO: change the way we detect bfloat16 support.
+    if ((Device->ZeDeviceProperties->deviceId & 0xfff) == 0x201 ||
+        (Device->ZeDeviceProperties->deviceId & 0xff0) == 0xbd0)
+      SupportedExtensions += ("cl_intel_bfloat16_conversions ");
+
     return ReturnValue(SupportedExtensions.c_str());
   }
   case PI_DEVICE_INFO_NAME:
@@ -3195,12 +3202,9 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
     // currently not supported in level zero runtime
     return PI_ERROR_INVALID_VALUE;
   case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16: {
-    // L0 does not yet tell us if bfloat16 is supported.
-    // TBD change the way we detect bfloat16 support.
-    // For now, assume ATS and PVC support it.
-    return ReturnValue(
-        bool{(Device->ZeDeviceProperties->deviceId & 0xfff) == 0x201 ||
-             (Device->ZeDeviceProperties->deviceId & 0xff0) == 0xbd0});
+    // bfloat16 is implemented in hardware or emulated, so it is always
+    // supported.
+    return ReturnValue(bool{true});
   }
 
   // TODO: Implement.

From 35308f870745b56c1a48f9fe4ded26b64f0df8dd Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 8 Sep 2022 17:31:36 -0700
Subject: [PATCH 14/63] Fixes for ESIMD.

---
 libdevice/fallback-bfloat16.cpp                        | 10 ++++++----
 .../ext/intel/esimd/detail/bfloat16_type_traits.hpp    |  4 ++--
 sycl/include/sycl/ext/oneapi/bfloat16.hpp              | 10 ++++++----
 sycl/source/detail/program_manager/program_manager.cpp |  2 +-
 4 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/libdevice/fallback-bfloat16.cpp b/libdevice/fallback-bfloat16.cpp
index 3a7787edbf58b..66bd9128b1fc7 100755
--- a/libdevice/fallback-bfloat16.cpp
+++ b/libdevice/fallback-bfloat16.cpp
@@ -35,10 +35,12 @@ __devicelib_ConvertFToBF16INTEL(const float &a) {
 
 DEVICE_EXTERN_C_INLINE float
 __devicelib_ConvertBF16ToFINTEL(const uint16_t &a) {
-  uint32_t y = a;
-  y = y << 16;
-  float *res = reinterpret_cast<float *>(&y);
-  return *res;
+  union {
+    uint32_t intStorage;
+    float floatValue;
+  };
+  intStorage = a << 16;
+  return floatValue;
 }
 
 #endif // __SPIR__
diff --git a/sycl/include/sycl/ext/intel/esimd/detail/bfloat16_type_traits.hpp b/sycl/include/sycl/ext/intel/esimd/detail/bfloat16_type_traits.hpp
index 95bdeb5e63c5b..3883ee0382ba0 100644
--- a/sycl/include/sycl/ext/intel/esimd/detail/bfloat16_type_traits.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/detail/bfloat16_type_traits.hpp
@@ -13,7 +13,7 @@
 #include <sycl/ext/intel/esimd/detail/elem_type_traits.hpp>
 #include <sycl/ext/intel/esimd/detail/intrin.hpp>
 
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
 
 /// @cond ESIMD_DETAIL
 
@@ -21,7 +21,7 @@ namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace ext::intel::esimd::detail {
 
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 template <> struct element_type_traits<bfloat16> {
   // TODO map the raw type to __bf16 once SPIRV target supports it:
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 9a50c06bec992..4430d62e49988 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -69,10 +69,12 @@ class bfloat16 {
     return __devicelib_ConvertBF16ToFINTEL(a);
 #endif
 #else
-    // Shift temporary variable to silence the warning
-    uint32_t bits = a;
-    bits <<= 16;
-    return static_cast<float>(bits);
+    union {
+      uint32_t intStorage;
+      float floatValue;
+    };
+    intStorage = a << 16;
+    return floatValue;
 #endif
   }
 
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index f1d6e6df52e72..f7dcd48117a49 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -798,7 +798,7 @@ static const char *getDeviceLibExtensionStr(DeviceLibExt Extension) {
   case DeviceLibExt::cl_intel_devicelib_imf_fp64:
     return "cl_intel_devicelib_imf_fp64";
   case DeviceLibExt::cl_intel_devicelib_bfloat16:
-    return "cl_intel_devicelib_bfloat16";
+    return "cl_intel_bfloat16_conversions";
   }
   throw compile_program_error("Unhandled (new?) device library extension",
                               PI_ERROR_INVALID_OPERATION);

From fa045e2e7af07a0fb3f538e24797f488de2f29df Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Fri, 9 Sep 2022 14:20:48 -0700
Subject: [PATCH 15/63] Reinstated to_float and from_float, used by NVidia,
 updated doc.

---
 .../sycl_ext_oneapi_bfloat16.asciidoc         | 94 +++++++++++++------
 sycl/include/sycl/ext/oneapi/bfloat16.hpp     |  2 -
 2 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 922bddd63da26..7c80261396d89 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -48,8 +48,19 @@ This extension is implemented and fully supported by DPC++.
 
 == Overview
 
-This extension adds support for a 16-bit floating point type `bfloat16`. This type occupies 16 bits of storage space as does the `sycl::half` type. However, `bfloat16` allots 8 bits to the exponent instead of the 5 bits used by `sycl::half` and 7 bits to the significand versus 10 bits used by `sycl::half`. Thus, `bfloat16` has the same dynamic range as a 32-bit `float` but with reduced precision. This type is useful when memory required to store the values must be reduced, and when the calculations require high dynamic range but can tolerate lower-precision. Some implementations may still perform operations on this type using 32-bit math. For example, they may convert the `bfloat16` value to `float`, and then perform the operation on the 32-bit `float`.
+This extension adds support for a 16-bit floating point type `bfloat16`.
+This type occupies 16 bits of storage space as does the `sycl::half` type.
+However, `bfloat16` allots 8 bits to the exponent instead of the 5 bits used by
+`sycl::half` and 7 bits to the significand versus 10 bits used by `sycl::half`.
+Thus, `bfloat16` has the same dynamic range as a 32-bit `float` but with reduced
+precision. This type is useful when memory required to store the values must be
+reduced, and when the calculations require high dynamic range but can tolerate
+lower-precision. Some implementations may still perform operations on this type
+using 32-bit math. For example, they may convert the `bfloat16` value to
+`float`, and then perform the operation on the 32-bit `float`.
 
+[NOTE]
+The bfloat16 type is supported on all devices. DPC++ currently supports this type natively on Intel Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80. On other devices it is emulated in software.
 
 == Specification
 
@@ -61,11 +72,16 @@ supporting this extension must predefine the macro
 `SYCL_EXT_ONEAPI_BFLOAT16` to one of the values defined in the table
 below. Applications can test for the existence of this macro to determine if
 the implementation supports this feature, or applications can test the macro’s
- value to determine which of the extension’s APIs the implementation supports. Attempting to submit a kernel using `bfloat16` to a device that does not support it causes a synchronous `errc::kernel_not_supported` exception to be thrown from the kernel invocation command (e.g. from `parallel_for`).
+value to determine which of the extension’s APIs the implementation supports.
+Attempting to submit a kernel using `bfloat16` to a device that does not
+support it causes a synchronous `errc::kernel_not_supported` exception to be
+thrown from the kernel invocation command (e.g. from `parallel_for`).
 
 [NOTE]
 ====
-. DPC++ does not currently implement the `errc::kernel_not_supported` exception in this case. Attempting to submit a kernel using `bfloat16` to a device that does not support this extension results in undefined behavior.
+. DPC++ does not currently implement the `errc::kernel_not_supported`
+exception in this case. Attempting to submit a kernel using `bfloat16`
+to a device that does not support this extension results in undefined behavior.
 ====
 
 [%header,cols="1,5"]
@@ -74,28 +90,12 @@ the implementation supports this feature, or applications can test the macro’s
 |1     |Initial extension version. Base features are supported.
 |===
 
-=== Extension to `enum class aspect`
-
-[source]
-----
-namespace sycl {
-enum class aspect {
-  ...
-  ext_oneapi_bfloat16
-}
-}
-----
-
-This extension is an optional kernel feature as described in section 5.7 of the SYCL 2020 spec, with the associated aspect `ext_oneapi_bfloat16`. Applications can query whether the device has this aspect to determine if it supports `bfloat16` conversions.
-The `bfloat16` class is currently supported in hardware on Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80. On other devices it is emulated in software.
-[NOTE]
-====
-Aspect `ext_oneapi_bfloat16` is deprecated because `bfloat16` is supported on all devices.
-====
 
 === New `bfloat16` class
 
-The `bfloat16` type represents a 16-bit floating point value. Conversions from `float` to `bfloat16` are done with round to nearest even (RTE) rounding mode.
+The `bfloat16` type represents a 16-bit floating point value.
+Conversions from `float` to `bfloat16` are done with round to
+nearest even (RTE) rounding mode.
 
 [source]
 ----
@@ -104,12 +104,17 @@ namespace ext {
 namespace oneapi {
 
 class bfloat16 {
+using storage_t = uint16_t;
 
 public:
   bfloat16() = default;
   bfloat16(const bfloat16 &) = default;
   ~bfloat16() = default;
 
+  // Explicit conversion functions
+  static storage_t from_float(const float &a);
+  static float to_float(const storage_t &a);
+
   // Convert from float to bfloat16
   bfloat16(const float &a);
   bfloat16 &operator=(const float &a);
@@ -299,9 +304,15 @@ int main(int argc, char *argv[]) {
 
 === New bfloat16 math functions
 
-Many applications will require dedicated functions that take parameters of type `bfloat16`. This extension adds `bfloat16` support to the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point math functions. These functions can be used as element wise operations on matrices, supplementing the `bfloat16` support in the `sycl_ext_oneapi_matrix` extension.
+Many applications will require dedicated functions that take parameters of
+type `bfloat16`. This extension adds `bfloat16` support to the `fma`, `fmin`,
+`fmax` and `fabs` SYCL floating point math functions. These functions can be
+used as element wise operations on matrices, supplementing the `bfloat16`
+support in the `sycl_ext_oneapi_matrix` extension.
 
-The descriptions of the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point math functions can be found in the SYCL specification: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_math_functions.
+The descriptions of the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point
+math functions can be found in the SYCL specification:
+https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_math_functions.
 
 
 
@@ -320,8 +331,10 @@ marray<bfloat16, N> fma(marray<bfloat16, N> a, marray<bfloat16, N> b, marray<bfl
 
 ===== Description
 
-Returns the correctly rounded floating-point representation of the sum of `c` with the infinitely precise product of `a` and `b`.
-Rounding of intermediate products shall not occur. The mantissa LSB rounds to the nearest even. Subnormal numbers are supported.
+Returns the correctly rounded floating-point representation of the sum of `c`
+with the infinitely precise product of `a` and `b`.
+Rounding of intermediate products shall not occur.
+The mantissa LSB rounds to the nearest even. Subnormal numbers are supported.
 
 ==== fmax
 
@@ -382,11 +395,36 @@ T fabs(T x);
 
 Compute absolute value of a `bfloat16`.
 
+== Deprecated Features
+
+=== Extension to `enum class aspect`
+
+[source]
+----
+namespace sycl {
+enum class aspect {
+  ...
+  ext_oneapi_bfloat16
+}
+}
+----
+
+This extension adds a new aspect named `ext_oneapi_bfloat16`, but usage of this
+aspect is deprecated. It used to indicate whether a device supports `bfloat16`,
+but all devices are now required to support `bfloat16` when an implementation
+supports this extension. Therefore, this aspect now returns true for all devices.
+
+
 == Issues
 
-1. The CUDA backend does not have a use case that would necessitate support of the `vec` class in bfloat16 math functions, and `marray` would always be preferred over `vec` if `vec` support were to be added in the CUDA backend. For portability reasons, support for the `vec` class can be easily added if other backends require it.
+1. The CUDA backend does not have a use case that would necessitate
+support of the `vec` class in bfloat16 math functions, and `marray`
+would always be preferred over `vec` if `vec` support were to be
+added in the CUDA backend. For portability reasons, support for the
+`vec` class can be easily added if other backends require it.
 
-2. We should decide on a roadmap to extend support of `bfloat16` to other SYCL 2020 math functions.
+2. We should decide on a roadmap to extend support of `bfloat16`
+to other SYCL 2020 math functions.
 
 == Revision History
 
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 4430d62e49988..c24aad77a2d70 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -34,7 +34,6 @@ class bfloat16 {
   bfloat16(const bfloat16 &) = default;
   ~bfloat16() = default;
 
-private:
   // Explicit conversion functions
   static storage_t from_float(const float &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
@@ -84,7 +83,6 @@ class bfloat16 {
     return res;
   }
 
-public:
   // Implicit conversion from float to bfloat16
   bfloat16(const float &a) { value = from_float(a); }
 

From b12fd94315558c1ac2eadfc0da960c753be7a52a Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 12 Sep 2022 14:43:37 -0700
Subject: [PATCH 16/63] Update to doc.

---
 .../supported/sycl_ext_oneapi_bfloat16.asciidoc          | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 7c80261396d89..9f1e5d6e882da 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -73,16 +73,7 @@ supporting this extension must predefine the macro
 below. Applications can test for the existence of this macro to determine if
 the implementation supports this feature, or applications can test the macro’s
 value to determine which of the extension’s APIs the implementation supports.
-Attempting to submit a kernel using `bfloat16` to a device that does not
-support it causes a synchronous `errc::kernel_not_supported` exception to be
-thrown from the kernel invocation command (e.g. from `parallel_for`).
 
-[NOTE]
-====
-. DPC++ does not currently implement the `errc::kernel_not_supported`
-exception in this case. Attempting to submit a kernel using `bfloat16`
-to a device that does not support this extension results in undefined behavior.
-====
 
 [%header,cols="1,5"]
 |===

From f217eb4d8f59c8ee2f9df1fd36e9423a01f2925b Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 14 Sep 2022 16:22:25 -0700
Subject: [PATCH 17/63] Corrections to headers.

---
 sycl/include/sycl/ext/oneapi/bfloat16.hpp     |  4 +-
 sycl/include/sycl/ext/oneapi/builtins.hpp     | 16 ++---
 .../sycl/ext/oneapi/matrix/matrix-jit-use.hpp | 68 +++++++++----------
 3 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index c24aad77a2d70..719fa7233573d 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -34,6 +34,7 @@ class bfloat16 {
   bfloat16(const bfloat16 &) = default;
   ~bfloat16() = default;
 
+private:
   // Explicit conversion functions
   static storage_t from_float(const float &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
@@ -83,6 +84,7 @@ class bfloat16 {
     return res;
   }
 
+public:
   // Implicit conversion from float to bfloat16
   bfloat16(const float &a) { value = from_float(a); }
 
@@ -117,7 +119,7 @@ class bfloat16 {
 #if defined(__NVPTX__)
     return from_bits(__nvvm_neg_bf16(lhs.value));
 #else
-    return bfloat16{-__spirv_ConvertBF16ToFINTEL(lhs.value)};
+    return bfloat16{-__devicelib_ConvertBF16ToFINTEL(lhs.value)};
 #endif
 #else
     (void)lhs;
diff --git a/sycl/include/sycl/ext/oneapi/builtins.hpp b/sycl/include/sycl/ext/oneapi/builtins.hpp
index 2e316ed1270e2..fd8627f52c0b4 100644
--- a/sycl/include/sycl/ext/oneapi/builtins.hpp
+++ b/sycl/include/sycl/ext/oneapi/builtins.hpp
@@ -127,7 +127,7 @@ inline __SYCL_ALWAYS_INLINE
 template <typename T>
 std::enable_if_t<std::is_same<T, bfloat16>::value, T> fabs(T x) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  return bfloat16::from_bits(__clc_fabs(x.raw()));
+  return sycl::bit_cast<bfloat16>(__clc_fabs(x.raw()));
 #else
   std::ignore = x;
   throw runtime_error("bfloat16 is not currently supported on the host device.",
@@ -146,7 +146,7 @@ sycl::marray<bfloat16, N> fabs(sycl::marray<bfloat16, N> x) {
   }
 
   if (N % 2) {
-    res[N - 1] = bfloat16::from_bits(__clc_fabs(x[N - 1].raw()));
+    res[N - 1] = sycl::bit_cast<bfloat16>(__clc_fabs(x[N - 1].raw()));
   }
   return res;
 #else
@@ -159,7 +159,7 @@ sycl::marray<bfloat16, N> fabs(sycl::marray<bfloat16, N> x) {
 template <typename T>
 std::enable_if_t<std::is_same<T, bfloat16>::value, T> fmin(T x, T y) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  return bfloat16::from_bits(__clc_fmin(x.raw(), y.raw()));
+  return sycl::bit_cast<bfloat16>(__clc_fmin(x.raw(), y.raw()));
 #else
   std::ignore = x;
   std::ignore = y;
@@ -182,7 +182,7 @@ sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
 
   if (N % 2) {
     res[N - 1] =
-        bfloat16::from_bits(__clc_fmin(x[N - 1].raw(), y[N - 1].raw()));
+        sycl::bit_cast<bfloat16>(__clc_fmin(x[N - 1].raw(), y[N - 1].raw()));
   }
 
   return res;
@@ -197,7 +197,7 @@ sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
 template <typename T>
 std::enable_if_t<std::is_same<T, bfloat16>::value, T> fmax(T x, T y) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  return bfloat16::from_bits(__clc_fmax(x.raw(), y.raw()));
+  return sycl::bit_cast<bfloat16>(__clc_fmax(x.raw(), y.raw()));
 #else
   std::ignore = x;
   std::ignore = y;
@@ -220,7 +220,7 @@ sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
 
   if (N % 2) {
     res[N - 1] =
-        bfloat16::from_bits(__clc_fmax(x[N - 1].raw(), y[N - 1].raw()));
+        sycl::bit_cast<bfloat16>(__clc_fmax(x[N - 1].raw(), y[N - 1].raw()));
   }
   return res;
 #else
@@ -234,7 +234,7 @@ sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
 template <typename T>
 std::enable_if_t<std::is_same<T, bfloat16>::value, T> fma(T x, T y, T z) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  return bfloat16::from_bits(__clc_fma(x.raw(), y.raw(), z.raw()));
+  return sycl::bit_cast<bfloat16>(__clc_fma(x.raw(), y.raw(), z.raw()));
 #else
   std::ignore = x;
   std::ignore = y;
@@ -259,7 +259,7 @@ sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
   }
 
   if (N % 2) {
-    res[N - 1] = bfloat16::from_bits(
+    res[N - 1] = sycl::bit_cast<bfloat16>(
         __clc_fma(x[N - 1].raw(), y[N - 1].raw(), z[N - 1].raw()));
   }
   return res;
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp
index f8ff4aeaa047c..42735921c5d2f 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp
@@ -327,8 +327,7 @@ class wi_element {
 // represent bf16 type. Since the AMX and DPAS implementations don't support
 // uint16_t, this interpretation is possible. This design choice was made before
 // the introduction of SYCL experimental bfloat16 type. Our plan is to move
-// towards using the SYCL bfloat16. But since it is still experimental, we will
-// probably keep both uint16 interpretation and SYCL bfloat16.
+// towards using the SYCL bfloat16.
 template <size_t NumRows, size_t NumCols, use Use, layout Layout,
           typename Group>
 class wi_element<uint16_t, NumRows, NumCols, Use, Layout, Group> {
@@ -478,18 +477,18 @@ class wi_element<uint16_t, NumRows, NumCols, Use, Layout, Group> {
 
 template <size_t NumRows, size_t NumCols, use Use, layout Layout,
           typename Group>
-class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
-                 Use, Layout, Group> {
-  joint_matrix<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols, Use,
-               Layout, Group> &M;
+class wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use, Layout,
+                 Group> {
+  joint_matrix<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use, Layout,
+               Group> &M;
   std::size_t idx;
 
 public:
-  wi_element(joint_matrix<sycl::ext::oneapi::experimental::bfloat16, NumRows,
-                          NumCols, Use, Layout, Group> &Mat,
+  wi_element(joint_matrix<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,
+                          Layout, Group> &Mat,
              std::size_t i)
       : M(Mat), idx(i) {}
-  operator sycl::ext::oneapi::experimental::bfloat16() {
+  operator sycl::ext::oneapi::bfloat16() {
 #ifdef __SYCL_DEVICE_ONLY__
     return __spirv_VectorExtractDynamic(M.spvm, idx);
 #else
@@ -508,7 +507,7 @@ class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
 #endif // __SYCL_DEVICE_ONLY__
   }
 
-  wi_element &operator=(const sycl::ext::oneapi::experimental::bfloat16 &rhs) {
+  wi_element &operator=(const sycl::ext::oneapi::bfloat16 &rhs) {
 #ifdef __SYCL_DEVICE_ONLY__
     M.spvm = __spirv_VectorInsertDynamic(M.spvm, rhs, idx);
     return *this;
@@ -519,9 +518,8 @@ class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
 #endif // __SYCL_DEVICE_ONLY__
   }
 
-  wi_element &
-  operator=(const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,
-                             NumCols, Use, Layout, Group> &rhs) {
+  wi_element &operator=(const wi_element<sycl::ext::oneapi::bfloat16, NumRows,
+                                         NumCols, Use, Layout, Group> &rhs) {
 #ifdef __SYCL_DEVICE_ONLY__
     M.spvm = __spirv_VectorInsertDynamic(
         M.spvm, __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx), idx);
@@ -536,7 +534,7 @@ class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
 #if __SYCL_DEVICE_ONLY__
 #define OP(opassign, op)                                                       \
   wi_element &operator opassign(                                               \
-      const sycl::ext::oneapi::experimental::bfloat16 &rhs) {                  \
+      const sycl::ext::oneapi::bfloat16 &rhs) {                                \
     M.spvm = __spirv_VectorInsertDynamic(                                      \
         M.spvm, __spirv_VectorExtractDynamic(M.spvm, idx) op rhs, idx);        \
     return *this;                                                              \
@@ -544,7 +542,7 @@ class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
 #else // __SYCL_DEVICE_ONLY__
 #define OP(opassign, op)                                                       \
   wi_element &operator opassign(                                               \
-      const sycl::ext::oneapi::experimental::bfloat16 &rhs) {                  \
+      const sycl::ext::oneapi::bfloat16 &rhs) {                                \
     (void)rhs;                                                                 \
     throw runtime_error("joint matrix is not supported on host device.",       \
                         PI_ERROR_INVALID_DEVICE);                              \
@@ -559,33 +557,33 @@ class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
 #if __SYCL_DEVICE_ONLY__
 #define OP(type, op)                                                           \
   friend type operator op(                                                     \
-      const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,     \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows,                   \
                        NumCols, Use, Layout, Group> &lhs,                      \
-      const sycl::ext::oneapi::experimental::bfloat16 &rhs) {                  \
+      const sycl::ext::oneapi::bfloat16 &rhs) {                                \
     return __spirv_VectorExtractDynamic(lhs.M.spvm, lhs.idx) op rhs;           \
   }                                                                            \
   friend type operator op(                                                     \
-      const sycl::ext::oneapi::experimental::bfloat16 &lhs,                    \
-      const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,     \
+      const sycl::ext::oneapi::bfloat16 &lhs,                                  \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows,                   \
                        NumCols, Use, Layout, Group> &rhs) {                    \
     return __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx) op lhs;           \
   }
-  OP(sycl::ext::oneapi::experimental::bfloat16, +)
-  OP(sycl::ext::oneapi::experimental::bfloat16, -)
-  OP(sycl::ext::oneapi::experimental::bfloat16, *)
-  OP(sycl::ext::oneapi::experimental::bfloat16, /)
+  OP(sycl::ext::oneapi::bfloat16, +)
+  OP(sycl::ext::oneapi::bfloat16, -)
+  OP(sycl::ext::oneapi::bfloat16, *)
+  OP(sycl::ext::oneapi::bfloat16, /)
 #undef OP
 #define OP(type, op)                                                           \
   friend type operator op(                                                     \
-      const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,     \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows,                   \
                        NumCols, Use, Layout, Group> &lhs,                      \
-      const sycl::ext::oneapi::experimental::bfloat16 &rhs) {                  \
+      const sycl::ext::oneapi::bfloat16 &rhs) {                                \
     return type{static_cast<float>(__spirv_VectorExtractDynamic(               \
         lhs.M.spvm, lhs.idx)) op static_cast<float>(rhs)};                     \
   }                                                                            \
   friend type operator op(                                                     \
-      const sycl::ext::oneapi::experimental::bfloat16 &lhs,                    \
-      const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,     \
+      const sycl::ext::oneapi::bfloat16 &lhs,                                  \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows,                   \
                        NumCols, Use, Layout, Group> &rhs) {                    \
     return type{static_cast<float>(__spirv_VectorExtractDynamic(               \
         rhs.M.spvm, rhs.idx)) op static_cast<float>(lhs)};                     \
@@ -600,23 +598,23 @@ class wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows, NumCols,
 #else // __SYCL_DEVICE_ONLY__
 #define OP(type, op)                                                           \
   friend type operator op(                                                     \
-      const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,     \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows,                   \
                        NumCols, Use, Layout, Group> &,                         \
-      const sycl::ext::oneapi::experimental::bfloat16 &) {                     \
+      const sycl::ext::oneapi::bfloat16 &) {                                   \
     throw runtime_error("joint matrix is not supported on host device.",       \
                         PI_ERROR_INVALID_DEVICE);                              \
   }                                                                            \
   friend type operator op(                                                     \
-      const sycl::ext::oneapi::experimental::bfloat16 &,                       \
-      const wi_element<sycl::ext::oneapi::experimental::bfloat16, NumRows,     \
+      const sycl::ext::oneapi::bfloat16 &,                                     \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows,                   \
                        NumCols, Use, Layout, Group> &) {                       \
     throw runtime_error("joint matrix is not supported on host device.",       \
                         PI_ERROR_INVALID_DEVICE);                              \
   }
-  OP(sycl::ext::oneapi::experimental::bfloat16, +)
-  OP(sycl::ext::oneapi::experimental::bfloat16, -)
-  OP(sycl::ext::oneapi::experimental::bfloat16, *)
-  OP(sycl::ext::oneapi::experimental::bfloat16, /)
+  OP(sycl::ext::oneapi::bfloat16, +)
+  OP(sycl::ext::oneapi::bfloat16, -)
+  OP(sycl::ext::oneapi::bfloat16, *)
+  OP(sycl::ext::oneapi::bfloat16, /)
   OP(bool, ==)
   OP(bool, !=)
   OP(bool, <)

From a908b11ac539e6b384b704bf176174b8a574a359 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 14 Sep 2022 16:29:58 -0700
Subject: [PATCH 18/63] Formatting change.

---
 .../sycl/ext/oneapi/matrix/matrix-jit-use.hpp | 30 +++++++++----------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp
index 42735921c5d2f..2708866ecb5e9 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit-use.hpp
@@ -533,16 +533,14 @@ class wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use, Layout,
 
 #if __SYCL_DEVICE_ONLY__
 #define OP(opassign, op)                                                       \
-  wi_element &operator opassign(                                               \
-      const sycl::ext::oneapi::bfloat16 &rhs) {                                \
+  wi_element &operator opassign(const sycl::ext::oneapi::bfloat16 &rhs) {      \
     M.spvm = __spirv_VectorInsertDynamic(                                      \
         M.spvm, __spirv_VectorExtractDynamic(M.spvm, idx) op rhs, idx);        \
     return *this;                                                              \
   }
 #else // __SYCL_DEVICE_ONLY__
 #define OP(opassign, op)                                                       \
-  wi_element &operator opassign(                                               \
-      const sycl::ext::oneapi::bfloat16 &rhs) {                                \
+  wi_element &operator opassign(const sycl::ext::oneapi::bfloat16 &rhs) {      \
     (void)rhs;                                                                 \
     throw runtime_error("joint matrix is not supported on host device.",       \
                         PI_ERROR_INVALID_DEVICE);                              \
@@ -557,15 +555,15 @@ class wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use, Layout,
 #if __SYCL_DEVICE_ONLY__
 #define OP(type, op)                                                           \
   friend type operator op(                                                     \
-      const wi_element<sycl::ext::oneapi::bfloat16, NumRows,                   \
-                       NumCols, Use, Layout, Group> &lhs,                      \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
+                       Layout, Group> &lhs,                                    \
       const sycl::ext::oneapi::bfloat16 &rhs) {                                \
     return __spirv_VectorExtractDynamic(lhs.M.spvm, lhs.idx) op rhs;           \
   }                                                                            \
   friend type operator op(                                                     \
       const sycl::ext::oneapi::bfloat16 &lhs,                                  \
-      const wi_element<sycl::ext::oneapi::bfloat16, NumRows,                   \
-                       NumCols, Use, Layout, Group> &rhs) {                    \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
+                       Layout, Group> &rhs) {                                  \
     return __spirv_VectorExtractDynamic(rhs.M.spvm, rhs.idx) op lhs;           \
   }
   OP(sycl::ext::oneapi::bfloat16, +)
@@ -575,16 +573,16 @@ class wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use, Layout,
 #undef OP
 #define OP(type, op)                                                           \
   friend type operator op(                                                     \
-      const wi_element<sycl::ext::oneapi::bfloat16, NumRows,                   \
-                       NumCols, Use, Layout, Group> &lhs,                      \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
+                       Layout, Group> &lhs,                                    \
       const sycl::ext::oneapi::bfloat16 &rhs) {                                \
     return type{static_cast<float>(__spirv_VectorExtractDynamic(               \
         lhs.M.spvm, lhs.idx)) op static_cast<float>(rhs)};                     \
   }                                                                            \
   friend type operator op(                                                     \
       const sycl::ext::oneapi::bfloat16 &lhs,                                  \
-      const wi_element<sycl::ext::oneapi::bfloat16, NumRows,                   \
-                       NumCols, Use, Layout, Group> &rhs) {                    \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
+                       Layout, Group> &rhs) {                                  \
     return type{static_cast<float>(__spirv_VectorExtractDynamic(               \
         rhs.M.spvm, rhs.idx)) op static_cast<float>(lhs)};                     \
   }
@@ -598,16 +596,16 @@ class wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use, Layout,
 #else // __SYCL_DEVICE_ONLY__
 #define OP(type, op)                                                           \
   friend type operator op(                                                     \
-      const wi_element<sycl::ext::oneapi::bfloat16, NumRows,                   \
-                       NumCols, Use, Layout, Group> &,                         \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
+                       Layout, Group> &,                                       \
       const sycl::ext::oneapi::bfloat16 &) {                                   \
     throw runtime_error("joint matrix is not supported on host device.",       \
                         PI_ERROR_INVALID_DEVICE);                              \
   }                                                                            \
   friend type operator op(                                                     \
       const sycl::ext::oneapi::bfloat16 &,                                     \
-      const wi_element<sycl::ext::oneapi::bfloat16, NumRows,                   \
-                       NumCols, Use, Layout, Group> &) {                       \
+      const wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use,     \
+                       Layout, Group> &) {                                     \
     throw runtime_error("joint matrix is not supported on host device.",       \
                         PI_ERROR_INVALID_DEVICE);                              \
   }

From aab4c78b59fa2077ae7861b2c8d16ece30032d05 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Thu, 15 Sep 2022 17:08:04 +0100
Subject: [PATCH 19/63] bfloat16 class supports all sm_xx devices.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 clang/lib/Basic/Targets/NVPTX.cpp         |  2 +-
 sycl/include/sycl/ext/oneapi/bfloat16.hpp | 27 ++++++++++++++++-------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index f8e74c634df52..37904d1381c6e 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -181,7 +181,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
                                        MacroBuilder &Builder) const {
   Builder.defineMacro("__PTX__");
   Builder.defineMacro("__NVPTX__");
-  if (Opts.CUDAIsDevice || Opts.OpenMPIsDevice) {
+  if (Opts.CUDAIsDevice || Opts.OpenMPIsDevice || Opts.SYCLIsDevice) {
     // Set __CUDA_ARCH__ for the GPU specified.
     std::string CUDAArchCode = [this] {
       switch (GPU) {
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 719fa7233573d..7caa7718993ff 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -39,7 +39,21 @@ class bfloat16 {
   static storage_t from_float(const float &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
 #if defined(__NVPTX__)
+#if (__CUDA_ARCH__ >= 800)
     return __nvvm_f2bf16_rn(a);
+#else
+    // TODO std::isnan not defined in device code
+    // if (std::isnan(a))
+    // return 0xffc1;
+    union {
+      uint32_t intStorage;
+      float floatValue;
+    };
+    floatValue = a;
+    // Do RNE and truncate
+    uint32_t roundingBias = ((intStorage >> 16) & 0x1) + 0x00007FFF;
+    return static_cast<uint16_t>((intStorage + roundingBias) >> 16);
+#endif
 #else
     return __devicelib_ConvertFToBF16INTEL(a);
 #endif
@@ -59,15 +73,8 @@ class bfloat16 {
   }
 
   static float to_float(const storage_t &a) {
-#if defined(__SYCL_DEVICE_ONLY__)
-#if defined(__NVPTX__)
-    uint32_t y = a;
-    y = y << 16;
-    float *res = reinterpret_cast<float *>(&y);
-    return *res;
-#else
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__)
     return __devicelib_ConvertBF16ToFINTEL(a);
-#endif
 #else
     union {
       uint32_t intStorage;
@@ -117,7 +124,11 @@ class bfloat16 {
   friend bfloat16 operator-(bfloat16 &lhs) {
 #if defined(__SYCL_DEVICE_ONLY__)
 #if defined(__NVPTX__)
+#if (__CUDA_ARCH__ >= 800)
     return from_bits(__nvvm_neg_bf16(lhs.value));
+#else
+    return -to_float(lhs.value);
+#endif
 #else
     return bfloat16{-__devicelib_ConvertBF16ToFINTEL(lhs.value)};
 #endif

From 4d7a22bee7fec61918ed63b06a4e0df91827aaa9 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 15 Sep 2022 19:04:21 -0700
Subject: [PATCH 20/63] Changes to keep bfloat math functions experimental for
 now.

---
 libdevice/fallback-bfloat16.cpp               |   6 +-
 .../sycl_ext_oneapi_bfloat16_math.asciidoc    | 139 +++++++++
 .../sycl_ext_oneapi_bfloat16.asciidoc         | 108 +------
 sycl/include/sycl/ext/oneapi/builtins.hpp     | 281 ------------------
 .../ext/oneapi/experimental/bfloat16_math.hpp |   0
 .../sycl/ext/oneapi/experimental/builtins.hpp | 134 +++++++++
 sycl/include/sycl/sycl.hpp                    |   3 +-
 7 files changed, 281 insertions(+), 390 deletions(-)
 create mode 100644 sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
 delete mode 100644 sycl/include/sycl/ext/oneapi/builtins.hpp
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
 create mode 100755 sycl/include/sycl/ext/oneapi/experimental/builtins.hpp

diff --git a/libdevice/fallback-bfloat16.cpp b/libdevice/fallback-bfloat16.cpp
index 66bd9128b1fc7..0f92b1bcfa728 100755
--- a/libdevice/fallback-bfloat16.cpp
+++ b/libdevice/fallback-bfloat16.cpp
@@ -18,11 +18,13 @@
 // TODO: generate the DeviceLibFuncMap in sycl-post-link.cpp automatically
 // during the build based on libdevice to avoid manually sync.
 
+extern "C" SYCL_EXTERNAL int __builtin_spirv_OpIsNan_f32(float);
+
 DEVICE_EXTERN_C_INLINE uint16_t
 __devicelib_ConvertFToBF16INTEL(const float &a) {
   // In case float value is nan - propagate bfloat16's qnan
-  // if (std::isnan(a))
-  //  return 0xffc1;
+  if (__builtin_spirv_OpIsNan_f32(a))
+    return 0xffc1;
   union {
     uint32_t intStorage;
     float floatValue;
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
new file mode 100644
index 0000000000000..622ae302d5e47
--- /dev/null
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
@@ -0,0 +1,139 @@
+= sycl_ext_oneapi_bfloat16_math
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+
+:blank: pass:[ +]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+// This is necessary for asciidoc, but not for asciidoctor
+:cpp: C++
+
+== Notice
+
+IMPORTANT: This specification is a draft.
+
+Copyright (c) 2021-2022 Intel Corporation. All rights reserved.
+
+NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
+trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc.
+used by permission by Khronos.
+
+== Dependencies
+
+This extension is written against the SYCL 2020 specification, Revision 5.
+
+== Status
+
+Draft
+
+This is a preview extension specification, intended to provide early access to
+a feature for review and community feedback. When the feature matures, this
+specification may be released as a formal extension.
+
+Because the interfaces defined by this specification are not final and are
+subject to change they are not intended to be used by shipping software
+products.
+
+== Version
+
+Revision: 1
+
+== Introduction
+
+This extension adds `bfloat16` support to the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point math functions. These functions can be used as element wise operations on matrices, supplementing the `bfloat16` support in the sycl_ext_oneapi_matrix extension.
+
+The descriptions of the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point math functions can be found in the SYCL specification: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_math_functions.
+
+The following functions are only available when `T` is `bfloat16` or `sycl::marray<bfloat16, {N}>`, where `{N}` means any positive value of `size_t` type.
+
+=== fma
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+
+template <typename T>
+T fma(T a, T b, T c);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+==== Description
+
+Returns the correctly rounded floating-point representation of the sum of `c` with the infinitely precise product of `a` and `b`.
+Rounding of intermediate products shall not occur. The mantissa LSB rounds to the nearest even. Subnormal numbers are supported.
+
+=== fmax
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T fmax(T x, T y);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+==== Description
+
+Returns `y` if
+`x < y`, otherwise it
+returns `x`. If one argument is a
+NaN, `fmax()` returns the other
+argument. If both arguments are
+NaNs, `fmax()` returns a NaN.
+
+=== fmin
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T fmin(T x, T y);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+==== Description
+
+Returns `y` if
+`y < x`, otherwise it
+returns `x`. If one argument is a
+NaN, `fmax()` returns the other
+argument. If both arguments are
+NaNs, `fmax()` returns a NaN.
+
+=== fabs
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T fabs(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+==== Description
+
+Compute absolute value of a `bfloat16`.
+
+== Issues
+
+1. The CUDA backend does not have a use case that would necessitate support of the `vec` class in bfloat16 math functions, and `marray` would always be preferred over `vec` if `vec` support were to be added in the CUDA backend. For portability reasons, support for the `vec` class can be easily added if other backends require it.
+
+2. We should decide on a roadmap to extend support of `bfloat16` to other SYCL 2020 math functions.
+
+== Revision History
+
+[cols="5,15,15,70"]
+[grid="rows"]
+[options="header"]
+|========================================
+|Rev|Date|Author|Changes
+|1|2022-09-15|Rajiv Deodhar |Initial public working draft
+|========================================
diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 9f1e5d6e882da..8e369b0f34b29 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -293,99 +293,6 @@ int main(int argc, char *argv[]) {
 }
 ----
 
-=== New bfloat16 math functions
-
-Many applications will require dedicated functions that take parameters of
-type `bfloat16`. This extension adds `bfloat16` support to the `fma`, `fmin`,
-`fmax` and `fabs` SYCL floating point math functions. These functions can be
-used as element wise operations on matrices, supplementing the `bfloat16`
-support in the `sycl_ext_oneapi_matrix` extension.
-
-The descriptions of the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point
-math functions can be found in the SYCL specification:
-https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_math_functions.
-
-
-
-==== fma
-
-```c++
-namespace sycl::ext::oneapi {
-
-bfloat16 fma(bfloat16 a, bfloat16 b, bfloat16 c);
-
-template<size_t N>
-marray<bfloat16, N> fma(marray<bfloat16, N> a, marray<bfloat16, N> b, marray<bfloat16, N> c);
-
-} // namespace sycl::ext::oneapi
-```
-
-===== Description
-
-Returns the correctly rounded floating-point representation of the sum of `c`
-with the infinitely precise product of `a` and `b`.
-Rounding of intermediate products shall not occur.
-The mantissa LSB rounds to the nearest even. Subnormal numbers are supported.
-
-==== fmax
-
-```c++
-namespace sycl::ext::oneapi {
-
-bfloat16 fmax(bfloat16 x, bfloat16 y);
-
-template<size_t N>
-marray<bfloat16, N> fmax(marray<bfloat16, N> x, marray<bfloat16, N> y);
-
-} // namespace sycl::ext::oneapi
-```
-
-===== Description
-
-Returns `y` if
-`x < y`, otherwise it
-returns `x`. If one argument is a
-NaN, `fmax()` returns the other
-argument. If both arguments are
-NaNs, `fmax()` returns a NaN.
-
-==== fmin
-
-```c++
-namespace sycl::ext::oneapi {
-
-bfloat16 fmin(bfloat16 a, bfloat16 b);
-
-template<size_t N>
-marray<bfloat16, N> fmin(marray<bfloat16, N> a, marray<bfloat16, N> b);
-
-} // namespace sycl::ext::oneapi
-```
-
-===== Description
-
-Returns `y` if
-`y < x`, otherwise it
-returns `x`. If one argument is a
-NaN, `fmin()` returns the other
-argument. If both arguments are
-NaNs, `fmin()` returns a NaN.
-
-==== fabs
-
-```c++
-namespace sycl::ext::oneapi {
-
-template <typename T>
-T fabs(T x);
-
-} // namespace sycl::ext::oneapi
-```
-
-===== Description
-
-Compute absolute value of a `bfloat16`.
-
 == Deprecated Features
 
 === Extension to `enum class aspect`
@@ -405,18 +312,6 @@ aspect is deprecated. It used to indicate whether a device supports `bfloat16`,
 but all devices are now required to support `bfloat16` when an implementation
 supports this extension. Therefore, this aspect now returns true for all devices.
 
-
-== Issues
-
-1. The CUDA backend does not have a use case that would necessitate
-support of the `vec` class in bfloat16 math functions, and `marray`
-would always be preferred over `vec` if `vec` support were to be
-added in the CUDA backend. For portability reasons, support for the
-`vec` class can be easily added if other backends require it.
-
-2. We should decide on a roadmap to extend support of `bfloat16`
-to other SYCL 2020 math functions.
-
 == Revision History
 
 [cols="5,15,15,70"]
@@ -431,5 +326,6 @@ to other SYCL 2020 math functions.
 |3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor
 |4|2022-03-07|Aidan Belton and Jack Kirk |Switch from Intel vendor specific to oneapi
 |5|2022-04-05|Jack Kirk | Added section for bfloat16 math builtins
-|6|2022-08-24|Rajiv Deodhar |Move bfloat16 from experimental to supported
+|6|2022-09-15|Rajiv Deodhar |Move bfloat16 from experimental to supported
+and leave math functions as experimental
 |========================================
diff --git a/sycl/include/sycl/ext/oneapi/builtins.hpp b/sycl/include/sycl/ext/oneapi/builtins.hpp
deleted file mode 100644
index fd8627f52c0b4..0000000000000
--- a/sycl/include/sycl/ext/oneapi/builtins.hpp
+++ /dev/null
@@ -1,281 +0,0 @@
-//==------ builtins.hpp - Non-standard SYCL built-in functions -------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <sycl/builtins.hpp>
-#include <sycl/detail/builtins.hpp>
-#include <sycl/detail/generic_type_lists.hpp>
-#include <sycl/detail/generic_type_traits.hpp>
-#include <sycl/detail/type_traits.hpp>
-
-#include <CL/__spirv/spirv_ops.hpp>
-#include <sycl/ext/oneapi/bfloat16.hpp>
-
-// TODO Decide whether to mark functions with this attribute.
-#define __NOEXC /*noexcept*/
-
-#ifdef __SYCL_DEVICE_ONLY__
-#define __SYCL_CONSTANT_AS __attribute__((opencl_constant))
-#else
-#define __SYCL_CONSTANT_AS
-#endif
-
-namespace sycl {
-__SYCL_INLINE_VER_NAMESPACE(_V1) {
-namespace ext {
-namespace oneapi {
-namespace experimental {
-namespace detail {
-template <size_t N>
-uint32_t to_uint32_t(sycl::marray<bfloat16, N> x, size_t start) {
-  uint32_t res;
-  std::memcpy(&res, &x[start], sizeof(uint32_t));
-  return res;
-}
-} // namespace detail
-
-// Provides functionality to print data from kernels in a C way:
-// - On non-host devices this function is directly mapped to printf from
-//   OpenCL C
-// - On host device, this function should be equivalent to standard printf
-//   function from C/C++.
-//
-// Please refer to corresponding section in OpenCL C specification to find
-// information about format string and its differences from standard C rules.
-//
-// This function is placed under 'experimental' namespace on purpose, because it
-// has too much caveats you need to be aware of before using it. Please find
-// them below and read carefully before using it:
-//
-// - According to the OpenCL spec, the format string must be
-// resolvable at compile time i.e. cannot be dynamically created by the
-// executing program.
-//
-// - According to the OpenCL spec, the format string must reside in constant
-// address space. The constant address space declarations might get "tricky",
-// see test/built-ins/printf.cpp for examples.
-// In simple cases (compile-time known string contents, direct declaration of
-// the format literal inside the printf call, etc.), the compiler should handle
-// the automatic address space conversion.
-// FIXME: Once the extension to generic address space is fully supported, the
-// constant AS version may need to be deprecated.
-//
-// - The format string is interpreted according to the OpenCL C spec, where all
-// data types has fixed size, opposed to C++ types which doesn't guarantee
-// the exact width of particular data types (except, may be, char). This might
-// lead to unexpected result, for example: %ld in OpenCL C means that printed
-// argument has 'long' type which is 64-bit wide by the OpenCL C spec. However,
-// by C++ spec long is just at least 32-bit wide, so, you need to ensure (by
-// performing a cast, for example) that if you use %ld specifier, you pass
-// 64-bit argument to the sycl::experimental::printf
-//
-// - OpenCL spec defines several additional features, like, for example, 'v'
-// modifier which allows to print OpenCL vectors: note that these features are
-// not available on host device and therefore their usage should be either
-// guarded using __SYCL_DEVICE_ONLY__ preprocessor macro or avoided in favor
-// of more portable solutions if needed
-//
-template <typename FormatT, typename... Args>
-int printf(const FormatT *__format, Args... args) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__)
-  return __spirv_ocl_printf(__format, args...);
-#else
-  return ::printf(__format, args...);
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__)
-}
-
-namespace native {
-
-// genfloatfh tanh (genfloatfh x)
-template <typename T>
-inline __SYCL_ALWAYS_INLINE
-    sycl::detail::enable_if_t<sycl::detail::is_genfloatf<T>::value ||
-                                  sycl::detail::is_genfloath<T>::value,
-                              T>
-    tanh(T x) __NOEXC {
-#if defined(__NVPTX__)
-  using _ocl_T = sycl::detail::ConvertToOpenCLType_t<T>;
-  _ocl_T arg1 = sycl::detail::convertDataToType<T, _ocl_T>(x);
-  return sycl::detail::convertDataToType<_ocl_T, T>(__clc_native_tanh(arg1));
-#else
-  return __sycl_std::__invoke_tanh<T>(x);
-#endif
-}
-
-// genfloath exp2 (genfloath x)
-template <typename T>
-inline __SYCL_ALWAYS_INLINE
-    sycl::detail::enable_if_t<sycl::detail::is_genfloath<T>::value, T>
-    exp2(T x) __NOEXC {
-#if defined(__NVPTX__)
-  using _ocl_T = sycl::detail::ConvertToOpenCLType_t<T>;
-  _ocl_T arg1 = sycl::detail::convertDataToType<T, _ocl_T>(x);
-  return sycl::detail::convertDataToType<_ocl_T, T>(__clc_native_exp2(arg1));
-#else
-  return __sycl_std::__invoke_exp2<T>(x);
-#endif
-}
-
-} // namespace native
-
-template <typename T>
-std::enable_if_t<std::is_same<T, bfloat16>::value, T> fabs(T x) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  return sycl::bit_cast<bfloat16>(__clc_fabs(x.raw()));
-#else
-  std::ignore = x;
-  throw runtime_error("bfloat16 is not currently supported on the host device.",
-                      PI_ERROR_INVALID_DEVICE);
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-}
-
-template <size_t N>
-sycl::marray<bfloat16, N> fabs(sycl::marray<bfloat16, N> x) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  sycl::marray<bfloat16, N> res;
-
-  for (size_t i = 0; i < N / 2; i++) {
-    auto partial_res = __clc_fabs(detail::to_uint32_t(x, i * 2));
-    std::memcpy(&res[i * 2], &partial_res, sizeof(uint32_t));
-  }
-
-  if (N % 2) {
-    res[N - 1] = sycl::bit_cast<bfloat16>(__clc_fabs(x[N - 1].raw()));
-  }
-  return res;
-#else
-  std::ignore = x;
-  throw runtime_error("bfloat16 is not currently supported on the host device.",
-                      PI_ERROR_INVALID_DEVICE);
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-}
-
-template <typename T>
-std::enable_if_t<std::is_same<T, bfloat16>::value, T> fmin(T x, T y) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  return sycl::bit_cast<bfloat16>(__clc_fmin(x.raw(), y.raw()));
-#else
-  std::ignore = x;
-  std::ignore = y;
-  throw runtime_error("bfloat16 is not currently supported on the host device.",
-                      PI_ERROR_INVALID_DEVICE);
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-}
-
-template <size_t N>
-sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
-                               sycl::marray<bfloat16, N> y) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  sycl::marray<bfloat16, N> res;
-
-  for (size_t i = 0; i < N / 2; i++) {
-    auto partial_res = __clc_fmin(detail::to_uint32_t(x, i * 2),
-                                  detail::to_uint32_t(y, i * 2));
-    std::memcpy(&res[i * 2], &partial_res, sizeof(uint32_t));
-  }
-
-  if (N % 2) {
-    res[N - 1] =
-        sycl::bit_cast<bfloat16>(__clc_fmin(x[N - 1].raw(), y[N - 1].raw()));
-  }
-
-  return res;
-#else
-  std::ignore = x;
-  std::ignore = y;
-  throw runtime_error("bfloat16 is not currently supported on the host device.",
-                      PI_ERROR_INVALID_DEVICE);
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-}
-
-template <typename T>
-std::enable_if_t<std::is_same<T, bfloat16>::value, T> fmax(T x, T y) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  return sycl::bit_cast<bfloat16>(__clc_fmax(x.raw(), y.raw()));
-#else
-  std::ignore = x;
-  std::ignore = y;
-  throw runtime_error("bfloat16 is not currently supported on the host device.",
-                      PI_ERROR_INVALID_DEVICE);
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-}
-
-template <size_t N>
-sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
-                               sycl::marray<bfloat16, N> y) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  sycl::marray<bfloat16, N> res;
-
-  for (size_t i = 0; i < N / 2; i++) {
-    auto partial_res = __clc_fmax(detail::to_uint32_t(x, i * 2),
-                                  detail::to_uint32_t(y, i * 2));
-    std::memcpy(&res[i * 2], &partial_res, sizeof(uint32_t));
-  }
-
-  if (N % 2) {
-    res[N - 1] =
-        sycl::bit_cast<bfloat16>(__clc_fmax(x[N - 1].raw(), y[N - 1].raw()));
-  }
-  return res;
-#else
-  std::ignore = x;
-  std::ignore = y;
-  throw runtime_error("bfloat16 is not currently supported on the host device.",
-                      PI_ERROR_INVALID_DEVICE);
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-}
-
-template <typename T>
-std::enable_if_t<std::is_same<T, bfloat16>::value, T> fma(T x, T y, T z) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  return sycl::bit_cast<bfloat16>(__clc_fma(x.raw(), y.raw(), z.raw()));
-#else
-  std::ignore = x;
-  std::ignore = y;
-  std::ignore = z;
-  throw runtime_error("bfloat16 is not currently supported on the host device.",
-                      PI_ERROR_INVALID_DEVICE);
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-}
-
-template <size_t N>
-sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
-                              sycl::marray<bfloat16, N> y,
-                              sycl::marray<bfloat16, N> z) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  sycl::marray<bfloat16, N> res;
-
-  for (size_t i = 0; i < N / 2; i++) {
-    auto partial_res =
-        __clc_fma(detail::to_uint32_t(x, i * 2), detail::to_uint32_t(y, i * 2),
-                  detail::to_uint32_t(z, i * 2));
-    std::memcpy(&res[i * 2], &partial_res, sizeof(uint32_t));
-  }
-
-  if (N % 2) {
-    res[N - 1] = sycl::bit_cast<bfloat16>(
-        __clc_fma(x[N - 1].raw(), y[N - 1].raw(), z[N - 1].raw()));
-  }
-  return res;
-#else
-  std::ignore = x;
-  std::ignore = y;
-  std::ignore = z;
-  throw runtime_error("bfloat16 is not currently supported on the host device.",
-                      PI_ERROR_INVALID_DEVICE);
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-}
-
-} // namespace experimental
-} // namespace oneapi
-} // namespace ext
-} // __SYCL_INLINE_VER_NAMESPACE(_V1)
-} // namespace sycl
-
-#undef __SYCL_CONSTANT_AS
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
new file mode 100755
index 0000000000000..e676cf1013295
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
@@ -0,0 +1,134 @@
+//==------ builtins.hpp - Non-standard SYCL built-in functions -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/builtins.hpp>
+#include <sycl/detail/builtins.hpp>
+#include <sycl/detail/generic_type_lists.hpp>
+#include <sycl/detail/generic_type_traits.hpp>
+#include <sycl/detail/type_traits.hpp>
+
+#include <CL/__spirv/spirv_ops.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
+
+// TODO Decide whether to mark functions with this attribute.
+#define __NOEXC /*noexcept*/
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define __SYCL_CONSTANT_AS __attribute__((opencl_constant))
+#else
+#define __SYCL_CONSTANT_AS
+#endif
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+namespace detail {
+template <size_t N>
+uint32_t to_uint32_t(sycl::marray<bfloat16, N> x, size_t start) {
+  uint32_t res;
+  std::memcpy(&res, &x[start], sizeof(uint32_t));
+  return res;
+}
+} // namespace detail
+
+// Provides functionality to print data from kernels in a C way:
+// - On non-host devices this function is directly mapped to printf from
+//   OpenCL C
+// - On host device, this function should be equivalent to standard printf
+//   function from C/C++.
+//
+// Please refer to corresponding section in OpenCL C specification to find
+// information about format string and its differences from standard C rules.
+//
+// This function is placed under 'experimental' namespace on purpose, because it
+// has too much caveats you need to be aware of before using it. Please find
+// them below and read carefully before using it:
+//
+// - According to the OpenCL spec, the format string must be
+// resolvable at compile time i.e. cannot be dynamically created by the
+// executing program.
+//
+// - According to the OpenCL spec, the format string must reside in constant
+// address space. The constant address space declarations might get "tricky",
+// see test/built-ins/printf.cpp for examples.
+// In simple cases (compile-time known string contents, direct declaration of
+// the format literal inside the printf call, etc.), the compiler should handle
+// the automatic address space conversion.
+// FIXME: Once the extension to generic address space is fully supported, the
+// constant AS version may need to be deprecated.
+//
+// - The format string is interpreted according to the OpenCL C spec, where all
+// data types has fixed size, opposed to C++ types which doesn't guarantee
+// the exact width of particular data types (except, may be, char). This might
+// lead to unexpected result, for example: %ld in OpenCL C means that printed
+// argument has 'long' type which is 64-bit wide by the OpenCL C spec. However,
+// by C++ spec long is just at least 32-bit wide, so, you need to ensure (by
+// performing a cast, for example) that if you use %ld specifier, you pass
+// 64-bit argument to the sycl::experimental::printf
+//
+// - OpenCL spec defines several additional features, like, for example, 'v'
+// modifier which allows to print OpenCL vectors: note that these features are
+// not available on host device and therefore their usage should be either
+// guarded using __SYCL_DEVICE_ONLY__ preprocessor macro or avoided in favor
+// of more portable solutions if needed
+//
+template <typename FormatT, typename... Args>
+int printf(const FormatT *__format, Args... args) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__)
+  return __spirv_ocl_printf(__format, args...);
+#else
+  return ::printf(__format, args...);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__)
+}
+
+namespace native {
+
+// genfloatfh tanh (genfloatfh x)
+template <typename T>
+inline __SYCL_ALWAYS_INLINE
+    sycl::detail::enable_if_t<sycl::detail::is_genfloatf<T>::value ||
+                                  sycl::detail::is_genfloath<T>::value,
+                              T>
+    tanh(T x) __NOEXC {
+#if defined(__NVPTX__)
+  using _ocl_T = sycl::detail::ConvertToOpenCLType_t<T>;
+  _ocl_T arg1 = sycl::detail::convertDataToType<T, _ocl_T>(x);
+  return sycl::detail::convertDataToType<_ocl_T, T>(__clc_native_tanh(arg1));
+#else
+  return __sycl_std::__invoke_tanh<T>(x);
+#endif
+}
+
+// genfloath exp2 (genfloath x)
+template <typename T>
+inline __SYCL_ALWAYS_INLINE
+    sycl::detail::enable_if_t<sycl::detail::is_genfloath<T>::value, T>
+    exp2(T x) __NOEXC {
+#if defined(__NVPTX__)
+  using _ocl_T = sycl::detail::ConvertToOpenCLType_t<T>;
+  _ocl_T arg1 = sycl::detail::convertDataToType<T, _ocl_T>(x);
+  return sycl::detail::convertDataToType<_ocl_T, T>(__clc_native_exp2(arg1));
+#else
+  return __sycl_std::__invoke_exp2<T>(x);
+#endif
+}
+
+} // namespace native
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
+
+#undef __SYCL_CONSTANT_AS
diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp
index fcdc9643ff4d4..39007301ab778 100644
--- a/sycl/include/sycl/sycl.hpp
+++ b/sycl/include/sycl/sycl.hpp
@@ -59,9 +59,10 @@
 #if SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
 #endif
-#include <sycl/ext/oneapi/builtins.hpp>
 #include <sycl/ext/oneapi/device_global/device_global.hpp>
 #include <sycl/ext/oneapi/device_global/properties.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16_math.hpp>
+#include <sycl/ext/oneapi/experimental/builtins.hpp>
 #include <sycl/ext/oneapi/experimental/cuda/barrier.hpp>
 #include <sycl/ext/oneapi/filter_selector.hpp>
 #include <sycl/ext/oneapi/group_algorithm.hpp>

From b9accad4958b12a7ccbc6560f9c794eec33795bb Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Fri, 16 Sep 2022 15:49:41 -0700
Subject: [PATCH 21/63] Cleanup of bfloat16_math extension.

---
 .../sycl_ext_oneapi_bfloat16_math.asciidoc    | 92 +++++++++++--------
 .../sycl_ext_oneapi_bfloat16.asciidoc         |  4 -
 sycl/include/sycl/ext/oneapi/bfloat16.hpp     |  3 -
 sycl/include/sycl/feature_test.hpp.in         |  2 +-
 sycl/source/detail/device_info.hpp            | 13 +--
 5 files changed, 60 insertions(+), 54 deletions(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
index 622ae302d5e47..fe860015977a3 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
@@ -22,43 +22,73 @@
 
 == Notice
 
-IMPORTANT: This specification is a draft.
+Copyright © 2022-2022 Intel Corporation. All rights reserved.
 
-Copyright (c) 2021-2022 Intel Corporation. All rights reserved.
+Khronos® is a registered trademark and SYCL™ and SPIR™ are trademarks of The Khronos Group Inc. OpenCL™ is a trademark of Apple Inc. used by permission by Khronos.
 
-NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
-trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc.
-used by permission by Khronos.
+== Contact
 
-== Dependencies
+To report problems with this extension, please open a new issue at:
 
-This extension is written against the SYCL 2020 specification, Revision 5.
+https://github.com/intel/llvm/issues
 
-== Status
+== Dependencies
 
-Draft
+This extension is written against the SYCL 2020 revision 5 specification. All references below to the "core SYCL specification" or to section numbers in the SYCL specification refer to that revision.
 
-This is a preview extension specification, intended to provide early access to
-a feature for review and community feedback. When the feature matures, this
-specification may be released as a formal extension.
+This extension depends on the following other SYCL extension:
 
-Because the interfaces defined by this specification are not final and are
-subject to change they are not intended to be used by shipping software
-products.
+* sycl_ext_oneapi_bfloat16
 
-== Version
+== Status
 
-Revision: 1
+This is an experimental extension specification, intended to provide early
+access to features and gather community feedback. Interfaces defined in this
+specification are implemented in DPC\++ but they are not finalized and may change incompatibly in future versions of DPC++ without prior notice.
+Shipping software products should not rely on APIs defined in this specification.
 
-== Introduction
+== Overview
 
 This extension adds `bfloat16` support to the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point math functions. These functions can be used as element wise operations on matrices, supplementing the `bfloat16` support in the sycl_ext_oneapi_matrix extension.
 
 The descriptions of the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point math functions can be found in the SYCL specification: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_math_functions.
 
+== Specification
+
+=== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification section 6.3.3 "Feature test macros". Therefore, an implementation
+supporting this extension must predefine the macro 
+`SYCL_EXT_ONEAPI_BFLOAT16_MATH` to one of the values defined in the table
+below. Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro's value to determine which of the extension's APIs the implementation supports.
+ 
+[%header,cols="1,5"]
+|===
+|Value |Description
+|1     |Initial extension version. Base features are supported.
+|===   
+
+=== Extension to `enum class aspect`
+
+[source]
+----
+namespace sycl {
+enum class aspect {
+  ...
+  ext_oneapi_bfloat16
+}
+}
+----
+
+If a SYCL device has the `ext_oneapi_bfloat16` aspect, then it supports the `bfloat16` math functions described in the next section.
+
+=== Math Functions
+
 The following functions are only available when `T` is `bfloat16` or `sycl::marray<bfloat16, {N}>`, where `{N}` means any positive value of `size_t` type.
 
-=== fma
+==== fma
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
@@ -68,12 +98,12 @@ T fma(T a, T b, T c);
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-==== Description
+===== Description
 
 Returns the correctly rounded floating-point representation of the sum of `c` with the infinitely precise product of `a` and `b`.
 Rounding of intermediate products shall not occur. The mantissa LSB rounds to the nearest even. Subnormal numbers are supported.
 
-=== fmax
+==== fmax
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
@@ -82,7 +112,7 @@ T fmax(T x, T y);
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-==== Description
+===== Description
 
 Returns `y` if
 `x < y`, otherwise it
@@ -91,7 +121,7 @@ NaN, `fmax()` returns the other
 argument. If both arguments are
 NaNs, `fmax()` returns a NaN.
 
-=== fmin
+==== fmin
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
@@ -100,7 +130,7 @@ T fmin(T x, T y);
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-==== Description
+===== Description
 
 Returns `y` if
 `y < x`, otherwise it
@@ -109,7 +139,7 @@ NaN, `fmax()` returns the other
 argument. If both arguments are
 NaNs, `fmax()` returns a NaN.
 
-=== fabs
+==== fabs
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
@@ -118,7 +148,7 @@ T fabs(T x);
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-==== Description
+===== Description
 
 Compute absolute value of a `bfloat16`.
 
@@ -127,13 +157,3 @@ Compute absolute value of a `bfloat16`.
 1. The CUDA backend does not have a use case that would necessitate support of the `vec` class in bfloat16 math functions, and `marray` would always be preferred over `vec` if `vec` support were to be added in the CUDA backend. For portability reasons, support for the `vec` class can be easily added if other backends require it.
 
 2. We should decide on a roadmap to extend support of `bfloat16` to other SYCL 2020 math functions.
-
-== Revision History
-
-[cols="5,15,15,70"]
-[grid="rows"]
-[options="header"]
-|========================================
-|Rev|Date|Author|Changes
-|1|2022-09-15|Rajiv Deodhar |Initial public working draft
-|========================================
diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 8e369b0f34b29..b98f2c48e9db3 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -102,10 +102,6 @@ public:
   bfloat16(const bfloat16 &) = default;
   ~bfloat16() = default;
 
-  // Explicit conversion functions
-  static storage_t from_float(const float &a);
-  static float to_float(const storage_t &a);
-
   // Convert from float to bfloat16
   bfloat16(const float &a);
   bfloat16 &operator=(const float &a);
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 7caa7718993ff..802190a307121 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -114,9 +114,6 @@ class bfloat16 {
   // Implicit conversion from bfloat16 to sycl::half
   operator sycl::half() const { return to_float(value); }
 
-  // Get raw bits representation of bfloat16
-  storage_t raw() const { return value; }
-
   // Logical operators (!,||,&&) are covered if we can cast to bool
   explicit operator bool() { return to_float(value) != 0.0f; }
 
diff --git a/sycl/include/sycl/feature_test.hpp.in b/sycl/include/sycl/feature_test.hpp.in
index 59d2ebed77e39..8a3a94fe3a3ef 100644
--- a/sycl/include/sycl/feature_test.hpp.in
+++ b/sycl/include/sycl/feature_test.hpp.in
@@ -55,7 +55,6 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
 #define SYCL_EXT_ONEAPI_SUB_GROUP 1
 #define SYCL_EXT_ONEAPI_PROPERTIES 1
 #define SYCL_EXT_ONEAPI_NATIVE_MATH 1
-#define SYCL_EXT_ONEAPI_BFLOAT16 1
 #define SYCL_EXT_INTEL_DATAFLOW_PIPES 1
 #ifdef __clang__
 #if __has_extension(sycl_extended_atomics)
@@ -74,6 +73,7 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
 #cmakedefine01 SYCL_BUILD_PI_CUDA
 #if SYCL_BUILD_PI_CUDA
 #define SYCL_EXT_ONEAPI_BACKEND_CUDA 1
+#define SYCL_EXT_ONEAPI_BFLOAT16_MATH 1
 #endif
 #cmakedefine01 SYCL_BUILD_PI_ESIMD_EMULATOR
 #if SYCL_BUILD_PI_ESIMD_EMULATOR
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 6f00fe20cd001..70fe8ba5c483a 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -283,16 +283,9 @@ struct get_device_info_impl<std::vector<memory_scope>,
 template <>
 struct get_device_info_impl<bool, info::device::ext_oneapi_bfloat16> {
   static bool get(RT::PiDevice dev, const plugin &Plugin) {
-
-    bool result = false;
-
-    RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
-        dev, PiInfoCode<info::device::ext_oneapi_bfloat16>::value,
-        sizeof(result), &result, nullptr);
-    if (Err != PI_SUCCESS) {
-      return false;
-    }
-    return result;
+    // This aspect indicates support for bfloat16 math functions,
+    // which we don't do yet.
+    return false;
   }
 };
 

From ca7880a8bb847e4a8c340e4bc1448410d6ac3763 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 19 Sep 2022 10:14:40 -0700
Subject: [PATCH 22/63] Document updates and minor changes.

---
 libdevice/fallback-bfloat16.cpp               |  4 +--
 .../sycl_ext_oneapi_bfloat16.asciidoc         | 34 ++++---------------
 2 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/libdevice/fallback-bfloat16.cpp b/libdevice/fallback-bfloat16.cpp
index 0f92b1bcfa728..e5596ff4871dd 100755
--- a/libdevice/fallback-bfloat16.cpp
+++ b/libdevice/fallback-bfloat16.cpp
@@ -18,12 +18,10 @@
 // TODO: generate the DeviceLibFuncMap in sycl-post-link.cpp automatically
 // during the build based on libdevice to avoid manually sync.
 
-extern "C" SYCL_EXTERNAL int __builtin_spirv_OpIsNan_f32(float);
-
 DEVICE_EXTERN_C_INLINE uint16_t
 __devicelib_ConvertFToBF16INTEL(const float &a) {
   // In case float value is nan - propagate bfloat16's qnan
-  if (__builtin_spirv_OpIsNan_f32(a))
+  if (__spirv_IsNan(a))
     return 0xffc1;
   union {
     uint32_t intStorage;
diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index b98f2c48e9db3..fda2f91f16c05 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -274,40 +274,18 @@ int main(int argc, char *argv[]) {
   queue deviceQueue{dev};
   buffer<float, 1> buf{data, 3};
 
-  if (dev.has(aspect::ext_oneapi_bfloat16)) {
-    deviceQueue.submit([&](handler &cgh) {
-      accessor numbers{buf, cgh, read_write};
-      cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); });
-    });
-  } else {
-    std::cout << "No bfloat16 support\n";
-    return 1;
-  }
+  deviceQueue.submit([&](handler &cgh) {
+    accessor numbers{buf, cgh, read_write};
+    cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); });
+  });
+
   host_accessor hostOutAcc{buf, read_only};
   std::cout << "Result = " << hostOutAcc[2] << std::endl;
+  
   return 0;
 }
 ----
 
-== Deprecated Features
-
-=== Extension to `enum class aspect`
-
-[source]
-----
-namespace sycl {
-enum class aspect {
-  ...
-  ext_oneapi_bfloat16
-}
-}
-----
-
-This extension adds a new aspect named `ext_oneapi_bfloat16`, but usage of this
-aspect is deprecated. It used to indicate whether a device supports `bfloat16`,
-but all devices are now required to support `bfloat16` when an implementation
-supports this extension. Therefore, this aspect now returns true for all devices.
-
 == Revision History
 
 [cols="5,15,15,70"]

From dc3b2b5ce1c1fd3d5ce8b5e993e00227dc7db3fd Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 19 Sep 2022 13:17:11 -0700
Subject: [PATCH 23/63] Fixes for long lines in doc, a different way to check
 for NaN.

---
 sycl/include/sycl/ext/oneapi/bfloat16.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 802190a307121..5d58ec4963296 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -42,9 +42,9 @@ class bfloat16 {
 #if (__CUDA_ARCH__ >= 800)
     return __nvvm_f2bf16_rn(a);
 #else
-    // TODO std::isnan not defined in device code
-    // if (std::isnan(a))
-    // return 0xffc1;
+    // TODO find a better way to check for NaN
+    if (a != a)
+      return 0xffc1;
     union {
       uint32_t intStorage;
       float floatValue;

From 1aa6ad30b35a923222236da0ad8e5095f83d97b1 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 20 Sep 2022 16:32:09 -0700
Subject: [PATCH 24/63] Broke long lines into multiple lines.

---
 .../sycl_ext_oneapi_bfloat16.asciidoc         | 35 +++++++++++--------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index fda2f91f16c05..f04bd92de0b63 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -52,15 +52,17 @@ This extension adds support for a 16-bit floating point type `bfloat16`.
 This type occupies 16 bits of storage space as does the `sycl::half` type.
 However, `bfloat16` allots 8 bits to the exponent instead of the 5 bits used by
 `sycl::half` and 7 bits to the significand versus 10 bits used by `sycl::half`.
-Thus, `bfloat16` has the same dynamic range as a 32-bit `float` but with reduced
-precision. This type is useful when memory required to store the values must be
-reduced, and when the calculations require high dynamic range but can tolerate
-lower-precision. Some implementations may still perform operations on this type
-using 32-bit math. For example, they may convert the `bfloat16` value to
-`float`, and then perform the operation on the 32-bit `float`.
+Thus, `bfloat16` has the same dynamic range as a 32-bit `float` but with
+reduced precision. This type is useful when memory required to store the values
+must be reduced, and when the calculations require high dynamic range but can
+tolerate lower-precision. Some implementations may still perform operations
+ on this type using 32-bit math. For example, they may convert the `bfloat16`
+ value to `float`, and then perform the operation on the 32-bit `float`.
 
 [NOTE]
-The bfloat16 type is supported on all devices. DPC++ currently supports this type natively on Intel Xe HP GPUs and Nvidia GPUs with Compute Capability >= SM80. On other devices it is emulated in software.
+The bfloat16 type is supported on all devices. DPC++ currently supports this
+type natively on Intel Xe HP GPUs and Nvidia GPUs with
+Compute Capability >= SM80. On other devices it is emulated in software.
 
 == Specification
 
@@ -95,7 +97,6 @@ namespace ext {
 namespace oneapi {
 
 class bfloat16 {
-using storage_t = uint16_t;
 
 public:
   bfloat16() = default;
@@ -131,14 +132,16 @@ public:
   friend bfloat16 &operatorOP(bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
 
   // OP is +, -, *, /
-  friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+  friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs)
+  { /* ... */ }
   template <typename T>
   friend bfloat16 operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }
   template <typename T>
   friend bfloat16 operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }
 
   // OP is ==,!=, <, >, <=, >=
-  friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+  friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs)
+  { /* ... */ }
   template <typename T>
   friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }
   template <typename T>
@@ -192,13 +195,15 @@ Table 1. Member functions of `bfloat16` class.
 
   OP is: `++, --`
 
-| `friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| `friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs)
+{ /* ... */ }`
 | Perform an in-place `OP` arithmetic operation between the `lhs` and the `rhs`
   and return the `lhs`.
 
   OP is: `+=, -=, *=, /=`
 
-| `friend type operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| `friend type operatorOP(const bfloat16 &lhs, const bfloat16 &rhs)
+{ /* ... */ }`
 | Construct a new instance of the `bfloat16` class with the value of the new
   `bfloat16` instance being the result of an OP arithmetic operation between
   the `lhs` `bfloat16` and `rhs` `bfloat16` values.
@@ -223,7 +228,8 @@ Table 1. Member functions of `bfloat16` class.
 
   OP is `+, -, *, /`
 
-| `friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| `friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs)
+{ /* ... */ }`
 | Perform comparison operation OP between `lhs` `bfloat16` and `rhs` `bfloat16`
   values and return the result as a boolean value.
 
@@ -298,7 +304,8 @@ int main(int argc, char *argv[]) {
                              Add operator overloadings +
                              Apply code review suggestions
 |3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor
-|4|2022-03-07|Aidan Belton and Jack Kirk |Switch from Intel vendor specific to oneapi
+|4|2022-03-07|Aidan Belton and Jack Kirk |Switch from Intel vendor specific
+ to oneapi
 |5|2022-04-05|Jack Kirk | Added section for bfloat16 math builtins
 |6|2022-09-15|Rajiv Deodhar |Move bfloat16 from experimental to supported
 and leave math functions as experimental

From 802f5020f7ab09f5c76370da92686231f91f20c8 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 21 Sep 2022 16:56:35 -0700
Subject: [PATCH 25/63] Changed library order on Windows.

---
 libdevice/cmake/modules/SYCLLibdevice.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake
index a23e462d1e61e..9fc004aa26378 100644
--- a/libdevice/cmake/modules/SYCLLibdevice.cmake
+++ b/libdevice/cmake/modules/SYCLLibdevice.cmake
@@ -114,10 +114,10 @@ add_devicelib_obj(libsycl-cmath SRC cmath_wrapper.cpp DEP ${cmath_obj_deps})
 add_devicelib_obj(libsycl-cmath-fp64 SRC cmath_wrapper_fp64.cpp DEP ${cmath_obj_deps} )
 add_devicelib_obj(libsycl-imf SRC imf_wrapper.cpp DEP ${imf_obj_deps})
 add_devicelib_obj(libsycl-imf-fp64 SRC imf_wrapper_fp64.cpp DEP ${imf_obj_deps})
+add_devicelib_obj(libsycl-bfloat16 SRC bfloat16_wrapper.cpp DEP ${cmath_obj_deps} )
 if(WIN32)
 add_devicelib_obj(libsycl-msvc-math SRC msvc_math.cpp DEP ${cmath_obj_deps})
 endif()
-add_devicelib_obj(libsycl-bfloat16 SRC bfloat16_wrapper.cpp DEP ${cmath_obj_deps} )
 
 add_fallback_devicelib(libsycl-fallback-cassert SRC fallback-cassert.cpp DEP ${crt_obj_deps} EXTRA_ARGS -fno-sycl-instrument-device-code)
 add_fallback_devicelib(libsycl-fallback-cstring SRC fallback-cstring.cpp DEP ${crt_obj_deps})

From 190f2a3180f00be81d66e2add3bf77478d3f1acf Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 22 Sep 2022 14:16:49 -0700
Subject: [PATCH 26/63] Fix for AOT compilation and correction to new headers.

---
 clang/lib/Driver/Driver.cpp                    | 6 ++++--
 sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp | 7 +++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 78c5172148b57..cb84ce9e0c644 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5064,7 +5064,8 @@ class OffloadingActionBuilder final {
       // of "internal" libraries cannot be affected via -fno-sycl-device-lib.
       llvm::StringMap<bool> devicelib_link_info = {
           {"libc", true},        {"libm-fp32", true},   {"libm-fp64", true},
-          {"libimf-fp32", true}, {"libimf-fp64", true}, {"internal", true}};
+          {"libimf-fp32", true}, {"libimf-fp64", true}, {"libm-bfloat16", true},
+          {"internal", true}};
       if (Arg *A = Args.getLastArg(options::OPT_fsycl_device_lib_EQ,
                                    options::OPT_fno_sycl_device_lib_EQ)) {
         if (A->getValues().size() == 0)
@@ -5122,7 +5123,8 @@ class OffloadingActionBuilder final {
           {"libsycl-fallback-cmath", "libm-fp32"},
           {"libsycl-fallback-cmath-fp64", "libm-fp64"},
           {"libsycl-fallback-imf", "libimf-fp32"},
-          {"libsycl-fallback-imf-fp64", "libimf-fp64"}};
+          {"libsycl-fallback-imf-fp64", "libimf-fp64"},
+          {"libsycl-native-bfloat16", "libm-bfloat16"}};
       // ITT annotation libraries are linked in separately whenever the device
       // code instrumentation is enabled.
       const SYCLDeviceLibsList sycl_device_annotation_libs = {
diff --git a/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp b/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp
index 258a7393e2d34..bdc309dd17b10 100644
--- a/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp
@@ -14,7 +14,7 @@
 #include <sycl/ext/intel/esimd/detail/types.hpp>
 #include <sycl/ext/intel/esimd/xmx/common.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp>
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
@@ -27,8 +27,7 @@ template <typename T> constexpr dpas_argument_type dpas_precision_from_type() {
   // TODO: add support for tfloat32 here.
   if constexpr (std::is_same_v<T, sycl::half>)
     return dpas_argument_type::FP16;
-  else if constexpr (std::is_same_v<T,
-                                    sycl::ext::oneapi::experimental::bfloat16>)
+  else if constexpr (std::is_same_v<T, sycl::ext::oneapi::bfloat16>)
     return dpas_argument_type::BF16;
   else if constexpr (std::is_same_v<T, unsigned char>)
     return dpas_argument_type::U8;
@@ -143,7 +142,7 @@ constexpr int verify_parameters_and_deduce_exec_size() {
     }
   } else if constexpr (APrecision == dpas_argument_type::BF16 ||
                        BPrecision == dpas_argument_type::BF16) {
-    using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+    using bfloat16 = sycl::ext::oneapi::bfloat16;
     if constexpr (ExecutionSize == 8) {
       static_assert(APrecision == BPrecision &&
                         __ESIMD_DNS::is_type<T, float, bfloat16>() &&

From 84c50f3c356efd84c1202994fb3081ae25360fdf Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 22 Sep 2022 20:26:24 -0700
Subject: [PATCH 27/63] Noted AOT limitation in doc.

---
 .../supported/sycl_ext_oneapi_bfloat16.asciidoc        | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index f04bd92de0b63..abd2ce88b4ce4 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -43,8 +43,16 @@ This extension is written against the SYCL 2020 specification, Revision 5.
 
 == Status
 
-This extension is implemented and fully supported by DPC++.
+This extension is implemented and fully supported by DPC++, with the
+limitation noted below.
 
+[NOTE]
+This extension has limited support in ahead-of-time (AOT) compilation mode when
+the target device is an Intel GPU.  Software emulation of this feature does not
+work in this case, so AOT mode is only supported if the target device has
+native support for `bfloat16`. Currently, this is limited to Intel Xe HP GPUs.
+The DPC++ implementation has no such limitation in just-in-time (JIT)
+compilation mode.
 
 == Overview
 

From df058bae1740c5e52ee51455570606d4177b372a Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Sat, 24 Sep 2022 13:41:50 -0700
Subject: [PATCH 28/63] Adjustment for AOT compilation.

---
 .../program_manager/program_manager.cpp       | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index d53a4477666ef..da026c84576fc 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -773,7 +773,7 @@ static const char *getDeviceLibFilename(DeviceLibExt Extension) {
   case DeviceLibExt::cl_intel_devicelib_imf_fp64:
     return "libsycl-fallback-imf-fp64.spv";
   case DeviceLibExt::cl_intel_devicelib_bfloat16:
-    return "libsycl-fallback-bfloat16.spv";
+    return "libsycl-native-bfloat16.spv";
   }
   throw compile_program_error("Unhandled (new?) device library extension",
                               PI_ERROR_INVALID_OPERATION);
@@ -807,12 +807,13 @@ static const char *getDeviceLibExtensionStr(DeviceLibExt Extension) {
 static RT::PiProgram loadDeviceLibFallback(const ContextImplPtr Context,
                                            DeviceLibExt Extension,
                                            const RT::PiDevice &Device,
-                                           bool UseNativeLib) {
+                                           bool UseFallbackLib) {
 
   const char *LibFileName = getDeviceLibFilename(Extension);
   std::string LibFileNameStr(LibFileName);
-  if (UseNativeLib) {
-    LibFileNameStr.replace(8, 8, "native");
+  if (UseFallbackLib) {
+    // Replace "native" with "fallback".
+    LibFileNameStr.replace(8, 6, "fallback");
     LibFileName = LibFileNameStr.c_str();
   }
 
@@ -1008,14 +1009,13 @@ getDeviceLibPrograms(const ContextImplPtr Context, const RT::PiDevice &Device,
     bool DeviceSupports = DevExtList.npos != DevExtList.find(ExtStr);
 
     if (!DeviceSupports || InhibitNativeImpl) {
-      Programs.push_back(loadDeviceLibFallback(Context, Ext, Device, false));
+      // Driver always links native bfloat16 library so that AOT will work for
+      // PVC. If device does not support bfloat16 then we replace native with
+      // fallback library.
+      Programs.push_back(loadDeviceLibFallback(
+          Context, Ext, Device,
+          Ext == DeviceLibExt::cl_intel_devicelib_bfloat16));
       FallbackIsLoaded = true;
-    } else {
-      // bfloat16 needs native library if device supports it
-      if (Ext == DeviceLibExt::cl_intel_devicelib_bfloat16) {
-        Programs.push_back(loadDeviceLibFallback(Context, Ext, Device, true));
-        FallbackIsLoaded = true;
-      }
     }
   }
   return Programs;

From fed4d1d3a7d9b766924a324489348a1b4ff6adc7 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Sun, 25 Sep 2022 21:28:51 -0700
Subject: [PATCH 29/63] Fixes for AOT builds.

---
 clang/lib/Driver/Driver.cpp                   | 54 ++++++++++++++++++-
 .../program_manager/program_manager.cpp       | 23 ++++----
 2 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index cb84ce9e0c644..1c4744a424173 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5051,6 +5051,42 @@ class OffloadingActionBuilder final {
       }
     }
 
+    bool hasNativeBfloat16() {
+      const OptTable &Opts = C.getDriver().getOpts();
+      const char *DeviceOpt = nullptr;
+      for (auto *A : Args) {
+        llvm::Triple *TargetBE = nullptr;
+
+        auto GetTripleIt = [&, this](llvm::StringRef Triple) {
+          llvm::Triple TargetTriple{Triple};
+          auto TripleIt = llvm::find_if(SYCLTripleList, [&](auto &SYCLTriple) {
+            return SYCLTriple == TargetTriple;
+          });
+          return TripleIt != SYCLTripleList.end() ? &*TripleIt : nullptr;
+        };
+
+        if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
+          // Passing device args: -Xsycl-target-backend=<triple> -opt=val.
+          TargetBE = GetTripleIt(A->getValue(0));
+          if (TargetBE)
+            DeviceOpt = A->getValue(1);
+          else
+            continue;
+        } else if (A->getOption().matches(options::OPT_Xsycl_backend)) {
+          // Passing device args: -Xsycl-target-backend -opt=val.
+          TargetBE = &SYCLTripleList.front();
+          DeviceOpt = A->getValue(0);
+        } else {
+          continue;
+        };
+      }
+      if (DeviceOpt) {
+        if (strstr(DeviceOpt, "pvc") || strstr(DeviceOpt, "ats"))
+          return true;
+      }
+      return false;
+    }
+
     bool addSYCLDeviceLibs(const ToolChain *TC, ActionList &DeviceLinkObjects,
                            bool isSpirvAOT, bool isMSVCEnv) {
       struct DeviceLibOptInfo {
@@ -5123,7 +5159,10 @@ class OffloadingActionBuilder final {
           {"libsycl-fallback-cmath", "libm-fp32"},
           {"libsycl-fallback-cmath-fp64", "libm-fp64"},
           {"libsycl-fallback-imf", "libimf-fp32"},
-          {"libsycl-fallback-imf-fp64", "libimf-fp64"},
+          {"libsycl-fallback-imf-fp64", "libimf-fp64"}};
+      const SYCLDeviceLibsList sycl_device_bfloat16_fallback_lib = {
+          {"libsycl-fallback-bfloat16", "libm-bfloat16"}};
+      const SYCLDeviceLibsList sycl_device_bfloat16_native_lib = {
           {"libsycl-native-bfloat16", "libm-bfloat16"}};
       // ITT annotation libraries are linked in separately whenever the device
       // code instrumentation is enabled.
@@ -5174,6 +5213,19 @@ class OffloadingActionBuilder final {
       addInputs(sycl_device_wrapper_libs);
       if (isSpirvAOT || TC->getTriple().isNVPTX())
         addInputs(sycl_device_fallback_libs);
+
+      // Add native or fallback bfloat16 library.
+      if (TC->getTriple().isSPIR()) {
+        if (TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen)
+          if (hasNativeBfloat16())
+            addInputs(sycl_device_bfloat16_native_lib);
+          else
+            addInputs(sycl_device_bfloat16_fallback_lib);
+        else if (TC->getTriple().getSubArch() ==
+                 llvm::Triple::SPIRSubArch_x86_64)
+          addInputs(sycl_device_bfloat16_fallback_lib);
+      }
+
       if (Args.hasFlag(options::OPT_fsycl_instrument_device_code,
                        options::OPT_fno_sycl_instrument_device_code, true))
         addInputs(sycl_device_annotation_libs);
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index da026c84576fc..f2184ec379782 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -773,7 +773,7 @@ static const char *getDeviceLibFilename(DeviceLibExt Extension) {
   case DeviceLibExt::cl_intel_devicelib_imf_fp64:
     return "libsycl-fallback-imf-fp64.spv";
   case DeviceLibExt::cl_intel_devicelib_bfloat16:
-    return "libsycl-native-bfloat16.spv";
+    return "libsycl-fallback-bfloat16.spv";
   }
   throw compile_program_error("Unhandled (new?) device library extension",
                               PI_ERROR_INVALID_OPERATION);
@@ -807,13 +807,12 @@ static const char *getDeviceLibExtensionStr(DeviceLibExt Extension) {
 static RT::PiProgram loadDeviceLibFallback(const ContextImplPtr Context,
                                            DeviceLibExt Extension,
                                            const RT::PiDevice &Device,
-                                           bool UseFallbackLib) {
+                                           bool UseNativeLib) {
 
   const char *LibFileName = getDeviceLibFilename(Extension);
   std::string LibFileNameStr(LibFileName);
-  if (UseFallbackLib) {
-    // Replace "native" with "fallback".
-    LibFileNameStr.replace(8, 6, "fallback");
+  if (UseNativeLib) {
+    LibFileNameStr.replace(8, 8, "native");
     LibFileName = LibFileNameStr.c_str();
   }
 
@@ -1007,15 +1006,15 @@ getDeviceLibPrograms(const ContextImplPtr Context, const RT::PiDevice &Device,
     }
 
     bool DeviceSupports = DevExtList.npos != DevExtList.find(ExtStr);
-
     if (!DeviceSupports || InhibitNativeImpl) {
-      // Driver always links native bfloat16 library so that AOT will work for
-      // PVC. If device does not support bfloat16 then we replace native with
-      // fallback library.
-      Programs.push_back(loadDeviceLibFallback(
-          Context, Ext, Device,
-          Ext == DeviceLibExt::cl_intel_devicelib_bfloat16));
+      Programs.push_back(loadDeviceLibFallback(Context, Ext, Device, false));
       FallbackIsLoaded = true;
+    } else {
+      // bfloat16 needs native library if device supports it
+      if (Ext == DeviceLibExt::cl_intel_devicelib_bfloat16) {
+        Programs.push_back(loadDeviceLibFallback(Context, Ext, Device, true));
+        FallbackIsLoaded = true;
+      }
     }
   }
   return Programs;

From a82d73ad9622f3a7493c491368f1b0f2a8936b92 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 27 Sep 2022 00:07:31 -0700
Subject: [PATCH 30/63] Fixes for AOT multiple devices.

---
 clang/lib/Driver/Driver.cpp | 47 ++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 3b298d1201d4b..1e20c1a0b4baf 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5081,8 +5081,14 @@ class OffloadingActionBuilder final {
       }
     }
 
-    bool hasNativeBfloat16() {
+    // Return whether to use native bfloat16 library.
+    bool useNativeBfloat(const ToolChain *TC, bool &isAOT) {
+      isAOT = false;
+      if (!TC->getTriple().isSPIR())
+        return false;
+
       const OptTable &Opts = C.getDriver().getOpts();
+      const char *TargetOpt = nullptr;
       const char *DeviceOpt = nullptr;
       for (auto *A : Args) {
         llvm::Triple *TargetBE = nullptr;
@@ -5095,7 +5101,15 @@ class OffloadingActionBuilder final {
           return TripleIt != SYCLTripleList.end() ? &*TripleIt : nullptr;
         };
 
-        if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
+        if (A->getOption().matches(options::OPT_fsycl_targets_EQ)) {
+          // Passing arg: -fsycl-targets=<targets>.
+          isAOT = true;
+          TargetBE = GetTripleIt(A->getValue(0));
+          if (TargetBE)
+            TargetOpt = A->getValue(0);
+          else
+            continue;
+        } else if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
           // Passing device args: -Xsycl-target-backend=<triple> -opt=val.
           TargetBE = GetTripleIt(A->getValue(0));
           if (TargetBE)
@@ -5110,9 +5124,15 @@ class OffloadingActionBuilder final {
           continue;
         };
       }
-      if (DeviceOpt) {
-        if (strstr(DeviceOpt, "pvc") || strstr(DeviceOpt, "ats"))
-          return true;
+      if (TC->getTriple().getSubArch() != llvm::Triple::SPIRSubArch_gen)
+        return false;
+
+      if (TargetOpt && DeviceOpt) {
+        // Currently we support only single AOT target for bfloat16.
+        if (!(strstr(TargetOpt, "*") || strstr(TargetOpt, ",")))
+          return strstr(DeviceOpt, "pvc") || strstr(DeviceOpt, "ats");
+        else
+          return false;
       }
       return false;
     }
@@ -5244,15 +5264,14 @@ class OffloadingActionBuilder final {
       if (isSpirvAOT || TC->getTriple().isNVPTX())
         addInputs(sycl_device_fallback_libs);
 
-      // Add native or fallback bfloat16 library.
-      if (TC->getTriple().isSPIR()) {
-        if (TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen)
-          if (hasNativeBfloat16())
-            addInputs(sycl_device_bfloat16_native_lib);
-          else
-            addInputs(sycl_device_bfloat16_fallback_lib);
-        else if (TC->getTriple().getSubArch() ==
-                 llvm::Triple::SPIRSubArch_x86_64)
+      bool isAOT;
+      bool useNativeBfloatLib = useNativeBfloat(TC, isAOT);
+      if (isAOT &&
+          TC->getTriple().getSubArch() != llvm::Triple::SPIRSubArch_fpga) {
+        // Add native or fallback bfloat16 library.
+        if (useNativeBfloatLib)
+          addInputs(sycl_device_bfloat16_native_lib);
+        else
           addInputs(sycl_device_bfloat16_fallback_lib);
       }
 

From 3fc888517f959d70c1aa97ea1c55020246bb7eeb Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 27 Sep 2022 09:14:45 -0700
Subject: [PATCH 31/63] Updated documentation.

---
 .../supported/sycl_ext_oneapi_bfloat16.asciidoc       | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index abd2ce88b4ce4..048f15c3f49d9 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -41,18 +41,7 @@ https://github.com/intel/llvm/issues
 
 This extension is written against the SYCL 2020 specification, Revision 5.
 
-== Status
 
-This extension is implemented and fully supported by DPC++, with the
-limitation noted below.
-
-[NOTE]
-This extension has limited support in ahead-of-time (AOT) compilation mode when
-the target device is an Intel GPU.  Software emulation of this feature does not
-work in this case, so AOT mode is only supported if the target device has
-native support for `bfloat16`. Currently, this is limited to Intel Xe HP GPUs.
-The DPC++ implementation has no such limitation in just-in-time (JIT)
-compilation mode.
 
 == Overview
 

From 1ec6838a0a59f91488658e8b406fd9f0789ed81d Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 27 Sep 2022 14:13:54 -0700
Subject: [PATCH 32/63] Added back missing Status section in documentation.

---
 sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 048f15c3f49d9..930f4d2cf311e 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -41,7 +41,9 @@ https://github.com/intel/llvm/issues
 
 This extension is written against the SYCL 2020 specification, Revision 5.
 
+== Status
 
+This extension is implemented and fully supported by DPC++.
 
 == Overview
 

From c13564312745df9e59e409b55f073e07d37ed87a Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Fri, 30 Sep 2022 17:09:00 -0700
Subject: [PATCH 33/63] Added tests, corrected aspect check.

---
 clang/lib/Driver/Driver.cpp                   | 39 ++++++-------
 clang/test/Driver/Inputs/SYCL/c.cpp           |  1 +
 clang/test/Driver/sycl-bfloat16-lib-win.cpp   | 58 +++++++++++++++++++
 clang/test/Driver/sycl-bfloat16-lib.cpp       | 58 +++++++++++++++++++
 clang/test/Preprocessor/sycl-macro.cpp        |  3 +
 .../sycl_ext_oneapi_bfloat16.asciidoc         |  2 +-
 sycl/plugins/level_zero/pi_level_zero.cpp     |  5 +-
 sycl/plugins/opencl/pi_opencl.cpp             | 10 +---
 sycl/source/detail/device_info.hpp            | 12 +++-
 9 files changed, 152 insertions(+), 36 deletions(-)
 create mode 100644 clang/test/Driver/Inputs/SYCL/c.cpp
 create mode 100755 clang/test/Driver/sycl-bfloat16-lib-win.cpp
 create mode 100755 clang/test/Driver/sycl-bfloat16-lib.cpp

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 43062c894de95..a6059b0cb03b1 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5082,10 +5082,8 @@ class OffloadingActionBuilder final {
     }
 
     // Return whether to use native bfloat16 library.
-    bool useNativeBfloat(const ToolChain *TC, bool &isAOT) {
-      isAOT = false;
-      if (!TC->getTriple().isSPIR())
-        return false;
+    bool selectBfloatLibs(const ToolChain *TC, bool &useNative) {
+      bool needLibs = false;
 
       const OptTable &Opts = C.getDriver().getOpts();
       const char *TargetOpt = nullptr;
@@ -5103,38 +5101,37 @@ class OffloadingActionBuilder final {
 
         if (A->getOption().matches(options::OPT_fsycl_targets_EQ)) {
           // Passing arg: -fsycl-targets=<targets>.
-          isAOT = true;
+          needLibs = true;
           TargetBE = GetTripleIt(A->getValue(0));
           if (TargetBE)
             TargetOpt = A->getValue(0);
           else
             continue;
         } else if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
-          // Passing device args: -Xsycl-target-backend=<triple> -opt=val.
+          // Passing device args: -Xsycl-target-backend=<triple> <opt>
           TargetBE = GetTripleIt(A->getValue(0));
           if (TargetBE)
             DeviceOpt = A->getValue(1);
           else
             continue;
         } else if (A->getOption().matches(options::OPT_Xsycl_backend)) {
-          // Passing device args: -Xsycl-target-backend -opt=val.
+          // Passing device args: -Xsycl-target-backend <opt>
           TargetBE = &SYCLTripleList.front();
           DeviceOpt = A->getValue(0);
+        } else if (A->getOption().matches(options::OPT_Xs_separate)) {
+          // Passing device args: -Xs <opt>
+          DeviceOpt = A->getValue(0);
         } else {
           continue;
         };
       }
-      if (TC->getTriple().getSubArch() != llvm::Triple::SPIRSubArch_gen)
-        return false;
-
-      if (TargetOpt && DeviceOpt) {
-        // Currently we support only single AOT target for bfloat16.
-        if (!(strstr(TargetOpt, "*") || strstr(TargetOpt, ",")))
-          return strstr(DeviceOpt, "pvc") || strstr(DeviceOpt, "ats");
-        else
-          return false;
+      useNative = false;
+      if (needLibs && TC->getTriple().getArch() == llvm::Triple::spir64 &&
+          TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen &&
+          TargetOpt && DeviceOpt) {
+        useNative = strstr(DeviceOpt, "pvc") || strstr(DeviceOpt, "ats");
       }
-      return false;
+      return needLibs;
     }
 
     bool addSYCLDeviceLibs(const ToolChain *TC, ActionList &DeviceLinkObjects,
@@ -5264,12 +5261,12 @@ class OffloadingActionBuilder final {
       if (isSpirvAOT || TC->getTriple().isNVPTX())
         addInputs(sycl_device_fallback_libs);
 
-      bool isAOT;
-      bool useNativeBfloatLib = useNativeBfloat(TC, isAOT);
-      if (isAOT &&
+      bool nativeBfloatLibs;
+      bool needBfloatLibs = selectBfloatLibs(TC, nativeBfloatLibs);
+      if (needBfloatLibs &&
           TC->getTriple().getSubArch() != llvm::Triple::SPIRSubArch_fpga) {
         // Add native or fallback bfloat16 library.
-        if (useNativeBfloatLib)
+        if (nativeBfloatLibs)
           addInputs(sycl_device_bfloat16_native_lib);
         else
           addInputs(sycl_device_bfloat16_fallback_lib);
diff --git a/clang/test/Driver/Inputs/SYCL/c.cpp b/clang/test/Driver/Inputs/SYCL/c.cpp
new file mode 100644
index 0000000000000..cf2ff6f441705
--- /dev/null
+++ b/clang/test/Driver/Inputs/SYCL/c.cpp
@@ -0,0 +1 @@
+#remark
diff --git a/clang/test/Driver/sycl-bfloat16-lib-win.cpp b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
new file mode 100755
index 0000000000000..7cfd5d2cff5f5
--- /dev/null
+++ b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
@@ -0,0 +1,58 @@
+///
+/// Check if bfloat16 native and fallback libraries are added on Windows
+///
+
+// REQUIRES: windows
+
+/// ###########################################################################
+/// test that no bfloat16 libraries are added in JIT mode
+// RUN: %clangxx -fsycl %s --sysroot=%S/Inputs/SYCL -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16
+
+// test that a PVC AOT compilation uses the native library
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc" %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-NATIVE
+
+// test that a gen9 AOT compilation uses the fallback library
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
+
+// test that a generic compilation uses the fallback library
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
+
+// BFLOAT16-NOT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-{{fallback|native}}-bfloat16.obj" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+
+// BFLOAT16-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.obj" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.obj" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-mscv-math.obj" "-output={{.*}}libsycl-msvc-math-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.obj" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.obj" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.obj" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.obj" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.obj" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.obj" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-native-bfloat16.obj"
+
+// BFLOAT16-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.obj" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.obj" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-mscv-math.obj" "-output={{.*}}libsycl-msvc-math-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.obj" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.obj" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.obj" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.obj" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.obj" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.obj" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.obj"
\ No newline at end of file
diff --git a/clang/test/Driver/sycl-bfloat16-lib.cpp b/clang/test/Driver/sycl-bfloat16-lib.cpp
new file mode 100755
index 0000000000000..8d465f69568d6
--- /dev/null
+++ b/clang/test/Driver/sycl-bfloat16-lib.cpp
@@ -0,0 +1,58 @@
+///
+/// Check if bfloat16 native and fallback libraries are added on Linux
+///
+
+// UNSUPPORTED: system-windows
+
+/// ###########################################################################
+/// test that no bfloat16 libraries are added in JIT mode
+// RUN: %clangxx -fsycl %s --sysroot=%S/Inputs/SYCL -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16
+
+// test that a PVC AOT compilation uses the native library
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc" %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-NATIVE
+
+// test that a gen9 AOT compilation uses the fallback library
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
+
+// test that a generic compilation uses the fallback library
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
+
+// BFLOAT16-NOT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-{{fallback|native}}-bfloat16.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+
+// BFLOAT16-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.o" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-native-bfloat16.o"
+
+// BFLOAT16-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.o" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.o"
\ No newline at end of file
diff --git a/clang/test/Preprocessor/sycl-macro.cpp b/clang/test/Preprocessor/sycl-macro.cpp
index b2af292db71d4..26d73b70c89a4 100644
--- a/clang/test/Preprocessor/sycl-macro.cpp
+++ b/clang/test/Preprocessor/sycl-macro.cpp
@@ -9,6 +9,7 @@
 // RUNx: %clang_cc1 %s -fsycl-id-queries-fit-in-int -fsycl-is-device -E -dM -fms-compatibility | FileCheck --check-prefix=CHECK-MSVC %s
 // RUN: %clang_cc1 -fno-sycl-id-queries-fit-in-int %s -E -dM | FileCheck \
 // RUN: --check-prefix=CHECK-NO-SYCL_FIT_IN_INT %s
+// RUN: %clang_cc1 %s  -triple nvptx64-nvidia-cuda -target-cpu sm_80 -fsycl-is-device -E -dM | FileCheck --check-prefix=CHECK-CUDA %s
 
 // CHECK-NOT:#define __SYCL_DEVICE_ONLY__ 1
 // CHECK-NOT:#define SYCL_EXTERNAL
@@ -30,3 +31,5 @@
 
 // CHECK-NO-SYCL_FIT_IN_INT-NOT:#define __SYCL_ID_QUERIES_FIT_IN_INT__ 1
 // CHECK-SYCL-ID:#define __SYCL_ID_QUERIES_FIT_IN_INT__ 1
+
+// CHECK-CUDA:#define __CUDA_ARCH__ 800
\ No newline at end of file
diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 930f4d2cf311e..e5290f93a1fd6 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -54,7 +54,7 @@ However, `bfloat16` allots 8 bits to the exponent instead of the 5 bits used by
 Thus, `bfloat16` has the same dynamic range as a 32-bit `float` but with
 reduced precision. This type is useful when memory required to store the values
 must be reduced, and when the calculations require high dynamic range but can
-tolerate lower-precision. Some implementations may still perform operations
+tolerate lower precision. Some implementations may still perform operations
  on this type using 32-bit math. For example, they may convert the `bfloat16`
  value to `float`, and then perform the operation on the 32-bit `float`.
 
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 7b44f8a00f408..a442e0c7e9efb 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -3199,9 +3199,8 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
     // currently not supported in level zero runtime
     return PI_ERROR_INVALID_VALUE;
   case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16: {
-    // bfloat16 is implemented in hardware or emulated, so it is always
-    // supported.
-    return ReturnValue(bool{true});
+    // bfloat16 math functions are not yet supported on Intel GPUs.
+    return ReturnValue(bool{false});
   }
 
   // TODO: Implement.
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index cdb90ce582017..f416149271dec 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -296,14 +296,8 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
     return PI_SUCCESS;
   }
   case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16: {
-    size_t extSize;
-    cl_bool result = clGetDeviceInfo(
-        cast<cl_device_id>(device), CL_DEVICE_EXTENSIONS, 0, nullptr, &extSize);
-    std::string extStr(extSize, '\0');
-    result = clGetDeviceInfo(cast<cl_device_id>(device), CL_DEVICE_EXTENSIONS,
-                             extSize, &extStr.front(), nullptr);
-    result =
-        (extStr.find("cl_intel_bfloat16_conversions") != std::string::npos);
+    // bfloat16 math functions are not yet supported on Intel GPUs.
+    cl_bool result = false;
     std::memcpy(paramValue, &result, sizeof(cl_bool));
     return PI_SUCCESS;
   }
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 29d5eb2705ed8..d06c84da40c20 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -283,9 +283,15 @@ struct get_device_info_impl<std::vector<memory_scope>,
 template <>
 struct get_device_info_impl<bool, info::device::ext_oneapi_bfloat16> {
   static bool get(RT::PiDevice dev, const plugin &Plugin) {
-    // This aspect indicates support for bfloat16 math functions,
-    // which we don't do yet.
-    return false;
+    bool result = false;
+
+    RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
+        dev, PiInfoCode<info::device::ext_oneapi_bfloat16>::value,
+        sizeof(result), &result, nullptr);
+    if (Err != PI_SUCCESS) {
+      return false;
+    }
+    return result;
   }
 };
 

From 8876ac8650cb290fcef7369fe2e3e6a9ba146f1a Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 3 Oct 2022 08:58:53 -0700
Subject: [PATCH 34/63] Added missing newlines.

---
 clang/test/Driver/sycl-bfloat16-lib-win.cpp | 2 +-
 clang/test/Driver/sycl-bfloat16-lib.cpp     | 2 +-
 clang/test/Preprocessor/sycl-macro.cpp      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/test/Driver/sycl-bfloat16-lib-win.cpp b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
index 7cfd5d2cff5f5..aae66e80b8996 100755
--- a/clang/test/Driver/sycl-bfloat16-lib-win.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
@@ -55,4 +55,4 @@
 // BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
 // BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
 // BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.obj"
\ No newline at end of file
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.obj"
diff --git a/clang/test/Driver/sycl-bfloat16-lib.cpp b/clang/test/Driver/sycl-bfloat16-lib.cpp
index 8d465f69568d6..c535eeea4dcf0 100755
--- a/clang/test/Driver/sycl-bfloat16-lib.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib.cpp
@@ -55,4 +55,4 @@
 // BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
 // BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
 // BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.o"
\ No newline at end of file
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.o"
diff --git a/clang/test/Preprocessor/sycl-macro.cpp b/clang/test/Preprocessor/sycl-macro.cpp
index 26d73b70c89a4..ba4708d6e7e9a 100644
--- a/clang/test/Preprocessor/sycl-macro.cpp
+++ b/clang/test/Preprocessor/sycl-macro.cpp
@@ -32,4 +32,4 @@
 // CHECK-NO-SYCL_FIT_IN_INT-NOT:#define __SYCL_ID_QUERIES_FIT_IN_INT__ 1
 // CHECK-SYCL-ID:#define __SYCL_ID_QUERIES_FIT_IN_INT__ 1
 
-// CHECK-CUDA:#define __CUDA_ARCH__ 800
\ No newline at end of file
+// CHECK-CUDA:#define __CUDA_ARCH__ 800

From 17673bff2cd59a73b205ce2ddcec1c7a01e365e4 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 3 Oct 2022 17:58:29 -0700
Subject: [PATCH 35/63] Corrections to tests and macros, added host code
 emulation.

---
 clang/lib/Driver/Driver.cpp                   | 24 +++++++------------
 clang/test/Driver/sycl-bfloat16-lib-win.cpp   |  6 ++++-
 clang/test/Driver/sycl-bfloat16-lib.cpp       |  6 ++++-
 .../sycl_ext_oneapi_bfloat16.asciidoc         |  3 ++-
 sycl/include/sycl/ext/oneapi/bfloat16.hpp     |  4 +---
 sycl/include/sycl/feature_test.hpp.in         |  1 +
 6 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index a6059b0cb03b1..0f4ba7a61ea67 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5083,10 +5083,12 @@ class OffloadingActionBuilder final {
 
     // Return whether to use native bfloat16 library.
     bool selectBfloatLibs(const ToolChain *TC, bool &useNative) {
-      bool needLibs = false;
+      // bfloat16 libraries are added only for Gen AOT
+      if (!(TC->getTriple().getArch() == llvm::Triple::spir64 &&
+            TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen))
+        return false;
 
       const OptTable &Opts = C.getDriver().getOpts();
-      const char *TargetOpt = nullptr;
       const char *DeviceOpt = nullptr;
       for (auto *A : Args) {
         llvm::Triple *TargetBE = nullptr;
@@ -5099,15 +5101,7 @@ class OffloadingActionBuilder final {
           return TripleIt != SYCLTripleList.end() ? &*TripleIt : nullptr;
         };
 
-        if (A->getOption().matches(options::OPT_fsycl_targets_EQ)) {
-          // Passing arg: -fsycl-targets=<targets>.
-          needLibs = true;
-          TargetBE = GetTripleIt(A->getValue(0));
-          if (TargetBE)
-            TargetOpt = A->getValue(0);
-          else
-            continue;
-        } else if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
+        if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
           // Passing device args: -Xsycl-target-backend=<triple> <opt>
           TargetBE = GetTripleIt(A->getValue(0));
           if (TargetBE)
@@ -5123,15 +5117,13 @@ class OffloadingActionBuilder final {
           DeviceOpt = A->getValue(0);
         } else {
           continue;
-        };
+        }
       }
       useNative = false;
-      if (needLibs && TC->getTriple().getArch() == llvm::Triple::spir64 &&
-          TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen &&
-          TargetOpt && DeviceOpt) {
+      if (DeviceOpt) {
         useNative = strstr(DeviceOpt, "pvc") || strstr(DeviceOpt, "ats");
       }
-      return needLibs;
+      return true;
     }
 
     bool addSYCLDeviceLibs(const ToolChain *TC, ActionList &DeviceLinkObjects,
diff --git a/clang/test/Driver/sycl-bfloat16-lib-win.cpp b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
index aae66e80b8996..eee9d42dc1b6b 100755
--- a/clang/test/Driver/sycl-bfloat16-lib-win.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
@@ -9,6 +9,10 @@
 // RUN: %clangxx -fsycl %s --sysroot=%S/Inputs/SYCL -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16
 
+// test that no bfloat16 libraries are added in JIT mode with generic target
+// RUN: %clangxx -fsycl -fsycl-targets=spir64 %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16
+
 // test that a PVC AOT compilation uses the native library
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc" %S/Inputs/SYCL/c.cpp -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-NATIVE
@@ -17,7 +21,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %S/Inputs/SYCL/c.cpp -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
-// test that a generic compilation uses the fallback library
+// test that a generic AOT compilation uses the fallback library
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %S/Inputs/SYCL/c.cpp -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
diff --git a/clang/test/Driver/sycl-bfloat16-lib.cpp b/clang/test/Driver/sycl-bfloat16-lib.cpp
index c535eeea4dcf0..76ee64721e5d0 100755
--- a/clang/test/Driver/sycl-bfloat16-lib.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib.cpp
@@ -17,10 +17,14 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %S/Inputs/SYCL/c.cpp -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
-// test that a generic compilation uses the fallback library
+// test that a generic AOT compilation uses the fallback library
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %S/Inputs/SYCL/c.cpp -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
+// test that a generic JIT compilation with no target switches uses no bfloat library
+// RUN: %clangxx -fsycl -fsycl-targets=spir64 %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16
+
 // BFLOAT16-NOT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-{{fallback|native}}-bfloat16.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
 
 // BFLOAT16-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index e5290f93a1fd6..416661f4039bc 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -61,7 +61,8 @@ tolerate lower precision. Some implementations may still perform operations
 [NOTE]
 The bfloat16 type is supported on all devices. DPC++ currently supports this
 type natively on Intel Xe HP GPUs and Nvidia GPUs with
-Compute Capability >= SM80. On other devices it is emulated in software.
+Compute Capability >= SM80. On other devices, and in host code, it is emulated
+in software.
 
 == Specification
 
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 5d58ec4963296..2643cf35daaff 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -130,9 +130,7 @@ class bfloat16 {
     return bfloat16{-__devicelib_ConvertBF16ToFINTEL(lhs.value)};
 #endif
 #else
-    (void)lhs;
-    throw exception{errc::feature_not_supported,
-                    "Bfloat16 unary minus is not supported on host device"};
+    return -to_float(lhs.value);
 #endif
   }
 
diff --git a/sycl/include/sycl/feature_test.hpp.in b/sycl/include/sycl/feature_test.hpp.in
index 8a3a94fe3a3ef..42e1a6d1afd15 100644
--- a/sycl/include/sycl/feature_test.hpp.in
+++ b/sycl/include/sycl/feature_test.hpp.in
@@ -55,6 +55,7 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
 #define SYCL_EXT_ONEAPI_SUB_GROUP 1
 #define SYCL_EXT_ONEAPI_PROPERTIES 1
 #define SYCL_EXT_ONEAPI_NATIVE_MATH 1
+#define SYCL_EXT_ONEAPI_BFLOAT16 1
 #define SYCL_EXT_INTEL_DATAFLOW_PIPES 1
 #ifdef __clang__
 #if __has_extension(sycl_extended_atomics)

From 8d4022895b303df8bd6c1758c11150303a1b1c13 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 4 Oct 2022 09:50:07 -0700
Subject: [PATCH 36/63] Small corrections.

---
 clang/lib/Driver/Driver.cpp                        |  3 +--
 clang/test/Driver/Inputs/SYCL/c.cpp                |  1 -
 clang/test/Driver/sycl-bfloat16-lib-win.cpp        |  8 ++++----
 clang/test/Driver/sycl-bfloat16-lib.cpp            | 14 +++++++-------
 .../supported/sycl_ext_oneapi_bfloat16.asciidoc    |  3 +++
 5 files changed, 15 insertions(+), 14 deletions(-)
 delete mode 100644 clang/test/Driver/Inputs/SYCL/c.cpp

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 0f4ba7a61ea67..eb94956991fc4 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5255,8 +5255,7 @@ class OffloadingActionBuilder final {
 
       bool nativeBfloatLibs;
       bool needBfloatLibs = selectBfloatLibs(TC, nativeBfloatLibs);
-      if (needBfloatLibs &&
-          TC->getTriple().getSubArch() != llvm::Triple::SPIRSubArch_fpga) {
+      if (needBfloatLibs) {
         // Add native or fallback bfloat16 library.
         if (nativeBfloatLibs)
           addInputs(sycl_device_bfloat16_native_lib);
diff --git a/clang/test/Driver/Inputs/SYCL/c.cpp b/clang/test/Driver/Inputs/SYCL/c.cpp
deleted file mode 100644
index cf2ff6f441705..0000000000000
--- a/clang/test/Driver/Inputs/SYCL/c.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#remark
diff --git a/clang/test/Driver/sycl-bfloat16-lib-win.cpp b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
index eee9d42dc1b6b..5914a6766113c 100755
--- a/clang/test/Driver/sycl-bfloat16-lib-win.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
@@ -10,19 +10,19 @@
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16
 
 // test that no bfloat16 libraries are added in JIT mode with generic target
-// RUN: %clangxx -fsycl -fsycl-targets=spir64 %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16
 
 // test that a PVC AOT compilation uses the native library
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc" %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-NATIVE
 
 // test that a gen9 AOT compilation uses the fallback library
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
 // test that a generic AOT compilation uses the fallback library
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
 // BFLOAT16-NOT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-{{fallback|native}}-bfloat16.obj" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
diff --git a/clang/test/Driver/sycl-bfloat16-lib.cpp b/clang/test/Driver/sycl-bfloat16-lib.cpp
index 76ee64721e5d0..eb9833c615c15 100755
--- a/clang/test/Driver/sycl-bfloat16-lib.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib.cpp
@@ -9,22 +9,22 @@
 // RUN: %clangxx -fsycl %s --sysroot=%S/Inputs/SYCL -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16
 
+// test that no bfloat16 libraries are added in JIT mode with generic target
+// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16
+
 // test that a PVC AOT compilation uses the native library
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc" %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-NATIVE
 
 // test that a gen9 AOT compilation uses the fallback library
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
 // test that a generic AOT compilation uses the fallback library
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %S/Inputs/SYCL/c.cpp -### 2>&1 \
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
-// test that a generic JIT compilation with no target switches uses no bfloat library
-// RUN: %clangxx -fsycl -fsycl-targets=spir64 %S/Inputs/SYCL/c.cpp -### 2>&1 \
-// RUN:   | FileCheck %s -check-prefix=BFLOAT16
-
 // BFLOAT16-NOT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-{{fallback|native}}-bfloat16.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
 
 // BFLOAT16-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 416661f4039bc..ab86934321bda 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -90,6 +90,9 @@ The `bfloat16` type represents a 16-bit floating point value.
 Conversions from `float` to `bfloat16` are done with round to
 nearest even (RTE) rounding mode.
 
+The bfloat16 type and its operations are available in both device code and
+host code.
+
 [source]
 ----
 namespace sycl {

From cf8f6e01665537bb2ea2e3a98d6ecb657767ce37 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 4 Oct 2022 11:58:38 -0700
Subject: [PATCH 37/63] Fixes for AOT.

---
 clang/lib/Driver/Driver.cpp | 51 +++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 779d9dc5e7aef..940feea5c5375 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5106,48 +5106,61 @@ class OffloadingActionBuilder final {
     }
 
     // Return whether to use native bfloat16 library.
-    bool selectBfloatLibs(const ToolChain *TC, bool &useNative) {
-      // bfloat16 libraries are added only for Gen AOT
-      if (!(TC->getTriple().getArch() == llvm::Triple::spir64 &&
-            TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen))
-        return false;
+    bool selectBfloatLibs(const ToolChain* TC, bool& useNative) {
+      bool needLibs = false;
 
-      const OptTable &Opts = C.getDriver().getOpts();
-      const char *DeviceOpt = nullptr;
-      for (auto *A : Args) {
-        llvm::Triple *TargetBE = nullptr;
+      const OptTable& Opts = C.getDriver().getOpts();
+      const char* TargetOpt = nullptr;
+      const char* DeviceOpt = nullptr;
+      for (auto* A : Args) {
+        llvm::Triple* TargetBE = nullptr;
 
         auto GetTripleIt = [&, this](llvm::StringRef Triple) {
-          llvm::Triple TargetTriple{Triple};
-          auto TripleIt = llvm::find_if(SYCLTripleList, [&](auto &SYCLTriple) {
+          llvm::Triple TargetTriple{ Triple };
+          auto TripleIt = llvm::find_if(SYCLTripleList, [&](auto& SYCLTriple) {
             return SYCLTriple == TargetTriple;
-          });
+            });
           return TripleIt != SYCLTripleList.end() ? &*TripleIt : nullptr;
         };
 
-        if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
+        if (A->getOption().matches(options::OPT_fsycl_targets_EQ)) {
+          // Passing arg: -fsycl-targets=<targets>.
+          needLibs =
+              TC->getTriple().getSubArch() != llvm::Triple::SPIRSubArch_fpga;
+          TargetBE = GetTripleIt(A->getValue(0));
+          if (TargetBE)
+            TargetOpt = A->getValue(0);
+          else
+            continue;
+        }
+        else if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
           // Passing device args: -Xsycl-target-backend=<triple> <opt>
           TargetBE = GetTripleIt(A->getValue(0));
           if (TargetBE)
             DeviceOpt = A->getValue(1);
           else
             continue;
-        } else if (A->getOption().matches(options::OPT_Xsycl_backend)) {
+        }
+        else if (A->getOption().matches(options::OPT_Xsycl_backend)) {
           // Passing device args: -Xsycl-target-backend <opt>
           TargetBE = &SYCLTripleList.front();
           DeviceOpt = A->getValue(0);
-        } else if (A->getOption().matches(options::OPT_Xs_separate)) {
+        }
+        else if (A->getOption().matches(options::OPT_Xs_separate)) {
           // Passing device args: -Xs <opt>
           DeviceOpt = A->getValue(0);
-        } else {
-          continue;
         }
+        else {
+          continue;
+        };
       }
       useNative = false;
-      if (DeviceOpt) {
+      if (needLibs && TC->getTriple().getArch() == llvm::Triple::spir64 &&
+        TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen &&
+        TargetOpt && DeviceOpt) {
         useNative = strstr(DeviceOpt, "pvc") || strstr(DeviceOpt, "ats");
       }
-      return true;
+      return needLibs;
     }
 
     bool addSYCLDeviceLibs(const ToolChain *TC, ActionList &DeviceLinkObjects,

From 5e50646f02682aaf9ac2177785129646dd73ae56 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 4 Oct 2022 12:04:45 -0700
Subject: [PATCH 38/63] Formatting change.

---
 clang/lib/Driver/Driver.cpp | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 940feea5c5375..3c7e5a3a43a11 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5106,20 +5106,20 @@ class OffloadingActionBuilder final {
     }
 
     // Return whether to use native bfloat16 library.
-    bool selectBfloatLibs(const ToolChain* TC, bool& useNative) {
+    bool selectBfloatLibs(const ToolChain *TC, bool &useNative) {
       bool needLibs = false;
 
-      const OptTable& Opts = C.getDriver().getOpts();
-      const char* TargetOpt = nullptr;
-      const char* DeviceOpt = nullptr;
-      for (auto* A : Args) {
-        llvm::Triple* TargetBE = nullptr;
+      const OptTable &Opts = C.getDriver().getOpts();
+      const char *TargetOpt = nullptr;
+      const char *DeviceOpt = nullptr;
+      for (auto *A : Args) {
+        llvm::Triple *TargetBE = nullptr;
 
         auto GetTripleIt = [&, this](llvm::StringRef Triple) {
-          llvm::Triple TargetTriple{ Triple };
-          auto TripleIt = llvm::find_if(SYCLTripleList, [&](auto& SYCLTriple) {
+          llvm::Triple TargetTriple{Triple};
+          auto TripleIt = llvm::find_if(SYCLTripleList, [&](auto &SYCLTriple) {
             return SYCLTriple == TargetTriple;
-            });
+          });
           return TripleIt != SYCLTripleList.end() ? &*TripleIt : nullptr;
         };
 
@@ -5132,32 +5132,28 @@ class OffloadingActionBuilder final {
             TargetOpt = A->getValue(0);
           else
             continue;
-        }
-        else if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
+        } else if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
           // Passing device args: -Xsycl-target-backend=<triple> <opt>
           TargetBE = GetTripleIt(A->getValue(0));
           if (TargetBE)
             DeviceOpt = A->getValue(1);
           else
             continue;
-        }
-        else if (A->getOption().matches(options::OPT_Xsycl_backend)) {
+        } else if (A->getOption().matches(options::OPT_Xsycl_backend)) {
           // Passing device args: -Xsycl-target-backend <opt>
           TargetBE = &SYCLTripleList.front();
           DeviceOpt = A->getValue(0);
-        }
-        else if (A->getOption().matches(options::OPT_Xs_separate)) {
+        } else if (A->getOption().matches(options::OPT_Xs_separate)) {
           // Passing device args: -Xs <opt>
           DeviceOpt = A->getValue(0);
-        }
-        else {
+        } else {
           continue;
         };
       }
       useNative = false;
       if (needLibs && TC->getTriple().getArch() == llvm::Triple::spir64 &&
-        TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen &&
-        TargetOpt && DeviceOpt) {
+          TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen &&
+          TargetOpt && DeviceOpt) {
         useNative = strstr(DeviceOpt, "pvc") || strstr(DeviceOpt, "ats");
       }
       return needLibs;

From a7be7185daeff77f8b474774f7562621f9bb5ff1 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 5 Oct 2022 15:18:35 -0700
Subject: [PATCH 39/63] Renamed the bfloat aspects.

---
 clang/test/Driver/sycl-bfloat16-lib-win.cpp | 53 +++++++++++++--------
 clang/test/Driver/sycl-bfloat16-lib.cpp     | 53 +++++++++++++--------
 sycl/include/sycl/detail/pi.h               |  3 +-
 sycl/include/sycl/info/device_traits.def    |  2 +
 sycl/plugins/cuda/pi_cuda.cpp               |  3 +-
 sycl/plugins/hip/pi_hip.cpp                 |  1 +
 sycl/plugins/level_zero/pi_level_zero.cpp   |  4 ++
 sycl/plugins/opencl/pi_opencl.cpp           |  6 +++
 sycl/source/detail/device_info.hpp          | 26 +++++++++-
 9 files changed, 110 insertions(+), 41 deletions(-)

diff --git a/clang/test/Driver/sycl-bfloat16-lib-win.cpp b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
index 5914a6766113c..c29eeed8601aa 100755
--- a/clang/test/Driver/sycl-bfloat16-lib-win.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
@@ -9,9 +9,9 @@
 // RUN: %clangxx -fsycl %s --sysroot=%S/Inputs/SYCL -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16
 
-// test that no bfloat16 libraries are added in JIT mode with generic target
+// test that fallback bfloat16 libraries are added in JIT mode with generic target
 // RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -### 2>&1 \
-// RUN:   | FileCheck %s -check-prefix=BFLOAT16
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
 // test that a PVC AOT compilation uses the native library
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc" %s -### 2>&1 \
@@ -25,7 +25,22 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
-// BFLOAT16-NOT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-{{fallback|native}}-bfloat16.obj" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath.obj" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.obj" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-mscv-math.obj" "-output={{.*}}libsycl-msvc-math-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf.obj" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.obj" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.obj" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.obj" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.obj" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.obj" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NOT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-{{fallback|native}}-bfloat16.obj" "-output={{.*}}libsycl-{{fallback|native}}-{{.*}}.o" "-unbundle"
 
 // BFLOAT16-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
 // BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
@@ -44,19 +59,19 @@
 // BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
 // BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-native-bfloat16.obj"
 
-// BFLOAT16-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.obj" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.obj" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-mscv-math.obj" "-output={{.*}}libsycl-msvc-math-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.obj" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.obj" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.obj" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.obj" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.obj" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.obj" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.obj"
+// BFLOAT16-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-cmath.obj" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.obj" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-mscv-math.obj" "-output={{.*}}libsycl-msvc-math-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-imf.obj" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-imf-fp64.obj" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.obj" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.obj" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-complex.obj" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.obj" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.obj"
diff --git a/clang/test/Driver/sycl-bfloat16-lib.cpp b/clang/test/Driver/sycl-bfloat16-lib.cpp
index eb9833c615c15..5fe572ebc6925 100755
--- a/clang/test/Driver/sycl-bfloat16-lib.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib.cpp
@@ -9,9 +9,9 @@
 // RUN: %clangxx -fsycl %s --sysroot=%S/Inputs/SYCL -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16
 
-// test that no bfloat16 libraries are added in JIT mode with generic target
+// test that fallback bfloat16 libraries are added in JIT mode with generic target
 // RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -### 2>&1 \
-// RUN:   | FileCheck %s -check-prefix=BFLOAT16
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
 // test that a PVC AOT compilation uses the native library
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc" %s -### 2>&1 \
@@ -25,7 +25,22 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
-// BFLOAT16-NOT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-{{fallback|native}}-bfloat16.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath.o" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NOT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-{{fallback|native}}-bfloat16.o" "-output={{.*}}libsycl-{{fallback|native}}-{{.*}}.o" "-unbundle"
 
 // BFLOAT16-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
 // BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
@@ -44,19 +59,19 @@
 // BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
 // BFLOAT16-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-native-bfloat16.o"
 
-// BFLOAT16-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.o" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
-// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.o"
+// BFLOAT16-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-cmath.o" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.o"
diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
index 492e43534dcdd..d98f934b243fc 100644
--- a/sycl/include/sycl/detail/pi.h
+++ b/sycl/include/sycl/detail/pi.h
@@ -280,7 +280,8 @@ typedef enum {
   PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 0x11000,
   PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 0x10112,
   PI_DEVICE_INFO_BACKEND_VERSION = 0x10113,
-  // Return true if bfloat16 data type is supported by device
+  // Return whether bfloat16 conversions and math are supported by device
+  PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS = 0x1FFFE,
   PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16 = 0x1FFFF,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_GLOBAL_WORK_GROUPS = 0x20000,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_1D = 0x20001,
diff --git a/sycl/include/sycl/info/device_traits.def b/sycl/include/sycl/info/device_traits.def
index 9d1813ea10ce6..49e5632e706fe 100644
--- a/sycl/include/sycl/info/device_traits.def
+++ b/sycl/include/sycl/info/device_traits.def
@@ -195,6 +195,8 @@ __SYCL_PARAM_TRAITS_SPEC(device, ext_intel_mem_channel, bool,
                          PI_MEM_PROPERTIES_CHANNEL)
 __SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_bfloat16, bool,
                          PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16)
+__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_bfloat16_math_functions, bool,
+                         PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS)
 
 //Deprecated oneapi/intel extension
 //TODO:Remove when possible
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 70fd058197ebc..7575721543666 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -1301,7 +1301,8 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    capabilities);
   }
-  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16: {
+  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16:
+  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS: {
     int major = 0;
     sycl::detail::pi::assertion(
         cuDeviceGetAttribute(&major,
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index 82f68dc3cf0f3..9bd51b115d10b 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -1802,6 +1802,7 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name,
   case PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
   case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16:
+  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS:
     return PI_ERROR_INVALID_VALUE;
 
   default:
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index f3ab9d39fe74e..adec955fdbb03 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -3214,6 +3214,10 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
     // currently not supported in level zero runtime
     return PI_ERROR_INVALID_VALUE;
   case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16: {
+    // bfloat16 conversions are supported on Intel GPUs.
+    return ReturnValue(bool{true});
+  }
+  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS: {
     // bfloat16 math functions are not yet supported on Intel GPUs.
     return ReturnValue(bool{false});
   }
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index f416149271dec..1bcbbba2e58d5 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -296,6 +296,12 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
     return PI_SUCCESS;
   }
   case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16: {
+    // bfloat16 conversions are supported on Intel GPUs.
+    cl_bool result = true;
+    std::memcpy(paramValue, &result, sizeof(cl_bool));
+    return PI_SUCCESS;
+  }
+  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS: {
     // bfloat16 math functions are not yet supported on Intel GPUs.
     cl_bool result = false;
     std::memcpy(paramValue, &result, sizeof(cl_bool));
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index d06c84da40c20..7421c79a46e84 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -279,7 +279,7 @@ struct get_device_info_impl<std::vector<memory_scope>,
   }
 };
 
-// Specialization for bf16
+// Specialization for bf16 conversions
 template <>
 struct get_device_info_impl<bool, info::device::ext_oneapi_bfloat16> {
   static bool get(RT::PiDevice dev, const plugin &Plugin) {
@@ -295,6 +295,24 @@ struct get_device_info_impl<bool, info::device::ext_oneapi_bfloat16> {
   }
 };
 
+// Specialization for bf16 math functions
+template <>
+struct get_device_info_impl<bool,
+                            info::device::ext_oneapi_bfloat16_math_functions> {
+  static bool get(RT::PiDevice dev, const plugin &Plugin) {
+    bool result = false;
+
+    RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
+        dev,
+        PiInfoCode<info::device::ext_oneapi_bfloat16_math_functions>::value,
+        sizeof(result), &result, nullptr);
+    if (Err != PI_SUCCESS) {
+      return false;
+    }
+    return result;
+  }
+};
+
 // Specialization for exec_capabilities, OpenCL returns a bitfield
 template <>
 struct get_device_info_impl<std::vector<info::execution_capability>,
@@ -1002,6 +1020,12 @@ get_device_info_host<info::device::atomic_memory_scope_capabilities>() {
 
 template <>
 inline bool get_device_info_host<info::device::ext_oneapi_bfloat16>() {
+  return true;
+}
+
+template <>
+inline bool
+get_device_info_host<info::device::ext_oneapi_bfloat16_math_functions>() {
   return false;
 }
 

From cac1c18f6038c352821a0065d89b70d697a649f4 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 5 Oct 2022 17:14:06 -0700
Subject: [PATCH 40/63] Fixes for generic JIT compilation.

---
 clang/lib/Driver/Driver.cpp             | 3 +--
 clang/test/Driver/sycl-bfloat16-lib.cpp | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 3c7e5a3a43a11..a724fb15a5321 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5125,8 +5125,7 @@ class OffloadingActionBuilder final {
 
         if (A->getOption().matches(options::OPT_fsycl_targets_EQ)) {
           // Passing arg: -fsycl-targets=<targets>.
-          needLibs =
-              TC->getTriple().getSubArch() != llvm::Triple::SPIRSubArch_fpga;
+          needLibs = TC->getTriple().getSubArch() != llvm::Triple::NoSubArch;
           TargetBE = GetTripleIt(A->getValue(0));
           if (TargetBE)
             TargetOpt = A->getValue(0);
diff --git a/clang/test/Driver/sycl-bfloat16-lib.cpp b/clang/test/Driver/sycl-bfloat16-lib.cpp
index 5fe572ebc6925..65283d77e51ef 100755
--- a/clang/test/Driver/sycl-bfloat16-lib.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib.cpp
@@ -9,9 +9,9 @@
 // RUN: %clangxx -fsycl %s --sysroot=%S/Inputs/SYCL -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16
 
-// test that fallback bfloat16 libraries are added in JIT mode with generic target
+// test that no bfloat16 libraries are added in JIT mode with generic target
 // RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -### 2>&1 \
-// RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16
 
 // test that a PVC AOT compilation uses the native library
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc" %s -### 2>&1 \

From 208c09aaf7df7be72a299c5c9831c802d97d8b3f Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 6 Oct 2022 00:12:10 -0700
Subject: [PATCH 41/63] Changes for AOT sycl-targets switch.

---
 clang/lib/Driver/Driver.cpp                         | 13 ++++++++-----
 .../supported/sycl_ext_oneapi_bfloat16.asciidoc     |  9 +++++++++
 sycl/test/abi/sycl_symbols_linux.dump               |  1 +
 sycl/test/abi/sycl_symbols_windows.dump             |  1 +
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index a724fb15a5321..64541f2a7951e 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5107,11 +5107,10 @@ class OffloadingActionBuilder final {
 
     // Return whether to use native bfloat16 library.
     bool selectBfloatLibs(const ToolChain *TC, bool &useNative) {
-      bool needLibs = false;
-
       const OptTable &Opts = C.getDriver().getOpts();
       const char *TargetOpt = nullptr;
       const char *DeviceOpt = nullptr;
+      bool needLibs = false;
       for (auto *A : Args) {
         llvm::Triple *TargetBE = nullptr;
 
@@ -5124,8 +5123,12 @@ class OffloadingActionBuilder final {
         };
 
         if (A->getOption().matches(options::OPT_fsycl_targets_EQ)) {
-          // Passing arg: -fsycl-targets=<targets>.
-          needLibs = TC->getTriple().getSubArch() != llvm::Triple::NoSubArch;
+          // When a generic target "spir64" is used with other AOT targets
+          // we use fallback libraries.
+          if (TC->getTriple().getSubArch() == llvm::Triple::NoSubArch)
+            needLibs = DeviceLinkerInputs.size() > 1;
+          else
+            needLibs = true;
           TargetBE = GetTripleIt(A->getValue(0));
           if (TargetBE)
             TargetOpt = A->getValue(0);
@@ -5150,7 +5153,7 @@ class OffloadingActionBuilder final {
         };
       }
       useNative = false;
-      if (needLibs && TC->getTriple().getArch() == llvm::Triple::spir64 &&
+      if (needLibs &&
           TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen &&
           TargetOpt && DeviceOpt) {
         useNative = strstr(DeviceOpt, "pvc") || strstr(DeviceOpt, "ats");
diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index ab86934321bda..9362a613908d3 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -44,6 +44,15 @@ This extension is written against the SYCL 2020 specification, Revision 5.
 == Status
 
 This extension is implemented and fully supported by DPC++.
+[NOTE]
+Currently, if AOT compilation is done for multiple targets
+using "-fsycl-targets=<A,B,C>" then `bfloat16` support will be native
+or software emulation depending on the target, with the exception of the
+"spirv" target. That is actually a JIT compilation and it will use software
+emulation regardless of the actual runtime execution device.
+In JIT mode where the "-fsycl-targets=" switch is not specified,
+the selection of native or emulation is at runtime,
+depending on the device.
 
 == Overview
 
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index a17fe783f3b60..db8ebb2d377a0 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -4263,6 +4263,7 @@ _ZNK4sycl3_V16device8get_infoINS0_4info6device32atomic_memory_scope_capabilities
 _ZNK4sycl3_V16device8get_infoINS0_4info6device33ext_intel_gpu_subslices_per_sliceEEENS0_6detail19is_device_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16device8get_infoINS0_4info6device33ext_oneapi_max_global_work_groupsEEENS0_6detail19is_device_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16device8get_infoINS0_4info6device33usm_restricted_shared_allocationsEEENS0_6detail19is_device_info_descIT_E11return_typeEv
+_ZNK4sycl3_V16device8get_infoINS0_4info6device34ext_oneapi_bfloat16_math_functionsEEENS0_6detail19is_device_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16device8get_infoINS0_4info6device35ext_intel_gpu_eu_count_per_subsliceEEENS0_6detail19is_device_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16device8get_infoINS0_4info6device38sub_group_independent_forward_progressEEENS0_6detail19is_device_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16device8get_infoINS0_4info6device4nameEEENS0_6detail19is_device_info_descIT_E11return_typeEv
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index fd001e427e986..40549d01be162 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -58,6 +58,7 @@
 ??$get_info@Uext_intel_mem_channel@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA_NXZ
 ??$get_info@Uext_intel_pci_address@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
 ??$get_info@Uext_oneapi_bfloat16@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA_NXZ
+??$get_info@Uext_oneapi_bfloat16_math_functions@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA_NXZ
 ??$get_info@Uext_oneapi_max_global_work_groups@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA_KXZ
 ??$get_info@Uext_oneapi_max_work_groups_1d@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA?AV?$id@$00@12@XZ
 ??$get_info@Uext_oneapi_max_work_groups_2d@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA?AV?$id@$01@12@XZ

From 68308576d98aedf5be2c60e30de66c837e614aff Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 6 Oct 2022 16:50:51 -0700
Subject: [PATCH 42/63] Corrected aspects queries.

---
 .../sycl_ext_oneapi_bfloat16_math.asciidoc    | 50 +++++++++++++------
 .../sycl_ext_oneapi_bfloat16.asciidoc         | 13 +++++
 sycl/include/sycl/aspects.hpp                 |  1 +
 sycl/source/detail/device_impl.cpp            |  2 +
 4 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
index fe860015977a3..7bb840551847e 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
@@ -24,7 +24,9 @@
 
 Copyright © 2022-2022 Intel Corporation. All rights reserved.
 
-Khronos® is a registered trademark and SYCL™ and SPIR™ are trademarks of The Khronos Group Inc. OpenCL™ is a trademark of Apple Inc. used by permission by Khronos.
+Khronos® is a registered trademark and SYCL™ and SPIR™ are trademarks of
+The Khronos Group Inc. OpenCL™ is a trademark of Apple Inc. used by permission
+by Khronos.
 
 == Contact
 
@@ -34,7 +36,9 @@ https://github.com/intel/llvm/issues
 
 == Dependencies
 
-This extension is written against the SYCL 2020 revision 5 specification. All references below to the "core SYCL specification" or to section numbers in the SYCL specification refer to that revision.
+This extension is written against the SYCL 2020 revision 5 specification.
+All references below to the "core SYCL specification" or to section
+numbers in the SYCL specification refer to that revision.
 
 This extension depends on the following other SYCL extension:
 
@@ -44,14 +48,21 @@ This extension depends on the following other SYCL extension:
 
 This is an experimental extension specification, intended to provide early
 access to features and gather community feedback. Interfaces defined in this
-specification are implemented in DPC\++ but they are not finalized and may change incompatibly in future versions of DPC++ without prior notice.
-Shipping software products should not rely on APIs defined in this specification.
+specification are implemented in DPC\++ but they are not finalized and may
+change incompatibly in future versions of DPC++ without prior notice.
+Shipping software products should not rely on APIs defined in this
+specification.
 
 == Overview
 
-This extension adds `bfloat16` support to the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point math functions. These functions can be used as element wise operations on matrices, supplementing the `bfloat16` support in the sycl_ext_oneapi_matrix extension.
+This extension adds `bfloat16` support to the `fma`, `fmin`, `fmax` and
+`fabs` SYCL floating point math functions. These functions can be used as
+element wise operations on matrices, supplementing the `bfloat16` support
+in the sycl_ext_oneapi_matrix extension.
 
-The descriptions of the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point math functions can be found in the SYCL specification: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_math_functions.
+The descriptions of the `fma`, `fmin`, `fmax` and `fabs` SYCL floating point
+math functions can be found in the SYCL specification:
+https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_math_functions.
 
 == Specification
 
@@ -62,7 +73,8 @@ specification section 6.3.3 "Feature test macros". Therefore, an implementation
 supporting this extension must predefine the macro 
 `SYCL_EXT_ONEAPI_BFLOAT16_MATH` to one of the values defined in the table
 below. Applications can test for the existence of this macro to determine if
-the implementation supports this feature, or applications can test the macro's value to determine which of the extension's APIs the implementation supports.
+the implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's APIs the implementation supports.
  
 [%header,cols="1,5"]
 |===
@@ -77,16 +89,19 @@ the implementation supports this feature, or applications can test the macro's v
 namespace sycl {
 enum class aspect {
   ...
-  ext_oneapi_bfloat16
+  ext_oneapi_bfloat16_math_functions
 }
 }
 ----
 
-If a SYCL device has the `ext_oneapi_bfloat16` aspect, then it supports the `bfloat16` math functions described in the next section.
+If a SYCL device has the `ext_oneapi_bfloat16_math_functions` aspect,
+then it supports the `bfloat16` math functions described in the next section.
 
 === Math Functions
 
-The following functions are only available when `T` is `bfloat16` or `sycl::marray<bfloat16, {N}>`, where `{N}` means any positive value of `size_t` type.
+The following functions are only available when `T` is `bfloat16` or
+`sycl::marray<bfloat16, {N}>`, where `{N}` means any positive value of
+`size_t` type.
 
 ==== fma
 
@@ -100,8 +115,10 @@ T fma(T a, T b, T c);
 
 ===== Description
 
-Returns the correctly rounded floating-point representation of the sum of `c` with the infinitely precise product of `a` and `b`.
-Rounding of intermediate products shall not occur. The mantissa LSB rounds to the nearest even. Subnormal numbers are supported.
+Returns the correctly rounded floating-point representation of the
+sum of `c` with the infinitely precise product of `a` and `b`.
+Rounding of intermediate products shall not occur. The mantissa
+LSB rounds to the nearest even. Subnormal numbers are supported.
 
 ==== fmax
 
@@ -154,6 +171,11 @@ Compute absolute value of a `bfloat16`.
 
 == Issues
 
-1. The CUDA backend does not have a use case that would necessitate support of the `vec` class in bfloat16 math functions, and `marray` would always be preferred over `vec` if `vec` support were to be added in the CUDA backend. For portability reasons, support for the `vec` class can be easily added if other backends require it.
+1. The CUDA backend does not have a use case that would necessitate support
+of the `vec` class in bfloat16 math functions, and `marray` would always be
+preferred over `vec` if `vec` support were to be added in the CUDA backend.
+For portability reasons, support for the `vec` class can be easily added if
+other backends require it.
 
-2. We should decide on a roadmap to extend support of `bfloat16` to other SYCL 2020 math functions.
+2. We should decide on a roadmap to extend support of `bfloat16` to other
+SYCL 2020 math functions.
diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 9362a613908d3..668b803be40e1 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -91,7 +91,20 @@ value to determine which of the extension’s APIs the implementation supports.
 |Value |Description
 |1     |Initial extension version. Base features are supported.
 |===
+=== Extension to `enum class aspect`
 
+[source]
+----
+namespace sycl {
+enum class aspect {
+  ...
+  ext_oneapi_bfloat16
+}
+}
+----
+
+If a SYCL device has the `ext_oneapi_bfloat16` aspect, then it supports
+`bfloat16` conversions described in the next section.
 
 === New `bfloat16` class
 
diff --git a/sycl/include/sycl/aspects.hpp b/sycl/include/sycl/aspects.hpp
index 418c77b943159..db07cc6e6b351 100644
--- a/sycl/include/sycl/aspects.hpp
+++ b/sycl/include/sycl/aspects.hpp
@@ -52,6 +52,7 @@ enum class __SYCL_TYPE(aspect) aspect {
   ext_oneapi_cuda_async_barrier = 34,
   ext_oneapi_bfloat16 = 35,
   ext_intel_free_memory = 36,
+  ext_oneapi_bfloat16_math_functions = 37,
 };
 
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index b73e1fd46f285..2a1b589be843e 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -277,6 +277,8 @@ bool device_impl::has(aspect Aspect) const {
     return has_extension("cl_khr_fp64");
   case aspect::ext_oneapi_bfloat16:
     return get_info<info::device::ext_oneapi_bfloat16>();
+  case aspect::ext_oneapi_bfloat16_math_functions:
+    return get_info<info::device::ext_oneapi_bfloat16_math_functions>();
   case aspect::int64_base_atomics:
     return has_extension("cl_khr_int64_base_atomics");
   case aspect::int64_extended_atomics:

From 10fc9a3072151c35ff99411dcc55cdeb4ce3f448 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Fri, 7 Oct 2022 20:21:28 -0700
Subject: [PATCH 43/63] Change in the way fallback/native libs are selected.

---
 .../program_manager/program_manager.cpp       | 119 +++++++++---------
 1 file changed, 62 insertions(+), 57 deletions(-)

diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index f2184ec379782..bfca442f1b197 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -754,54 +754,60 @@ static bool loadDeviceLib(const ContextImplPtr Context, const char *Name,
   return Prog != nullptr;
 }
 
-static const char *getDeviceLibFilename(DeviceLibExt Extension) {
-  switch (Extension) {
-  case DeviceLibExt::cl_intel_devicelib_assert:
-    return "libsycl-fallback-cassert.spv";
-  case DeviceLibExt::cl_intel_devicelib_math:
-    return "libsycl-fallback-cmath.spv";
-  case DeviceLibExt::cl_intel_devicelib_math_fp64:
-    return "libsycl-fallback-cmath-fp64.spv";
-  case DeviceLibExt::cl_intel_devicelib_complex:
-    return "libsycl-fallback-complex.spv";
-  case DeviceLibExt::cl_intel_devicelib_complex_fp64:
-    return "libsycl-fallback-complex-fp64.spv";
-  case DeviceLibExt::cl_intel_devicelib_cstring:
-    return "libsycl-fallback-cstring.spv";
-  case DeviceLibExt::cl_intel_devicelib_imf:
-    return "libsycl-fallback-imf.spv";
-  case DeviceLibExt::cl_intel_devicelib_imf_fp64:
-    return "libsycl-fallback-imf-fp64.spv";
-  case DeviceLibExt::cl_intel_devicelib_bfloat16:
-    return "libsycl-fallback-bfloat16.spv";
-  }
-  throw compile_program_error("Unhandled (new?) device library extension",
-                              PI_ERROR_INVALID_OPERATION);
+static const std::map<DeviceLibExt,
+                      std::pair<const std::string, const std::string>>
+    DeviceLibNames = {
+        {DeviceLibExt::cl_intel_devicelib_assert,
+         {"libsycl-fallback-cassert.spv", ""}},
+        {DeviceLibExt::cl_intel_devicelib_math,
+         {"libsycl-fallback-cmath.spv", ""}},
+        {DeviceLibExt::cl_intel_devicelib_math_fp64,
+         {"libsycl-fallback-cmath-fp64.spv", ""}},
+        {DeviceLibExt::cl_intel_devicelib_complex,
+         {"libsycl-fallback-complex.spv", ""}},
+        {DeviceLibExt::cl_intel_devicelib_complex_fp64,
+         {"libsycl-fallback-complex-fp64.spv", ""}},
+        {DeviceLibExt::cl_intel_devicelib_cstring,
+         {"libsycl-fallback-cstring.spv", ""}},
+        {DeviceLibExt::cl_intel_devicelib_imf,
+         {"libsycl-fallback-imf.spv", ""}},
+        {DeviceLibExt::cl_intel_devicelib_imf_fp64,
+         {"libsycl-fallback-imf-fp64.spv", ""}},
+        {DeviceLibExt::cl_intel_devicelib_bfloat16,
+         {"libsycl-fallback-bfloat16.spv", "libsycl-native-bfloat16.spv"}}};
+
+static const std::string getDeviceLibFilename(DeviceLibExt Extension,
+                                              bool Native) {
+  auto LibPair = DeviceLibNames.find(Extension);
+  std::string Lib;
+  if (LibPair != DeviceLibNames.end())
+    Lib = Native ? LibPair->second.second : LibPair->second.first;
+  if (Lib.empty())
+    throw compile_program_error("Unhandled (new?) device library extension",
+                                PI_ERROR_INVALID_OPERATION);
+  return Lib;
 }
 
-static const char *getDeviceLibExtensionStr(DeviceLibExt Extension) {
-  switch (Extension) {
-  case DeviceLibExt::cl_intel_devicelib_assert:
-    return "cl_intel_devicelib_assert";
-  case DeviceLibExt::cl_intel_devicelib_math:
-    return "cl_intel_devicelib_math";
-  case DeviceLibExt::cl_intel_devicelib_math_fp64:
-    return "cl_intel_devicelib_math_fp64";
-  case DeviceLibExt::cl_intel_devicelib_complex:
-    return "cl_intel_devicelib_complex";
-  case DeviceLibExt::cl_intel_devicelib_complex_fp64:
-    return "cl_intel_devicelib_complex_fp64";
-  case DeviceLibExt::cl_intel_devicelib_cstring:
-    return "cl_intel_devicelib_cstring";
-  case DeviceLibExt::cl_intel_devicelib_imf:
-    return "cl_intel_devicelib_imf";
-  case DeviceLibExt::cl_intel_devicelib_imf_fp64:
-    return "cl_intel_devicelib_imf_fp64";
-  case DeviceLibExt::cl_intel_devicelib_bfloat16:
-    return "cl_intel_bfloat16_conversions";
-  }
-  throw compile_program_error("Unhandled (new?) device library extension",
-                              PI_ERROR_INVALID_OPERATION);
+static const std::map<DeviceLibExt, std::string> DeviceLibExtensionStrs = {
+    {DeviceLibExt::cl_intel_devicelib_assert, "cl_intel_devicelib_assert"},
+    {DeviceLibExt::cl_intel_devicelib_math, "cl_intel_devicelib_math"},
+    {DeviceLibExt::cl_intel_devicelib_math_fp64,
+     "cl_intel_devicelib_math_fp64"},
+    {DeviceLibExt::cl_intel_devicelib_complex, "cl_intel_devicelib_complex"},
+    {DeviceLibExt::cl_intel_devicelib_complex_fp64,
+     "cl_intel_devicelib_complex_fp64"},
+    {DeviceLibExt::cl_intel_devicelib_cstring, "cl_intel_devicelib_cstring"},
+    {DeviceLibExt::cl_intel_devicelib_imf, "cl_intel_devicelib_imf"},
+    {DeviceLibExt::cl_intel_devicelib_imf_fp64, "cl_intel_devicelib_imf_fp64"},
+    {DeviceLibExt::cl_intel_devicelib_bfloat16,
+     "cl_intel_bfloat16_conversions"}};
+
+static const std::string getDeviceLibExtensionStr(DeviceLibExt Extension) {
+  auto Ext = DeviceLibExtensionStrs.find(Extension);
+  if (Ext == DeviceLibExtensionStrs.end())
+    throw compile_program_error("Unhandled (new?) device library extension",
+                                PI_ERROR_INVALID_OPERATION);
+  return Ext->second;
 }
 
 static RT::PiProgram loadDeviceLibFallback(const ContextImplPtr Context,
@@ -809,12 +815,8 @@ static RT::PiProgram loadDeviceLibFallback(const ContextImplPtr Context,
                                            const RT::PiDevice &Device,
                                            bool UseNativeLib) {
 
-  const char *LibFileName = getDeviceLibFilename(Extension);
-  std::string LibFileNameStr(LibFileName);
-  if (UseNativeLib) {
-    LibFileNameStr.replace(8, 8, "native");
-    LibFileName = LibFileNameStr.c_str();
-  }
+  auto LibFileNameStr = getDeviceLibFilename(Extension, UseNativeLib);
+  auto LibFileName = LibFileNameStr.c_str();
 
   auto LockedCache = Context->acquireCachedLibPrograms();
   auto CachedLibPrograms = LockedCache.get();
@@ -998,21 +1000,24 @@ getDeviceLibPrograms(const ContextImplPtr Context, const RT::PiDevice &Device,
       continue;
     }
 
-    const char *ExtStr = getDeviceLibExtensionStr(Ext);
+    auto ExtStr = getDeviceLibExtensionStr(Ext);
+    auto ExtName = ExtStr.c_str();
 
     bool InhibitNativeImpl = false;
     if (const char *Env = getenv("SYCL_DEVICELIB_INHIBIT_NATIVE")) {
-      InhibitNativeImpl = strstr(Env, ExtStr) != nullptr;
+      InhibitNativeImpl = strstr(Env, ExtName) != nullptr;
     }
 
-    bool DeviceSupports = DevExtList.npos != DevExtList.find(ExtStr);
+    bool DeviceSupports = DevExtList.npos != DevExtList.find(ExtName);
     if (!DeviceSupports || InhibitNativeImpl) {
-      Programs.push_back(loadDeviceLibFallback(Context, Ext, Device, false));
+      Programs.push_back(
+          loadDeviceLibFallback(Context, Ext, Device, /*UseNativeLib=*/false));
       FallbackIsLoaded = true;
     } else {
       // bfloat16 needs native library if device supports it
       if (Ext == DeviceLibExt::cl_intel_devicelib_bfloat16) {
-        Programs.push_back(loadDeviceLibFallback(Context, Ext, Device, true));
+        Programs.push_back(
+            loadDeviceLibFallback(Context, Ext, Device, /*UseNativeLib=*/true));
         FallbackIsLoaded = true;
       }
     }

From 437e34aeb2b41fea3729e85aec119f500f4ab5d6 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 10 Oct 2022 09:54:26 -0700
Subject: [PATCH 44/63] Changed type of string.

---
 .../program_manager/program_manager.cpp       | 45 ++++++++++---------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index bfca442f1b197..4359575e14606 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -754,41 +754,44 @@ static bool loadDeviceLib(const ContextImplPtr Context, const char *Name,
   return Prog != nullptr;
 }
 
-static const std::map<DeviceLibExt,
-                      std::pair<const std::string, const std::string>>
+// For each extension, a pair of library names. The first uses native support,
+// the second emulates functionality in software.
+static const std::map<DeviceLibExt, std::pair<const char *, const char *>>
     DeviceLibNames = {
         {DeviceLibExt::cl_intel_devicelib_assert,
-         {"libsycl-fallback-cassert.spv", ""}},
+         {nullptr, "libsycl-fallback-cassert.spv"}},
         {DeviceLibExt::cl_intel_devicelib_math,
-         {"libsycl-fallback-cmath.spv", ""}},
+         {nullptr, "libsycl-fallback-cmath.spv"}},
         {DeviceLibExt::cl_intel_devicelib_math_fp64,
-         {"libsycl-fallback-cmath-fp64.spv", ""}},
+         {nullptr, "libsycl-fallback-cmath-fp64.spv"}},
         {DeviceLibExt::cl_intel_devicelib_complex,
-         {"libsycl-fallback-complex.spv", ""}},
+         {nullptr, "libsycl-fallback-complex.spv"}},
         {DeviceLibExt::cl_intel_devicelib_complex_fp64,
-         {"libsycl-fallback-complex-fp64.spv", ""}},
+         {nullptr, "libsycl-fallback-complex-fp64.spv"}},
         {DeviceLibExt::cl_intel_devicelib_cstring,
-         {"libsycl-fallback-cstring.spv", ""}},
+         {nullptr, "libsycl-fallback-cstring.spv"}},
         {DeviceLibExt::cl_intel_devicelib_imf,
-         {"libsycl-fallback-imf.spv", ""}},
+         {nullptr, "libsycl-fallback-imf.spv"}},
         {DeviceLibExt::cl_intel_devicelib_imf_fp64,
-         {"libsycl-fallback-imf-fp64.spv", ""}},
+         {nullptr, "libsycl-fallback-imf-fp64.spv"}},
         {DeviceLibExt::cl_intel_devicelib_bfloat16,
-         {"libsycl-fallback-bfloat16.spv", "libsycl-native-bfloat16.spv"}}};
+         {"libsycl-native-bfloat16.spv", "libsycl-fallback-bfloat16.spv"}}};
 
-static const std::string getDeviceLibFilename(DeviceLibExt Extension,
-                                              bool Native) {
+static const char *getDeviceLibFilename(DeviceLibExt Extension, bool Native) {
   auto LibPair = DeviceLibNames.find(Extension);
-  std::string Lib;
+  const char *Lib = nullptr;
   if (LibPair != DeviceLibNames.end())
-    Lib = Native ? LibPair->second.second : LibPair->second.first;
-  if (Lib.empty())
+    Lib = Native ? LibPair->second.first : LibPair->second.second;
+  if (Lib == nullptr)
     throw compile_program_error("Unhandled (new?) device library extension",
                                 PI_ERROR_INVALID_OPERATION);
   return Lib;
 }
 
-static const std::map<DeviceLibExt, std::string> DeviceLibExtensionStrs = {
+// For each extension understood by the SYCL runtime, the string representation
+// of its name. Names with devicelib in them are internal to the runtime. Others
+// are actual OpenCL extensions.
+static const std::map<DeviceLibExt, const char *> DeviceLibExtensionStrs = {
     {DeviceLibExt::cl_intel_devicelib_assert, "cl_intel_devicelib_assert"},
     {DeviceLibExt::cl_intel_devicelib_math, "cl_intel_devicelib_math"},
     {DeviceLibExt::cl_intel_devicelib_math_fp64,
@@ -802,7 +805,7 @@ static const std::map<DeviceLibExt, std::string> DeviceLibExtensionStrs = {
     {DeviceLibExt::cl_intel_devicelib_bfloat16,
      "cl_intel_bfloat16_conversions"}};
 
-static const std::string getDeviceLibExtensionStr(DeviceLibExt Extension) {
+static const char *getDeviceLibExtensionStr(DeviceLibExt Extension) {
   auto Ext = DeviceLibExtensionStrs.find(Extension);
   if (Ext == DeviceLibExtensionStrs.end())
     throw compile_program_error("Unhandled (new?) device library extension",
@@ -815,8 +818,7 @@ static RT::PiProgram loadDeviceLibFallback(const ContextImplPtr Context,
                                            const RT::PiDevice &Device,
                                            bool UseNativeLib) {
 
-  auto LibFileNameStr = getDeviceLibFilename(Extension, UseNativeLib);
-  auto LibFileName = LibFileNameStr.c_str();
+  auto LibFileName = getDeviceLibFilename(Extension, UseNativeLib);
 
   auto LockedCache = Context->acquireCachedLibPrograms();
   auto CachedLibPrograms = LockedCache.get();
@@ -1000,8 +1002,7 @@ getDeviceLibPrograms(const ContextImplPtr Context, const RT::PiDevice &Device,
       continue;
     }
 
-    auto ExtStr = getDeviceLibExtensionStr(Ext);
-    auto ExtName = ExtStr.c_str();
+    auto ExtName = getDeviceLibExtensionStr(Ext);
 
     bool InhibitNativeImpl = false;
     if (const char *Env = getenv("SYCL_DEVICELIB_INHIBIT_NATIVE")) {

From 386353e91f115a8b831bfd1da8daa02518bbe4fe Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 12 Oct 2022 16:12:32 -0700
Subject: [PATCH 45/63] Replaced bfloat16 aspect with bfloat16_math_functions
 aspect.

---
 .../sycl_ext_oneapi_bfloat16_math.asciidoc    |  9 ++--
 .../sycl_ext_oneapi_bfloat16.asciidoc         | 30 -------------
 sycl/include/sycl/aspects.hpp                 | 43 -------------------
 sycl/include/sycl/detail/pi.h                 |  5 +--
 sycl/include/sycl/feature_test.hpp.in         |  3 +-
 sycl/include/sycl/info/aspects.def            |  2 +-
 sycl/include/sycl/info/device_traits.def      |  2 -
 sycl/plugins/cuda/pi_cuda.cpp                 |  1 -
 sycl/plugins/hip/pi_hip.cpp                   |  1 -
 sycl/plugins/level_zero/pi_level_zero.cpp     |  4 --
 sycl/plugins/opencl/pi_opencl.cpp             |  6 ---
 sycl/source/detail/device_impl.cpp            |  2 -
 sycl/source/detail/device_info.hpp            | 21 ---------
 sycl/test/abi/sycl_symbols_linux.dump         |  1 -
 sycl/test/abi/sycl_symbols_windows.dump       |  1 -
 15 files changed, 9 insertions(+), 122 deletions(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
index 7bb840551847e..5eb25fbf77a00 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
@@ -71,10 +71,11 @@ https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_math_
 This extension provides a feature-test macro as described in the core SYCL
 specification section 6.3.3 "Feature test macros". Therefore, an implementation
 supporting this extension must predefine the macro 
-`SYCL_EXT_ONEAPI_BFLOAT16_MATH` to one of the values defined in the table
-below. Applications can test for the existence of this macro to determine if
-the implementation supports this feature, or applications can test the macro's
-value to determine which of the extension's APIs the implementation supports.
+`SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS` to one of the values defined in the
+table below. Applications can test for the existence of this macro to determine
+if the implementation supports this feature, or applications can test the
+macro's value to determine which of the extension's APIs the implementation
+supports.
  
 [%header,cols="1,5"]
 |===
diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index 668b803be40e1..cdbfaf8059055 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -75,36 +75,6 @@ in software.
 
 == Specification
 
-=== Feature test macro
-
-This extension provides a feature-test macro as described in the core SYCL
-specification section 6.3.3 "Feature test macros". Therefore, an implementation
-supporting this extension must predefine the macro
-`SYCL_EXT_ONEAPI_BFLOAT16` to one of the values defined in the table
-below. Applications can test for the existence of this macro to determine if
-the implementation supports this feature, or applications can test the macro’s
-value to determine which of the extension’s APIs the implementation supports.
-
-
-[%header,cols="1,5"]
-|===
-|Value |Description
-|1     |Initial extension version. Base features are supported.
-|===
-=== Extension to `enum class aspect`
-
-[source]
-----
-namespace sycl {
-enum class aspect {
-  ...
-  ext_oneapi_bfloat16
-}
-}
-----
-
-If a SYCL device has the `ext_oneapi_bfloat16` aspect, then it supports
-`bfloat16` conversions described in the next section.
 
 === New `bfloat16` class
 
diff --git a/sycl/include/sycl/aspects.hpp b/sycl/include/sycl/aspects.hpp
index 94c3794f11904..d2e389f4c9f8a 100644
--- a/sycl/include/sycl/aspects.hpp
+++ b/sycl/include/sycl/aspects.hpp
@@ -18,51 +18,8 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
 #define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)                    \
   __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE)
 enum class __SYCL_TYPE(aspect) aspect {
-<<<<<<< HEAD
-  host = 0,
-  cpu = 1,
-  gpu = 2,
-  accelerator = 3,
-  custom = 4,
-  fp16 = 5,
-  fp64 = 6,
-  int64_base_atomics __SYCL2020_DEPRECATED("use atomic64 instead") = 7,
-  int64_extended_atomics __SYCL2020_DEPRECATED("use atomic64 instead") = 8,
-  image = 9,
-  online_compiler = 10,
-  online_linker = 11,
-  queue_profiling = 12,
-  usm_device_allocations = 13,
-  usm_host_allocations = 14,
-  usm_shared_allocations = 15,
-  usm_restricted_shared_allocations = 16,
-  usm_system_allocations = 17,
-  usm_system_allocator __SYCL2020_DEPRECATED(
-      "use usm_system_allocations instead") = usm_system_allocations,
-  ext_intel_pci_address = 18,
-  ext_intel_gpu_eu_count = 19,
-  ext_intel_gpu_eu_simd_width = 20,
-  ext_intel_gpu_slices = 21,
-  ext_intel_gpu_subslices_per_slice = 22,
-  ext_intel_gpu_eu_count_per_subslice = 23,
-  ext_intel_max_mem_bandwidth = 24,
-  ext_intel_mem_channel = 25,
-  usm_atomic_host_allocations = 26,
-  usm_atomic_shared_allocations = 27,
-  atomic64 = 28,
-  ext_intel_device_info_uuid = 29,
-  ext_oneapi_srgb = 30,
-  ext_oneapi_native_assert = 31,
-  host_debuggable = 32,
-  ext_intel_gpu_hw_threads_per_eu = 33,
-  ext_oneapi_cuda_async_barrier = 34,
-  ext_oneapi_bfloat16 = 35,
-  ext_intel_free_memory = 36,
-  ext_oneapi_bfloat16_math_functions = 37,
-=======
 #include <sycl/info/aspects.def>
 #include <sycl/info/aspects_deprecated.def>
->>>>>>> a32021ba066ecbf842136a973dd35d700c983cea
 };
 #undef __SYCL_ASPECT_DEPRECATED_ALIAS
 #undef __SYCL_ASPECT_DEPRECATED
diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
index b1ec2966ff069..d61febc26479a 100644
--- a/sycl/include/sycl/detail/pi.h
+++ b/sycl/include/sycl/detail/pi.h
@@ -281,9 +281,8 @@ typedef enum {
   PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 0x11000,
   PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 0x10112,
   PI_DEVICE_INFO_BACKEND_VERSION = 0x10113,
-  // Return whether bfloat16 conversions and math are supported by device
-  PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS = 0x1FFFE,
-  PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16 = 0x1FFFF,
+  // Return whether bfloat16 math functions are supported by device
+  PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS = 0x1FFFF,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_GLOBAL_WORK_GROUPS = 0x20000,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_1D = 0x20001,
   PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_2D = 0x20002,
diff --git a/sycl/include/sycl/feature_test.hpp.in b/sycl/include/sycl/feature_test.hpp.in
index bd37830769171..e8dd78b5a91a3 100644
--- a/sycl/include/sycl/feature_test.hpp.in
+++ b/sycl/include/sycl/feature_test.hpp.in
@@ -48,7 +48,7 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
 #define SYCL_EXT_ONEAPI_SUB_GROUP 1
 #define SYCL_EXT_ONEAPI_PROPERTIES 1
 #define SYCL_EXT_ONEAPI_NATIVE_MATH 1
-#define SYCL_EXT_ONEAPI_BFLOAT16 1
+#define SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS 1
 #define SYCL_EXT_INTEL_DATAFLOW_PIPES 1
 #ifdef __clang__
 #if __has_extension(sycl_extended_atomics)
@@ -67,7 +67,6 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
 #cmakedefine01 SYCL_BUILD_PI_CUDA
 #if SYCL_BUILD_PI_CUDA
 #define SYCL_EXT_ONEAPI_BACKEND_CUDA 1
-#define SYCL_EXT_ONEAPI_BFLOAT16_MATH 1
 #endif
 #cmakedefine01 SYCL_BUILD_PI_ESIMD_EMULATOR
 #if SYCL_BUILD_PI_ESIMD_EMULATOR
diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def
index 692728f5afec4..3771bcf388472 100644
--- a/sycl/include/sycl/info/aspects.def
+++ b/sycl/include/sycl/info/aspects.def
@@ -31,6 +31,6 @@ __SYCL_ASPECT(ext_oneapi_native_assert, 31)
 __SYCL_ASPECT(host_debuggable, 32)
 __SYCL_ASPECT(ext_intel_gpu_hw_threads_per_eu, 33)
 __SYCL_ASPECT(ext_oneapi_cuda_async_barrier, 34)
-__SYCL_ASPECT(ext_oneapi_bfloat16, 35)
+__SYCL_ASPECT(ext_oneapi_bfloat16_math_functions, 35)
 __SYCL_ASPECT(ext_intel_free_memory, 36)
 __SYCL_ASPECT(ext_intel_device_id, 37)
diff --git a/sycl/include/sycl/info/device_traits.def b/sycl/include/sycl/info/device_traits.def
index 49e5632e706fe..053026d6dbaa6 100644
--- a/sycl/include/sycl/info/device_traits.def
+++ b/sycl/include/sycl/info/device_traits.def
@@ -193,8 +193,6 @@ __SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_srgb, bool,
                          PI_DEVICE_INFO_IMAGE_SRGB)
 __SYCL_PARAM_TRAITS_SPEC(device, ext_intel_mem_channel, bool,
                          PI_MEM_PROPERTIES_CHANNEL)
-__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_bfloat16, bool,
-                         PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16)
 __SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_bfloat16_math_functions, bool,
                          PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS)
 
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 5a8650ba53257..4de92154b0500 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -1327,7 +1327,6 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    capabilities);
   }
-  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16:
   case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS: {
     int major = 0;
     sycl::detail::pi::assertion(
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index 1f8e7b3bcda4c..777802561e569 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -1828,7 +1828,6 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name,
   case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
   case PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
-  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16:
   case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS:
     return PI_ERROR_INVALID_VALUE;
 
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 22bda2297f429..6a749a4d7dbea 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -3212,10 +3212,6 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
     // currently not supported in level zero runtime
     return PI_ERROR_INVALID_VALUE;
-  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16: {
-    // bfloat16 conversions are supported on Intel GPUs.
-    return ReturnValue(bool{true});
-  }
   case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS: {
     // bfloat16 math functions are not yet supported on Intel GPUs.
     return ReturnValue(bool{false});
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index 1bcbbba2e58d5..0bac4ca16969a 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -295,12 +295,6 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
     std::memcpy(paramValue, &result, sizeof(cl_bool));
     return PI_SUCCESS;
   }
-  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16: {
-    // bfloat16 conversions are supported on Intel GPUs.
-    cl_bool result = true;
-    std::memcpy(paramValue, &result, sizeof(cl_bool));
-    return PI_SUCCESS;
-  }
   case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS: {
     // bfloat16 math functions are not yet supported on Intel GPUs.
     cl_bool result = false;
diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index e343a25e21be2..ac78154924090 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -275,8 +275,6 @@ bool device_impl::has(aspect Aspect) const {
     return has_extension("cl_khr_fp16");
   case aspect::fp64:
     return has_extension("cl_khr_fp64");
-  case aspect::ext_oneapi_bfloat16:
-    return get_info<info::device::ext_oneapi_bfloat16>();
   case aspect::ext_oneapi_bfloat16_math_functions:
     return get_info<info::device::ext_oneapi_bfloat16_math_functions>();
   case aspect::int64_base_atomics:
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 3e034142052f3..4fbf8b6c249a6 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -279,22 +279,6 @@ struct get_device_info_impl<std::vector<memory_scope>,
   }
 };
 
-// Specialization for bf16 conversions
-template <>
-struct get_device_info_impl<bool, info::device::ext_oneapi_bfloat16> {
-  static bool get(RT::PiDevice dev, const plugin &Plugin) {
-    bool result = false;
-
-    RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
-        dev, PiInfoCode<info::device::ext_oneapi_bfloat16>::value,
-        sizeof(result), &result, nullptr);
-    if (Err != PI_SUCCESS) {
-      return false;
-    }
-    return result;
-  }
-};
-
 // Specialization for bf16 math functions
 template <>
 struct get_device_info_impl<bool,
@@ -1018,11 +1002,6 @@ get_device_info_host<info::device::atomic_memory_scope_capabilities>() {
           memory_scope::work_group, memory_scope::device, memory_scope::system};
 }
 
-template <>
-inline bool get_device_info_host<info::device::ext_oneapi_bfloat16>() {
-  return true;
-}
-
 template <>
 inline bool
 get_device_info_host<info::device::ext_oneapi_bfloat16_math_functions>() {
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 40034ec5d3707..9fec091456bb0 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -4201,7 +4201,6 @@ _ZNK4sycl3_V16device8get_infoINS0_4info6device18max_num_sub_groupsEEENS0_6detail
 _ZNK4sycl3_V16device8get_infoINS0_4info6device18max_parameter_sizeEEENS0_6detail19is_device_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16device8get_infoINS0_4info6device18printf_buffer_sizeEEENS0_6detail19is_device_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16device8get_infoINS0_4info6device19built_in_kernel_idsEEENS0_6detail19is_device_info_descIT_E11return_typeEv
-_ZNK4sycl3_V16device8get_infoINS0_4info6device19ext_oneapi_bfloat16EEENS0_6detail19is_device_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16device8get_infoINS0_4info6device19host_unified_memoryEEENS0_6detail19is_device_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16device8get_infoINS0_4info6device19is_linker_availableEEENS0_6detail19is_device_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16device8get_infoINS0_4info6device19max_clock_frequencyEEENS0_6detail19is_device_info_descIT_E11return_typeEv
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index a8c4fea892d24..37f5321374d3a 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -58,7 +58,6 @@
 ??$get_info@Uext_intel_max_mem_bandwidth@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA_KXZ
 ??$get_info@Uext_intel_mem_channel@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA_NXZ
 ??$get_info@Uext_intel_pci_address@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
-??$get_info@Uext_oneapi_bfloat16@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA_NXZ
 ??$get_info@Uext_oneapi_bfloat16_math_functions@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA_NXZ
 ??$get_info@Uext_oneapi_max_global_work_groups@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA_KXZ
 ??$get_info@Uext_oneapi_max_work_groups_1d@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA?AV?$id@$00@12@XZ

From 0f935863b50843c24897c9544c3a11f0a0089c52 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 12 Oct 2022 20:45:29 -0700
Subject: [PATCH 46/63] Improved devices check in clang driver.

---
 clang/lib/Driver/Driver.cpp                   | 33 +++++++++++++++++--
 ...t_oneapi_bfloat16_math_functions.asciidoc} |  6 ++--
 2 files changed, 34 insertions(+), 5 deletions(-)
 rename sycl/doc/extensions/experimental/{sycl_ext_oneapi_bfloat16_math.asciidoc => sycl_ext_oneapi_bfloat16_math_functions.asciidoc} (96%)
 mode change 100644 => 100755

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 9e65de9f77e39..1178bfa5d1a31 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -101,6 +101,7 @@
 #include <cstdlib> // ::getenv
 #include <map>
 #include <memory>
+#include <regex>
 #include <utility>
 #if LLVM_ON_UNIX
 #include <unistd.h> // getpid
@@ -5168,11 +5169,39 @@ class OffloadingActionBuilder final {
           continue;
         };
       }
-      useNative = false;
       if (needLibs &&
           TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen &&
           TargetOpt && DeviceOpt) {
-        useNative = strstr(DeviceOpt, "pvc") || strstr(DeviceOpt, "ats");
+
+        auto checkBF = [=](std::string &Param, size_t Length) {
+          static const std::regex BFFs("pvc.*|ats.*");
+          std::string Dev = Param.substr(0, Length);
+          return std::regex_match(Dev, BFFs);
+        };
+
+        std::string Params{DeviceOpt};
+        size_t DevicesPos = Params.find("-device ");
+        useNative = false;
+        if (DevicesPos != std::string::npos) {
+          useNative = true;
+          Params.erase(0, DevicesPos + 8);
+          do {
+            size_t Pos = Params.find(',');
+            if (Pos != std::string::npos) {
+              // comma found
+              if (Pos > 0) {
+                std::string ADevice = Params.substr(0, Pos);
+                useNative &= checkBF(ADevice, ADevice.size());
+              }
+              Params.erase(0, Pos + 1);
+              if (Params.size() == 0)
+                break;
+            } else {
+              useNative &= checkBF(Params, Params.size());
+              break;
+            }
+          } while (true);
+        }
       }
       return needLibs;
     }
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
old mode 100644
new mode 100755
similarity index 96%
rename from sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
rename to sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
index 5eb25fbf77a00..41d75660cd94f
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
@@ -1,4 +1,4 @@
-= sycl_ext_oneapi_bfloat16_math
+= sycl_ext_oneapi_bfloat16_math_functions
 
 :source-highlighter: coderay
 :coderay-linenums-mode: table
@@ -90,12 +90,12 @@ supports.
 namespace sycl {
 enum class aspect {
   ...
-  ext_oneapi_bfloat16_math_functions
+  sycl_ext_oneapi_bfloat16_math_functions
 }
 }
 ----
 
-If a SYCL device has the `ext_oneapi_bfloat16_math_functions` aspect,
+If a SYCL device has the `sycl_ext_oneapi_bfloat16_math_functions` aspect,
 then it supports the `bfloat16` math functions described in the next section.
 
 === Math Functions

From d33cb10d019164cc3cc4de39f6bd5f731b6e3b03 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 12 Oct 2022 20:55:28 -0700
Subject: [PATCH 47/63] Enhanced test for improved bfloat16 target detection.

---
 clang/test/Driver/sycl-bfloat16-lib.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/clang/test/Driver/sycl-bfloat16-lib.cpp b/clang/test/Driver/sycl-bfloat16-lib.cpp
index 65283d77e51ef..99741a75e295d 100755
--- a/clang/test/Driver/sycl-bfloat16-lib.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib.cpp
@@ -17,6 +17,14 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-NATIVE
 
+// test that unless all targets support bfloat16, AOT compilation uses the fallback library
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc,gen9" %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
+
+// test that when all targets support bfloat16, AOT compilation uses the native library
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc-sdv,ats-m75" %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-NATIVE
+
 // test that a gen9 AOT compilation uses the fallback library
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK

From 28992c2b8486a51fa16e58244e252d01b6c61fe7 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 13 Oct 2022 08:34:51 -0700
Subject: [PATCH 48/63] Updated bfloat16 driver test for windows.

---
 clang/test/Driver/sycl-bfloat16-lib-win.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/clang/test/Driver/sycl-bfloat16-lib-win.cpp b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
index c29eeed8601aa..288454139b633 100755
--- a/clang/test/Driver/sycl-bfloat16-lib-win.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
@@ -17,6 +17,14 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-NATIVE
 
+// test that unless all targets support bfloat16, AOT compilation uses the fallback library
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc,gen9" %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
+
+// test that when all targets support bfloat16, AOT compilation uses the native library
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device pvc-sdv,ats-m75" %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-NATIVE
+
 // test that a gen9 AOT compilation uses the fallback library
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK

From ec28c8b137157c155ca710a76081ec0b0686b8a8 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 13 Oct 2022 10:36:30 -0700
Subject: [PATCH 49/63] Use STL for parsing devices.

---
 clang/lib/Driver/Driver.cpp | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 1178bfa5d1a31..b2a5542e913ce 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -102,6 +102,7 @@
 #include <map>
 #include <memory>
 #include <regex>
+#include <sstream>
 #include <utility>
 #if LLVM_ON_UNIX
 #include <unistd.h> // getpid
@@ -5173,9 +5174,8 @@ class OffloadingActionBuilder final {
           TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen &&
           TargetOpt && DeviceOpt) {
 
-        auto checkBF = [=](std::string &Param, size_t Length) {
+        auto checkBF = [=](std::string &Dev) {
           static const std::regex BFFs("pvc.*|ats.*");
-          std::string Dev = Param.substr(0, Length);
           return std::regex_match(Dev, BFFs);
         };
 
@@ -5184,23 +5184,10 @@ class OffloadingActionBuilder final {
         useNative = false;
         if (DevicesPos != std::string::npos) {
           useNative = true;
-          Params.erase(0, DevicesPos + 8);
-          do {
-            size_t Pos = Params.find(',');
-            if (Pos != std::string::npos) {
-              // comma found
-              if (Pos > 0) {
-                std::string ADevice = Params.substr(0, Pos);
-                useNative &= checkBF(ADevice, ADevice.size());
-              }
-              Params.erase(0, Pos + 1);
-              if (Params.size() == 0)
-                break;
-            } else {
-              useNative &= checkBF(Params, Params.size());
-              break;
-            }
-          } while (true);
+          std::istringstream Devices(Params.substr(DevicesPos + 8));
+          for (std::string S; std::getline(Devices, S, ',');) {
+            useNative &= checkBF(S);
+          }
         }
       }
       return needLibs;

From ec70b20c93e2a910347a5a1e2a3b81636f9d5a9c Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 24 Oct 2022 10:00:40 -0700
Subject: [PATCH 50/63] Allow spir64 target to be JIT even when combined with
 AOT targets.

---
 clang/lib/Driver/Driver.cpp                            | 10 ++--------
 sycl/source/detail/program_manager/program_manager.cpp |  4 ----
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 672c8489e6b78..185e2511e78d5 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5141,12 +5141,6 @@ class OffloadingActionBuilder final {
         };
 
         if (A->getOption().matches(options::OPT_fsycl_targets_EQ)) {
-          // When a generic target "spir64" is used with other AOT targets
-          // we use fallback libraries.
-          if (TC->getTriple().getSubArch() == llvm::Triple::NoSubArch)
-            needLibs = DeviceLinkerInputs.size() > 1;
-          else
-            needLibs = true;
           TargetBE = GetTripleIt(A->getValue(0));
           if (TargetBE)
             TargetOpt = A->getValue(0);
@@ -5170,8 +5164,7 @@ class OffloadingActionBuilder final {
           continue;
         };
       }
-      if (needLibs &&
-          TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen &&
+      if (TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen &&
           TargetOpt && DeviceOpt) {
 
         auto checkBF = [=](std::string &Dev) {
@@ -5179,6 +5172,7 @@ class OffloadingActionBuilder final {
           return std::regex_match(Dev, BFFs);
         };
 
+        needLibs = true;
         std::string Params{DeviceOpt};
         size_t DevicesPos = Params.find("-device ");
         useNative = false;
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 17ce5cf4e9c86..58196dc156535 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -1264,10 +1264,6 @@ void ProgramManager::addImages(pi_device_binaries DeviceBinary) {
       StrToKSIdMap &KSIdMap = m_KernelSets[M];
       auto KSIdIt = KSIdMap.find(EntriesB->name);
       if (KSIdIt != KSIdMap.end()) {
-        for (_pi_offload_entry EntriesIt = EntriesB + 1; EntriesIt != EntriesE;
-             ++EntriesIt)
-          assert(KSIdMap[EntriesIt->name] == KSIdIt->second &&
-                 "Kernel sets are not disjoint");
         auto &Imgs = m_DeviceImages[KSIdIt->second];
         assert(Imgs && "Device image vector should have been already created");
 

From 1b86012b23a10dad6d2261b6ddc42d9e71c432f1 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 24 Oct 2022 10:13:51 -0700
Subject: [PATCH 51/63] Updated documentation.

---
 .../supported/sycl_ext_oneapi_bfloat16.asciidoc        | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index cdbfaf8059055..cf971bb2ee4a0 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -44,15 +44,7 @@ This extension is written against the SYCL 2020 specification, Revision 5.
 == Status
 
 This extension is implemented and fully supported by DPC++.
-[NOTE]
-Currently, if AOT compilation is done for multiple targets
-using "-fsycl-targets=<A,B,C>" then `bfloat16` support will be native
-or software emulation depending on the target, with the exception of the
-"spirv" target. That is actually a JIT compilation and it will use software
-emulation regardless of the actual runtime execution device.
-In JIT mode where the "-fsycl-targets=" switch is not specified,
-the selection of native or emulation is at runtime,
-depending on the device.
+
 
 == Overview
 

From 3e1e6812a99f40908b7329cd3dfcc156ea151d76 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 24 Oct 2022 23:15:26 -0700
Subject: [PATCH 52/63] Modifications for mixed JIT and AOT compilations, added
 tests.

---
 clang/lib/Driver/Driver.cpp                 |  42 +++--
 clang/test/Driver/sycl-bfloat16-lib-win.cpp | 196 ++++++++++++++++++++
 clang/test/Driver/sycl-bfloat16-lib.cpp     | 196 ++++++++++++++++++++
 3 files changed, 416 insertions(+), 18 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 185e2511e78d5..6645a74f70168 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5141,6 +5141,9 @@ class OffloadingActionBuilder final {
         };
 
         if (A->getOption().matches(options::OPT_fsycl_targets_EQ)) {
+          // spir64 target is actually JIT compilation, so we defer selection of
+          // bfloat16 libraries to runtime. For AOT we need libraries.
+          needLibs = TC->getTriple().getSubArch() != llvm::Triple::NoSubArch;
           TargetBE = GetTripleIt(A->getValue(0));
           if (TargetBE)
             TargetOpt = A->getValue(0);
@@ -5164,26 +5167,29 @@ class OffloadingActionBuilder final {
           continue;
         };
       }
-      if (TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen &&
-          TargetOpt && DeviceOpt) {
-
-        auto checkBF = [=](std::string &Dev) {
-          static const std::regex BFFs("pvc.*|ats.*");
-          return std::regex_match(Dev, BFFs);
-        };
-
-        needLibs = true;
-        std::string Params{DeviceOpt};
-        size_t DevicesPos = Params.find("-device ");
-        useNative = false;
-        if (DevicesPos != std::string::npos) {
-          useNative = true;
-          std::istringstream Devices(Params.substr(DevicesPos + 8));
-          for (std::string S; std::getline(Devices, S, ',');) {
-            useNative &= checkBF(S);
+      useNative = false;
+      if (needLibs)
+        if (TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen &&
+            TargetOpt && DeviceOpt) {
+
+          auto checkBF = [=](std::string &Dev) {
+            static const std::regex BFFs("pvc.*|ats.*");
+            return std::regex_match(Dev, BFFs);
+          };
+
+          needLibs = true;
+          std::string Params{DeviceOpt};
+          size_t DevicesPos = Params.find("-device ");
+          useNative = false;
+          if (DevicesPos != std::string::npos) {
+            useNative = true;
+            std::istringstream Devices(Params.substr(DevicesPos + 8));
+            for (std::string S; std::getline(Devices, S, ',');) {
+              useNative &= checkBF(S);
+            }
           }
         }
-      }
+
       return needLibs;
     }
 
diff --git a/clang/test/Driver/sycl-bfloat16-lib-win.cpp b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
index 288454139b633..8f76a4ec755e7 100755
--- a/clang/test/Driver/sycl-bfloat16-lib-win.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
@@ -33,6 +33,22 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
+// test that a mixed JIT + AOT-Gen9 compilation uses no libs + fallback libs
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-NONE-NATIVE
+
+// test that a mixed JIT + AOT-PVC compilation uses no libs + native libs
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-NONE-FALLBACK
+
+// test that an AOT-CPU + AOT-Gen9 compilation fallback + fallback libs
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK-NATIVE
+
+// test that an AOT-CPU + AOT-PVC compilation uses fallback + native libs
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK-FALLBACK
+
 // BFLOAT16: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
 // BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
 // BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
@@ -83,3 +99,183 @@
 // BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
 // BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
 // BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.obj"
+
+// BFLOAT16-NONE-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath.obj" "-output={{.*}}libsycl-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.obj" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-mscv-math.obj" "-output={{.*}}libsycl-mscv-math-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf.obj" "-output={{.*}}libsycl-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.obj" "-output={{.*}}libsycl-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.obj" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.obj" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.obj" "-output={{.*}}libsycl-fallback-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.obj" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-user-wrappers.obj" "-output={{.*}}libsycl-itt-user-wrappers-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-compiler-wrappers.obj" "-output={{.*}}libsycl-itt-compiler-wrappers-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-stubs.obj" "-output={{.*}}libsycl-itt-stubs-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: llvm-link{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: sycl-post-link{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: file-table-tform{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: llvm-foreach{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: file-table-tform{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-wrapper{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: llc{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: clang-16{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: llvm-link{{.*}}
+// BFLOAT16-NONE-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.obj" "-output={{.*}}libsycl-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.obj" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-mscv-math.obj" "-output={{.*}}libsycl-mscv-math-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.obj" "-output={{.*}}libsycl-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.obj" "-output={{.*}}libsycl-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.obj" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.obj" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.obj" "-output={{.*}}libsycl-fallback-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.obj" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-native-bfloat16.obj"
+
+// BFLOAT16-NONE-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath.obj" "-output={{.*}}libsycl-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.obj" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-mscv-math.obj" "-output={{.*}}libsycl-mscv-math-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf.obj" "-output={{.*}}libsycl-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.obj" "-output={{.*}}libsycl-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.obj" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.obj" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.obj" "-output={{.*}}libsycl-fallback-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.obj" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-user-wrappers.obj" "-output={{.*}}libsycl-itt-user-wrappers-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-compiler-wrappers.obj" "-output={{.*}}libsycl-itt-compiler-wrappers-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-stubs.obj" "-output={{.*}}libsycl-itt-stubs-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: llvm-link{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: sycl-post-link{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: file-table-tform{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: llvm-foreach{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: file-table-tform{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-wrapper{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: llc{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-16{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: llvm-link{{.*}}
+// BFLOAT16-NONE-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.obj" "-output={{.*}}libsycl-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.obj" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-mscv-math.obj" "-output={{.*}}libsycl-mscv-math-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.obj" "-output={{.*}}libsycl-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.obj" "-output={{.*}}libsycl-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.obj" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.obj" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.obj" "-output={{.*}}libsycl-fallback-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.obj" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.obj"
+
+// BFLOAT16-FALLBACK-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-cmath.obj" "-output={{.*}}libsycl-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.obj" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-mscv-math.obj" "-output={{.*}}libsycl-mscv-math-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-imf.obj" "-output={{.*}}libsycl-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.obj" "-output={{.*}}libsycl-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.obj" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.obj" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.obj" "-output={{.*}}libsycl-fallback-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.obj" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.obj" "-output={{.*}}libsycl-fallback-bfloat16-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-itt-user-wrappers.obj" "-output={{.*}}libsycl-itt-user-wrappers-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-itt-compiler-wrappers.obj" "-output={{.*}}libsycl-itt-compiler-wrappers-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-itt-stubs.obj" "-output={{.*}}libsycl-itt-stubs-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: llvm-link{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: sycl-post-link{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: file-table-tform{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: llvm-foreach{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: llvm-foreach{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: file-table-tform{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-wrapper{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: llc{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-16{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: llvm-link{{.*}}
+// BFLOAT16-FALLBACK-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.obj" "-output={{.*}}libsycl-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.obj" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-mscv-math.obj" "-output={{.*}}libsycl-mscv-math-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.obj" "-output={{.*}}libsycl-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.obj" "-output={{.*}}libsycl-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.obj" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.obj" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.obj" "-output={{.*}}libsycl-fallback-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.obj" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-native-bfloat16.obj"
+
+// BFLOAT16-FALLBACK-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-cmath.obj" "-output={{.*}}libsycl-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.obj" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-mscv-math.obj" "-output={{.*}}libsycl-mscv-math-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-imf.obj" "-output={{.*}}libsycl-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.obj" "-output={{.*}}libsycl-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.obj" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.obj" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.obj" "-output={{.*}}libsycl-fallback-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.obj" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.obj" "-output={{.*}}libsycl-fallback-bfloat16-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-itt-user-wrappers.obj" "-output={{.*}}libsycl-itt-user-wrappers-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-itt-compiler-wrappers.obj" "-output={{.*}}libsycl-itt-compiler-wrappers-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-itt-stubs.obj" "-output={{.*}}libsycl-itt-stubs-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: llvm-link{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: sycl-post-link{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: file-table-tform{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: llvm-foreach{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: llvm-foreach{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: file-table-tform{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-wrapper{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: llc{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-16{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: llvm-link{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.obj" "-output={{.*}}libsycl-crt-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.obj" "-output={{.*}}libsycl-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.obj" "-output={{.*}}libsycl-complex-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.obj" "-output={{.*}}libsycl-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.obj" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-mscv-math.obj" "-output={{.*}}libsycl-mscv-math-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.obj" "-output={{.*}}libsycl-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.obj" "-output={{.*}}libsycl-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.obj" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.obj" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.obj" "-output={{.*}}libsycl-fallback-complex-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.obj" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.obj" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.obj" "-output={{.*}}libsycl-fallback-imf-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.obj" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.obj" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.obj"
\ No newline at end of file
diff --git a/clang/test/Driver/sycl-bfloat16-lib.cpp b/clang/test/Driver/sycl-bfloat16-lib.cpp
index 99741a75e295d..9046c4769f207 100755
--- a/clang/test/Driver/sycl-bfloat16-lib.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib.cpp
@@ -33,6 +33,22 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
+// test that a mixed JIT + AOT-Gen9 compilation uses no libs + fallback libs
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-NONE-NATIVE
+
+// test that a mixed JIT + AOT-PVC compilation uses no libs + native libs
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-NONE-FALLBACK
+
+// test that an AOT-CPU + AOT-Gen9 compilation fallback + fallback libs
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK-NATIVE
+
+// test that an AOT-CPU + AOT-PVC compilation uses fallback + native libs
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK-FALLBACK
+
 // BFLOAT16: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
 // BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
 // BFLOAT16-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
@@ -83,3 +99,183 @@
 // BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
 // BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
 // BFLOAT16-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-{{spir64_gen-|spir64-}}unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.o"
+
+// BFLOAT16-NONE-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath.o" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-user-wrappers.o" "-output={{.*}}libsycl-itt-user-wrappers-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-compiler-wrappers.o" "-output={{.*}}libsycl-itt-compiler-wrappers-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-stubs.o" "-output={{.*}}libsycl-itt-stubs-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: llvm-link{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: sycl-post-link{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: file-table-tform{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: llvm-foreach{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: file-table-tform{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-wrapper{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: llc{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: clang-16{{.*}}
+// BFLOAT16-NONE-NATIVE-NEXT: llvm-link{{.*}}
+// BFLOAT16-NONE-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.o" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-native-bfloat16.o"
+
+// BFLOAT16-NONE-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath.o" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-user-wrappers.o" "-output={{.*}}libsycl-itt-user-wrappers-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-compiler-wrappers.o" "-output={{.*}}libsycl-itt-compiler-wrappers-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64-unknown-unknown" "-input={{.*}}libsycl-itt-stubs.o" "-output={{.*}}libsycl-itt-stubs-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: llvm-link{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: sycl-post-link{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: file-table-tform{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: llvm-foreach{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: file-table-tform{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-wrapper{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: llc{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-16{{.*}}
+// BFLOAT16-NONE-FALLBACK-NEXT: llvm-link{{.*}}
+// BFLOAT16-NONE-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.o" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-NONE-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.o"
+
+// BFLOAT16-FALLBACK-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-cmath.o" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.o" "-output={{.*}}libsycl-fallback-bfloat16-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-itt-user-wrappers.o" "-output={{.*}}libsycl-itt-user-wrappers-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-itt-compiler-wrappers.o" "-output={{.*}}libsycl-itt-compiler-wrappers-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-itt-stubs.o" "-output={{.*}}libsycl-itt-stubs-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: llvm-link{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: sycl-post-link{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: file-table-tform{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: llvm-foreach{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: llvm-foreach{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: file-table-tform{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-wrapper{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: llc{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-16{{.*}}
+// BFLOAT16-FALLBACK-NATIVE-NEXT: llvm-link{{.*}}
+// BFLOAT16-FALLBACK-NATIVE: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.o" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-NATIVE-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-native-bfloat16.o"
+
+// BFLOAT16-FALLBACK-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-cmath.o" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.o" "-output={{.*}}libsycl-fallback-bfloat16-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-itt-user-wrappers.o" "-output={{.*}}libsycl-itt-user-wrappers-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-itt-compiler-wrappers.o" "-output={{.*}}libsycl-itt-compiler-wrappers-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_x86_64-unknown-unknown" "-input={{.*}}libsycl-itt-stubs.o" "-output={{.*}}libsycl-itt-stubs-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: llvm-link{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: sycl-post-link{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: file-table-tform{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: llvm-foreach{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: llvm-foreach{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: file-table-tform{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-wrapper{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: llc{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-16{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: llvm-link{{.*}}
+// BFLOAT16-FALLBACK-FALLBACK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-crt.o" "-output={{.*}}libsycl-crt-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex.o" "-output={{.*}}libsycl-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-complex-fp64.o" "-output={{.*}}libsycl-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath.o" "-output={{.*}}libsycl-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-cmath-fp64.o" "-output={{.*}}libsycl-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf.o" "-output={{.*}}libsycl-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-imf-fp64.o" "-output={{.*}}libsycl-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cassert.o" "-output={{.*}}libsycl-fallback-cassert-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cstring.o" "-output={{.*}}libsycl-fallback-cstring-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex.o" "-output={{.*}}libsycl-fallback-complex-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-complex-fp64.o" "-output={{.*}}libsycl-fallback-complex-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath.o" "-output={{.*}}libsycl-fallback-cmath-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-cmath-fp64.o" "-output={{.*}}libsycl-fallback-cmath-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf.o" "-output={{.*}}libsycl-fallback-imf-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-imf-fp64.o" "-output={{.*}}libsycl-fallback-imf-fp64-{{.*}}.o" "-unbundle"
+// BFLOAT16-FALLBACK-FALLBACK-NEXT: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_gen-unknown-unknown" "-input={{.*}}libsycl-fallback-bfloat16.o"

From 8c633d32578693ae7a32178a77efa0e849e2e050 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 25 Oct 2022 09:33:36 -0700
Subject: [PATCH 53/63] Corrections to comments.

---
 clang/test/Driver/sycl-bfloat16-lib-win.cpp | 8 ++++----
 clang/test/Driver/sycl-bfloat16-lib.cpp     | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/clang/test/Driver/sycl-bfloat16-lib-win.cpp b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
index 8f76a4ec755e7..ecba163add3ec 100755
--- a/clang/test/Driver/sycl-bfloat16-lib-win.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
@@ -33,19 +33,19 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
-// test that a mixed JIT + AOT-Gen9 compilation uses no libs + fallback libs
+// test that a mixed JIT + AOT-PVC  compilation uses no libs + fallback libs
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-NONE-NATIVE
 
-// test that a mixed JIT + AOT-PVC compilation uses no libs + native libs
+// test that a mixed JIT + AOT-Gen9 compilation uses no libs + native libs
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-NONE-FALLBACK
 
-// test that an AOT-CPU + AOT-Gen9 compilation fallback + fallback libs
+// test that an AOT-CPU + AOT-PVC compilation fallback + fallback libs
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK-NATIVE
 
-// test that an AOT-CPU + AOT-PVC compilation uses fallback + native libs
+// test that an AOT-CPU + AOT-Gen9 compilation uses fallback + native libs
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK-FALLBACK
 
diff --git a/clang/test/Driver/sycl-bfloat16-lib.cpp b/clang/test/Driver/sycl-bfloat16-lib.cpp
index 9046c4769f207..6cbf7ff8b235f 100755
--- a/clang/test/Driver/sycl-bfloat16-lib.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib.cpp
@@ -33,19 +33,19 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK
 
-// test that a mixed JIT + AOT-Gen9 compilation uses no libs + fallback libs
+// test that a mixed JIT + AOT-PVC compilation uses no libs + fallback libs
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-NONE-NATIVE
 
-// test that a mixed JIT + AOT-PVC compilation uses no libs + native libs
+// test that a mixed JIT + AOT-Gen9 compilation uses no libs + native libs
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-NONE-FALLBACK
 
-// test that an AOT-CPU + AOT-Gen9 compilation fallback + fallback libs
+// test that an AOT-CPU + AOT-PVC compilation fallback + fallback libs
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK-NATIVE
 
-// test that an AOT-CPU + AOT-PVC compilation uses fallback + native libs
+// test that an AOT-CPU + AOT-Gen9-win compilation uses fallback + native libs
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK-FALLBACK
 

From 1a59e0338278e8b69e2f35d7f61efb0245b8e5b1 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 25 Oct 2022 11:02:00 -0700
Subject: [PATCH 54/63] Update to documentation.

---
 clang/test/Driver/sycl-bfloat16-lib.cpp       |  2 +-
 .../sycl_ext_oneapi_bfloat16.asciidoc         | 22 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/clang/test/Driver/sycl-bfloat16-lib.cpp b/clang/test/Driver/sycl-bfloat16-lib.cpp
index 6cbf7ff8b235f..95f1fc9d24773 100755
--- a/clang/test/Driver/sycl-bfloat16-lib.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib.cpp
@@ -45,7 +45,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK-NATIVE
 
-// test that an AOT-CPU + AOT-Gen9-win compilation uses fallback + native libs
+// test that an AOT-CPU + AOT-Gen9 compilation uses fallback + native libs
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=BFLOAT16-FALLBACK-FALLBACK
 
diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index cf971bb2ee4a0..d581ee18e3ff1 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -44,6 +44,28 @@ This extension is written against the SYCL 2020 specification, Revision 5.
 == Status
 
 This extension is implemented and fully supported by DPC++.
+[NOTE]
+====
+The DPC++ compiler has the following limitation when using this extension
+in conjunction with ahead-of-time (AOT) compilation with the `-fsycl-targets`
+compiler option.  When doing AOT compilation for an Intel GPU device via
+`-fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device ..."`,
+the compiler chooses either fallback or native support for bfloat16 according
+to the device(s) specified in `...`.  Native support is used only if all of
+these devices have native bfloat16 support.  As a result, AOT compiling for
+multiple Intel GPU devices could result in the lower performance fallback
+support even when running on a GPU that has native support.  Therefore, the
+recommendation is to use AOT only when all Intel GPU devices have the same
+type of bfloat16 support (all native support or all fallback support).
+
+There is a similar limitation when AOT compiling for one Intel GPU device and
+running on a different Intel GPU device. In this case, the compiler chooses
+either fallback or native bfloat16 support according to the device(s) specified
+on the command line. If the fallback library had been chosen at AOT compilation
+time, then the binary will run on all Intel GPU devices. If however, the native
+bfloat16 library had been chosen at AOT compilation time then the binary
+will run only on Intel GPU devices that have native bfloat16 support.
+====
 
 
 == Overview

From b2fd6cc088a3b7d524062663838ef2f4b469a8c7 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 25 Oct 2022 13:43:33 -0700
Subject: [PATCH 55/63] Updated doc.

---
 .../extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc  | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
index d581ee18e3ff1..ff72b3c959f8f 100644
--- a/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_bfloat16.asciidoc
@@ -61,8 +61,10 @@ type of bfloat16 support (all native support or all fallback support).
 There is a similar limitation when AOT compiling for one Intel GPU device and
 running on a different Intel GPU device. In this case, the compiler chooses
 either fallback or native bfloat16 support according to the device(s) specified
-on the command line. If the fallback library had been chosen at AOT compilation
-time, then the binary will run on all Intel GPU devices. If however, the native
+on the command line. If the fallback library was chosen at AOT compilation
+time, then the binary will run on all Intel GPU devices but you will not
+get the performance benefit of native support even when running on a new
+Intel GPU that has native support. If however, the native
 bfloat16 library had been chosen at AOT compilation time then the binary
 will run only on Intel GPU devices that have native bfloat16 support.
 ====

From 35b89102870f1fdb434204992f41f0849db777fe Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 27 Oct 2022 09:52:03 -0700
Subject: [PATCH 56/63] Adjustments to tests.

---
 sycl/test/esimd/dpas.cpp                      |  2 +-
 sycl/test/extensions/bfloat16.cpp             | 32 +++++++++----------
 sycl/test/extensions/bfloat16_host.cpp        |  7 ++--
 sycl/test/matrix/matrix-bfloat16-test-use.cpp |  6 ++--
 sycl/test/matrix/matrix-bfloat16-test.cpp     |  4 +--
 5 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/sycl/test/esimd/dpas.cpp b/sycl/test/esimd/dpas.cpp
index 207886aaa6eec..27307b0ddfce0 100644
--- a/sycl/test/esimd/dpas.cpp
+++ b/sycl/test/esimd/dpas.cpp
@@ -11,7 +11,7 @@ using namespace sycl::ext::intel::esimd;
 namespace old = sycl::ext::intel::experimental::esimd;
 namespace xmx = sycl::ext::intel::esimd::xmx;
 
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 using half = sycl::half;
 
 constexpr auto bf16 = xmx::dpas_argument_type::bf16;
diff --git a/sycl/test/extensions/bfloat16.cpp b/sycl/test/extensions/bfloat16.cpp
index 63eecc8b30534..8c4a0bbba3968 100644
--- a/sycl/test/extensions/bfloat16.cpp
+++ b/sycl/test/extensions/bfloat16.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl-device-only -fsycl-targets=%sycl_triple -S -Xclang -no-enable-noundef-analysis %s -o - | FileCheck %s
+// RUN: %clangxx -fsycl-device-only -fsycl-targets=%sycl_triple -Xsycl-target-backend=%sycl_triple "-device pvc" -S -Xclang -no-enable-noundef-analysis %s -o - | FileCheck %s
 
 // UNSUPPORTED: cuda || hip_amd
 
@@ -13,39 +13,37 @@ SYCL_EXTERNAL void foo(long x, sycl::half y);
 __attribute__((noinline)) float op(float a, float b) {
   // CHECK: define {{.*}} spir_func float @_Z2opff(float [[a:%.*]], float [[b:%.*]])
   bfloat16 A{a};
-  // CHECK: [[A:%.*]] = tail call spir_func zeroext i16 @_Z27__spirv_ConvertFToBF16INTELf(float [[a]])
+  // CHECK: [[A:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float addrspace(4)* align 4 dereferenceable(4) [[a]].addr.ascast)
   // CHECK-NOT: fptoui
 
   bfloat16 B{b};
-  // CHECK: [[B:%.*]] = tail call spir_func zeroext i16 @_Z27__spirv_ConvertFToBF16INTELf(float [[b]])
+  // CHECK: [[B:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float addrspace(4)* align 4 dereferenceable(4) [[b]].addr.ascast)
   // CHECK-NOT: fptoui
 
   bfloat16 C = A + B;
-  // CHECK: [[A_float:%.*]] = tail call spir_func float @_Z27__spirv_ConvertBF16ToFINTELt(i16 zeroext [[A]])
-  // CHECK: [[B_float:%.*]] = tail call spir_func float @_Z27__spirv_ConvertBF16ToFINTELt(i16 zeroext [[B]])
+  // CHECK: [[A_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 addrspace(4)* align 2 dereferenceable(2) %value.i)
+  // CHECK: [[B_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 addrspace(4)* align 2 dereferenceable(2) %value.i7)
   // CHECK: [[Add:%.*]] = fadd float [[A_float]], [[B_float]]
-  // CHECK: [[C:%.*]] = tail call spir_func zeroext i16 @_Z27__spirv_ConvertFToBF16INTELf(float [[Add]])
-  // CHECK-NOT: uitofp
-  // CHECK-NOT: fptoui
-
-  bfloat16 D = bfloat16::from_bits(some_bf16_intrinsic(A.raw(), C.raw()));
-  // CHECK: [[D:%.*]] = tail call spir_func zeroext i16 @_Z19some_bf16_intrinsictt(i16 zeroext [[A]], i16 zeroext [[C]])
+  // CHECK: store float [[Add]], float addrspace(4)* [[Add1:%ref.tmp.ascast.i]], align 4, !tbaa !48, !noalias !55
+  // CHECK: [[C:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float addrspace(4)* align 4 dereferenceable(4) [[Add1]])
   // CHECK-NOT: uitofp
   // CHECK-NOT: fptoui
 
   long L = bfloat16(3.14f);
-  // CHECK: [[L_bfloat16:%.*]] = tail call spir_func zeroext i16 @_Z27__spirv_ConvertFToBF16INTELf(float 0x40091EB860000000)
-  // CHECK: [[L_float:%.*]] = tail call spir_func float @_Z27__spirv_ConvertBF16ToFINTELt(i16 zeroext [[L_bfloat16]])
+  // CHECK: [[L:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float addrspace(4)* align 4 dereferenceable(4) %ref.tmp1.ascast)
+  // CHECK: store i16 [[L]], i16 addrspace(4)* [[L1:%value.i9]]
+  // CHECK: [[L_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 addrspace(4)* align 2 dereferenceable(2) [[L1]])
   // CHECK: [[L:%.*]] = fptosi float [[L_float]] to i{{32|64}}
 
   sycl::half H = bfloat16(2.71f);
-  // CHECK: [[H_bfloat16:%.*]] = tail call spir_func zeroext i16 @_Z27__spirv_ConvertFToBF16INTELf(float 0x4005AE1480000000)
-  // CHECK: [[H_float:%.*]] = tail call spir_func float @_Z27__spirv_ConvertBF16ToFINTELt(i16 zeroext [[H_bfloat16]])
+  // CHECK: [[H:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float addrspace(4)* align 4 dereferenceable(4) %ref.tmp3.ascast)
+  // CHECK: store i16 [[H]], i16 addrspace(4)* [[H1:%value.i13]]
+  // CHECK: [[H_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 addrspace(4)* align 2 dereferenceable(2) [[H1]])
   // CHECK: [[H:%.*]] = fptrunc float [[H_float]] to half
   foo(L, H);
 
-  return D;
-  // CHECK: [[RetVal:%.*]] = tail call spir_func float @_Z27__spirv_ConvertBF16ToFINTELt(i16 zeroext [[D]])
+  return A;
+  // CHECK: [[RetVal:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 addrspace(4)* align 2 dereferenceable(2) %value.i)
   // CHECK: ret float [[RetVal]]
   // CHECK-NOT: uitofp
   // CHECK-NOT: fptoui
diff --git a/sycl/test/extensions/bfloat16_host.cpp b/sycl/test/extensions/bfloat16_host.cpp
index acd02d2829b40..c18e1b2e24957 100644
--- a/sycl/test/extensions/bfloat16_host.cpp
+++ b/sycl/test/extensions/bfloat16_host.cpp
@@ -8,7 +8,7 @@
 
 // RUN: %clangxx -fsycl %s -o %t.out
 // RUN: %t.out
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
 #include <sycl/sycl.hpp>
 
 #include <cmath>
@@ -42,7 +42,8 @@ float bitsToFloatConv(std::string Bits) {
 }
 
 bool check_bf16_from_float(float Val, uint16_t Expected) {
-  uint16_t Result = sycl::ext::oneapi::experimental::bfloat16::from_float(Val);
+  sycl::ext::oneapi::bfloat16 B = Val;
+  uint16_t Result = *reinterpret_cast<uint16_t *>(&B);
   if (Result != Expected) {
     std::cout << "from_float check for Val = " << Val << " failed!\n"
               << "Expected " << Expected << " Got " << Result << "\n";
@@ -52,7 +53,7 @@ bool check_bf16_from_float(float Val, uint16_t Expected) {
 }
 
 bool check_bf16_to_float(uint16_t Val, float Expected) {
-  float Result = sycl::ext::oneapi::experimental::bfloat16::to_float(Val);
+  float Result = *reinterpret_cast<sycl::ext::oneapi::bfloat16 *>(&Val);
   if (Result != Expected) {
     std::cout << "to_float check for Val = " << Val << " failed!\n"
               << "Expected " << Expected << " Got " << Result << "\n";
diff --git a/sycl/test/matrix/matrix-bfloat16-test-use.cpp b/sycl/test/matrix/matrix-bfloat16-test-use.cpp
index 89a295cb23b75..f133b5d5bd9cc 100644
--- a/sycl/test/matrix/matrix-bfloat16-test-use.cpp
+++ b/sycl/test/matrix/matrix-bfloat16-test-use.cpp
@@ -3,7 +3,7 @@
 #include <sycl/sycl.hpp>
 
 using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 static constexpr auto TILE_SZ = 16;
 static constexpr auto TM = TILE_SZ - 1;
@@ -137,13 +137,13 @@ int main() {
     for (int j = 0; j < MATRIX_K; j++) {
       // Ee create bfloat16 from unsigned short since float-to-bfloat's
       // conversion is not allowed.
-      A[i][j] = bfloat16::from_bits(make_bf16(1.0f * (i + j)));
+      A[i][j] = make_bf16(1.0f * (i + j));
       Aref[i][j] = make_bf16(1.0f * (i + j));
     }
   }
   for (int i = 0; i < MATRIX_K / 2; i++) {
     for (int j = 0; j < MATRIX_N * 2; j++) {
-      B[i][j] = bfloat16::from_bits((make_bf16(2.0f * i + 3.0f * j)));
+      B[i][j] = make_bf16(2.0f * i + 3.0f * j);
       Bref[i][j] = make_bf16(2.0f * i + 3.0f * j);
     }
   }
diff --git a/sycl/test/matrix/matrix-bfloat16-test.cpp b/sycl/test/matrix/matrix-bfloat16-test.cpp
index d2ad0f2a72994..f4a7262b9fd89 100644
--- a/sycl/test/matrix/matrix-bfloat16-test.cpp
+++ b/sycl/test/matrix/matrix-bfloat16-test.cpp
@@ -139,13 +139,13 @@ int main() {
     for (int j = 0; j < MATRIX_K; j++) {
       // Ee create bfloat16 from unsigned short since float-to-bfloat's
       // conversion is not allowed.
-      A[i][j] = bfloat16::from_bits(make_bf16(1.0f * (i + j)));
+      A[i][j] = make_bf16(1.0f * (i + j));
       Aref[i][j] = make_bf16(1.0f * (i + j));
     }
   }
   for (int i = 0; i < MATRIX_K / 2; i++) {
     for (int j = 0; j < MATRIX_N * 2; j++) {
-      B[i][j] = bfloat16::from_bits((make_bf16(2.0f * i + 3.0f * j)));
+      B[i][j] = make_bf16(2.0f * i + 3.0f * j);
       Bref[i][j] = make_bf16(2.0f * i + 3.0f * j);
     }
   }

From a05c8725bb12e5cb739c28757018d6f9cd1ec7dc Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 27 Oct 2022 13:21:17 -0700
Subject: [PATCH 57/63] Test cleanup.

---
 sycl/test/extensions/bfloat16.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sycl/test/extensions/bfloat16.cpp b/sycl/test/extensions/bfloat16.cpp
index 8c4a0bbba3968..f96fa5ace3ae5 100644
--- a/sycl/test/extensions/bfloat16.cpp
+++ b/sycl/test/extensions/bfloat16.cpp
@@ -13,37 +13,37 @@ SYCL_EXTERNAL void foo(long x, sycl::half y);
 __attribute__((noinline)) float op(float a, float b) {
   // CHECK: define {{.*}} spir_func float @_Z2opff(float [[a:%.*]], float [[b:%.*]])
   bfloat16 A{a};
-  // CHECK: [[A:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float addrspace(4)* align 4 dereferenceable(4) [[a]].addr.ascast)
+  // CHECK: [[A:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float {{.*}} [[a]].addr.ascast)
   // CHECK-NOT: fptoui
 
   bfloat16 B{b};
-  // CHECK: [[B:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float addrspace(4)* align 4 dereferenceable(4) [[b]].addr.ascast)
+  // CHECK: [[B:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float {{.*}} [[b]].addr.ascast)
   // CHECK-NOT: fptoui
 
   bfloat16 C = A + B;
-  // CHECK: [[A_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 addrspace(4)* align 2 dereferenceable(2) %value.i)
-  // CHECK: [[B_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 addrspace(4)* align 2 dereferenceable(2) %value.i7)
+  // CHECK: [[A_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} %value.i)
+  // CHECK: [[B_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} %value.i7)
   // CHECK: [[Add:%.*]] = fadd float [[A_float]], [[B_float]]
   // CHECK: store float [[Add]], float addrspace(4)* [[Add1:%ref.tmp.ascast.i]], align 4, !tbaa !48, !noalias !55
-  // CHECK: [[C:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float addrspace(4)* align 4 dereferenceable(4) [[Add1]])
+  // CHECK: [[C:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float {{.*}} [[Add1]])
   // CHECK-NOT: uitofp
   // CHECK-NOT: fptoui
 
   long L = bfloat16(3.14f);
-  // CHECK: [[L:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float addrspace(4)* align 4 dereferenceable(4) %ref.tmp1.ascast)
+  // CHECK: [[L:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float {{.*}} %ref.tmp1.ascast)
   // CHECK: store i16 [[L]], i16 addrspace(4)* [[L1:%value.i9]]
-  // CHECK: [[L_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 addrspace(4)* align 2 dereferenceable(2) [[L1]])
+  // CHECK: [[L_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} [[L1]])
   // CHECK: [[L:%.*]] = fptosi float [[L_float]] to i{{32|64}}
 
   sycl::half H = bfloat16(2.71f);
-  // CHECK: [[H:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float addrspace(4)* align 4 dereferenceable(4) %ref.tmp3.ascast)
+  // CHECK: [[H:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float {{.*}} %ref.tmp3.ascast)
   // CHECK: store i16 [[H]], i16 addrspace(4)* [[H1:%value.i13]]
-  // CHECK: [[H_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 addrspace(4)* align 2 dereferenceable(2) [[H1]])
+  // CHECK: [[H_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} [[H1]])
   // CHECK: [[H:%.*]] = fptrunc float [[H_float]] to half
   foo(L, H);
 
   return A;
-  // CHECK: [[RetVal:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 addrspace(4)* align 2 dereferenceable(2) %value.i)
+  // CHECK: [[RetVal:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} %value.i)
   // CHECK: ret float [[RetVal]]
   // CHECK-NOT: uitofp
   // CHECK-NOT: fptoui

From 6d45ed1796d350a39de53ba4c0f6ab4e92a97a30 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 27 Oct 2022 14:28:47 -0700
Subject: [PATCH 58/63] Adjustments to more tests.

---
 .../properties/properties_kernel.cpp          |  8 ++++---
 .../properties_kernel_device_has.cpp          | 23 ++++++++++---------
 .../properties_kernel_device_has_macro.cpp    | 19 +++++++--------
 3 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/sycl/test/extensions/properties/properties_kernel.cpp b/sycl/test/extensions/properties/properties_kernel.cpp
index 689a236ccf660..4a683b59be004 100644
--- a/sycl/test/extensions/properties/properties_kernel.cpp
+++ b/sycl/test/extensions/properties/properties_kernel.cpp
@@ -25,7 +25,8 @@ using device_has_all =
              aspect::ext_intel_device_info_uuid, aspect::ext_oneapi_srgb,
              aspect::ext_oneapi_native_assert, aspect::host_debuggable,
              aspect::ext_intel_gpu_hw_threads_per_eu,
-             aspect::ext_oneapi_cuda_async_barrier, aspect::ext_oneapi_bfloat16,
+             aspect::ext_oneapi_cuda_async_barrier,
+             aspect::ext_oneapi_bfloat16_math_functions,
              aspect::ext_intel_free_memory, aspect::ext_intel_device_id>);
 
 template <aspect Aspect> inline void singleAspectDeviceHasChecks() {
@@ -117,7 +118,7 @@ int main() {
   singleAspectDeviceHasChecks<aspect::host_debuggable>();
   singleAspectDeviceHasChecks<aspect::ext_intel_gpu_hw_threads_per_eu>();
   singleAspectDeviceHasChecks<aspect::ext_oneapi_cuda_async_barrier>();
-  singleAspectDeviceHasChecks<aspect::ext_oneapi_bfloat16>();
+  singleAspectDeviceHasChecks<aspect::ext_oneapi_bfloat16_math_functions>();
   singleAspectDeviceHasChecks<aspect::ext_intel_free_memory>();
   singleAspectDeviceHasChecks<aspect::ext_intel_device_id>();
 
@@ -171,7 +172,8 @@ int main() {
                 aspect::ext_intel_gpu_hw_threads_per_eu);
   static_assert(device_has_all::value[32] ==
                 aspect::ext_oneapi_cuda_async_barrier);
-  static_assert(device_has_all::value[33] == aspect::ext_oneapi_bfloat16);
+  static_assert(device_has_all::value[33] ==
+                aspect::ext_oneapi_bfloat16_math_functions);
   static_assert(device_has_all::value[34] == aspect::ext_intel_free_memory);
   static_assert(device_has_all::value[35] == aspect::ext_intel_device_id);
 
diff --git a/sycl/test/extensions/properties/properties_kernel_device_has.cpp b/sycl/test/extensions/properties/properties_kernel_device_has.cpp
index e25a910ba5cfb..d41f7d08ecc3a 100644
--- a/sycl/test/extensions/properties/properties_kernel_device_has.cpp
+++ b/sycl/test/extensions/properties/properties_kernel_device_has.cpp
@@ -8,13 +8,14 @@ using namespace sycl;
 using namespace ext::oneapi::experimental;
 
 static constexpr auto device_has_all = device_has<
-    aspect::ext_oneapi_cuda_async_barrier, aspect::ext_oneapi_bfloat16,
-    aspect::custom, aspect::fp16, aspect::fp64, aspect::image,
-    aspect::online_compiler, aspect::online_linker, aspect::queue_profiling,
-    aspect::usm_device_allocations, aspect::usm_restricted_shared_allocations,
-    aspect::usm_system_allocations, aspect::ext_intel_pci_address, aspect::host,
-    aspect::cpu, aspect::gpu, aspect::accelerator,
-    aspect::ext_intel_gpu_eu_count, aspect::ext_intel_gpu_subslices_per_slice,
+    aspect::ext_oneapi_cuda_async_barrier,
+    aspect::ext_oneapi_bfloat16_math_functions, aspect::custom, aspect::fp16,
+    aspect::fp64, aspect::image, aspect::online_compiler, aspect::online_linker,
+    aspect::queue_profiling, aspect::usm_device_allocations,
+    aspect::usm_restricted_shared_allocations, aspect::usm_system_allocations,
+    aspect::ext_intel_pci_address, aspect::host, aspect::cpu, aspect::gpu,
+    aspect::accelerator, aspect::ext_intel_gpu_eu_count,
+    aspect::ext_intel_gpu_subslices_per_slice,
     aspect::ext_intel_gpu_eu_count_per_subslice,
     aspect::ext_intel_max_mem_bandwidth, aspect::ext_intel_mem_channel,
     aspect::usm_atomic_host_allocations, aspect::usm_atomic_shared_allocations,
@@ -131,7 +132,7 @@ int main() {
 }
 
 // CHECK-IR-DAG: !{{[0-9]+}} = !{!"ext_oneapi_cuda_async_barrier", i32 [[ext_oneapi_cuda_async_barrier_ASPECT_MD:[0-9]+]]}
-// CHECK-IR-DAG: !{{[0-9]+}} = !{!"ext_oneapi_bfloat16", i32 [[ext_oneapi_bfloat16_ASPECT_MD:[0-9]+]]}
+// CHECK-IR-DAG: !{{[0-9]+}} = !{!"ext_oneapi_bfloat16_math_functions", i32 [[ext_oneapi_bfloat16_math_functions_ASPECT_MD:[0-9]+]]}
 // CHECK-IR-DAG: !{{[0-9]+}} = !{!"custom", i32 [[custom_ASPECT_MD:[0-9]+]]}
 // CHECK-IR-DAG: !{{[0-9]+}} = !{!"fp16", i32 [[fp16_ASPECT_MD:[0-9]+]]}
 // CHECK-IR-DAG: !{{[0-9]+}} = !{!"fp64", i32 [[fp64_ASPECT_MD:[0-9]+]]}
@@ -167,6 +168,6 @@ int main() {
 // CHECK-IR-DAG: !{{[0-9]+}} = !{!"ext_intel_free_memory", i32 [[ext_intel_free_memory_ASPECT_MD:[0-9]+]]}
 // CHECK-IR-DAG: !{{[0-9]+}} = !{!"ext_intel_device_id", i32 [[ext_intel_device_id_ASPECT_MD:[0-9]+]]}
 
-// CHECK-IR-DAG: attributes #[[DHAttr1]] = { {{.*}}"sycl-device-has"="[[ext_oneapi_cuda_async_barrier_ASPECT_MD]],[[ext_oneapi_bfloat16_ASPECT_MD]],[[custom_ASPECT_MD]],[[fp16_ASPECT_MD]],[[fp64_ASPECT_MD]],[[image_ASPECT_MD]],[[online_compiler_ASPECT_MD]],[[online_linker_ASPECT_MD]],[[queue_profiling_ASPECT_MD]],[[usm_device_allocations_ASPECT_MD]],[[usm_restricted_shared_allocations_ASPECT_MD]],[[usm_system_allocations_ASPECT_MD]],[[ext_intel_pci_address_ASPECT_MD]],[[host_ASPECT_MD]],[[cpu_ASPECT_MD]],[[gpu_ASPECT_MD]],[[accelerator_ASPECT_MD]],[[ext_intel_gpu_eu_count_ASPECT_MD]],[[ext_intel_gpu_subslices_per_slice_ASPECT_MD]],[[ext_intel_gpu_eu_count_per_subslice_ASPECT_MD]],[[ext_intel_max_mem_bandwidth_ASPECT_MD]],[[ext_intel_mem_channel_ASPECT_MD]],[[usm_atomic_host_allocations_ASPECT_MD]],[[usm_atomic_shared_allocations_ASPECT_MD]],[[atomic64_ASPECT_MD]],[[ext_intel_device_info_uuid_ASPECT_MD]],[[ext_oneapi_srgb_ASPECT_MD]],[[ext_intel_gpu_eu_simd_width_ASPECT_MD]],[[ext_intel_gpu_slices_ASPECT_MD]],[[ext_oneapi_native_assert_ASPECT_MD]],[[host_debuggable_ASPECT_MD]],[[ext_intel_gpu_hw_threads_per_eu_ASPECT_MD]],[[usm_host_allocations_ASPECT_MD]],[[usm_shared_allocations_ASPECT_MD]],[[ext_intel_free_memory_ASPECT_MD]],[[ext_intel_device_id_ASPECT_MD]]"
-// CHECK-IR-DAG: attributes #[[DHAttr2]] = { {{.*}}"sycl-device-has"="[[ext_oneapi_cuda_async_barrier_ASPECT_MD]],[[ext_oneapi_bfloat16_ASPECT_MD]],[[custom_ASPECT_MD]],[[fp16_ASPECT_MD]],[[fp64_ASPECT_MD]],[[image_ASPECT_MD]],[[online_compiler_ASPECT_MD]],[[online_linker_ASPECT_MD]],[[queue_profiling_ASPECT_MD]],[[usm_device_allocations_ASPECT_MD]],[[usm_restricted_shared_allocations_ASPECT_MD]],[[usm_system_allocations_ASPECT_MD]],[[ext_intel_pci_address_ASPECT_MD]],[[host_ASPECT_MD]],[[cpu_ASPECT_MD]],[[gpu_ASPECT_MD]],[[accelerator_ASPECT_MD]],[[ext_intel_gpu_eu_count_ASPECT_MD]],[[ext_intel_gpu_subslices_per_slice_ASPECT_MD]],[[ext_intel_gpu_eu_count_per_subslice_ASPECT_MD]],[[ext_intel_max_mem_bandwidth_ASPECT_MD]],[[ext_intel_mem_channel_ASPECT_MD]],[[usm_atomic_host_allocations_ASPECT_MD]],[[usm_atomic_shared_allocations_ASPECT_MD]],[[atomic64_ASPECT_MD]],[[ext_intel_device_info_uuid_ASPECT_MD]],[[ext_oneapi_srgb_ASPECT_MD]],[[ext_intel_gpu_eu_simd_width_ASPECT_MD]],[[ext_intel_gpu_slices_ASPECT_MD]],[[ext_oneapi_native_assert_ASPECT_MD]],[[host_debuggable_ASPECT_MD]],[[ext_intel_gpu_hw_threads_per_eu_ASPECT_MD]],[[usm_host_allocations_ASPECT_MD]],[[usm_shared_allocations_ASPECT_MD]],[[ext_intel_free_memory_ASPECT_MD]],[[ext_intel_device_id_ASPECT_MD]]"
-// CHECK-IR-DAG: attributes #[[DHAttr3]] = { {{.*}}"sycl-device-has"="[[ext_oneapi_cuda_async_barrier_ASPECT_MD]],[[ext_oneapi_bfloat16_ASPECT_MD]],[[custom_ASPECT_MD]],[[fp16_ASPECT_MD]],[[fp64_ASPECT_MD]],[[image_ASPECT_MD]],[[online_compiler_ASPECT_MD]],[[online_linker_ASPECT_MD]],[[queue_profiling_ASPECT_MD]],[[usm_device_allocations_ASPECT_MD]],[[usm_restricted_shared_allocations_ASPECT_MD]],[[usm_system_allocations_ASPECT_MD]],[[ext_intel_pci_address_ASPECT_MD]],[[host_ASPECT_MD]],[[cpu_ASPECT_MD]],[[gpu_ASPECT_MD]],[[accelerator_ASPECT_MD]],[[ext_intel_gpu_eu_count_ASPECT_MD]],[[ext_intel_gpu_subslices_per_slice_ASPECT_MD]],[[ext_intel_gpu_eu_count_per_subslice_ASPECT_MD]],[[ext_intel_max_mem_bandwidth_ASPECT_MD]],[[ext_intel_mem_channel_ASPECT_MD]],[[usm_atomic_host_allocations_ASPECT_MD]],[[usm_atomic_shared_allocations_ASPECT_MD]],[[atomic64_ASPECT_MD]],[[ext_intel_device_info_uuid_ASPECT_MD]],[[ext_oneapi_srgb_ASPECT_MD]],[[ext_intel_gpu_eu_simd_width_ASPECT_MD]],[[ext_intel_gpu_slices_ASPECT_MD]],[[ext_oneapi_native_assert_ASPECT_MD]],[[host_debuggable_ASPECT_MD]],[[ext_intel_gpu_hw_threads_per_eu_ASPECT_MD]],[[usm_host_allocations_ASPECT_MD]],[[usm_shared_allocations_ASPECT_MD]],[[ext_intel_free_memory_ASPECT_MD]],[[ext_intel_device_id_ASPECT_MD]]"
+// CHECK-IR-DAG: attributes #[[DHAttr1]] = { {{.*}}"sycl-device-has"="[[ext_oneapi_cuda_async_barrier_ASPECT_MD]],[[ext_oneapi_bfloat16_math_functions_ASPECT_MD]],[[custom_ASPECT_MD]],[[fp16_ASPECT_MD]],[[fp64_ASPECT_MD]],[[image_ASPECT_MD]],[[online_compiler_ASPECT_MD]],[[online_linker_ASPECT_MD]],[[queue_profiling_ASPECT_MD]],[[usm_device_allocations_ASPECT_MD]],[[usm_restricted_shared_allocations_ASPECT_MD]],[[usm_system_allocations_ASPECT_MD]],[[ext_intel_pci_address_ASPECT_MD]],[[host_ASPECT_MD]],[[cpu_ASPECT_MD]],[[gpu_ASPECT_MD]],[[accelerator_ASPECT_MD]],[[ext_intel_gpu_eu_count_ASPECT_MD]],[[ext_intel_gpu_subslices_per_slice_ASPECT_MD]],[[ext_intel_gpu_eu_count_per_subslice_ASPECT_MD]],[[ext_intel_max_mem_bandwidth_ASPECT_MD]],[[ext_intel_mem_channel_ASPECT_MD]],[[usm_atomic_host_allocations_ASPECT_MD]],[[usm_atomic_shared_allocations_ASPECT_MD]],[[atomic64_ASPECT_MD]],[[ext_intel_device_info_uuid_ASPECT_MD]],[[ext_oneapi_srgb_ASPECT_MD]],[[ext_intel_gpu_eu_simd_width_ASPECT_MD]],[[ext_intel_gpu_slices_ASPECT_MD]],[[ext_oneapi_native_assert_ASPECT_MD]],[[host_debuggable_ASPECT_MD]],[[ext_intel_gpu_hw_threads_per_eu_ASPECT_MD]],[[usm_host_allocations_ASPECT_MD]],[[usm_shared_allocations_ASPECT_MD]],[[ext_intel_free_memory_ASPECT_MD]],[[ext_intel_device_id_ASPECT_MD]]"
+// CHECK-IR-DAG: attributes #[[DHAttr2]] = { {{.*}}"sycl-device-has"="[[ext_oneapi_cuda_async_barrier_ASPECT_MD]],[[ext_oneapi_bfloat16_math_functions_ASPECT_MD]],[[custom_ASPECT_MD]],[[fp16_ASPECT_MD]],[[fp64_ASPECT_MD]],[[image_ASPECT_MD]],[[online_compiler_ASPECT_MD]],[[online_linker_ASPECT_MD]],[[queue_profiling_ASPECT_MD]],[[usm_device_allocations_ASPECT_MD]],[[usm_restricted_shared_allocations_ASPECT_MD]],[[usm_system_allocations_ASPECT_MD]],[[ext_intel_pci_address_ASPECT_MD]],[[host_ASPECT_MD]],[[cpu_ASPECT_MD]],[[gpu_ASPECT_MD]],[[accelerator_ASPECT_MD]],[[ext_intel_gpu_eu_count_ASPECT_MD]],[[ext_intel_gpu_subslices_per_slice_ASPECT_MD]],[[ext_intel_gpu_eu_count_per_subslice_ASPECT_MD]],[[ext_intel_max_mem_bandwidth_ASPECT_MD]],[[ext_intel_mem_channel_ASPECT_MD]],[[usm_atomic_host_allocations_ASPECT_MD]],[[usm_atomic_shared_allocations_ASPECT_MD]],[[atomic64_ASPECT_MD]],[[ext_intel_device_info_uuid_ASPECT_MD]],[[ext_oneapi_srgb_ASPECT_MD]],[[ext_intel_gpu_eu_simd_width_ASPECT_MD]],[[ext_intel_gpu_slices_ASPECT_MD]],[[ext_oneapi_native_assert_ASPECT_MD]],[[host_debuggable_ASPECT_MD]],[[ext_intel_gpu_hw_threads_per_eu_ASPECT_MD]],[[usm_host_allocations_ASPECT_MD]],[[usm_shared_allocations_ASPECT_MD]],[[ext_intel_free_memory_ASPECT_MD]],[[ext_intel_device_id_ASPECT_MD]]"
+// CHECK-IR-DAG: attributes #[[DHAttr3]] = { {{.*}}"sycl-device-has"="[[ext_oneapi_cuda_async_barrier_ASPECT_MD]],[[ext_oneapi_bfloat16_math_functions_ASPECT_MD]],[[custom_ASPECT_MD]],[[fp16_ASPECT_MD]],[[fp64_ASPECT_MD]],[[image_ASPECT_MD]],[[online_compiler_ASPECT_MD]],[[online_linker_ASPECT_MD]],[[queue_profiling_ASPECT_MD]],[[usm_device_allocations_ASPECT_MD]],[[usm_restricted_shared_allocations_ASPECT_MD]],[[usm_system_allocations_ASPECT_MD]],[[ext_intel_pci_address_ASPECT_MD]],[[host_ASPECT_MD]],[[cpu_ASPECT_MD]],[[gpu_ASPECT_MD]],[[accelerator_ASPECT_MD]],[[ext_intel_gpu_eu_count_ASPECT_MD]],[[ext_intel_gpu_subslices_per_slice_ASPECT_MD]],[[ext_intel_gpu_eu_count_per_subslice_ASPECT_MD]],[[ext_intel_max_mem_bandwidth_ASPECT_MD]],[[ext_intel_mem_channel_ASPECT_MD]],[[usm_atomic_host_allocations_ASPECT_MD]],[[usm_atomic_shared_allocations_ASPECT_MD]],[[atomic64_ASPECT_MD]],[[ext_intel_device_info_uuid_ASPECT_MD]],[[ext_oneapi_srgb_ASPECT_MD]],[[ext_intel_gpu_eu_simd_width_ASPECT_MD]],[[ext_intel_gpu_slices_ASPECT_MD]],[[ext_oneapi_native_assert_ASPECT_MD]],[[host_debuggable_ASPECT_MD]],[[ext_intel_gpu_hw_threads_per_eu_ASPECT_MD]],[[usm_host_allocations_ASPECT_MD]],[[usm_shared_allocations_ASPECT_MD]],[[ext_intel_free_memory_ASPECT_MD]],[[ext_intel_device_id_ASPECT_MD]]"
diff --git a/sycl/test/extensions/properties/properties_kernel_device_has_macro.cpp b/sycl/test/extensions/properties/properties_kernel_device_has_macro.cpp
index abb6480c10379..a5ef8e5150782 100644
--- a/sycl/test/extensions/properties/properties_kernel_device_has_macro.cpp
+++ b/sycl/test/extensions/properties/properties_kernel_device_has_macro.cpp
@@ -8,13 +8,14 @@ using namespace sycl;
 using namespace ext::oneapi::experimental;
 
 static constexpr auto device_has_all = device_has<
-    aspect::ext_oneapi_cuda_async_barrier, aspect::ext_oneapi_bfloat16,
-    aspect::custom, aspect::fp16, aspect::fp64, aspect::image,
-    aspect::online_compiler, aspect::online_linker, aspect::queue_profiling,
-    aspect::usm_device_allocations, aspect::usm_restricted_shared_allocations,
-    aspect::usm_system_allocations, aspect::ext_intel_pci_address, aspect::host,
-    aspect::cpu, aspect::gpu, aspect::accelerator,
-    aspect::ext_intel_gpu_eu_count, aspect::ext_intel_gpu_subslices_per_slice,
+    aspect::ext_oneapi_cuda_async_barrier,
+    aspect::ext_oneapi_bfloat16_math_functions, aspect::custom, aspect::fp16,
+    aspect::fp64, aspect::image, aspect::online_compiler, aspect::online_linker,
+    aspect::queue_profiling, aspect::usm_device_allocations,
+    aspect::usm_restricted_shared_allocations, aspect::usm_system_allocations,
+    aspect::ext_intel_pci_address, aspect::host, aspect::cpu, aspect::gpu,
+    aspect::accelerator, aspect::ext_intel_gpu_eu_count,
+    aspect::ext_intel_gpu_subslices_per_slice,
     aspect::ext_intel_gpu_eu_count_per_subslice,
     aspect::ext_intel_max_mem_bandwidth, aspect::ext_intel_mem_channel,
     aspect::usm_atomic_host_allocations, aspect::usm_atomic_shared_allocations,
@@ -55,7 +56,7 @@ int main() {
 }
 
 // CHECK-IR-DAG: !{{[0-9]+}} = !{!"ext_oneapi_cuda_async_barrier", i32 [[ext_oneapi_cuda_async_barrier_ASPECT_MD:[0-9]+]]}
-// CHECK-IR-DAG: !{{[0-9]+}} = !{!"ext_oneapi_bfloat16", i32 [[ext_oneapi_bfloat16_ASPECT_MD:[0-9]+]]}
+// CHECK-IR-DAG: !{{[0-9]+}} = !{!"ext_oneapi_bfloat16_math_functions", i32 [[ext_oneapi_bfloat16_math_functions_ASPECT_MD:[0-9]+]]}
 // CHECK-IR-DAG: !{{[0-9]+}} = !{!"custom", i32 [[custom_ASPECT_MD:[0-9]+]]}
 // CHECK-IR-DAG: !{{[0-9]+}} = !{!"fp16", i32 [[fp16_ASPECT_MD:[0-9]+]]}
 // CHECK-IR-DAG: !{{[0-9]+}} = !{!"fp64", i32 [[fp64_ASPECT_MD:[0-9]+]]}
@@ -91,6 +92,6 @@ int main() {
 // CHECK-IR-DAG: !{{[0-9]+}} = !{!"ext_intel_free_memory", i32 [[ext_intel_free_memory_ASPECT_MD:[0-9]+]]}
 // CHECK-IR-DAG: !{{[0-9]+}} = !{!"ext_intel_device_id", i32 [[ext_intel_device_id_ASPECT_MD:[0-9]+]]}
 
-// CHECK-IR-DAG: attributes #[[DHAttr1]] = { {{.*}}"sycl-device-has"="[[ext_oneapi_cuda_async_barrier_ASPECT_MD]],[[ext_oneapi_bfloat16_ASPECT_MD]],[[custom_ASPECT_MD]],[[fp16_ASPECT_MD]],[[fp64_ASPECT_MD]],[[image_ASPECT_MD]],[[online_compiler_ASPECT_MD]],[[online_linker_ASPECT_MD]],[[queue_profiling_ASPECT_MD]],[[usm_device_allocations_ASPECT_MD]],[[usm_restricted_shared_allocations_ASPECT_MD]],[[usm_system_allocations_ASPECT_MD]],[[ext_intel_pci_address_ASPECT_MD]],[[host_ASPECT_MD]],[[cpu_ASPECT_MD]],[[gpu_ASPECT_MD]],[[accelerator_ASPECT_MD]],[[ext_intel_gpu_eu_count_ASPECT_MD]],[[ext_intel_gpu_subslices_per_slice_ASPECT_MD]],[[ext_intel_gpu_eu_count_per_subslice_ASPECT_MD]],[[ext_intel_max_mem_bandwidth_ASPECT_MD]],[[ext_intel_mem_channel_ASPECT_MD]],[[usm_atomic_host_allocations_ASPECT_MD]],[[usm_atomic_shared_allocations_ASPECT_MD]],[[atomic64_ASPECT_MD]],[[ext_intel_device_info_uuid_ASPECT_MD]],[[ext_oneapi_srgb_ASPECT_MD]],[[ext_intel_gpu_eu_simd_width_ASPECT_MD]],[[ext_intel_gpu_slices_ASPECT_MD]],[[ext_oneapi_native_assert_ASPECT_MD]],[[host_debuggable_ASPECT_MD]],[[ext_intel_gpu_hw_threads_per_eu_ASPECT_MD]],[[usm_host_allocations_ASPECT_MD]],[[usm_shared_allocations_ASPECT_MD]],[[ext_intel_free_memory_ASPECT_MD]],[[ext_intel_device_id_ASPECT_MD]]"
+// CHECK-IR-DAG: attributes #[[DHAttr1]] = { {{.*}}"sycl-device-has"="[[ext_oneapi_cuda_async_barrier_ASPECT_MD]],[[ext_oneapi_bfloat16_math_functions_ASPECT_MD]],[[custom_ASPECT_MD]],[[fp16_ASPECT_MD]],[[fp64_ASPECT_MD]],[[image_ASPECT_MD]],[[online_compiler_ASPECT_MD]],[[online_linker_ASPECT_MD]],[[queue_profiling_ASPECT_MD]],[[usm_device_allocations_ASPECT_MD]],[[usm_restricted_shared_allocations_ASPECT_MD]],[[usm_system_allocations_ASPECT_MD]],[[ext_intel_pci_address_ASPECT_MD]],[[host_ASPECT_MD]],[[cpu_ASPECT_MD]],[[gpu_ASPECT_MD]],[[accelerator_ASPECT_MD]],[[ext_intel_gpu_eu_count_ASPECT_MD]],[[ext_intel_gpu_subslices_per_slice_ASPECT_MD]],[[ext_intel_gpu_eu_count_per_subslice_ASPECT_MD]],[[ext_intel_max_mem_bandwidth_ASPECT_MD]],[[ext_intel_mem_channel_ASPECT_MD]],[[usm_atomic_host_allocations_ASPECT_MD]],[[usm_atomic_shared_allocations_ASPECT_MD]],[[atomic64_ASPECT_MD]],[[ext_intel_device_info_uuid_ASPECT_MD]],[[ext_oneapi_srgb_ASPECT_MD]],[[ext_intel_gpu_eu_simd_width_ASPECT_MD]],[[ext_intel_gpu_slices_ASPECT_MD]],[[ext_oneapi_native_assert_ASPECT_MD]],[[host_debuggable_ASPECT_MD]],[[ext_intel_gpu_hw_threads_per_eu_ASPECT_MD]],[[usm_host_allocations_ASPECT_MD]],[[usm_shared_allocations_ASPECT_MD]],[[ext_intel_free_memory_ASPECT_MD]],[[ext_intel_device_id_ASPECT_MD]]"
 // CHECK-IR-DAG: attributes #[[DHAttr2]] = { {{.*}}"sycl-device-has" {{.*}}
 // CHECK-IR-DAG: attributes #[[DHAttr3]] = { {{.*}}"sycl-device-has"="[[fp16_ASPECT_MD]],[[atomic64_ASPECT_MD]]"

From 077d0fe4ee9a99c11c5d9c1fadb61d9e3e378f2d Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Fri, 28 Oct 2022 08:53:21 -0700
Subject: [PATCH 59/63] Change to tests to ensure AOT components are available.

---
 clang/test/Driver/sycl-bfloat16-lib-win.cpp | 2 ++
 clang/test/Driver/sycl-bfloat16-lib.cpp     | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/clang/test/Driver/sycl-bfloat16-lib-win.cpp b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
index ecba163add3ec..f5f9c8c97d699 100755
--- a/clang/test/Driver/sycl-bfloat16-lib-win.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib-win.cpp
@@ -3,6 +3,8 @@
 ///
 
 // REQUIRES: windows
+// REQUIRES: opencl-aot, ocloc, cpu, gpu
+// UNSUPPORTED: cuda
 
 /// ###########################################################################
 /// test that no bfloat16 libraries are added in JIT mode
diff --git a/clang/test/Driver/sycl-bfloat16-lib.cpp b/clang/test/Driver/sycl-bfloat16-lib.cpp
index 95f1fc9d24773..54066a3c56940 100755
--- a/clang/test/Driver/sycl-bfloat16-lib.cpp
+++ b/clang/test/Driver/sycl-bfloat16-lib.cpp
@@ -3,6 +3,8 @@
 ///
 
 // UNSUPPORTED: system-windows
+// REQUIRES: opencl-aot, ocloc, cpu, gpu
+// UNSUPPORTED: cuda
 
 /// ###########################################################################
 /// test that no bfloat16 libraries are added in JIT mode

From d7c80eebfd13547891b4d9941a971f9c822b568c Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 7 Nov 2022 09:43:33 -0800
Subject: [PATCH 60/63] Adjustment to test for new bfloat16 header.

---
 sycl/test/extensions/bfloat16.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test/extensions/bfloat16.cpp b/sycl/test/extensions/bfloat16.cpp
index f96fa5ace3ae5..36d49294e2ab4 100644
--- a/sycl/test/extensions/bfloat16.cpp
+++ b/sycl/test/extensions/bfloat16.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl-device-only -fsycl-targets=%sycl_triple -Xsycl-target-backend=%sycl_triple "-device pvc" -S -Xclang -no-enable-noundef-analysis %s -o - | FileCheck %s
+// RUN: %clangxx -fsycl-device-only -fsycl-targets=%sycl_triple -S -Xclang -no-enable-noundef-analysis %s -o - | FileCheck %s
 
 // UNSUPPORTED: cuda || hip_amd
 
@@ -24,7 +24,7 @@ __attribute__((noinline)) float op(float a, float b) {
   // CHECK: [[A_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} %value.i)
   // CHECK: [[B_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} %value.i7)
   // CHECK: [[Add:%.*]] = fadd float [[A_float]], [[B_float]]
-  // CHECK: store float [[Add]], float addrspace(4)* [[Add1:%ref.tmp.ascast.i]], align 4, !tbaa !48, !noalias !55
+  // CHECK: store float [[Add]], float addrspace(4)* [[Add1:%ref.tmp.ascast.i]], align 4
   // CHECK: [[C:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float {{.*}} [[Add1]])
   // CHECK-NOT: uitofp
   // CHECK-NOT: fptoui

From 6ec2bb9471c96e5fb0202dcad92fa09c3b51dc6c Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 22 Nov 2022 06:27:15 -0800
Subject: [PATCH 61/63] Changes for indirect accesses.

---
 sycl/test/extensions/bfloat16.cpp | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/sycl/test/extensions/bfloat16.cpp b/sycl/test/extensions/bfloat16.cpp
index 36d49294e2ab4..31c5780e20fc7 100644
--- a/sycl/test/extensions/bfloat16.cpp
+++ b/sycl/test/extensions/bfloat16.cpp
@@ -21,29 +21,33 @@ __attribute__((noinline)) float op(float a, float b) {
   // CHECK-NOT: fptoui
 
   bfloat16 C = A + B;
-  // CHECK: [[A_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} %value.i)
-  // CHECK: [[B_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} %value.i7)
+  // CHECK: [[RTCASTI:%ref.tmp.ascast.i]] = addrspacecast float* [[RT:%ref.tmp.i]] to float addrspace(4)*
+  // CHECK: [[A_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} %1)
+  // CHECK: [[B_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} %4)
   // CHECK: [[Add:%.*]] = fadd float [[A_float]], [[B_float]]
-  // CHECK: store float [[Add]], float addrspace(4)* [[Add1:%ref.tmp.ascast.i]], align 4
-  // CHECK: [[C:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float {{.*}} [[Add1]])
+  // CHECK: store float [[Add]], float* [[RT]], align 4
+  // CHECK: [[C:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float {{.*}}) [[RTCASTI]])
+
   // CHECK-NOT: uitofp
   // CHECK-NOT: fptoui
 
   long L = bfloat16(3.14f);
   // CHECK: [[L:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float {{.*}} %ref.tmp1.ascast)
-  // CHECK: store i16 [[L]], i16 addrspace(4)* [[L1:%value.i9]]
-  // CHECK: [[L_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} [[L1]])
+  // CHECK: [[P8:%.*]] = addrspacecast i16* [[VI9:%.*]] to i16 addrspace(4)*
+  // CHECK: store i16 [[L]], i16* [[VI9]]
+  // CHECK: [[L_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} [[P8]])
   // CHECK: [[L:%.*]] = fptosi float [[L_float]] to i{{32|64}}
 
   sycl::half H = bfloat16(2.71f);
   // CHECK: [[H:%.*]] = call spir_func zeroext i16 @__devicelib_ConvertFToBF16INTEL(float {{.*}} %ref.tmp3.ascast)
-  // CHECK: store i16 [[H]], i16 addrspace(4)* [[H1:%value.i13]]
-  // CHECK: [[H_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} [[H1]])
+  // CHECK: [[P11:%.*]] = addrspacecast i16* [[VI13:%.*]] to i16 addrspace(4)*
+  // CHECK: store i16 [[H]], i16* [[VI13]], align 2
+  // CHECK: [[H_float:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} [[P11]])
   // CHECK: [[H:%.*]] = fptrunc float [[H_float]] to half
   foo(L, H);
 
   return A;
-  // CHECK: [[RetVal:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} %value.i)
+  // CHECK: [[RetVal:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(i16 {{.*}} %2)
   // CHECK: ret float [[RetVal]]
   // CHECK-NOT: uitofp
   // CHECK-NOT: fptoui

From e24e57b57b6f304dab929f8e865b2edaf4a8e224 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 22 Nov 2022 22:43:27 -0800
Subject: [PATCH 62/63] Fixed conflicts.

---
 libdevice/cmake/modules/SYCLLibdevice.cmake   |  5 +-
 .../sycl-post-link/SYCLDeviceLibReqMask.cpp   | 16 ++---
 .../sycl-post-link/SYCLDeviceLibReqMask.h     |  5 +-
 .../program_manager/program_manager.cpp       | 58 +------------------
 .../program_manager/program_manager.hpp       |  5 +-
 5 files changed, 11 insertions(+), 78 deletions(-)

diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake
index 4fd49fce5dd5c..080e2e575e540 100644
--- a/libdevice/cmake/modules/SYCLLibdevice.cmake
+++ b/libdevice/cmake/modules/SYCLLibdevice.cmake
@@ -114,11 +114,8 @@ add_devicelib_obj(libsycl-cmath SRC cmath_wrapper.cpp DEP ${cmath_obj_deps})
 add_devicelib_obj(libsycl-cmath-fp64 SRC cmath_wrapper_fp64.cpp DEP ${cmath_obj_deps} )
 add_devicelib_obj(libsycl-imf SRC imf_wrapper.cpp DEP ${imf_obj_deps})
 add_devicelib_obj(libsycl-imf-fp64 SRC imf_wrapper_fp64.cpp DEP ${imf_obj_deps})
-<<<<<<< HEAD
-add_devicelib_obj(libsycl-bfloat16 SRC bfloat16_wrapper.cpp DEP ${cmath_obj_deps} )
-=======
 add_devicelib_obj(libsycl-imf-bf16 SRC imf_wrapper_bf16.cpp DEP ${imf_obj_deps})
->>>>>>> ccd16396310cd2a827c68c0fac1985121fc4a8c7
+add_devicelib_obj(libsycl-bfloat16 SRC bfloat16_wrapper.cpp DEP ${cmath_obj_deps} )
 if(WIN32)
 add_devicelib_obj(libsycl-msvc-math SRC msvc_math.cpp DEP ${cmath_obj_deps})
 endif()
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
index 36ea928e6f611..9b1c80cc88287 100644
--- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
+++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
@@ -474,12 +474,6 @@ SYCLDeviceLibFuncMap SDLMap = {
      DeviceLibExt::cl_intel_devicelib_imf_fp64},
     {"__devicelib_imf_longlong_as_double",
      DeviceLibExt::cl_intel_devicelib_imf_fp64},
-<<<<<<< HEAD
-    {"__devicelib_ConvertFToBF16INTEL",
-     DeviceLibExt::cl_intel_devicelib_bfloat16},
-    {"__devicelib_ConvertBF16ToFINTEL",
-     DeviceLibExt::cl_intel_devicelib_bfloat16},
-=======
     {"__devicelib_imf_bfloat162float",
      DeviceLibExt::cl_intel_devicelib_imf_bf16},
     {"__devicelib_imf_float2bfloat16",
@@ -503,7 +497,10 @@ SYCLDeviceLibFuncMap SDLMap = {
     {"__devicelib_imf_floorbf16", DeviceLibExt::cl_intel_devicelib_imf_bf16},
     {"__devicelib_imf_ceilbf16", DeviceLibExt::cl_intel_devicelib_imf_bf16},
     {"__devicelib_imf_truncbf16", DeviceLibExt::cl_intel_devicelib_imf_bf16},
->>>>>>> ccd16396310cd2a827c68c0fac1985121fc4a8c7
+    {"__devicelib_ConvertFToBF16INTEL",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
+    {"__devicelib_ConvertBF16ToFINTEL",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
 };
 
 // Each fallback device library corresponds to one bit in "require mask" which
@@ -518,11 +515,8 @@ SYCLDeviceLibFuncMap SDLMap = {
 // fallback-cstring:      0x20
 // fallback-imf:          0x40
 // fallback-imf-fp64:     0x80
-<<<<<<< HEAD
-// fallback-bfloat16:     0x100
-=======
 // fallback-imf-bf16:     0x100
->>>>>>> ccd16396310cd2a827c68c0fac1985121fc4a8c7
+// fallback-bfloat16:     0x200
 uint32_t getDeviceLibBits(const std::string &FuncName) {
   auto DeviceLibFuncIter = SDLMap.find(FuncName);
   return ((DeviceLibFuncIter == SDLMap.end())
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.h b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.h
index b5fd6c5849ef6..c9b737e2d053a 100644
--- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.h
+++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.h
@@ -34,11 +34,8 @@ enum class DeviceLibExt : std::uint32_t {
   cl_intel_devicelib_cstring,
   cl_intel_devicelib_imf,
   cl_intel_devicelib_imf_fp64,
-<<<<<<< HEAD
-  cl_intel_devicelib_bfloat16,
-=======
   cl_intel_devicelib_imf_bf16,
->>>>>>> ccd16396310cd2a827c68c0fac1985121fc4a8c7
+  cl_intel_devicelib_bfloat16,
 };
 
 uint32_t getSYCLDeviceLibReqMask(const Module &M);
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 3ec460e4839f0..7962dc4e85501 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -800,7 +800,6 @@ static bool loadDeviceLib(const ContextImplPtr Context, const char *Name,
   return Prog != nullptr;
 }
 
-<<<<<<< HEAD
 // For each extension, a pair of library names. The first uses native support,
 // the second emulates functionality in software.
 static const std::map<DeviceLibExt, std::pair<const char *, const char *>>
@@ -821,6 +820,8 @@ static const std::map<DeviceLibExt, std::pair<const char *, const char *>>
          {nullptr, "libsycl-fallback-imf.spv"}},
         {DeviceLibExt::cl_intel_devicelib_imf_fp64,
          {nullptr, "libsycl-fallback-imf-fp64.spv"}},
+        {DeviceLibExt::cl_intel_devicelib_imf_bf16,
+         {nullptr, "libsycl-fallback-imf-bf16.spv"}},
         {DeviceLibExt::cl_intel_devicelib_bfloat16,
          {"libsycl-native-bfloat16.spv", "libsycl-fallback-bfloat16.spv"}}};
 
@@ -833,31 +834,6 @@ static const char *getDeviceLibFilename(DeviceLibExt Extension, bool Native) {
     throw compile_program_error("Unhandled (new?) device library extension",
                                 PI_ERROR_INVALID_OPERATION);
   return Lib;
-=======
-static const char *getDeviceLibFilename(DeviceLibExt Extension) {
-  switch (Extension) {
-  case DeviceLibExt::cl_intel_devicelib_assert:
-    return "libsycl-fallback-cassert.spv";
-  case DeviceLibExt::cl_intel_devicelib_math:
-    return "libsycl-fallback-cmath.spv";
-  case DeviceLibExt::cl_intel_devicelib_math_fp64:
-    return "libsycl-fallback-cmath-fp64.spv";
-  case DeviceLibExt::cl_intel_devicelib_complex:
-    return "libsycl-fallback-complex.spv";
-  case DeviceLibExt::cl_intel_devicelib_complex_fp64:
-    return "libsycl-fallback-complex-fp64.spv";
-  case DeviceLibExt::cl_intel_devicelib_cstring:
-    return "libsycl-fallback-cstring.spv";
-  case DeviceLibExt::cl_intel_devicelib_imf:
-    return "libsycl-fallback-imf.spv";
-  case DeviceLibExt::cl_intel_devicelib_imf_fp64:
-    return "libsycl-fallback-imf-fp64.spv";
-  case DeviceLibExt::cl_intel_devicelib_imf_bf16:
-    return "libsycl-fallback-imf-bf16.spv";
-  }
-  throw compile_program_error("Unhandled (new?) device library extension",
-                              PI_ERROR_INVALID_OPERATION);
->>>>>>> ccd16396310cd2a827c68c0fac1985121fc4a8c7
 }
 
 // For each extension understood by the SYCL runtime, the string representation
@@ -878,36 +854,11 @@ static const std::map<DeviceLibExt, const char *> DeviceLibExtensionStrs = {
      "cl_intel_bfloat16_conversions"}};
 
 static const char *getDeviceLibExtensionStr(DeviceLibExt Extension) {
-<<<<<<< HEAD
   auto Ext = DeviceLibExtensionStrs.find(Extension);
   if (Ext == DeviceLibExtensionStrs.end())
     throw compile_program_error("Unhandled (new?) device library extension",
                                 PI_ERROR_INVALID_OPERATION);
   return Ext->second;
-=======
-  switch (Extension) {
-  case DeviceLibExt::cl_intel_devicelib_assert:
-    return "cl_intel_devicelib_assert";
-  case DeviceLibExt::cl_intel_devicelib_math:
-    return "cl_intel_devicelib_math";
-  case DeviceLibExt::cl_intel_devicelib_math_fp64:
-    return "cl_intel_devicelib_math_fp64";
-  case DeviceLibExt::cl_intel_devicelib_complex:
-    return "cl_intel_devicelib_complex";
-  case DeviceLibExt::cl_intel_devicelib_complex_fp64:
-    return "cl_intel_devicelib_complex_fp64";
-  case DeviceLibExt::cl_intel_devicelib_cstring:
-    return "cl_intel_devicelib_cstring";
-  case DeviceLibExt::cl_intel_devicelib_imf:
-    return "cl_intel_devicelib_imf";
-  case DeviceLibExt::cl_intel_devicelib_imf_fp64:
-    return "cl_intel_devicelib_imf_fp64";
-  case DeviceLibExt::cl_intel_devicelib_imf_bf16:
-    return "cl_intel_devicelib_imf_bf16";
-  }
-  throw compile_program_error("Unhandled (new?) device library extension",
-                              PI_ERROR_INVALID_OPERATION);
->>>>>>> ccd16396310cd2a827c68c0fac1985121fc4a8c7
 }
 
 static RT::PiProgram loadDeviceLibFallback(const ContextImplPtr Context,
@@ -1071,11 +1022,8 @@ getDeviceLibPrograms(const ContextImplPtr Context, const RT::PiDevice &Device,
       {DeviceLibExt::cl_intel_devicelib_cstring, false},
       {DeviceLibExt::cl_intel_devicelib_imf, false},
       {DeviceLibExt::cl_intel_devicelib_imf_fp64, false},
-<<<<<<< HEAD
+      {DeviceLibExt::cl_intel_devicelib_imf_bf16, false},
       {DeviceLibExt::cl_intel_devicelib_bfloat16, false}};
-=======
-      {DeviceLibExt::cl_intel_devicelib_imf_bf16, false}};
->>>>>>> ccd16396310cd2a827c68c0fac1985121fc4a8c7
 
   // Disable all devicelib extensions requiring fp64 support if at least
   // one underlying device doesn't support cl_khr_fp64.
diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp
index 0daa4b453a10c..bfb39d0cec7e6 100644
--- a/sycl/source/detail/program_manager/program_manager.hpp
+++ b/sycl/source/detail/program_manager/program_manager.hpp
@@ -66,11 +66,8 @@ enum class DeviceLibExt : std::uint32_t {
   cl_intel_devicelib_cstring,
   cl_intel_devicelib_imf,
   cl_intel_devicelib_imf_fp64,
-<<<<<<< HEAD
-  cl_intel_devicelib_bfloat16,
-=======
   cl_intel_devicelib_imf_bf16,
->>>>>>> ccd16396310cd2a827c68c0fac1985121fc4a8c7
+  cl_intel_devicelib_bfloat16,
 };
 
 // Provides single loading and building OpenCL programs with unique contexts

From 37b05f03cb91b1ee04efe41bd5895a9476a2e793 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 24 Nov 2022 22:54:03 -0800
Subject: [PATCH 63/63] Correction to library list.

---
 sycl/source/detail/program_manager/program_manager.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 7962dc4e85501..77f72ff7a8fc2 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -850,6 +850,7 @@ static const std::map<DeviceLibExt, const char *> DeviceLibExtensionStrs = {
     {DeviceLibExt::cl_intel_devicelib_cstring, "cl_intel_devicelib_cstring"},
     {DeviceLibExt::cl_intel_devicelib_imf, "cl_intel_devicelib_imf"},
     {DeviceLibExt::cl_intel_devicelib_imf_fp64, "cl_intel_devicelib_imf_fp64"},
+    {DeviceLibExt::cl_intel_devicelib_imf_bf16, "cl_intel_devicelib_imf_bf16"},
     {DeviceLibExt::cl_intel_devicelib_bfloat16,
      "cl_intel_bfloat16_conversions"}};