From f95f28c1a4f2afb4383e92e7191b277dce3a4033 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Thu, 17 Jun 2021 12:35:03 +0100
Subject: [PATCH 01/10] [SYCL][CUDA] atomic_ref.fetch_add used for fp64 (add
 operator) reduction when device has atomic64.

Only the cuda backend is currently supported for the atomic64 device aspect.

SYCL2020 introduces the atomic64 aspect which is required for the use of atomic_ref.fetch_add with fp64 operand.
These changes allow devices with the atomic64 aspect to use a specialized reduction when using the add operator that makes use of atomics at the group level.
If the atomic64 aspect is not available then the default existing implementation which does not use atomic operations is used.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/CL/sycl/ONEAPI/reduction.hpp   | 69 ++++++++++++++++++++-
 sycl/include/CL/sycl/aspects.hpp            |  3 +-
 sycl/include/CL/sycl/detail/pi.h            |  4 +-
 sycl/include/CL/sycl/handler.hpp            | 60 +++++++++++++++++-
 sycl/include/CL/sycl/info/device_traits.def |  1 +
 sycl/include/CL/sycl/info/info_desc.hpp     |  4 +-
 sycl/plugins/cuda/pi_cuda.cpp               | 13 ++++
 sycl/plugins/opencl/pi_opencl.cpp           |  1 +
 sycl/source/detail/device_impl.cpp          |  2 +
 sycl/source/detail/device_info.hpp          | 26 ++++++++
 sycl/test/abi/sycl_symbols_linux.dump       |  1 +
 sycl/test/on-device/basic_tests/aspects.cpp |  3 +
 12 files changed, 182 insertions(+), 5 deletions(-)
diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
index 9f599d72b0d4c..9284f4f6964f0 100644
--- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp
+++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
@@ -53,6 +53,22 @@ using IsReduOptForFastAtomicFetch =
                    sycl::detail::IsBitAND<T, BinaryOperation>::value)>;
 #endif
 
+// This type trait is used to detect if the group algorithm reduce() used with
+// operands of the type T and the operation Plus is available
+// for using in reduction. Note that this type trait is a subset of
+// IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
+// using the reduce() algorithm to produce stable results across same type
+// devices.
+template <typename T, class BinaryOperation>
+using IsReduOptForFastFloatAtomicAdd =
+#ifdef SYCL_REDUCTION_DETERMINISTIC
+    bool_constant<false>;
+#else
+    bool_constant<sycl::detail::IsPlus<T, BinaryOperation>::value &&
+                  sycl::detail::is_sgenfloat<T>::value &&
+                  (sizeof(T) == 4 || sizeof(T) == 8)>;
+#endif
+
 // This type trait is used to detect if the group algorithm reduce() used with
 // operands of the type T and the operation BinaryOperation is available
 // for using in reduction.
@@ -288,6 +304,18 @@ class reducer<T, BinaryOperation,
         .fetch_max(MValue);
   }
 
+  /// Atomic ADD operation: for floating point using atomic_ref
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
+              IsReduOptForFastFloatAtomicAdd<T, _BinaryOperation>::value>
+  atomic_combine(_T *ReduVarPtr) const {
+
+    atomic_ref<T, sycl::ONEAPI::memory_order::relaxed,
+               sycl::ONEAPI::memory_scope::device,
+               access::address_space::global_space>(
+        *global_ptr<T>(ReduVarPtr)) += MValue;
+  }
+
   T MValue;
 };
 
@@ -330,6 +358,8 @@ class reduction_impl : private reduction_impl_base {
   using local_accessor_type =
       accessor<T, buffer_dim, access::mode::read_write, access::target::local>;
 
+  static constexpr bool has_atomic_add_float =
+      IsReduOptForFastFloatAtomicAdd<T, BinaryOperation>::value;
   static constexpr bool has_fast_atomics =
       IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
   static constexpr bool has_fast_reduce =
@@ -636,8 +666,9 @@ class reduction_impl : private reduction_impl_base {
   /// require initialization with identity value, then return user's read-write
   /// accessor. Otherwise, create 1-element global buffer initialized with
   /// identity value and return an accessor to that buffer.
+
   template <bool HasFastAtomics = has_fast_atomics>
-  std::enable_if_t<HasFastAtomics, rw_accessor_type>
+  std::enable_if_t<HasFastAtomics || has_atomic_add_float, rw_accessor_type>
   getReadWriteAccessorToInitializedMem(handler &CGH) {
     if (!is_usm && !initializeToIdentity())
       return *MRWAcc;
@@ -1467,6 +1498,42 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc,
   }
 }
 
+// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
+// 32) bit floating point support for atomic add.
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+std::enable_if_t<Reduction::has_atomic_add_float>
+reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc,
+                       const nd_range<Dims> &Range, Reduction &,
+                       typename Reduction::rw_accessor_type Out) {
+  using Name = typename get_reduction_main_kernel_name_t<
+      KernelName, KernelType, Reduction::is_usm,
+      Reduction::has_atomic_add_float,
+      typename Reduction::rw_accessor_type>::name;
+  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
+    // Call user's function. Reducer.MValue gets initialized there.
+    typename Reduction::reducer_type Reducer;
+    KernelFunc(NDIt, Reducer);
+
+    typename Reduction::binary_operation BOp;
+    Reducer.MValue = reduce_over_group(NDIt.get_group(), Reducer.MValue, BOp);
+    if (NDIt.get_local_linear_id() == 0) {
+      Reducer.atomic_combine(Reduction::getOutPointer(Out));
+    }
+  });
+}
+
+// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
+// 32) bit floating point support for atomic add.
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<Reduction::has_atomic_add_float>
+reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
+                   const nd_range<Dims> &Range, Reduction &Redu) {
+
+  auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
+  reduCGFuncImplAtomic64<KernelName, KernelType, Dims, Reduction>(
+      CGH, KernelFunc, Range, Redu, Out);
+}
+
 inline void associateReduAccsWithHandlerHelper(handler &) {}
 
 template <typename ReductionT>
diff --git a/sycl/include/CL/sycl/aspects.hpp b/sycl/include/CL/sycl/aspects.hpp
index 34d02856639aa..dabaac4e58709 100644
--- a/sycl/include/CL/sycl/aspects.hpp
+++ b/sycl/include/CL/sycl/aspects.hpp
@@ -38,7 +38,8 @@ enum class aspect {
   ext_intel_gpu_subslices_per_slice = 22,
   ext_intel_gpu_eu_count_per_subslice = 23,
   ext_intel_max_mem_bandwidth = 24,
-  ext_intel_mem_channel = 25
+  ext_intel_mem_channel = 25,
+  atomic64 = 26
 };
 
 } // namespace sycl
diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
index 46f42c32bb704..aae8d9ec79438 100644
--- a/sycl/include/CL/sycl/detail/pi.h
+++ b/sycl/include/CL/sycl/detail/pi.h
@@ -285,7 +285,9 @@ typedef enum {
   PI_DEVICE_INFO_GPU_SLICES = 0x10023,
   PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 0x10024,
   PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025,
-  PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026
+  PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026,
+  // These are extensions that are currently only implemented for nvidia.
+  PI_DEVICE_INFO_ATOMIC_64 = 0x10110
 } _pi_device_info;
 
 typedef enum {
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index b3035c2e548d6..3a5fac80dcf80 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -210,6 +210,11 @@ class reduction_impl;
 using cl::sycl::detail::enable_if_t;
 using cl::sycl::detail::queue_impl;
 
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<Reduction::has_atomic_add_float>
+reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
+                   const nd_range<Dims> &Range, Reduction &Redu);
+
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
 enable_if_t<Reduction::has_fast_atomics>
 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
@@ -1373,6 +1378,49 @@ class __SYCL_EXPORT handler {
     }
   }
 
+  /// Implements parallel_for() accepting nd_range \p Range and one reduction
+  /// object. This version is a specialization for the add operator.
+  /// It performs runtime checks for device aspect "atomic64"; if found, fast
+  /// sycl::atomic_ref operations are used to update the reduction at the
+  /// end of each work-group work.  Otherwise the default implementation is
+  /// used.
+  //
+  // If the reduction variable must be initialized with the identity value
+  // before the kernel run, then an additional working accessor is created,
+  // initialized with the identity value and used in the kernel. That working
+  // accessor is then copied to user's accessor or USM pointer after
+  // the kernel run.
+  // For USM pointers without initialize_to_identity properties the same scheme
+  // with working accessor is used as re-using user's USM pointer in the kernel
+  // would require creation of another variant of user's kernel, which does not
+  // seem efficient.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  detail::enable_if_t<Reduction::has_atomic_add_float>
+  parallel_for(nd_range<Dims> Range, Reduction Redu,
+               _KERNELFUNCPARAM(KernelFunc)) {
+
+    shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
+    device D = detail::getDeviceFromHandler(*this);
+
+    if (D.has(aspect::atomic64)) {
+
+      ONEAPI::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc, Range,
+                                                     Redu);
+
+      if (Reduction::is_usm || Redu.initializeToIdentity()) {
+        this->finalize();
+        handler CopyHandler(QueueCopy, MIsHost);
+        CopyHandler.saveCodeLoc(MCodeLoc);
+        ONEAPI::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
+                                                                 Redu);
+        MLastEvent = CopyHandler.finalize();
+      }
+    } else {
+      parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
+    }
+  }
+
   /// Defines and invokes a SYCL kernel function for the specified nd_range.
   /// Performs reduction operation specified in \p Redu.
   ///
@@ -1389,9 +1437,19 @@ class __SYCL_EXPORT handler {
   /// optimized implementations waiting for their turn of code-review.
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims, typename Reduction>
-  detail::enable_if_t<!Reduction::has_fast_atomics>
+  detail::enable_if_t<!Reduction::has_fast_atomics &&
+                      !Reduction::has_atomic_add_float>
   parallel_for(nd_range<Dims> Range, Reduction Redu,
                _KERNELFUNCPARAM(KernelFunc)) {
+
+    parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
+  }
+
+  template <typename KernelName, typename KernelType, int Dims,
+            typename Reduction>
+  detail::enable_if_t<!Reduction::has_fast_atomics>
+  parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
+                    KernelType KernelFunc) {
     // This parallel_for() is lowered to the following sequence:
     // 1) Call a kernel that a) call user's lambda function and b) performs
     //    one iteration of reduction, storing the partial reductions/sums
diff --git a/sycl/include/CL/sycl/info/device_traits.def b/sycl/include/CL/sycl/info/device_traits.def
index f3d4dd6b1d1b3..c736fd6ef1cb7 100644
--- a/sycl/include/CL/sycl/info/device_traits.def
+++ b/sycl/include/CL/sycl/info/device_traits.def
@@ -22,6 +22,7 @@ __SYCL_PARAM_TRAITS_SPEC(device, max_clock_frequency, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, address_bits, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, max_mem_alloc_size, pi_uint64)
 __SYCL_PARAM_TRAITS_SPEC(device, image_support, bool)
+__SYCL_PARAM_TRAITS_SPEC(device, atomic64, bool)
 __SYCL_PARAM_TRAITS_SPEC(device, max_read_image_args, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, max_write_image_args, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, image2d_max_width, size_t)
diff --git a/sycl/include/CL/sycl/info/info_desc.hpp b/sycl/include/CL/sycl/info/info_desc.hpp
index f221638c9af99..d767acb549b25 100644
--- a/sycl/include/CL/sycl/info/info_desc.hpp
+++ b/sycl/include/CL/sycl/info/info_desc.hpp
@@ -141,7 +141,9 @@ enum class device : cl_device_info {
   ext_intel_gpu_eu_count_per_subslice =
       PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE,
   ext_intel_max_mem_bandwidth = PI_DEVICE_INFO_MAX_MEM_BANDWIDTH,
-  ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL
+  ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL,
+  // currently only implemented for nvidia
+  atomic64 = PI_DEVICE_INFO_ATOMIC_64
 };
 
 enum class device_type : pi_uint64 {
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 9384a0ac10c5a..41cabd28db8a5 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -981,6 +981,19 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     bool ifp = (major >= 7);
     return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
   }
+
+  case PI_DEVICE_INFO_ATOMIC_64: {
+    int major = 0;
+    cl::sycl::detail::pi::assertion(
+        cuDeviceGetAttribute(&major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             device->get()) == CUDA_SUCCESS);
+
+    bool atomic64 = (major >= 6) ? true : false;
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   atomic64);
+  }
+
   case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
     // NVIDIA devices only support one sub-group size (the warp size)
     int warpSize = 0;
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index 484c7cf77f1b3..a5bfa37d90dd5 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -176,6 +176,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
   case PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
   case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
+  case PI_DEVICE_INFO_ATOMIC_64:
     return PI_INVALID_VALUE;
 
   default:
diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index 85305d397987d..f5c1603d401e3 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -239,6 +239,8 @@ bool device_impl::has(aspect Aspect) const {
     return has_extension("cl_khr_int64_base_atomics");
   case aspect::int64_extended_atomics:
     return has_extension("cl_khr_int64_extended_atomics");
+  case aspect::atomic64:
+    return get_info<info::device::atomic64>();
   case aspect::image:
     return get_info<info::device::image_support>();
   case aspect::online_compiler:
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index bf3eb0561ef7a..b4512241c11f0 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -233,6 +233,28 @@ template <> struct get_device_info<bool, info::device::queue_profiling> {
   }
 };
 
+// Specialization for atomic64 that is necessary because
+// PI_DEVICE_INFO_ATOMIC_64 isn't implemented for backend other than cuda.
+// TODO the if-statement can be removed when the other backends support
+// PI_DEVICE_INFO_ATOMIC_64.
+template <> struct get_device_info<bool, info::device::atomic64> {
+  static bool get(RT::PiDevice dev, const plugin &Plugin) {
+
+    bool result = false;
+
+    platform plt =
+        get_device_info<platform, info::device::platform>::get(dev, Plugin);
+
+    if (plt.get_backend() == backend::cuda) {
+      Plugin.call<PiApiKind::piDeviceGetInfo>(
+          dev, pi::cast<RT::PiDeviceInfo>(info::device::atomic64),
+          sizeof(result), &result, nullptr);
+    }
+
+    return (result);
+  }
+};
+
 // Specialization for exec_capabilities, OpenCL returns a bitfield
 template <>
 struct get_device_info<vector_class<info::execution_capability>,
@@ -613,6 +635,10 @@ template <> inline bool get_device_info_host<info::device::image_support>() {
   return true;
 }
 
+template <> inline bool get_device_info_host<info::device::atomic64>() {
+  return false;
+}
+
 template <>
 inline cl_uint get_device_info_host<info::device::max_read_image_args>() {
   // current value is the required minimum
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 626008589c654..80f7b2e4b7947 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -4103,6 +4103,7 @@ _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65571EEENS3_12param_traitsIS4_XT_
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65572EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65573EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65574EEENS3_12param_traitsIS4_XT_EE11return_typeEv
+_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65808EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device9getNativeEv
 _ZNK2cl4sycl6kernel11get_contextEv
 _ZNK2cl4sycl6kernel11get_programEv
diff --git a/sycl/test/on-device/basic_tests/aspects.cpp b/sycl/test/on-device/basic_tests/aspects.cpp
index 521914c763ee6..85589a31b4d34 100644
--- a/sycl/test/on-device/basic_tests/aspects.cpp
+++ b/sycl/test/on-device/basic_tests/aspects.cpp
@@ -57,6 +57,9 @@ int main() {
     if (plt.has(aspect::int64_extended_atomics)) {
       std::cout << "  extended atomic operations" << std::endl;
     }
+    if (plt.has(aspect::atomic64)) {
+      std::cout << "  atomic64" << std::endl;
+    }
     if (plt.has(aspect::image)) {
       std::cout << "  images" << std::endl;
     }

From 80c85087f356bb0228ce0d95e5fc681b468e6217 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Wed, 23 Jun 2021 10:10:14 +0100
Subject: [PATCH 02/10] cl_khr_fp64 and cl_khr_fp16 extensions are now
 connected to the cuda plugin.

This change ensures that all Reduction tests run for the cuda backend.
The cl_khr_fp64 extension is used for all cuda devices by default, since sm_XX wherer XX < 13 has been unsupported by the cuda driver since cuda 8.0.
The test fp16-with-unnamed-lambda.cpp has been deleted because it has a duplicate in the test suite (in the dir SYCL/Regression).
In both cases the triple is missing on the first line which needs to be added to the llvm-test-suite copy to avoid a test failure now that the test is not skipped for the cuda backend.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/plugins/cuda/pi_cuda.cpp                 | 21 ++++++++-
 .../regression/fp16-with-unnamed-lambda.cpp   | 43 -------------------
 2 files changed, 20 insertions(+), 44 deletions(-)
 delete mode 100644 sycl/test/regression/fp16-with-unnamed-lambda.cpp

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 41cabd28db8a5..ab0bf29ac2498 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -1371,7 +1371,26 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     return getInfo(param_value_size, param_value, param_value_size_ret, "");
   }
   case PI_DEVICE_INFO_EXTENSIONS: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, "");
+
+    std::string SupportedExtensions = "cl_khr_fp64 ";
+    int major = 0;
+    int minor = 0;
+
+    cl::sycl::detail::pi::assertion(
+        cuDeviceGetAttribute(&major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             device->get()) == CUDA_SUCCESS);
+    cl::sycl::detail::pi::assertion(
+        cuDeviceGetAttribute(&minor,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                             device->get()) == CUDA_SUCCESS);
+
+    if ((major >= 6) || ((major == 5) && (minor >= 3))) {
+      SupportedExtensions += "cl_khr_fp16 ";
+    }
+
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   SupportedExtensions.c_str());
   }
   case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: {
     // The minimum value for the FULL profile is 1 MB.
diff --git a/sycl/test/regression/fp16-with-unnamed-lambda.cpp b/sycl/test/regression/fp16-with-unnamed-lambda.cpp
deleted file mode 100644
index 91f3f69bdc3bd..0000000000000
--- a/sycl/test/regression/fp16-with-unnamed-lambda.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda %s -o %t.out
-// RUN: %RUN_ON_HOST %t.out
-#include <CL/sycl.hpp>
-
-#include <iostream>
-
-int main() {
-  auto AsyncHandler = [](cl::sycl::exception_list EL) {
-    for (std::exception_ptr const &P : EL) {
-      try {
-        std::rethrow_exception(P);
-      } catch (std::exception const &E) {
-        std::cerr << "Caught async SYCL exception: " << E.what() << std::endl;
-      }
-    }
-  };
-
-  cl::sycl::queue Q(AsyncHandler);
-
-  cl::sycl::device D = Q.get_device();
-  if (!D.has_extension("cl_khr_fp16"))
-    return 0; // Skip the test if halfs are not supported
-
-  cl::sycl::buffer<cl::sycl::cl_half> Buf(1);
-
-  Q.submit([&](cl::sycl::handler &CGH) {
-    auto Acc = Buf.get_access<cl::sycl::access::mode::write>(CGH);
-    CGH.single_task([=]() {
-      Acc[0] = 1;
-    });
-  });
-
-  Q.wait_and_throw();
-
-  auto Acc = Buf.get_access<cl::sycl::access::mode::read>();
-  if (1 != Acc[0]) {
-    std::cerr << "Incorrect result, got: " << Acc[0]
-              << ", expected: 1" << std::endl;
-    return 1;
-  }
-
-  return 0;
-}

From b1a8a7d2bd10eda5b921389cb753e8eb3d46146f Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Wed, 23 Jun 2021 10:50:24 +0100
Subject: [PATCH 03/10] Revert "cl_khr_fp64 and cl_khr_fp16 extensions are now
 connected to the cuda plugin."

This reverts commit 80c85087f356bb0228ce0d95e5fc681b468e6217.
---
 sycl/plugins/cuda/pi_cuda.cpp                 | 21 +--------
 .../regression/fp16-with-unnamed-lambda.cpp   | 43 +++++++++++++++++++
 2 files changed, 44 insertions(+), 20 deletions(-)
 create mode 100644 sycl/test/regression/fp16-with-unnamed-lambda.cpp

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index ab0bf29ac2498..41cabd28db8a5 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -1371,26 +1371,7 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     return getInfo(param_value_size, param_value, param_value_size_ret, "");
   }
   case PI_DEVICE_INFO_EXTENSIONS: {
-
-    std::string SupportedExtensions = "cl_khr_fp64 ";
-    int major = 0;
-    int minor = 0;
-
-    cl::sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-    cl::sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&minor,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                             device->get()) == CUDA_SUCCESS);
-
-    if ((major >= 6) || ((major == 5) && (minor >= 3))) {
-      SupportedExtensions += "cl_khr_fp16 ";
-    }
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   SupportedExtensions.c_str());
+    return getInfo(param_value_size, param_value, param_value_size_ret, "");
   }
   case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: {
     // The minimum value for the FULL profile is 1 MB.
diff --git a/sycl/test/regression/fp16-with-unnamed-lambda.cpp b/sycl/test/regression/fp16-with-unnamed-lambda.cpp
new file mode 100644
index 0000000000000..91f3f69bdc3bd
--- /dev/null
+++ b/sycl/test/regression/fp16-with-unnamed-lambda.cpp
@@ -0,0 +1,43 @@
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda %s -o %t.out
+// RUN: %RUN_ON_HOST %t.out
+#include <CL/sycl.hpp>
+
+#include <iostream>
+
+int main() {
+  auto AsyncHandler = [](cl::sycl::exception_list EL) {
+    for (std::exception_ptr const &P : EL) {
+      try {
+        std::rethrow_exception(P);
+      } catch (std::exception const &E) {
+        std::cerr << "Caught async SYCL exception: " << E.what() << std::endl;
+      }
+    }
+  };
+
+  cl::sycl::queue Q(AsyncHandler);
+
+  cl::sycl::device D = Q.get_device();
+  if (!D.has_extension("cl_khr_fp16"))
+    return 0; // Skip the test if halfs are not supported
+
+  cl::sycl::buffer<cl::sycl::cl_half> Buf(1);
+
+  Q.submit([&](cl::sycl::handler &CGH) {
+    auto Acc = Buf.get_access<cl::sycl::access::mode::write>(CGH);
+    CGH.single_task([=]() {
+      Acc[0] = 1;
+    });
+  });
+
+  Q.wait_and_throw();
+
+  auto Acc = Buf.get_access<cl::sycl::access::mode::read>();
+  if (1 != Acc[0]) {
+    std::cerr << "Incorrect result, got: " << Acc[0]
+              << ", expected: 1" << std::endl;
+    return 1;
+  }
+
+  return 0;
+}

From 70a2c54595b3f5bae9a8313c9095fcc7e75736b2 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Wed, 23 Jun 2021 11:15:39 +0100
Subject: [PATCH 04/10] The cl_khr_fp64 extension is now connected to the cuda
 plugin.

This change allows a skipped fp64 Reduction test in llvm-test-suite (reduction_nd_ext_double.cpp) to run for the cuda backend.
The cl_khr_fp64 extension is used for all cuda devices by default, since sm_XX wherer XX < 13 has been unsupported by the cuda driver since cuda 8.0.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/plugins/cuda/pi_cuda.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 41cabd28db8a5..aa1f7d6914eb2 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -1371,7 +1371,11 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     return getInfo(param_value_size, param_value, param_value_size_ret, "");
   }
   case PI_DEVICE_INFO_EXTENSIONS: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, "");
+
+    std::string SupportedExtensions = "cl_khr_fp64 ";
+
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   SupportedExtensions.c_str());
   }
   case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: {
     // The minimum value for the FULL profile is 1 MB.

From ab8a60000ca5b1430a69e3ff22a44c7d1d6a44b1 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Thu, 24 Jun 2021 11:37:52 +0100
Subject: [PATCH 05/10] Renamed has_atomic_add_float to has_atomic_add_float64.

Renamed has_atomic_add_float to has_atomic_add_float64, since the general usage that includes float32 is only expected to be temporary.
has_atomic_add_float64 is a pseudonym of IsReduOptForAtomic64Add.
Updated documentation describing the current temporary usage of fp32 within IsReduOptForAtomic64Add.
IsReduOptForFastFloatAtomicAdd has been renamed IsReduOptForAtomic64Add to distinguish that this boolean should only be used in the case that the device has the sycl2020 atomic64 aspect, consistent with the naming convention used in other functions that are specializations for the atomic64 aspect.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/CL/sycl/ONEAPI/reduction.hpp | 46 +++++++++++++++--------
 sycl/include/CL/sycl/handler.hpp          |  6 +--
 2 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
index 9284f4f6964f0..c6311c05afcdc 100644
--- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp
+++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
@@ -53,14 +53,20 @@ using IsReduOptForFastAtomicFetch =
                    sycl::detail::IsBitAND<T, BinaryOperation>::value)>;
 #endif
 
-// This type trait is used to detect if the group algorithm reduce() used with
-// operands of the type T and the operation Plus is available
-// for using in reduction. Note that this type trait is a subset of
+// This type trait is used to detect if the atomic operation BinaryOperation
+// used with operands of the type T is available for using in reduction, in
+// addition to the cases covered by "IsReduOptForFastAtomicFetch", if the device
+// has the atomic64 aspect. This type trait should only be used if the device
+// has the atomic64 aspect.  Note that this type trait is currently a subset of
 // IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
-// using the reduce() algorithm to produce stable results across same type
-// devices.
+// using the reduce_over_group() algorithm to produce stable results across same
+// type devices.
+// TODO 32 bit floating point atomics are eventually expected to be supported by
+// the has_fast_atomics specialization. Once the reducer class is updated to
+// replace the deprecated atomic class with atomic_ref, the (sizeof(T) == 4)
+// case should be removed here and replaced in IsReduOptForFastAtomicFetch.
 template <typename T, class BinaryOperation>
-using IsReduOptForFastFloatAtomicAdd =
+using IsReduOptForAtomic64Add =
 #ifdef SYCL_REDUCTION_DETERMINISTIC
     bool_constant<false>;
 #else
@@ -307,7 +313,7 @@ class reducer<T, BinaryOperation,
   /// Atomic ADD operation: for floating point using atomic_ref
   template <typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
-              IsReduOptForFastFloatAtomicAdd<T, _BinaryOperation>::value>
+              IsReduOptForAtomic64Add<T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
 
     atomic_ref<T, sycl::ONEAPI::memory_order::relaxed,
@@ -358,8 +364,8 @@ class reduction_impl : private reduction_impl_base {
   using local_accessor_type =
       accessor<T, buffer_dim, access::mode::read_write, access::target::local>;
 
-  static constexpr bool has_atomic_add_float =
-      IsReduOptForFastFloatAtomicAdd<T, BinaryOperation>::value;
+  static constexpr bool has_atomic_add_float64 =
+      IsReduOptForAtomic64Add<T, BinaryOperation>::value;
   static constexpr bool has_fast_atomics =
       IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
   static constexpr bool has_fast_reduce =
@@ -667,8 +673,8 @@ class reduction_impl : private reduction_impl_base {
   /// accessor. Otherwise, create 1-element global buffer initialized with
   /// identity value and return an accessor to that buffer.
 
-  template <bool HasFastAtomics = has_fast_atomics>
-  std::enable_if_t<HasFastAtomics || has_atomic_add_float, rw_accessor_type>
+  template <bool HasFastAtomics = (has_fast_atomics || has_atomic_add_float64)>
+  std::enable_if_t<HasFastAtomics, rw_accessor_type>
   getReadWriteAccessorToInitializedMem(handler &CGH) {
     if (!is_usm && !initializeToIdentity())
       return *MRWAcc;
@@ -1499,15 +1505,19 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc,
 }
 
 // Specialization for devices with the atomic64 aspect, which guarantees 64 (and
-// 32) bit floating point support for atomic add.
+// temporarily 32) bit floating point support for atomic add.
+// TODO 32 bit floating point atomics are eventually expected to be supported by
+// the has_fast_atomics specialization. Corresponding changes to
+// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
+// be made.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-std::enable_if_t<Reduction::has_atomic_add_float>
+std::enable_if_t<Reduction::has_atomic_add_float64>
 reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc,
                        const nd_range<Dims> &Range, Reduction &,
                        typename Reduction::rw_accessor_type Out) {
   using Name = typename get_reduction_main_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm,
-      Reduction::has_atomic_add_float,
+      Reduction::has_atomic_add_float64,
       typename Reduction::rw_accessor_type>::name;
   CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
     // Call user's function. Reducer.MValue gets initialized there.
@@ -1523,9 +1533,13 @@ reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc,
 }
 
 // Specialization for devices with the atomic64 aspect, which guarantees 64 (and
-// 32) bit floating point support for atomic add.
+// temporarily 32) bit floating point support for atomic add.
+// TODO 32 bit floating point atomics are eventually expected to be supported by
+// the has_fast_atomics specialization. Corresponding changes to
+// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
+// be made.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-enable_if_t<Reduction::has_atomic_add_float>
+enable_if_t<Reduction::has_atomic_add_float64>
 reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
                    const nd_range<Dims> &Range, Reduction &Redu) {
 
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 3a5fac80dcf80..2ed3b171475dd 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -211,7 +211,7 @@ using cl::sycl::detail::enable_if_t;
 using cl::sycl::detail::queue_impl;
 
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-enable_if_t<Reduction::has_atomic_add_float>
+enable_if_t<Reduction::has_atomic_add_float64>
 reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
                    const nd_range<Dims> &Range, Reduction &Redu);
 
@@ -1396,7 +1396,7 @@ class __SYCL_EXPORT handler {
   // seem efficient.
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims, typename Reduction>
-  detail::enable_if_t<Reduction::has_atomic_add_float>
+  detail::enable_if_t<Reduction::has_atomic_add_float64>
   parallel_for(nd_range<Dims> Range, Reduction Redu,
                _KERNELFUNCPARAM(KernelFunc)) {
 
@@ -1438,7 +1438,7 @@ class __SYCL_EXPORT handler {
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims, typename Reduction>
   detail::enable_if_t<!Reduction::has_fast_atomics &&
-                      !Reduction::has_atomic_add_float>
+                      !Reduction::has_atomic_add_float64>
   parallel_for(nd_range<Dims> Range, Reduction Redu,
                _KERNELFUNCPARAM(KernelFunc)) {
 

From 49ec72af5861fc4bca7c421293c1557bf7ae684a Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Mon, 28 Jun 2021 14:00:46 +0100
Subject: [PATCH 06/10] PI_DEVICE_INFO_ATOMIC_64 is now checked for all
 backends when get_info<info::device::atomic64>() is called.

If the PI returns the PI_INVALID_VALUE error then atomic64 is set false and the user is informed that the atomic64 aspect status is unknown for the device.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/source/detail/device_info.hpp | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index b4512241c11f0..b6177c917bbfe 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -234,24 +234,23 @@ template <> struct get_device_info<bool, info::device::queue_profiling> {
 };
 
 // Specialization for atomic64 that is necessary because
-// PI_DEVICE_INFO_ATOMIC_64 isn't implemented for backend other than cuda.
-// TODO the if-statement can be removed when the other backends support
-// PI_DEVICE_INFO_ATOMIC_64.
+// PI_DEVICE_INFO_ATOMIC_64 is currently only implemented for the cuda backend.
 template <> struct get_device_info<bool, info::device::atomic64> {
   static bool get(RT::PiDevice dev, const plugin &Plugin) {
 
     bool result = false;
 
-    platform plt =
-        get_device_info<platform, info::device::platform>::get(dev, Plugin);
-
-    if (plt.get_backend() == backend::cuda) {
-      Plugin.call<PiApiKind::piDeviceGetInfo>(
-          dev, pi::cast<RT::PiDeviceInfo>(info::device::atomic64),
-          sizeof(result), &result, nullptr);
+    RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
+        dev, pi::cast<RT::PiDeviceInfo>(info::device::atomic64), sizeof(result),
+        &result, nullptr);
+    if (Err == PI_INVALID_VALUE) {
+      std::cout
+          << "The Plugin Interface has returned an error:\n  The value of "
+             "the atomic64 device aspect is unknown, Setting atomic64 "
+             "= false.\n\n";
     }
 
-    return (result);
+    return result;
   }
 };
 

From b4bde0ccd3c4d44aed628d3ea2df338e9565f384 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Tue, 29 Jun 2021 09:42:32 +0100
Subject: [PATCH 07/10] Removed unnecessary screen print out.
 Plugin.call_nocheck casts result to false if PI_SUCCESS was not returned.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/source/detail/device_info.hpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index b6177c917bbfe..e718d17eea463 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -240,15 +240,9 @@ template <> struct get_device_info<bool, info::device::atomic64> {
 
     bool result = false;
 
-    RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
+    Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
         dev, pi::cast<RT::PiDeviceInfo>(info::device::atomic64), sizeof(result),
         &result, nullptr);
-    if (Err == PI_INVALID_VALUE) {
-      std::cout
-          << "The Plugin Interface has returned an error:\n  The value of "
-             "the atomic64 device aspect is unknown, Setting atomic64 "
-             "= false.\n\n";
-    }
 
     return result;
   }

From 6a0f94897f0c3a1751de959553559ce2018ec0af Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Tue, 29 Jun 2021 16:53:59 +0100
Subject: [PATCH 08/10] Return false if Err == PI_INVALID_VALUE.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/source/detail/device_info.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index e718d17eea463..0bc90f3c05c6d 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -240,10 +240,12 @@ template <> struct get_device_info<bool, info::device::atomic64> {
 
     bool result = false;
 
-    Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
+    RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
         dev, pi::cast<RT::PiDeviceInfo>(info::device::atomic64), sizeof(result),
         &result, nullptr);
-
+    if (Err == PI_INVALID_VALUE) {
+      return false;
+    }
     return result;
   }
 };

From 24ee26ed66a8fdd096a3b97b77d9ebf949a0bd9a Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Tue, 29 Jun 2021 17:09:08 +0100
Subject: [PATCH 09/10] Swapped (Err == PI_INVALID_VALUE) with (Err !=
 PI_SUCCESS).

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/source/detail/device_info.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 0bc90f3c05c6d..1ebccdc5b577e 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -243,7 +243,7 @@ template <> struct get_device_info<bool, info::device::atomic64> {
     RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
         dev, pi::cast<RT::PiDeviceInfo>(info::device::atomic64), sizeof(result),
         &result, nullptr);
-    if (Err == PI_INVALID_VALUE) {
+    if (Err != PI_SUCCESS) {
       return false;
     }
     return result;

From 471ccde14501d29b26ed402d173f1a2394b82f4a Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Wed, 30 Jun 2021 08:21:20 +0300
Subject: [PATCH 10/10] Apply suggestions from code review

Remove comments.
---
 sycl/include/CL/sycl/detail/pi.h        | 1 -
 sycl/include/CL/sycl/info/info_desc.hpp | 1 -
 2 files changed, 2 deletions(-)

diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
index 49b1c0b67928b..5f2735add28c5 100644
--- a/sycl/include/CL/sycl/detail/pi.h
+++ b/sycl/include/CL/sycl/detail/pi.h
@@ -286,7 +286,6 @@ typedef enum {
   PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 0x10024,
   PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025,
   PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026,
-  // These are extensions that are currently only implemented for nvidia.
   PI_DEVICE_INFO_ATOMIC_64 = 0x10110
 } _pi_device_info;
 
diff --git a/sycl/include/CL/sycl/info/info_desc.hpp b/sycl/include/CL/sycl/info/info_desc.hpp
index d767acb549b25..ce8037df60420 100644
--- a/sycl/include/CL/sycl/info/info_desc.hpp
+++ b/sycl/include/CL/sycl/info/info_desc.hpp
@@ -142,7 +142,6 @@ enum class device : cl_device_info {
       PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE,
   ext_intel_max_mem_bandwidth = PI_DEVICE_INFO_MAX_MEM_BANDWIDTH,
   ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL,
-  // currently only implemented for nvidia
   atomic64 = PI_DEVICE_INFO_ATOMIC_64
 };