From f95f28c1a4f2afb4383e92e7191b277dce3a4033 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 17 Jun 2021 12:35:03 +0100 Subject: [PATCH 01/10] [SYCL][CUDA] atomic_ref.fetch_add used for fp64 (add operator) reduction when device has atomic64. Only the cuda backend is currently supported for the atomic64 device aspect. SYCL2020 introduces the atomic64 aspect which is required for the use of atomic_ref.fetch_add with fp64 operand. These changes allow devices with the atomic64 aspect to use a specialized reduction when using the add operator that makes use of atomics at the group level. If the atomic64 aspect is not available then the default existing implementation which does not use atomic operations is used. Signed-off-by: JackAKirk --- sycl/include/CL/sycl/ONEAPI/reduction.hpp | 69 ++++++++++++++++++++- sycl/include/CL/sycl/aspects.hpp | 3 +- sycl/include/CL/sycl/detail/pi.h | 4 +- sycl/include/CL/sycl/handler.hpp | 60 +++++++++++++++++- sycl/include/CL/sycl/info/device_traits.def | 1 + sycl/include/CL/sycl/info/info_desc.hpp | 4 +- sycl/plugins/cuda/pi_cuda.cpp | 13 ++++ sycl/plugins/opencl/pi_opencl.cpp | 1 + sycl/source/detail/device_impl.cpp | 2 + sycl/source/detail/device_info.hpp | 26 ++++++++ sycl/test/abi/sycl_symbols_linux.dump | 1 + sycl/test/on-device/basic_tests/aspects.cpp | 3 + 12 files changed, 182 insertions(+), 5 deletions(-) diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index 9f599d72b0d4c..9284f4f6964f0 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -53,6 +53,22 @@ using IsReduOptForFastAtomicFetch = sycl::detail::IsBitAND::value)>; #endif +// This type trait is used to detect if the group algorithm reduce() used with +// operands of the type T and the operation Plus is available +// for using in reduction. Note that this type trait is a subset of +// IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits +// using the reduce() algorithm to produce stable results across same type +// devices. +template +using IsReduOptForFastFloatAtomicAdd = +#ifdef SYCL_REDUCTION_DETERMINISTIC + bool_constant; +#else + bool_constant::value && + sycl::detail::is_sgenfloat::value && + (sizeof(T) == 4 || sizeof(T) == 8)>; +#endif + // This type trait is used to detect if the group algorithm reduce() used with // operands of the type T and the operation BinaryOperation is available // for using in reduction. @@ -288,6 +304,18 @@ class reducer + enable_if_t::type, T>::value && + IsReduOptForFastFloatAtomicAdd::value> + atomic_combine(_T *ReduVarPtr) const { + + atomic_ref( + *global_ptr(ReduVarPtr)) += MValue; + } + T MValue; }; @@ -330,6 +358,8 @@ class reduction_impl : private reduction_impl_base { using local_accessor_type = accessor; + static constexpr bool has_atomic_add_float = + IsReduOptForFastFloatAtomicAdd::value; static constexpr bool has_fast_atomics = IsReduOptForFastAtomicFetch::value; static constexpr bool has_fast_reduce = @@ -636,8 +666,9 @@ class reduction_impl : private reduction_impl_base { /// require initialization with identity value, then return user's read-write /// accessor. Otherwise, create 1-element global buffer initialized with /// identity value and return an accessor to that buffer. + template - std::enable_if_t + std::enable_if_t getReadWriteAccessorToInitializedMem(handler &CGH) { if (!is_usm && !initializeToIdentity()) return *MRWAcc; @@ -1467,6 +1498,42 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, } } +// Specialization for devices with the atomic64 aspect, which guarantees 64 (and +// 32) bit floating point support for atomic add. +template +std::enable_if_t +reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc, + const nd_range &Range, Reduction &, + typename Reduction::rw_accessor_type Out) { + using Name = typename get_reduction_main_kernel_name_t< + KernelName, KernelType, Reduction::is_usm, + Reduction::has_atomic_add_float, + typename Reduction::rw_accessor_type>::name; + CGH.parallel_for(Range, [=](nd_item NDIt) { + // Call user's function. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer; + KernelFunc(NDIt, Reducer); + + typename Reduction::binary_operation BOp; + Reducer.MValue = reduce_over_group(NDIt.get_group(), Reducer.MValue, BOp); + if (NDIt.get_local_linear_id() == 0) { + Reducer.atomic_combine(Reduction::getOutPointer(Out)); + } + }); +} + +// Specialization for devices with the atomic64 aspect, which guarantees 64 (and +// 32) bit floating point support for atomic add. +template +enable_if_t +reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, + const nd_range &Range, Reduction &Redu) { + + auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH); + reduCGFuncImplAtomic64( + CGH, KernelFunc, Range, Redu, Out); +} + inline void associateReduAccsWithHandlerHelper(handler &) {} template diff --git a/sycl/include/CL/sycl/aspects.hpp b/sycl/include/CL/sycl/aspects.hpp index 34d02856639aa..dabaac4e58709 100644 --- a/sycl/include/CL/sycl/aspects.hpp +++ b/sycl/include/CL/sycl/aspects.hpp @@ -38,7 +38,8 @@ enum class aspect { ext_intel_gpu_subslices_per_slice = 22, ext_intel_gpu_eu_count_per_subslice = 23, ext_intel_max_mem_bandwidth = 24, - ext_intel_mem_channel = 25 + ext_intel_mem_channel = 25, + atomic64 = 26 }; } // namespace sycl diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index 46f42c32bb704..aae8d9ec79438 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -285,7 +285,9 @@ typedef enum { PI_DEVICE_INFO_GPU_SLICES = 0x10023, PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 0x10024, PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025, - PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026 + PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026, + // These are extensions that are currently only implemented for nvidia. + PI_DEVICE_INFO_ATOMIC_64 = 0x10110 } _pi_device_info; typedef enum { diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index b3035c2e548d6..3a5fac80dcf80 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -210,6 +210,11 @@ class reduction_impl; using cl::sycl::detail::enable_if_t; using cl::sycl::detail::queue_impl; +template +enable_if_t +reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, + const nd_range &Range, Reduction &Redu); + template enable_if_t reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, @@ -1373,6 +1378,49 @@ class __SYCL_EXPORT handler { } } + /// Implements parallel_for() accepting nd_range \p Range and one reduction + /// object. This version is a specialization for the add operator. + /// It performs runtime checks for device aspect "atomic64"; if found, fast + /// sycl::atomic_ref operations are used to update the reduction at the + /// end of each work-group work. Otherwise the default implementation is + /// used. + // + // If the reduction variable must be initialized with the identity value + // before the kernel run, then an additional working accessor is created, + // initialized with the identity value and used in the kernel. That working + // accessor is then copied to user's accessor or USM pointer after + // the kernel run. + // For USM pointers without initialize_to_identity properties the same scheme + // with working accessor is used as re-using user's USM pointer in the kernel + // would require creation of another variant of user's kernel, which does not + // seem efficient. + template + detail::enable_if_t + parallel_for(nd_range Range, Reduction Redu, + _KERNELFUNCPARAM(KernelFunc)) { + + shared_ptr_class QueueCopy = MQueue; + device D = detail::getDeviceFromHandler(*this); + + if (D.has(aspect::atomic64)) { + + ONEAPI::detail::reduCGFuncAtomic64(*this, KernelFunc, Range, + Redu); + + if (Reduction::is_usm || Redu.initializeToIdentity()) { + this->finalize(); + handler CopyHandler(QueueCopy, MIsHost); + CopyHandler.saveCodeLoc(MCodeLoc); + ONEAPI::detail::reduSaveFinalResultToUserMem(CopyHandler, + Redu); + MLastEvent = CopyHandler.finalize(); + } + } else { + parallel_for_Impl(Range, Redu, KernelFunc); + } + } + /// Defines and invokes a SYCL kernel function for the specified nd_range. /// Performs reduction operation specified in \p Redu. /// @@ -1389,9 +1437,19 @@ class __SYCL_EXPORT handler { /// optimized implementations waiting for their turn of code-review. template - detail::enable_if_t + detail::enable_if_t parallel_for(nd_range Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc)) { + + parallel_for_Impl(Range, Redu, KernelFunc); + } + + template + detail::enable_if_t + parallel_for_Impl(nd_range Range, Reduction Redu, + KernelType KernelFunc) { // This parallel_for() is lowered to the following sequence: // 1) Call a kernel that a) call user's lambda function and b) performs // one iteration of reduction, storing the partial reductions/sums diff --git a/sycl/include/CL/sycl/info/device_traits.def b/sycl/include/CL/sycl/info/device_traits.def index f3d4dd6b1d1b3..c736fd6ef1cb7 100644 --- a/sycl/include/CL/sycl/info/device_traits.def +++ b/sycl/include/CL/sycl/info/device_traits.def @@ -22,6 +22,7 @@ __SYCL_PARAM_TRAITS_SPEC(device, max_clock_frequency, pi_uint32) __SYCL_PARAM_TRAITS_SPEC(device, address_bits, pi_uint32) __SYCL_PARAM_TRAITS_SPEC(device, max_mem_alloc_size, pi_uint64) __SYCL_PARAM_TRAITS_SPEC(device, image_support, bool) +__SYCL_PARAM_TRAITS_SPEC(device, atomic64, bool) __SYCL_PARAM_TRAITS_SPEC(device, max_read_image_args, pi_uint32) __SYCL_PARAM_TRAITS_SPEC(device, max_write_image_args, pi_uint32) __SYCL_PARAM_TRAITS_SPEC(device, image2d_max_width, size_t) diff --git a/sycl/include/CL/sycl/info/info_desc.hpp b/sycl/include/CL/sycl/info/info_desc.hpp index f221638c9af99..d767acb549b25 100644 --- a/sycl/include/CL/sycl/info/info_desc.hpp +++ b/sycl/include/CL/sycl/info/info_desc.hpp @@ -141,7 +141,9 @@ enum class device : cl_device_info { ext_intel_gpu_eu_count_per_subslice = PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE, ext_intel_max_mem_bandwidth = PI_DEVICE_INFO_MAX_MEM_BANDWIDTH, - ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL + ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL, + // currently only implemented for nvidia + atomic64 = PI_DEVICE_INFO_ATOMIC_64 }; enum class device_type : pi_uint64 { diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 9384a0ac10c5a..41cabd28db8a5 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -981,6 +981,19 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, bool ifp = (major >= 7); return getInfo(param_value_size, param_value, param_value_size_ret, ifp); } + + case PI_DEVICE_INFO_ATOMIC_64: { + int major = 0; + cl::sycl::detail::pi::assertion( + cuDeviceGetAttribute(&major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device->get()) == CUDA_SUCCESS); + + bool atomic64 = (major >= 6) ? true : false; + return getInfo(param_value_size, param_value, param_value_size_ret, + atomic64); + } + case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { // NVIDIA devices only support one sub-group size (the warp size) int warpSize = 0; diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 484c7cf77f1b3..a5bfa37d90dd5 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -176,6 +176,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, case PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE: case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH: + case PI_DEVICE_INFO_ATOMIC_64: return PI_INVALID_VALUE; default: diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 85305d397987d..f5c1603d401e3 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -239,6 +239,8 @@ bool device_impl::has(aspect Aspect) const { return has_extension("cl_khr_int64_base_atomics"); case aspect::int64_extended_atomics: return has_extension("cl_khr_int64_extended_atomics"); + case aspect::atomic64: + return get_info(); case aspect::image: return get_info(); case aspect::online_compiler: diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index bf3eb0561ef7a..b4512241c11f0 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -233,6 +233,28 @@ template <> struct get_device_info { } }; +// Specialization for atomic64 that is necessary because +// PI_DEVICE_INFO_ATOMIC_64 isn't implemented for backend other than cuda. +// TODO the if-statement can be removed when the other backends support +// PI_DEVICE_INFO_ATOMIC_64. +template <> struct get_device_info { + static bool get(RT::PiDevice dev, const plugin &Plugin) { + + bool result = false; + + platform plt = + get_device_info::get(dev, Plugin); + + if (plt.get_backend() == backend::cuda) { + Plugin.call( + dev, pi::cast(info::device::atomic64), + sizeof(result), &result, nullptr); + } + + return (result); + } +}; + // Specialization for exec_capabilities, OpenCL returns a bitfield template <> struct get_device_info, @@ -613,6 +635,10 @@ template <> inline bool get_device_info_host() { return true; } +template <> inline bool get_device_info_host() { + return false; +} + template <> inline cl_uint get_device_info_host() { // current value is the required minimum diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 626008589c654..80f7b2e4b7947 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -4103,6 +4103,7 @@ _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65571EEENS3_12param_traitsIS4_XT_ _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65572EEENS3_12param_traitsIS4_XT_EE11return_typeEv _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65573EEENS3_12param_traitsIS4_XT_EE11return_typeEv _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65574EEENS3_12param_traitsIS4_XT_EE11return_typeEv +_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65808EEENS3_12param_traitsIS4_XT_EE11return_typeEv _ZNK2cl4sycl6device9getNativeEv _ZNK2cl4sycl6kernel11get_contextEv _ZNK2cl4sycl6kernel11get_programEv diff --git a/sycl/test/on-device/basic_tests/aspects.cpp b/sycl/test/on-device/basic_tests/aspects.cpp index 521914c763ee6..85589a31b4d34 100644 --- a/sycl/test/on-device/basic_tests/aspects.cpp +++ b/sycl/test/on-device/basic_tests/aspects.cpp @@ -57,6 +57,9 @@ int main() { if (plt.has(aspect::int64_extended_atomics)) { std::cout << " extended atomic operations" << std::endl; } + if (plt.has(aspect::atomic64)) { + std::cout << " atomic64" << std::endl; + } if (plt.has(aspect::image)) { std::cout << " images" << std::endl; } From 80c85087f356bb0228ce0d95e5fc681b468e6217 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 23 Jun 2021 10:10:14 +0100 Subject: [PATCH 02/10] cl_khr_fp64 and cl_khr_fp16 extensions are now connected to the cuda plugin. This change ensures that all Reduction tests run for the cuda backend. The cl_khr_fp64 extension is used for all cuda devices by default, since sm_XX wherer XX < 13 has been unsupported by the cuda driver since cuda 8.0. The test fp16-with-unnamed-lambda.cpp has been deleted because it has a duplicate in the test suite (in the dir SYCL/Regression). In both cases the triple is missing on the first line which needs to be added to the llvm-test-suite copy to avoid a test failure now that the test is not skipped for the cuda backend. Signed-off-by: JackAKirk --- sycl/plugins/cuda/pi_cuda.cpp | 21 ++++++++- .../regression/fp16-with-unnamed-lambda.cpp | 43 ------------------- 2 files changed, 20 insertions(+), 44 deletions(-) delete mode 100644 sycl/test/regression/fp16-with-unnamed-lambda.cpp diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 41cabd28db8a5..ab0bf29ac2498 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1371,7 +1371,26 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, return getInfo(param_value_size, param_value, param_value_size_ret, ""); } case PI_DEVICE_INFO_EXTENSIONS: { - return getInfo(param_value_size, param_value, param_value_size_ret, ""); + + std::string SupportedExtensions = "cl_khr_fp64 "; + int major = 0; + int minor = 0; + + cl::sycl::detail::pi::assertion( + cuDeviceGetAttribute(&major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device->get()) == CUDA_SUCCESS); + cl::sycl::detail::pi::assertion( + cuDeviceGetAttribute(&minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + device->get()) == CUDA_SUCCESS); + + if ((major >= 6) || ((major == 5) && (minor >= 3))) { + SupportedExtensions += "cl_khr_fp16 "; + } + + return getInfo(param_value_size, param_value, param_value_size_ret, + SupportedExtensions.c_str()); } case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: { // The minimum value for the FULL profile is 1 MB. diff --git a/sycl/test/regression/fp16-with-unnamed-lambda.cpp b/sycl/test/regression/fp16-with-unnamed-lambda.cpp deleted file mode 100644 index 91f3f69bdc3bd..0000000000000 --- a/sycl/test/regression/fp16-with-unnamed-lambda.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-unnamed-lambda %s -o %t.out -// RUN: %RUN_ON_HOST %t.out -#include - -#include - -int main() { - auto AsyncHandler = [](cl::sycl::exception_list EL) { - for (std::exception_ptr const &P : EL) { - try { - std::rethrow_exception(P); - } catch (std::exception const &E) { - std::cerr << "Caught async SYCL exception: " << E.what() << std::endl; - } - } - }; - - cl::sycl::queue Q(AsyncHandler); - - cl::sycl::device D = Q.get_device(); - if (!D.has_extension("cl_khr_fp16")) - return 0; // Skip the test if halfs are not supported - - cl::sycl::buffer Buf(1); - - Q.submit([&](cl::sycl::handler &CGH) { - auto Acc = Buf.get_access(CGH); - CGH.single_task([=]() { - Acc[0] = 1; - }); - }); - - Q.wait_and_throw(); - - auto Acc = Buf.get_access(); - if (1 != Acc[0]) { - std::cerr << "Incorrect result, got: " << Acc[0] - << ", expected: 1" << std::endl; - return 1; - } - - return 0; -} From b1a8a7d2bd10eda5b921389cb753e8eb3d46146f Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 23 Jun 2021 10:50:24 +0100 Subject: [PATCH 03/10] Revert "cl_khr_fp64 and cl_khr_fp16 extensions are now connected to the cuda plugin." This reverts commit 80c85087f356bb0228ce0d95e5fc681b468e6217. --- sycl/plugins/cuda/pi_cuda.cpp | 21 +-------- .../regression/fp16-with-unnamed-lambda.cpp | 43 +++++++++++++++++++ 2 files changed, 44 insertions(+), 20 deletions(-) create mode 100644 sycl/test/regression/fp16-with-unnamed-lambda.cpp diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index ab0bf29ac2498..41cabd28db8a5 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1371,26 +1371,7 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, return getInfo(param_value_size, param_value, param_value_size_ret, ""); } case PI_DEVICE_INFO_EXTENSIONS: { - - std::string SupportedExtensions = "cl_khr_fp64 "; - int major = 0; - int minor = 0; - - cl::sycl::detail::pi::assertion( - cuDeviceGetAttribute(&major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - device->get()) == CUDA_SUCCESS); - cl::sycl::detail::pi::assertion( - cuDeviceGetAttribute(&minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - device->get()) == CUDA_SUCCESS); - - if ((major >= 6) || ((major == 5) && (minor >= 3))) { - SupportedExtensions += "cl_khr_fp16 "; - } - - return getInfo(param_value_size, param_value, param_value_size_ret, - SupportedExtensions.c_str()); + return getInfo(param_value_size, param_value, param_value_size_ret, ""); } case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: { // The minimum value for the FULL profile is 1 MB. diff --git a/sycl/test/regression/fp16-with-unnamed-lambda.cpp b/sycl/test/regression/fp16-with-unnamed-lambda.cpp new file mode 100644 index 0000000000000..91f3f69bdc3bd --- /dev/null +++ b/sycl/test/regression/fp16-with-unnamed-lambda.cpp @@ -0,0 +1,43 @@ +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda %s -o %t.out +// RUN: %RUN_ON_HOST %t.out +#include + +#include + +int main() { + auto AsyncHandler = [](cl::sycl::exception_list EL) { + for (std::exception_ptr const &P : EL) { + try { + std::rethrow_exception(P); + } catch (std::exception const &E) { + std::cerr << "Caught async SYCL exception: " << E.what() << std::endl; + } + } + }; + + cl::sycl::queue Q(AsyncHandler); + + cl::sycl::device D = Q.get_device(); + if (!D.has_extension("cl_khr_fp16")) + return 0; // Skip the test if halfs are not supported + + cl::sycl::buffer Buf(1); + + Q.submit([&](cl::sycl::handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.single_task([=]() { + Acc[0] = 1; + }); + }); + + Q.wait_and_throw(); + + auto Acc = Buf.get_access(); + if (1 != Acc[0]) { + std::cerr << "Incorrect result, got: " << Acc[0] + << ", expected: 1" << std::endl; + return 1; + } + + return 0; +} From 70a2c54595b3f5bae9a8313c9095fcc7e75736b2 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 23 Jun 2021 11:15:39 +0100 Subject: [PATCH 04/10] The cl_khr_fp64 extension is now connected to the cuda plugin. This change allows a skipped fp64 Reduction test in llvm-test-suite (reduction_nd_ext_double.cpp) to run for the cuda backend. The cl_khr_fp64 extension is used for all cuda devices by default, since sm_XX wherer XX < 13 has been unsupported by the cuda driver since cuda 8.0. Signed-off-by: JackAKirk --- sycl/plugins/cuda/pi_cuda.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 41cabd28db8a5..aa1f7d6914eb2 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1371,7 +1371,11 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, return getInfo(param_value_size, param_value, param_value_size_ret, ""); } case PI_DEVICE_INFO_EXTENSIONS: { - return getInfo(param_value_size, param_value, param_value_size_ret, ""); + + std::string SupportedExtensions = "cl_khr_fp64 "; + + return getInfo(param_value_size, param_value, param_value_size_ret, + SupportedExtensions.c_str()); } case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: { // The minimum value for the FULL profile is 1 MB. From ab8a60000ca5b1430a69e3ff22a44c7d1d6a44b1 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 24 Jun 2021 11:37:52 +0100 Subject: [PATCH 05/10] Renamed has_atomic_add_float to has_atomic_add_float64. Renamed has_atomic_add_float to has_atomic_add_float64, since the general usage that includes float32 is only expected to be temporary. has_atomic_add_float64 is a pseudonym of IsReduOptForAtomic64Add. Updated documentation describing the current temporary usage of fp32 within IsReduOptForAtomic64Add. IsReduOptForFastFloatAtomicAdd has been renamed IsReduOptForAtomic64Add to distinguish that this boolean should only be used in the case that the device has the sycl2020 atomic64 aspect, consistent with the naming convention used in other functions that are specializations for the atomic64 aspect. Signed-off-by: JackAKirk --- sycl/include/CL/sycl/ONEAPI/reduction.hpp | 46 +++++++++++++++-------- sycl/include/CL/sycl/handler.hpp | 6 +-- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index 9284f4f6964f0..c6311c05afcdc 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -53,14 +53,20 @@ using IsReduOptForFastAtomicFetch = sycl::detail::IsBitAND::value)>; #endif -// This type trait is used to detect if the group algorithm reduce() used with -// operands of the type T and the operation Plus is available -// for using in reduction. Note that this type trait is a subset of +// This type trait is used to detect if the atomic operation BinaryOperation +// used with operands of the type T is available for using in reduction, in +// addition to the cases covered by "IsReduOptForFastAtomicFetch", if the device +// has the atomic64 aspect. This type trait should only be used if the device +// has the atomic64 aspect. Note that this type trait is currently a subset of // IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits -// using the reduce() algorithm to produce stable results across same type -// devices. +// using the reduce_over_group() algorithm to produce stable results across same +// type devices. +// TODO 32 bit floating point atomics are eventually expected to be supported by +// the has_fast_atomics specialization. Once the reducer class is updated to +// replace the deprecated atomic class with atomic_ref, the (sizeof(T) == 4) +// case should be removed here and replaced in IsReduOptForFastAtomicFetch. template -using IsReduOptForFastFloatAtomicAdd = +using IsReduOptForAtomic64Add = #ifdef SYCL_REDUCTION_DETERMINISTIC bool_constant; #else @@ -307,7 +313,7 @@ class reducer enable_if_t::type, T>::value && - IsReduOptForFastFloatAtomicAdd::value> + IsReduOptForAtomic64Add::value> atomic_combine(_T *ReduVarPtr) const { atomic_ref; - static constexpr bool has_atomic_add_float = - IsReduOptForFastFloatAtomicAdd::value; + static constexpr bool has_atomic_add_float64 = + IsReduOptForAtomic64Add::value; static constexpr bool has_fast_atomics = IsReduOptForFastAtomicFetch::value; static constexpr bool has_fast_reduce = @@ -667,8 +673,8 @@ class reduction_impl : private reduction_impl_base { /// accessor. Otherwise, create 1-element global buffer initialized with /// identity value and return an accessor to that buffer. - template - std::enable_if_t + template + std::enable_if_t getReadWriteAccessorToInitializedMem(handler &CGH) { if (!is_usm && !initializeToIdentity()) return *MRWAcc; @@ -1499,15 +1505,19 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, } // Specialization for devices with the atomic64 aspect, which guarantees 64 (and -// 32) bit floating point support for atomic add. +// temporarily 32) bit floating point support for atomic add. +// TODO 32 bit floating point atomics are eventually expected to be supported by +// the has_fast_atomics specialization. Corresponding changes to +// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also +// be made. template -std::enable_if_t +std::enable_if_t reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction &, typename Reduction::rw_accessor_type Out) { using Name = typename get_reduction_main_kernel_name_t< KernelName, KernelType, Reduction::is_usm, - Reduction::has_atomic_add_float, + Reduction::has_atomic_add_float64, typename Reduction::rw_accessor_type>::name; CGH.parallel_for(Range, [=](nd_item NDIt) { // Call user's function. Reducer.MValue gets initialized there. @@ -1523,9 +1533,13 @@ reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc, } // Specialization for devices with the atomic64 aspect, which guarantees 64 (and -// 32) bit floating point support for atomic add. +// temporarily 32) bit floating point support for atomic add. +// TODO 32 bit floating point atomics are eventually expected to be supported by +// the has_fast_atomics specialization. Corresponding changes to +// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also +// be made. template -enable_if_t +enable_if_t reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction &Redu) { diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 3a5fac80dcf80..2ed3b171475dd 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -211,7 +211,7 @@ using cl::sycl::detail::enable_if_t; using cl::sycl::detail::queue_impl; template -enable_if_t +enable_if_t reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction &Redu); @@ -1396,7 +1396,7 @@ class __SYCL_EXPORT handler { // seem efficient. template - detail::enable_if_t + detail::enable_if_t parallel_for(nd_range Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc)) { @@ -1438,7 +1438,7 @@ class __SYCL_EXPORT handler { template detail::enable_if_t + !Reduction::has_atomic_add_float64> parallel_for(nd_range Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc)) { From 49ec72af5861fc4bca7c421293c1557bf7ae684a Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Mon, 28 Jun 2021 14:00:46 +0100 Subject: [PATCH 06/10] PI_DEVICE_INFO_ATOMIC_64 is now checked for all backends when get_info() is called. If the PI returns the PI_INVALID_VALUE error then atomic64 is set false and the user is informed that the atomic64 aspect status is unknown for the device. Signed-off-by: JackAKirk --- sycl/source/detail/device_info.hpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index b4512241c11f0..b6177c917bbfe 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -234,24 +234,23 @@ template <> struct get_device_info { }; // Specialization for atomic64 that is necessary because -// PI_DEVICE_INFO_ATOMIC_64 isn't implemented for backend other than cuda. -// TODO the if-statement can be removed when the other backends support -// PI_DEVICE_INFO_ATOMIC_64. +// PI_DEVICE_INFO_ATOMIC_64 is currently only implemented for the cuda backend. template <> struct get_device_info { static bool get(RT::PiDevice dev, const plugin &Plugin) { bool result = false; - platform plt = - get_device_info::get(dev, Plugin); - - if (plt.get_backend() == backend::cuda) { - Plugin.call( - dev, pi::cast(info::device::atomic64), - sizeof(result), &result, nullptr); + RT::PiResult Err = Plugin.call_nocheck( + dev, pi::cast(info::device::atomic64), sizeof(result), + &result, nullptr); + if (Err == PI_INVALID_VALUE) { + std::cout + << "The Plugin Interface has returned an error:\n The value of " + "the atomic64 device aspect is unknown, Setting atomic64 " + "= false.\n\n"; } - return (result); + return result; } }; From b4bde0ccd3c4d44aed628d3ea2df338e9565f384 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 29 Jun 2021 09:42:32 +0100 Subject: [PATCH 07/10] Removed unnecessary screen print out. Plugin.call_nocheck casts result to false if PI_SUCCESS was not returned. Signed-off-by: JackAKirk --- sycl/source/detail/device_info.hpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index b6177c917bbfe..e718d17eea463 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -240,15 +240,9 @@ template <> struct get_device_info { bool result = false; - RT::PiResult Err = Plugin.call_nocheck( + Plugin.call_nocheck( dev, pi::cast(info::device::atomic64), sizeof(result), &result, nullptr); - if (Err == PI_INVALID_VALUE) { - std::cout - << "The Plugin Interface has returned an error:\n The value of " - "the atomic64 device aspect is unknown, Setting atomic64 " - "= false.\n\n"; - } return result; } From 6a0f94897f0c3a1751de959553559ce2018ec0af Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 29 Jun 2021 16:53:59 +0100 Subject: [PATCH 08/10] Return false if Err == PI_INVALID_VALUE. Signed-off-by: JackAKirk --- sycl/source/detail/device_info.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index e718d17eea463..0bc90f3c05c6d 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -240,10 +240,12 @@ template <> struct get_device_info { bool result = false; - Plugin.call_nocheck( + RT::PiResult Err = Plugin.call_nocheck( dev, pi::cast(info::device::atomic64), sizeof(result), &result, nullptr); - + if (Err == PI_INVALID_VALUE) { + return false; + } return result; } }; From 24ee26ed66a8fdd096a3b97b77d9ebf949a0bd9a Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 29 Jun 2021 17:09:08 +0100 Subject: [PATCH 09/10] Swapped (Err == PI_INVALID_VALUE) with (Err != PI_SUCCESS). Signed-off-by: JackAKirk --- sycl/source/detail/device_info.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 0bc90f3c05c6d..1ebccdc5b577e 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -243,7 +243,7 @@ template <> struct get_device_info { RT::PiResult Err = Plugin.call_nocheck( dev, pi::cast(info::device::atomic64), sizeof(result), &result, nullptr); - if (Err == PI_INVALID_VALUE) { + if (Err != PI_SUCCESS) { return false; } return result; From 471ccde14501d29b26ed402d173f1a2394b82f4a Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Wed, 30 Jun 2021 08:21:20 +0300 Subject: [PATCH 10/10] Apply suggestions from code review Remove comments. --- sycl/include/CL/sycl/detail/pi.h | 1 - sycl/include/CL/sycl/info/info_desc.hpp | 1 - 2 files changed, 2 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index 49b1c0b67928b..5f2735add28c5 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -286,7 +286,6 @@ typedef enum { PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 0x10024, PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025, PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026, - // These are extensions that are currently only implemented for nvidia. PI_DEVICE_INFO_ATOMIC_64 = 0x10110 } _pi_device_info; diff --git a/sycl/include/CL/sycl/info/info_desc.hpp b/sycl/include/CL/sycl/info/info_desc.hpp index d767acb549b25..ce8037df60420 100644 --- a/sycl/include/CL/sycl/info/info_desc.hpp +++ b/sycl/include/CL/sycl/info/info_desc.hpp @@ -142,7 +142,6 @@ enum class device : cl_device_info { PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE, ext_intel_max_mem_bandwidth = PI_DEVICE_INFO_MAX_MEM_BANDWIDTH, ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL, - // currently only implemented for nvidia atomic64 = PI_DEVICE_INFO_ATOMIC_64 };