diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index 9f599d72b0d4c..c6311c05afcdc 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -53,6 +53,28 @@ using IsReduOptForFastAtomicFetch = sycl::detail::IsBitAND::value)>; #endif +// This type trait is used to detect if the atomic operation BinaryOperation +// used with operands of the type T is available for using in reduction, in +// addition to the cases covered by "IsReduOptForFastAtomicFetch", if the device +// has the atomic64 aspect. This type trait should only be used if the device +// has the atomic64 aspect. Note that this type trait is currently a subset of +// IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits +// using the reduce_over_group() algorithm to produce stable results across same +// type devices. +// TODO 32 bit floating point atomics are eventually expected to be supported by +// the has_fast_atomics specialization. Once the reducer class is updated to +// replace the deprecated atomic class with atomic_ref, the (sizeof(T) == 4) +// case should be removed here and replaced in IsReduOptForFastAtomicFetch. +template +using IsReduOptForAtomic64Add = +#ifdef SYCL_REDUCTION_DETERMINISTIC + bool_constant; +#else + bool_constant::value && + sycl::detail::is_sgenfloat::value && + (sizeof(T) == 4 || sizeof(T) == 8)>; +#endif + // This type trait is used to detect if the group algorithm reduce() used with // operands of the type T and the operation BinaryOperation is available // for using in reduction. @@ -288,6 +310,18 @@ class reducer + enable_if_t::type, T>::value && + IsReduOptForAtomic64Add::value> + atomic_combine(_T *ReduVarPtr) const { + + atomic_ref( + *global_ptr(ReduVarPtr)) += MValue; + } + T MValue; }; @@ -330,6 +364,8 @@ class reduction_impl : private reduction_impl_base { using local_accessor_type = accessor; + static constexpr bool has_atomic_add_float64 = + IsReduOptForAtomic64Add::value; static constexpr bool has_fast_atomics = IsReduOptForFastAtomicFetch::value; static constexpr bool has_fast_reduce = @@ -636,7 +672,8 @@ class reduction_impl : private reduction_impl_base { /// require initialization with identity value, then return user's read-write /// accessor. Otherwise, create 1-element global buffer initialized with /// identity value and return an accessor to that buffer. - template + + template std::enable_if_t getReadWriteAccessorToInitializedMem(handler &CGH) { if (!is_usm && !initializeToIdentity()) @@ -1467,6 +1504,50 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, } } +// Specialization for devices with the atomic64 aspect, which guarantees 64 (and +// temporarily 32) bit floating point support for atomic add. +// TODO 32 bit floating point atomics are eventually expected to be supported by +// the has_fast_atomics specialization. Corresponding changes to +// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also +// be made. +template +std::enable_if_t +reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc, + const nd_range &Range, Reduction &, + typename Reduction::rw_accessor_type Out) { + using Name = typename get_reduction_main_kernel_name_t< + KernelName, KernelType, Reduction::is_usm, + Reduction::has_atomic_add_float64, + typename Reduction::rw_accessor_type>::name; + CGH.parallel_for(Range, [=](nd_item NDIt) { + // Call user's function. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer; + KernelFunc(NDIt, Reducer); + + typename Reduction::binary_operation BOp; + Reducer.MValue = reduce_over_group(NDIt.get_group(), Reducer.MValue, BOp); + if (NDIt.get_local_linear_id() == 0) { + Reducer.atomic_combine(Reduction::getOutPointer(Out)); + } + }); +} + +// Specialization for devices with the atomic64 aspect, which guarantees 64 (and +// temporarily 32) bit floating point support for atomic add. +// TODO 32 bit floating point atomics are eventually expected to be supported by +// the has_fast_atomics specialization. Corresponding changes to +// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also +// be made. +template +enable_if_t +reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, + const nd_range &Range, Reduction &Redu) { + + auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH); + reduCGFuncImplAtomic64( + CGH, KernelFunc, Range, Redu, Out); +} + inline void associateReduAccsWithHandlerHelper(handler &) {} template diff --git a/sycl/include/CL/sycl/aspects.hpp b/sycl/include/CL/sycl/aspects.hpp index 24513ed3e515d..fbc13e967cc96 100644 --- a/sycl/include/CL/sycl/aspects.hpp +++ b/sycl/include/CL/sycl/aspects.hpp @@ -41,6 +41,7 @@ enum class aspect { ext_intel_mem_channel = 25, usm_atomic_host_allocations = 26, usm_atomic_shared_allocations = 27, + atomic64 = 28 }; } // namespace sycl diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index 393b5964568ff..5f2735add28c5 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -285,7 +285,8 @@ typedef enum { PI_DEVICE_INFO_GPU_SLICES = 0x10023, PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 0x10024, PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025, - PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026 + PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026, + PI_DEVICE_INFO_ATOMIC_64 = 0x10110 } _pi_device_info; typedef enum { diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index ad9b003c51a3f..352202bc84194 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -210,6 +210,11 @@ class reduction_impl; using cl::sycl::detail::enable_if_t; using cl::sycl::detail::queue_impl; +template +enable_if_t +reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, + const nd_range &Range, Reduction &Redu); + template enable_if_t reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, @@ -1382,6 +1387,49 @@ class __SYCL_EXPORT handler { } } + /// Implements parallel_for() accepting nd_range \p Range and one reduction + /// object. This version is a specialization for the add operator. + /// It performs runtime checks for device aspect "atomic64"; if found, fast + /// sycl::atomic_ref operations are used to update the reduction at the + /// end of each work-group work. Otherwise the default implementation is + /// used. + // + // If the reduction variable must be initialized with the identity value + // before the kernel run, then an additional working accessor is created, + // initialized with the identity value and used in the kernel. That working + // accessor is then copied to user's accessor or USM pointer after + // the kernel run. + // For USM pointers without initialize_to_identity properties the same scheme + // with working accessor is used as re-using user's USM pointer in the kernel + // would require creation of another variant of user's kernel, which does not + // seem efficient. + template + detail::enable_if_t + parallel_for(nd_range Range, Reduction Redu, + _KERNELFUNCPARAM(KernelFunc)) { + + shared_ptr_class QueueCopy = MQueue; + device D = detail::getDeviceFromHandler(*this); + + if (D.has(aspect::atomic64)) { + + ONEAPI::detail::reduCGFuncAtomic64(*this, KernelFunc, Range, + Redu); + + if (Reduction::is_usm || Redu.initializeToIdentity()) { + this->finalize(); + handler CopyHandler(QueueCopy, MIsHost); + CopyHandler.saveCodeLoc(MCodeLoc); + ONEAPI::detail::reduSaveFinalResultToUserMem(CopyHandler, + Redu); + MLastEvent = CopyHandler.finalize(); + } + } else { + parallel_for_Impl(Range, Redu, KernelFunc); + } + } + /// Defines and invokes a SYCL kernel function for the specified nd_range. /// Performs reduction operation specified in \p Redu. /// @@ -1398,9 +1446,19 @@ class __SYCL_EXPORT handler { /// optimized implementations waiting for their turn of code-review. template - detail::enable_if_t + detail::enable_if_t parallel_for(nd_range Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc)) { + + parallel_for_Impl(Range, Redu, KernelFunc); + } + + template + detail::enable_if_t + parallel_for_Impl(nd_range Range, Reduction Redu, + KernelType KernelFunc) { // This parallel_for() is lowered to the following sequence: // 1) Call a kernel that a) call user's lambda function and b) performs // one iteration of reduction, storing the partial reductions/sums diff --git a/sycl/include/CL/sycl/info/device_traits.def b/sycl/include/CL/sycl/info/device_traits.def index f3d4dd6b1d1b3..c736fd6ef1cb7 100644 --- a/sycl/include/CL/sycl/info/device_traits.def +++ b/sycl/include/CL/sycl/info/device_traits.def @@ -22,6 +22,7 @@ __SYCL_PARAM_TRAITS_SPEC(device, max_clock_frequency, pi_uint32) __SYCL_PARAM_TRAITS_SPEC(device, address_bits, pi_uint32) __SYCL_PARAM_TRAITS_SPEC(device, max_mem_alloc_size, pi_uint64) __SYCL_PARAM_TRAITS_SPEC(device, image_support, bool) +__SYCL_PARAM_TRAITS_SPEC(device, atomic64, bool) __SYCL_PARAM_TRAITS_SPEC(device, max_read_image_args, pi_uint32) __SYCL_PARAM_TRAITS_SPEC(device, max_write_image_args, pi_uint32) __SYCL_PARAM_TRAITS_SPEC(device, image2d_max_width, size_t) diff --git a/sycl/include/CL/sycl/info/info_desc.hpp b/sycl/include/CL/sycl/info/info_desc.hpp index f221638c9af99..ce8037df60420 100644 --- a/sycl/include/CL/sycl/info/info_desc.hpp +++ b/sycl/include/CL/sycl/info/info_desc.hpp @@ -141,7 +141,8 @@ enum class device : cl_device_info { ext_intel_gpu_eu_count_per_subslice = PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE, ext_intel_max_mem_bandwidth = PI_DEVICE_INFO_MAX_MEM_BANDWIDTH, - ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL + ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL, + atomic64 = PI_DEVICE_INFO_ATOMIC_64 }; enum class device_type : pi_uint64 { diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 1c8daf6e4b835..ec49dc3f04ce3 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -985,6 +985,19 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, bool ifp = (major >= 7); return getInfo(param_value_size, param_value, param_value_size_ret, ifp); } + + case PI_DEVICE_INFO_ATOMIC_64: { + int major = 0; + cl::sycl::detail::pi::assertion( + cuDeviceGetAttribute(&major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device->get()) == CUDA_SUCCESS); + + bool atomic64 = (major >= 6) ? true : false; + return getInfo(param_value_size, param_value, param_value_size_ret, + atomic64); + } + case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { // NVIDIA devices only support one sub-group size (the warp size) int warpSize = 0; @@ -1362,7 +1375,11 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, return getInfo(param_value_size, param_value, param_value_size_ret, ""); } case PI_DEVICE_INFO_EXTENSIONS: { - return getInfo(param_value_size, param_value, param_value_size_ret, ""); + + std::string SupportedExtensions = "cl_khr_fp64 "; + + return getInfo(param_value_size, param_value, param_value_size_ret, + SupportedExtensions.c_str()); } case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: { // The minimum value for the FULL profile is 1 MB. diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index f1a074b3fc65d..61be57c0fc745 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -178,6 +178,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, case PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE: case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH: + case PI_DEVICE_INFO_ATOMIC_64: return PI_INVALID_VALUE; default: diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 172c19be83344..1de46ece3ca9e 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -239,6 +239,8 @@ bool device_impl::has(aspect Aspect) const { return has_extension("cl_khr_int64_base_atomics"); case aspect::int64_extended_atomics: return has_extension("cl_khr_int64_extended_atomics"); + case aspect::atomic64: + return get_info(); case aspect::image: return get_info(); case aspect::online_compiler: diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index bf3eb0561ef7a..1ebccdc5b577e 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -233,6 +233,23 @@ template <> struct get_device_info { } }; +// Specialization for atomic64 that is necessary because +// PI_DEVICE_INFO_ATOMIC_64 is currently only implemented for the cuda backend. +template <> struct get_device_info { + static bool get(RT::PiDevice dev, const plugin &Plugin) { + + bool result = false; + + RT::PiResult Err = Plugin.call_nocheck( + dev, pi::cast(info::device::atomic64), sizeof(result), + &result, nullptr); + if (Err != PI_SUCCESS) { + return false; + } + return result; + } +}; + // Specialization for exec_capabilities, OpenCL returns a bitfield template <> struct get_device_info, @@ -613,6 +630,10 @@ template <> inline bool get_device_info_host() { return true; } +template <> inline bool get_device_info_host() { + return false; +} + template <> inline cl_uint get_device_info_host() { // current value is the required minimum diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 2016f1ae315c0..0771bed3c0010 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -4127,6 +4127,7 @@ _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65571EEENS3_12param_traitsIS4_XT_ _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65572EEENS3_12param_traitsIS4_XT_EE11return_typeEv _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65573EEENS3_12param_traitsIS4_XT_EE11return_typeEv _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65574EEENS3_12param_traitsIS4_XT_EE11return_typeEv +_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65808EEENS3_12param_traitsIS4_XT_EE11return_typeEv _ZNK2cl4sycl6device9getNativeEv _ZNK2cl4sycl6kernel11get_contextEv _ZNK2cl4sycl6kernel11get_programEv diff --git a/sycl/test/on-device/basic_tests/aspects.cpp b/sycl/test/on-device/basic_tests/aspects.cpp index 521247708a774..b9083ec4a0158 100644 --- a/sycl/test/on-device/basic_tests/aspects.cpp +++ b/sycl/test/on-device/basic_tests/aspects.cpp @@ -57,6 +57,9 @@ int main() { if (plt.has(aspect::int64_extended_atomics)) { std::cout << " extended atomic operations" << std::endl; } + if (plt.has(aspect::atomic64)) { + std::cout << " atomic64" << std::endl; + } if (plt.has(aspect::image)) { std::cout << " images" << std::endl; }