diff --git a/sycl/include/CL/sycl.hpp b/sycl/include/CL/sycl.hpp index 27b3d3d6c137f..83beb030b9a49 100644 --- a/sycl/include/CL/sycl.hpp +++ b/sycl/include/CL/sycl.hpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/sycl/include/CL/sycl/intel/functional.hpp b/sycl/include/CL/sycl/intel/functional.hpp index 842adbf96b3be..f54e3777d3375 100644 --- a/sycl/include/CL/sycl/intel/functional.hpp +++ b/sycl/include/CL/sycl/intel/functional.hpp @@ -53,6 +53,9 @@ template <> struct maximum { #endif template using plus = std::plus; +template using bit_or = std::bit_or; +template using bit_xor = std::bit_xor; +template using bit_and = std::bit_and; } // namespace intel diff --git a/sycl/include/CL/sycl/intel/group_algorithm.hpp b/sycl/include/CL/sycl/intel/group_algorithm.hpp index 86304709c7e23..5a11d23812cd7 100644 --- a/sycl/include/CL/sycl/intel/group_algorithm.hpp +++ b/sycl/include/CL/sycl/intel/group_algorithm.hpp @@ -108,8 +108,8 @@ template struct identity> { template Function for_each(Group g, Ptr first, Ptr last, Function f) { #ifdef __SYCL_DEVICE_ONLY__ - ptrdiff_t offset = detail::get_local_linear_id(g); - ptrdiff_t stride = detail::get_local_linear_range(g); + ptrdiff_t offset = sycl::detail::get_local_linear_id(g); + ptrdiff_t stride = sycl::detail::get_local_linear_range(g); for (Ptr p = first + offset; p < last; p += stride) { f(*p); } @@ -137,11 +137,11 @@ using EnableIfIsPointer = cl::sycl::detail::enable_if_t::value, T>; template bool all_of(Group g, bool pred) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); #ifdef __SYCL_DEVICE_ONLY__ - return detail::spirv::GroupAll(pred); + return sycl::detail::spirv::GroupAll(pred); #else throw runtime_error("Group algorithms are not supported on host device.", PI_INVALID_DEVICE); @@ -150,7 +150,7 @@ template bool all_of(Group g, bool pred) { template bool all_of(Group g, T x, Predicate pred) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); return all_of(g, pred(x)); @@ -159,14 +159,14 @@ bool all_of(Group g, T x, Predicate pred) { template EnableIfIsPointer all_of(Group g, Ptr first, Ptr last, Predicate pred) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); #ifdef __SYCL_DEVICE_ONLY__ bool partial = true; - detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) { - partial &= pred(x); - }); + sycl::detail::for_each( + g, first, last, + [&](const typename Ptr::element_type &x) { partial &= pred(x); }); return all_of(g, partial); #else throw runtime_error("Group algorithms are not supported on host device.", @@ -175,11 +175,11 @@ EnableIfIsPointer all_of(Group g, Ptr first, Ptr last, } template bool any_of(Group g, bool pred) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); #ifdef __SYCL_DEVICE_ONLY__ - return detail::spirv::GroupAny(pred); + return sycl::detail::spirv::GroupAny(pred); #else throw runtime_error("Group algorithms are not supported on host device.", PI_INVALID_DEVICE); @@ -188,7 +188,7 @@ template bool any_of(Group g, bool pred) { template bool any_of(Group g, T x, Predicate pred) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); return any_of(g, pred(x)); @@ -198,13 +198,13 @@ template EnableIfIsPointer any_of(Group g, Ptr first, Ptr last, Predicate pred) { #ifdef __SYCL_DEVICE_ONLY__ - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); bool partial = false; - detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) { - partial |= pred(x); - }); + sycl::detail::for_each( + g, first, last, + [&](const typename Ptr::element_type &x) { partial |= pred(x); }); return any_of(g, partial); #else throw runtime_error("Group algorithms are not supported on host device.", @@ -213,11 +213,11 @@ EnableIfIsPointer any_of(Group g, Ptr first, Ptr last, } template bool none_of(Group g, bool pred) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); #ifdef __SYCL_DEVICE_ONLY__ - return detail::spirv::GroupAll(!pred); + return sycl::detail::spirv::GroupAll(!pred); #else throw runtime_error("Group algorithms are not supported on host device.", PI_INVALID_DEVICE); @@ -226,7 +226,7 @@ template bool none_of(Group g, bool pred) { template bool none_of(Group g, T x, Predicate pred) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); return none_of(g, pred(x)); @@ -236,7 +236,7 @@ template EnableIfIsPointer none_of(Group g, Ptr first, Ptr last, Predicate pred) { #ifdef __SYCL_DEVICE_ONLY__ - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); return !any_of(g, first, last, pred); @@ -249,11 +249,11 @@ EnableIfIsPointer none_of(Group g, Ptr first, Ptr last, template EnableIfIsScalarArithmetic broadcast(Group g, T x, typename Group::id_type local_id) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); #ifdef __SYCL_DEVICE_ONLY__ - return detail::spirv::GroupBroadcast(x, local_id); + return sycl::detail::spirv::GroupBroadcast(x, local_id); #else throw runtime_error("Group algorithms are not supported on host device.", PI_INVALID_DEVICE); @@ -263,7 +263,7 @@ EnableIfIsScalarArithmetic broadcast(Group g, T x, template EnableIfIsVectorArithmetic broadcast(Group g, T x, typename Group::id_type local_id) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); #ifdef __SYCL_DEVICE_ONLY__ @@ -281,12 +281,13 @@ EnableIfIsVectorArithmetic broadcast(Group g, T x, template EnableIfIsScalarArithmetic broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); #ifdef __SYCL_DEVICE_ONLY__ return broadcast( - g, x, detail::linear_id_to_id(g.get_local_range(), linear_local_id)); + g, x, + sycl::detail::linear_id_to_id(g.get_local_range(), linear_local_id)); #else throw runtime_error("Group algorithms are not supported on host device.", PI_INVALID_DEVICE); @@ -296,7 +297,7 @@ broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) { template EnableIfIsVectorArithmetic broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); #ifdef __SYCL_DEVICE_ONLY__ @@ -313,7 +314,7 @@ broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) { template EnableIfIsScalarArithmetic broadcast(Group g, T x) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); #ifdef __SYCL_DEVICE_ONLY__ @@ -326,7 +327,7 @@ EnableIfIsScalarArithmetic broadcast(Group g, T x) { template EnableIfIsVectorArithmetic broadcast(Group g, T x) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); #ifdef __SYCL_DEVICE_ONLY__ @@ -343,7 +344,7 @@ EnableIfIsVectorArithmetic broadcast(Group g, T x) { template EnableIfIsScalarArithmetic reduce(Group g, T x, BinaryOperation binary_op) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -353,9 +354,9 @@ EnableIfIsScalarArithmetic reduce(Group g, T x, BinaryOperation binary_op) { std::is_same::value), "Result type of binary_op must match reduction accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - return detail::calc::value>( - typename detail::GroupOpTag::type(), x, binary_op); + return sycl::detail::calc::value>( + typename sycl::detail::GroupOpTag::type(), x, binary_op); #else throw runtime_error("Group algorithms are not supported on host device.", PI_INVALID_DEVICE); @@ -364,7 +365,7 @@ EnableIfIsScalarArithmetic reduce(Group g, T x, BinaryOperation binary_op) { template EnableIfIsVectorArithmetic reduce(Group g, T x, BinaryOperation binary_op) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -384,7 +385,7 @@ EnableIfIsVectorArithmetic reduce(Group g, T x, BinaryOperation binary_op) { template EnableIfIsScalarArithmetic reduce(Group g, V x, T init, BinaryOperation binary_op) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -404,7 +405,7 @@ EnableIfIsScalarArithmetic reduce(Group g, V x, T init, template EnableIfIsVectorArithmetic reduce(Group g, V x, T init, BinaryOperation binary_op) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -429,7 +430,7 @@ EnableIfIsVectorArithmetic reduce(Group g, V x, T init, template EnableIfIsPointer reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -441,10 +442,12 @@ reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) { "Result type of binary_op must match reduction accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ typename Ptr::element_type partial = - detail::identity::value; - detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) { - partial = binary_op(partial, x); - }); + sycl::detail::identity::value; + sycl::detail::for_each(g, first, last, + [&](const typename Ptr::element_type &x) { + partial = binary_op(partial, x); + }); return reduce(g, partial, binary_op); #else throw runtime_error("Group algorithms are not supported on host device.", @@ -455,7 +458,7 @@ reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) { template EnableIfIsPointer reduce(Group g, Ptr first, Ptr last, T init, BinaryOperation binary_op) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -465,11 +468,12 @@ EnableIfIsPointer reduce(Group g, Ptr first, Ptr last, T init, std::is_same::value), "Result type of binary_op must match reduction accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - T partial = - detail::identity::value; - detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) { - partial = binary_op(partial, x); - }); + T partial = sycl::detail::identity::value; + sycl::detail::for_each(g, first, last, + [&](const typename Ptr::element_type &x) { + partial = binary_op(partial, x); + }); return reduce(g, partial, init, binary_op); #else throw runtime_error("Group algorithms are not supported on host device.", @@ -480,7 +484,7 @@ EnableIfIsPointer reduce(Group g, Ptr first, Ptr last, T init, template EnableIfIsScalarArithmetic exclusive_scan(Group g, T x, BinaryOperation binary_op) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -489,9 +493,9 @@ EnableIfIsScalarArithmetic exclusive_scan(Group g, T x, std::is_same::value), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - return detail::calc::value>( - typename detail::GroupOpTag::type(), x, binary_op); + return sycl::detail::calc::value>( + typename sycl::detail::GroupOpTag::type(), x, binary_op); #else throw runtime_error("Group algorithms are not supported on host device.", PI_INVALID_DEVICE); @@ -501,7 +505,7 @@ EnableIfIsScalarArithmetic exclusive_scan(Group g, T x, template EnableIfIsVectorArithmetic exclusive_scan(Group g, T x, BinaryOperation binary_op) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -521,7 +525,7 @@ EnableIfIsVectorArithmetic exclusive_scan(Group g, T x, template EnableIfIsVectorArithmetic exclusive_scan(Group g, V x, T init, BinaryOperation binary_op) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -541,7 +545,7 @@ EnableIfIsVectorArithmetic exclusive_scan(Group g, V x, T init, template EnableIfIsScalarArithmetic exclusive_scan(Group g, V x, T init, BinaryOperation binary_op) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -551,7 +555,7 @@ EnableIfIsScalarArithmetic exclusive_scan(Group g, V x, T init, "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ typename Group::linear_id_type local_linear_id = - detail::get_local_linear_id(g); + sycl::detail::get_local_linear_id(g); if (local_linear_id == 0) { x = binary_op(init, x); } @@ -571,7 +575,7 @@ template exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, T init, BinaryOperation binary_op) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -581,8 +585,8 @@ exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, T init, std::is_same::value), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - ptrdiff_t offset = detail::get_local_linear_id(g); - ptrdiff_t stride = detail::get_local_linear_range(g); + ptrdiff_t offset = sycl::detail::get_local_linear_id(g); + ptrdiff_t stride = sycl::detail::get_local_linear_range(g); ptrdiff_t N = last - first; auto roundup = [=](const ptrdiff_t &v, const ptrdiff_t &divisor) -> ptrdiff_t { @@ -620,16 +624,16 @@ EnableIfIsPointer exclusive_scan(Group g, InPtr first, (std::is_same::value && std::is_same::value), "Result type of binary_op must match scan accumulation type."); - return exclusive_scan( - g, first, last, result, - detail::identity::value, - binary_op); + return exclusive_scan(g, first, last, result, + sycl::detail::identity::value, + binary_op); } template EnableIfIsVectorArithmetic inclusive_scan(Group g, T x, BinaryOperation binary_op) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -649,7 +653,7 @@ EnableIfIsVectorArithmetic inclusive_scan(Group g, T x, template EnableIfIsScalarArithmetic inclusive_scan(Group g, T x, BinaryOperation binary_op) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -658,9 +662,9 @@ EnableIfIsScalarArithmetic inclusive_scan(Group g, T x, std::is_same::value), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - return detail::calc::value>( - typename detail::GroupOpTag::type(), x, binary_op); + return sycl::detail::calc::value>( + typename sycl::detail::GroupOpTag::type(), x, binary_op); #else throw runtime_error("Group algorithms are not supported on host device.", PI_INVALID_DEVICE); @@ -670,7 +674,7 @@ EnableIfIsScalarArithmetic inclusive_scan(Group g, T x, template EnableIfIsScalarArithmetic inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -679,7 +683,7 @@ inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) { std::is_same::value), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - if (detail::get_local_linear_id(g) == 0) { + if (sycl::detail::get_local_linear_id(g) == 0) { x = binary_op(init, x); } return inclusive_scan(g, x, binary_op); @@ -692,7 +696,7 @@ inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) { template EnableIfIsVectorArithmetic inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -713,7 +717,7 @@ template inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, BinaryOperation binary_op, T init) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); // FIXME: Do not special-case for half precision @@ -723,8 +727,8 @@ inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, std::is_same::value), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - ptrdiff_t offset = detail::get_local_linear_id(g); - ptrdiff_t stride = detail::get_local_linear_range(g); + ptrdiff_t offset = sycl::detail::get_local_linear_id(g); + ptrdiff_t stride = sycl::detail::get_local_linear_range(g); ptrdiff_t N = last - first; auto roundup = [=](const ptrdiff_t &v, const ptrdiff_t &divisor) -> ptrdiff_t { @@ -762,17 +766,18 @@ EnableIfIsPointer inclusive_scan(Group g, InPtr first, (std::is_same::value && std::is_same::value), "Result type of binary_op must match scan accumulation type."); - return inclusive_scan( - g, first, last, result, binary_op, - detail::identity::value); + return inclusive_scan(g, first, last, result, binary_op, + sycl::detail::identity::value); } template bool leader(Group g) { - static_assert(detail::is_generic_group::value, + static_assert(sycl::detail::is_generic_group::value, "Group algorithms only support the sycl::group and " "intel::sub_group class."); #ifdef __SYCL_DEVICE_ONLY__ - typename Group::linear_id_type linear_id = detail::get_local_linear_id(g); + typename Group::linear_id_type linear_id = + sycl::detail::get_local_linear_id(g); return (linear_id == 0); #else throw runtime_error("Group algorithms are not supported on host device.", diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp new file mode 100644 index 0000000000000..2b0fb264bc94f --- /dev/null +++ b/sycl/include/CL/sycl/intel/reduction.hpp @@ -0,0 +1,388 @@ +//==---------------- reduction.hpp - SYCL reduction ------------*- C++ -*---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// ===--------------------------------------------------------------------=== // + +#pragma once + +#include + +__SYCL_INLINE_NAMESPACE(cl) { +namespace sycl { +namespace intel { + +namespace detail { + +using cl::sycl::detail::bool_constant; +using cl::sycl::detail::enable_if_t; +using cl::sycl::detail::is_geninteger16bit; +using cl::sycl::detail::is_geninteger32bit; +using cl::sycl::detail::is_geninteger64bit; +using cl::sycl::detail::is_geninteger8bit; +using cl::sycl::detail::remove_AS; + +// Identity = 0 +template +using IsZeroIdentityOp = bool_constant< + ((is_geninteger8bit::value || is_geninteger16bit::value || + is_geninteger32bit::value || is_geninteger64bit::value) && + (std::is_same>::value || + std::is_same>::value || + std::is_same>::value)) || + ((std::is_same::value || std::is_same::value) && + std::is_same>::value)>; + +// Identity = 1 +template +using IsOneIdentityOp = bool_constant< + (is_geninteger8bit::value || is_geninteger16bit::value || + is_geninteger32bit::value || is_geninteger64bit::value || + std::is_same::value || std::is_same::value) && + std::is_same>::value>; + +// Identity = ~0 +template +using IsOnesIdentityOp = bool_constant< + (is_geninteger8bit::value || is_geninteger16bit::value || + is_geninteger32bit::value || is_geninteger64bit::value) && + std::is_same>::value>; + +// Identity = +template +using IsMinimumIdentityOp = bool_constant< + (is_geninteger8bit::value || is_geninteger16bit::value || + is_geninteger32bit::value || is_geninteger64bit::value || + std::is_same::value || std::is_same::value) && + std::is_same>::value>; + +// Identity = +template +using IsMaximumIdentityOp = bool_constant< + (is_geninteger8bit::value || is_geninteger16bit::value || + is_geninteger32bit::value || is_geninteger64bit::value || + std::is_same::value || std::is_same::value) && + std::is_same>::value>; + +template +using IsKnownIdentityOp = + bool_constant::value || + IsOneIdentityOp::value || + IsOnesIdentityOp::value || + IsMinimumIdentityOp::value || + IsMaximumIdentityOp::value>; + +/// Class that is used to represent objects that are passed to user's lambda +/// functions and representing users' reduction variable. +/// The generic version of the class represents those reductions of those +/// types and operations for which the identity value is not known. +template +class reducer { +public: + reducer(const T &Identity) : MValue(Identity), MIdentity(Identity) {} + void combine(const T &Partial) { + BinaryOperation BOp; + MValue = BOp(MValue, Partial); + } + + T getIdentity() const { return MIdentity; } + + T MValue; + +private: + const T MIdentity; +}; + +/// Specialization of the generic class 'reducer'. It is used for reductions +/// of those types and operations for which the identity value is known. +/// +/// It allows to reduce the size of the 'reducer' object by not holding +/// the identity field inside it and allows to add a default constructor. +/// +/// Also, for many types with known identity the operation 'atomic_combine()' +/// is implemented here, which allows to use more efficient version of kernels +/// using those operations, which are based on functionality provided by +/// sycl::atomic class. +/// +/// For example, it is known that 0 is identity for intel::plus operations +/// accepting native scalar types to which scalar 0 is convertible. +/// Also, for int32/64 types the atomic_combine() is lowered to +/// sycl::atomic::fetch_add(). +// +// TODO: More types and ops can be added to here later. +template +class reducer::value>> { +public: + reducer() : MValue(getIdentity()) {} + reducer(const T &Identity) : MValue(getIdentity()) {} + + void combine(const T &Partial) { + BinaryOperation BOp; + MValue = BOp(MValue, Partial); + } + + /// Returns zero as identity for ADD, OR, XOR operations. + template + static enable_if_t::value, _T> + getIdentity() { + return 0; + } + + /// Returns one as identify for MULTIPLY operations. + template + static enable_if_t::value, _T> + getIdentity() { + return 1; + } + + /// Returns bit image consisting of all ones as identity for AND operations. + template + static enable_if_t::value, _T> + getIdentity() { + return ~static_cast<_T>(0); + } + + /// Returns maximal possible value as identity for MIN operations. + template + static enable_if_t::value, _T> + getIdentity() { + return (std::numeric_limits<_T>::max)(); + } + + /// Returns minimal possible value as identity for MAX operations. + template + static enable_if_t::value, _T> + getIdentity() { + return (std::numeric_limits<_T>::min)(); + } + + template + enable_if_t::value && + std::is_same>::value, + reducer &> + operator+=(const _T &Partial) { + combine(Partial); + return *this; + } + + template + enable_if_t::value && + std::is_same>::value, + reducer &> + operator*=(const _T &Partial) { + combine(Partial); + return *this; + } + + template + enable_if_t::value && + std::is_same>::value, + reducer &> + operator|=(const _T &Partial) { + combine(Partial); + return *this; + } + + template + enable_if_t::value && + std::is_same>::value, + reducer &> + operator^=(const _T &Partial) { + combine(Partial); + return *this; + } + + template + enable_if_t::value && + std::is_same>::value, + reducer &> + operator&=(const _T &Partial) { + combine(Partial); + return *this; + } + + /// Atomic ADD operation: *ReduVarPtr += MValue; + template + enable_if_t::type, T>::value && + (is_geninteger32bit::value || is_geninteger64bit::value) && + std::is_same<_BinaryOperation, intel::plus>::value> + atomic_combine(_T *ReduVarPtr) const { + atomic(global_ptr(ReduVarPtr)) + .fetch_add(MValue); + } + + /// Atomic BITWISE OR operation: *ReduVarPtr |= MValue; + template + enable_if_t::type, T>::value && + (is_geninteger32bit::value || is_geninteger64bit::value) && + std::is_same<_BinaryOperation, intel::bit_or>::value> + atomic_combine(_T *ReduVarPtr) const { + atomic(global_ptr(ReduVarPtr)) + .fetch_or(MValue); + } + + /// Atomic BITWISE XOR operation: *ReduVarPtr ^= MValue; + template + enable_if_t::type, T>::value && + (is_geninteger32bit::value || is_geninteger64bit::value) && + std::is_same<_BinaryOperation, intel::bit_xor>::value> + atomic_combine(_T *ReduVarPtr) const { + atomic(global_ptr(ReduVarPtr)) + .fetch_xor(MValue); + } + + /// Atomic BITWISE AND operation: *ReduVarPtr &= MValue; + template + enable_if_t::type, T>::value && + (is_geninteger32bit::value || is_geninteger64bit::value) && + std::is_same<_BinaryOperation, intel::bit_and>::value> + atomic_combine(_T *ReduVarPtr) const { + atomic(global_ptr(ReduVarPtr)) + .fetch_and(MValue); + } + + /// Atomic MIN operation: *ReduVarPtr = intel::minimum(*ReduVarPtr, MValue); + template + enable_if_t::type, T>::value && + (is_geninteger32bit::value || is_geninteger64bit::value) && + std::is_same<_BinaryOperation, intel::minimum>::value> + atomic_combine(_T *ReduVarPtr) const { + atomic(global_ptr(ReduVarPtr)) + .fetch_min(MValue); + } + + /// Atomic MAX operation: *ReduVarPtr = intel::maximum(*ReduVarPtr, MValue); + template + enable_if_t::type, T>::value && + (is_geninteger32bit::value || is_geninteger64bit::value) && + std::is_same<_BinaryOperation, intel::maximum>::value> + atomic_combine(_T *ReduVarPtr) const { + atomic(global_ptr(ReduVarPtr)) + .fetch_max(MValue); + } + + T MValue; +}; + +/// This class encapsulates the reduction variable/accessor, +/// the reduction operator and an optional operator identity. +template +class reduction_impl { +public: + using reducer_type = reducer; + using result_type = T; + using binary_operation = BinaryOperation; + using accessor_type = + accessor; + static constexpr access::mode accessor_mode = AccMode; + static constexpr int accessor_dim = Dims; + static constexpr int buffer_dim = (Dims == 0) ? 1 : Dims; + + // Only scalar (i.e. 0-dim and 1-dim with 1 element) reductions supported now. + // TODO: suport (Dims > 1) and placeholder accessors/reductions. + // TODO: support true 1-Dimensional accessors/reductions (get_count() > 1). + // (get_count() == 1) is checked in the constructor of reduction_impl. + static_assert(Dims <= 1 && IsPlaceholder == access::placeholder::false_t, + "Multi-dimensional and placeholder reductions" + " are not supported yet."); + + /// Returns the statically known identity value. + template + enable_if_t::value, + _T> constexpr getIdentity() { + return reducer_type::getIdentity(); + } + + /// Returns the identity value given by user. + template + enable_if_t::value, _T> + getIdentity() { + return MIdentity; + } + + /// Constructs reduction_impl when the identity value is statically known. + template < + typename _T = T, class _BinaryOperation = BinaryOperation, + enable_if_t::value> * = nullptr> + reduction_impl(accessor_type &Acc) : MAcc(Acc), MIdentity(getIdentity()) { + assert(Acc.get_count() == 1 && + "Only scalar/1-element reductions are supported now."); + } + + /// Constructs reduction_impl when the identity value is statically known, + /// and user still passed the identity value. + template < + typename _T = T, class _BinaryOperation = BinaryOperation, + enable_if_t::value> * = nullptr> + reduction_impl(accessor_type &Acc, const T &Identity) + : MAcc(Acc), MIdentity(Identity) { + assert(Acc.get_count() == 1 && + "Only scalar/1-element reductions are supported now."); + // For operations with known identity value the operator == is defined. + // It is sort of dilemma here: from one point of view - user may set + // such identity that would be enough for his data, i.e. identity=100 for + // min operation if user knows all data elements are less than 100. + // From another point of view - it is the source of unexpected errors, + // when the input data changes. + // Let's be strict for now and emit an error if identity is not proper. + assert(Identity == getIdentity() && "Unexpected Identity parameter value."); + } + + /// Constructs reduction_impl when the identity value is unknown. + template < + typename _T = T, class _BinaryOperation = BinaryOperation, + enable_if_t::value> * = nullptr> + reduction_impl(accessor_type &Acc, const T &Identity) + : MAcc(Acc), MIdentity(Identity) { + assert(Acc.get_count() == 1 && + "Only scalar/1-element reductions are supported now."); + } + + /// User's accessor to where the reduction must be written. + accessor_type MAcc; + +private: + /// Identity of the BinaryOperation. + /// The result of BinaryOperation(X, MIdentity) is equal to X for any X. + const T MIdentity; +}; + +} // namespace detail + +/// Creates and returns an object implementing the reduction functionality. +/// Accepts 3 arguments: the accessor to buffer to where the computed reduction +/// must be stored \param Acc, identity value \param Identity, and the +/// binary operation that must be used in the reduction \param Combiner. +template +detail::reduction_impl +reduction(accessor &Acc, + const T &Identity, BinaryOperation Combiner) { + // The Combiner argument was needed only to define the BinaryOperation param. + return detail::reduction_impl( + Acc, Identity); +} + +/// Creates and returns an object implementing the reduction functionality. +/// Accepts 2 arguments: the accessor to buffer to where the computed reduction +/// must be stored \param Acc and the binary operation that must be used +/// in the reduction \param Combiner. +/// The identity value is not passed to this version as it is statically known. +template +detail::enable_if_t< + detail::IsKnownIdentityOp::value, + detail::reduction_impl> +reduction(accessor &Acc, + BinaryOperation Combiner) { + // The Combiner argument was needed only to define the BinaryOperation param. + return detail::reduction_impl(Acc); +} + +} // namespace intel +} // namespace sycl +} // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/sub_group.hpp b/sycl/include/CL/sycl/intel/sub_group.hpp index 91d1b748e3a41..648fefe66c85e 100644 --- a/sycl/include/CL/sycl/intel/sub_group.hpp +++ b/sycl/include/CL/sycl/intel/sub_group.hpp @@ -36,7 +36,7 @@ namespace sub_group { #define __SYCL_SG_GENERATE_BODY_1ARG(name, SPIRVOperation) \ template T name(T x, id<1> local_id) { \ - using OCLT = detail::ConvertToOpenCLType_t; \ + using OCLT = sycl::detail::ConvertToOpenCLType_t; \ return __spirv_##SPIRVOperation(OCLT(x), local_id.get(0)); \ } @@ -47,7 +47,7 @@ __SYCL_SG_GENERATE_BODY_1ARG(shuffle_xor, SubgroupShuffleXorINTEL) #define __SYCL_SG_GENERATE_BODY_2ARG(name, SPIRVOperation) \ template T name(T A, T B, uint32_t Delta) { \ - using OCLT = detail::ConvertToOpenCLType_t; \ + using OCLT = sycl::detail::ConvertToOpenCLType_t; \ return __spirv_##SPIRVOperation(OCLT(A), OCLT(B), Delta); \ } @@ -77,7 +77,7 @@ template To bit_cast(const From &from) { return __builtin_bit_cast(To, from); #else To to; - detail::memcpy(&to, &from, sizeof(To)); + sycl::detail::memcpy(&to, &from, sizeof(To)); return to; #endif // __has_builtin(__builtin_bit_cast) #endif // __cpp_lib_bit_cast @@ -86,7 +86,8 @@ template To bit_cast(const From &from) { template T load(const multi_ptr src) { using BlockT = SelectBlockT; - using PtrT = detail::ConvertToOpenCLType_t>; + using PtrT = + sycl::detail::ConvertToOpenCLType_t>; BlockT Ret = __spirv_SubgroupBlockReadINTEL(reinterpret_cast(src.get())); @@ -97,8 +98,9 @@ T load(const multi_ptr src) { template vec load(const multi_ptr src) { using BlockT = SelectBlockT; - using VecT = detail::ConvertToOpenCLType_t>; - using PtrT = detail::ConvertToOpenCLType_t>; + using VecT = sycl::detail::ConvertToOpenCLType_t>; + using PtrT = + sycl::detail::ConvertToOpenCLType_t>; VecT Ret = __spirv_SubgroupBlockReadINTEL(reinterpret_cast(src.get())); @@ -109,7 +111,7 @@ vec load(const multi_ptr src) { template void store(multi_ptr dst, const T &x) { using BlockT = SelectBlockT; - using PtrT = detail::ConvertToOpenCLType_t>; + using PtrT = sycl::detail::ConvertToOpenCLType_t>; __spirv_SubgroupBlockWriteINTEL(reinterpret_cast(dst.get()), bit_cast(x)); @@ -118,8 +120,9 @@ void store(multi_ptr dst, const T &x) { template void store(multi_ptr dst, const vec &x) { using BlockT = SelectBlockT; - using VecT = detail::ConvertToOpenCLType_t>; - using PtrT = detail::ConvertToOpenCLType_t>; + using VecT = sycl::detail::ConvertToOpenCLType_t>; + using PtrT = + sycl::detail::ConvertToOpenCLType_t>; __spirv_SubgroupBlockWriteINTEL(reinterpret_cast(dst.get()), bit_cast(x)); @@ -171,22 +174,23 @@ struct sub_group { template using EnableIfIsScalarArithmetic = - detail::enable_if_t::value, T>; + sycl::detail::enable_if_t::value, + T>; /* --- collectives --- */ template __SYCL_EXPORT_DEPRECATED("Use sycl::intel::broadcast instead.") EnableIfIsScalarArithmetic broadcast(T x, id<1> local_id) const { - return detail::spirv::GroupBroadcast(x, local_id); + return sycl::detail::spirv::GroupBroadcast(x, local_id); } template __SYCL_EXPORT_DEPRECATED("Use sycl::intel::reduce instead.") EnableIfIsScalarArithmetic reduce(T x, BinaryOperation op) const { - return detail::calc( - typename detail::GroupOpTag::type(), x, op); + return sycl::detail::calc( + typename sycl::detail::GroupOpTag::type(), x, op); } template @@ -198,9 +202,9 @@ struct sub_group { template __SYCL_EXPORT_DEPRECATED("Use sycl::intel::exclusive_scan instead.") EnableIfIsScalarArithmetic exclusive_scan(T x, BinaryOperation op) const { - return detail::calc( - typename detail::GroupOpTag::type(), x, op); + return sycl::detail::calc( + typename sycl::detail::GroupOpTag::type(), x, op); } template @@ -220,9 +224,9 @@ struct sub_group { template __SYCL_EXPORT_DEPRECATED("Use sycl::intel::inclusive_scan instead.") EnableIfIsScalarArithmetic inclusive_scan(T x, BinaryOperation op) const { - return detail::calc( - typename detail::GroupOpTag::type(), x, op); + return sycl::detail::calc( + typename sycl::detail::GroupOpTag::type(), x, op); } template @@ -239,90 +243,94 @@ struct sub_group { /* indices in [0 , sub_group size) */ template T shuffle(T x, id<1> local_id) const { - return detail::sub_group::shuffle(x, local_id); + return sycl::detail::sub_group::shuffle(x, local_id); } template T shuffle_down(T x, uint32_t delta) const { - return detail::sub_group::shuffle_down(x, x, delta); + return sycl::detail::sub_group::shuffle_down(x, x, delta); } template T shuffle_up(T x, uint32_t delta) const { - return detail::sub_group::shuffle_up(x, x, delta); + return sycl::detail::sub_group::shuffle_up(x, x, delta); } template T shuffle_xor(T x, id<1> value) const { - return detail::sub_group::shuffle_xor(x, value); + return sycl::detail::sub_group::shuffle_xor(x, value); } /* --- two-input shuffles --- */ /* indices in [0 , 2 * sub_group size) */ template T shuffle(T x, T y, id<1> local_id) const { - return detail::sub_group::shuffle_down(x, y, - (local_id - get_local_id()).get(0)); + return sycl::detail::sub_group::shuffle_down( + x, y, (local_id - get_local_id()).get(0)); } template T shuffle_down(T current, T next, uint32_t delta) const { - return detail::sub_group::shuffle_down(current, next, delta); + return sycl::detail::sub_group::shuffle_down(current, next, delta); } template T shuffle_up(T previous, T current, uint32_t delta) const { - return detail::sub_group::shuffle_up(previous, current, delta); + return sycl::detail::sub_group::shuffle_up(previous, current, delta); } /* --- sub_group load/stores --- */ /* these can map to SIMD or block read/write hardware where available */ template - detail::enable_if_t< - detail::sub_group::AcceptableForLoadStore::value, T> + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForLoadStore::value, T> load(const multi_ptr src) const { - return detail::sub_group::load(src); + return sycl::detail::sub_group::load(src); } template - detail::enable_if_t< - detail::sub_group::AcceptableForLoadStore::value && N != 1, + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForLoadStore::value && + N != 1, vec> load(const multi_ptr src) const { - return detail::sub_group::load(src); + return sycl::detail::sub_group::load(src); } template - detail::enable_if_t< - detail::sub_group::AcceptableForLoadStore::value && N == 1, + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForLoadStore::value && + N == 1, vec> load(const multi_ptr src) const { - return detail::sub_group::load(src); + return sycl::detail::sub_group::load(src); } template - detail::enable_if_t< - detail::sub_group::AcceptableForLoadStore::value> + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForLoadStore::value> store(multi_ptr dst, const T &x) const { - detail::sub_group::store(dst, x); + sycl::detail::sub_group::store(dst, x); } template - detail::enable_if_t< - detail::sub_group::AcceptableForLoadStore::value && N == 1> + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForLoadStore::value && + N == 1> store(multi_ptr dst, const vec &x) const { store(dst, x); } template - detail::enable_if_t< - detail::sub_group::AcceptableForLoadStore::value && N != 1> + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForLoadStore::value && + N != 1> store(multi_ptr dst, const vec &x) const { - detail::sub_group::store(dst, x); + sycl::detail::sub_group::store(dst, x); } /* --- synchronization functions --- */ void barrier(access::fence_space accessSpace = access::fence_space::global_and_local) const { - uint32_t flags = detail::getSPIRVMemorySemanticsMask(accessSpace); + uint32_t flags = sycl::detail::getSPIRVMemorySemanticsMask(accessSpace); __spirv_ControlBarrier(__spv::Scope::Subgroup, __spv::Scope::Subgroup, flags); } diff --git a/sycl/test/reduction/reduction_ctor.cpp b/sycl/test/reduction/reduction_ctor.cpp new file mode 100644 index 0000000000000..9e4fb306a95b9 --- /dev/null +++ b/sycl/test/reduction/reduction_ctor.cpp @@ -0,0 +1,175 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This performs basic checks such as reduction creation, getIdentity() method, +// and the combine() method of the aux class 'reducer'. + +#include +#include + +using namespace cl::sycl; + +template +struct init_data_t { + void initInputData(T IdentityVal, + buffer &InBuf, + T &ExpectedReduValue) { + ExpectedReduValue = IdentityVal; + BinaryOperation Op; + auto In = InBuf.template get_access(); + for (int I = 0; I < N; ++I) { + In[I] = ((I + 1) % 5) + 1; + ExpectedReduValue = Op(ExpectedReduValue, In[I]); + } + } +}; + +template +struct init_data_t, N> { + void initInputData(T IdentityVal, buffer &InBuf, T &ExpectedReduValue) { + ExpectedReduValue = IdentityVal; + std::multiplies Op; + auto In = InBuf.template get_access(); + for (int I = 0; I < N; ++I) { + In[I] = 1 + (((I % 37) == 0) ? 1 : 0); + ExpectedReduValue = Op(ExpectedReduValue, In[I]); + } + } +}; + +template +void test_reducer(Reduction &Redu, T A, T B) { + typename Reduction::reducer_type Reducer; + Reducer.combine(A); + Reducer.combine(B); + + typename Reduction::binary_operation BOp; + T ExpectedValue = BOp(A, B); + assert(ExpectedValue == Reducer.MValue && + "Wrong result of binary operation."); +} + +template +void test_reducer(Reduction &Redu, T Identity, T A, T B) { + typename Reduction::reducer_type Reducer(Identity); + Reducer.combine(A); + Reducer.combine(B); + + typename Reduction::binary_operation BOp; + T ExpectedValue = BOp(A, B); + assert(ExpectedValue == Reducer.MValue && + "Wrong result of binary operation."); +} + +template +class Known; +template +class Unknown; + +template +struct Point { + Point() : X(0), Y(0) {} + Point(T X, T Y) : X(X), Y(Y) {} + Point(T V) : X(V), Y(V) {} + bool operator==(const Point &P) const { + return P.X == X && P.Y == Y; + } + T X; + T Y; +}; + +template +bool operator==(const Point &A, const Point &B) { + return A.X == B.X && A.Y == B.Y; +} + +template +struct PointPlus { + using P = Point; + P operator()(const P &A, const P &B) const { + return P(A.X + B.X, A.Y + B.Y); + } +}; + +template +void testKnown(T Identity, T A, T B) { + + BinaryOperation BOp; + buffer ReduBuf(1); + + queue Q; + Q.submit([&](handler &CGH) { + // Reduction needs a global_buffer accessor as a parameter. + // This accessor is not really used in this test. + accessor + ReduAcc(ReduBuf, CGH); + auto Redu = intel::reduction(ReduAcc, BOp); + assert(Redu.getIdentity() == Identity && + "Failed getIdentity() check()."); + test_reducer(Redu, A, B); + test_reducer(Redu, Identity, A, B); + + // Command group must have at least one task in it. Use an empty one. + CGH.single_task>([=]() {}); + }); +} + +template +void testUnknown(T Identity, T A, T B) { + + BinaryOperation BOp; + buffer ReduBuf(1); + queue Q; + Q.submit([&](handler &CGH) { + // Reduction needs a global_buffer accessor as a parameter. + // This accessor is not really used in this test. + accessor + ReduAcc(ReduBuf, CGH); + auto Redu = intel::reduction(ReduAcc, Identity, BOp); + assert(Redu.getIdentity() == Identity && + "Failed getIdentity() check()."); + test_reducer(Redu, Identity, A, B); + + // Command group must have at least one task in it. Use an empty one. + CGH.single_task>([=]() {}); + }); +} + +template +void testBoth(T Identity, T A, T B) { + testKnown(Identity, A, B); + testKnown(Identity, A, B); + testUnknown(Identity, A, B); + testUnknown(Identity, A, B); +} + +int main() { + // testKnown does not pass identity to reduction ctor. + testBoth>(0, 1, 7); + testBoth>(1, 1, 7); + testBoth>(0, 1, 8); + testBoth>(0, 7, 3); + testBoth>(~0, 7, 3); + testBoth>(std::numeric_limits::max(), 7, 3); + testBoth>(std::numeric_limits::min(), 7, 3); + + testBoth>(0, 1, 7); + testBoth>(1, 1, 7); + testBoth>( + std::numeric_limits::max(), 7, 3); + testBoth>( + std::numeric_limits::min(), 7, 3); + + testUnknown, 0, PointPlus>(Point(0), Point(1), Point(7)); + testUnknown, 1, PointPlus>(Point(0), Point(1), Point(7)); + + std::cout << "Test passed\n"; + return 0; +}