diff --git a/sycl/include/CL/sycl.hpp b/sycl/include/CL/sycl.hpp index 5f0cad5364997..1e986a4f264e8 100644 --- a/sycl/include/CL/sycl.hpp +++ b/sycl/include/CL/sycl.hpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/sycl/include/CL/sycl/ONEAPI/group_algorithm.hpp b/sycl/include/CL/sycl/ONEAPI/group_algorithm.hpp index 3aaab50b236aa..1dfdf003eaee0 100644 --- a/sycl/include/CL/sycl/ONEAPI/group_algorithm.hpp +++ b/sycl/include/CL/sycl/ONEAPI/group_algorithm.hpp @@ -16,137 +16,12 @@ #include #include #include +#include #include #ifndef __DISABLE_SYCL_ONEAPI_GROUP_ALGORITHMS__ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace detail { - -template size_t get_local_linear_range(Group g); -template <> inline size_t get_local_linear_range>(group<1> g) { - return g.get_local_range(0); -} -template <> inline size_t get_local_linear_range>(group<2> g) { - return g.get_local_range(0) * g.get_local_range(1); -} -template <> inline size_t get_local_linear_range>(group<3> g) { - return g.get_local_range(0) * g.get_local_range(1) * g.get_local_range(2); -} -template <> -inline size_t get_local_linear_range(ONEAPI::sub_group g) { - return g.get_local_range()[0]; -} - -template -typename Group::linear_id_type get_local_linear_id(Group g); - -#ifdef __SYCL_DEVICE_ONLY__ -#define __SYCL_GROUP_GET_LOCAL_LINEAR_ID(D) \ - template <> \ - group::linear_id_type get_local_linear_id>(group) { \ - nd_item it = cl::sycl::detail::Builder::getNDItem(); \ - return it.get_local_linear_id(); \ - } -__SYCL_GROUP_GET_LOCAL_LINEAR_ID(1); -__SYCL_GROUP_GET_LOCAL_LINEAR_ID(2); -__SYCL_GROUP_GET_LOCAL_LINEAR_ID(3); -#undef __SYCL_GROUP_GET_LOCAL_LINEAR_ID -#endif // __SYCL_DEVICE_ONLY__ - -template <> -inline ONEAPI::sub_group::linear_id_type -get_local_linear_id(ONEAPI::sub_group g) { - return g.get_local_id()[0]; -} - -template -id linear_id_to_id(range, size_t linear_id); -template <> inline id<1> linear_id_to_id(range<1>, size_t linear_id) { - return id<1>(linear_id); -} -template <> inline id<2> linear_id_to_id(range<2> r, size_t linear_id) { - id<2> result; - result[0] = linear_id / r[1]; - result[1] = linear_id % r[1]; - return result; -} -template <> inline id<3> linear_id_to_id(range<3> r, size_t linear_id) { - id<3> result; - result[0] = linear_id / (r[1] * r[2]); - result[1] = (linear_id % (r[1] * r[2])) / r[2]; - result[2] = linear_id % r[2]; - return result; -} - -template struct identity {}; - -template struct identity> { - static constexpr T value = 0; -}; - -template struct identity> { - static constexpr T value = std::numeric_limits::has_infinity - ? std::numeric_limits::infinity() - : (std::numeric_limits::max)(); -}; - -template struct identity> { - static constexpr T value = - std::numeric_limits::has_infinity - ? static_cast(-std::numeric_limits::infinity()) - : std::numeric_limits::lowest(); -}; - -template struct identity> { - static constexpr T value = static_cast(1); -}; - -template struct identity> { - static constexpr T value = 0; -}; - -template struct identity> { - static constexpr T value = 0; -}; - -template struct identity> { - static constexpr T value = ~static_cast(0); -}; - -template -using native_op_list = - type_list, ONEAPI::bit_or, ONEAPI::bit_xor, - ONEAPI::bit_and, ONEAPI::maximum, ONEAPI::minimum, - ONEAPI::multiplies>; - -template struct is_native_op { - static constexpr bool value = - is_contained>::value || - is_contained>::value; -}; - -template -Function for_each(Group g, Ptr first, Ptr last, Function f) { -#ifdef __SYCL_DEVICE_ONLY__ - ptrdiff_t offset = sycl::detail::get_local_linear_id(g); - ptrdiff_t stride = sycl::detail::get_local_linear_range(g); - for (Ptr p = first + offset; p < last; p += stride) { - f(*p); - } - return f; -#else - (void)g; - (void)first; - (void)last; - (void)f; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif -} - -} // namespace detail - namespace ONEAPI { // EnableIf shorthands for algorithms that depend only on type @@ -191,124 +66,82 @@ using EnableIfIsNonNativeOp = cl::sycl::detail::enable_if_t< T>; template -detail::enable_if_t::value, bool> -all_of(Group, bool pred) { -#ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupAll(pred); -#else - (void)pred; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif +__SYCL2020_DEPRECATED("ONEAPI::all_of is deprecated. Use all_of_group instead.") +detail::enable_if_t::value, bool> all_of( + Group g, bool pred) { + return all_of_group(g, pred); } template -detail::enable_if_t::value, bool> -all_of(Group g, T x, Predicate pred) { - return all_of(g, pred(x)); +__SYCL2020_DEPRECATED("ONEAPI::all_of is deprecated. Use all_of_group instead.") +detail::enable_if_t::value, bool> all_of( + Group g, T x, Predicate pred) { + return all_of_group(g, pred(x)); } template +__SYCL2020_DEPRECATED("ONEAPI::all_of is deprecated. Use joint_all_of instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_pointer::value), - bool> -all_of(Group g, Ptr first, Ptr last, Predicate pred) { -#ifdef __SYCL_DEVICE_ONLY__ - bool partial = true; - sycl::detail::for_each( - g, first, last, - [&](const typename Ptr::element_type &x) { partial &= pred(x); }); - return all_of(g, partial); -#else - (void)g; - (void)first; - (void)last; - (void)pred; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + bool> all_of(Group g, Ptr first, Ptr last, Predicate pred) { + return joint_all_of(g, first, last, pred); } template -detail::enable_if_t::value, bool> -any_of(Group, bool pred) { -#ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupAny(pred); -#else - (void)pred; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif +__SYCL2020_DEPRECATED("ONEAPI::any_of is deprecated. Use any_of_group instead.") +detail::enable_if_t::value, bool> any_of( + Group g, bool pred) { + return any_of_group(g, pred); } template -detail::enable_if_t::value, bool> -any_of(Group g, T x, Predicate pred) { - return any_of(g, pred(x)); +__SYCL2020_DEPRECATED("ONEAPI::any_of is deprecated. Use any_of_group instead.") +detail::enable_if_t::value, bool> any_of( + Group g, T x, Predicate pred) { + return any_of_group(g, pred(x)); } template +__SYCL2020_DEPRECATED("ONEAPI::any_of is deprecated. Use joint_any_of instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_pointer::value), - bool> -any_of(Group g, Ptr first, Ptr last, Predicate pred) { -#ifdef __SYCL_DEVICE_ONLY__ - bool partial = false; - sycl::detail::for_each( - g, first, last, - [&](const typename Ptr::element_type &x) { partial |= pred(x); }); - return any_of(g, partial); -#else - (void)g; - (void)first; - (void)last; - (void)pred; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + bool> any_of(Group g, Ptr first, Ptr last, Predicate pred) { + return joint_any_of(g, first, last, pred); } template -detail::enable_if_t::value, bool> -none_of(Group, bool pred) { -#ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupAll(!pred); -#else - (void)pred; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif +__SYCL2020_DEPRECATED( + "ONEAPI::none_of is deprecated. Use none_of_group instead.") +detail::enable_if_t::value, bool> none_of( + Group g, bool pred) { + return none_of_group(g, pred); } template -detail::enable_if_t::value, bool> -none_of(Group g, T x, Predicate pred) { - return none_of(g, pred(x)); +__SYCL2020_DEPRECATED( + "ONEAPI::none_of is deprecated. Use none_of_group instead.") +detail::enable_if_t::value, bool> none_of( + Group g, T x, Predicate pred) { + return none_of_group(g, pred(x)); } template +__SYCL2020_DEPRECATED( + "ONEAPI::none_of is deprecated. Use joint_none_of instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_pointer::value), - bool> -none_of(Group g, Ptr first, Ptr last, Predicate pred) { -#ifdef __SYCL_DEVICE_ONLY__ - return !any_of(g, first, last, pred); -#else - (void)g; - (void)first; - (void)last; - (void)pred; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + bool> none_of(Group g, Ptr first, Ptr last, + Predicate pred) { + return joint_none_of(g, first, last, pred); } template +__SYCL2020_DEPRECATED( + "ONEAPI::broadcast is deprecated. Use group_broadcast instead.") detail::enable_if_t<(detail::is_generic_group::value && std::is_trivially_copyable::value && !detail::is_vector_arithmetic::value), - T> -broadcast(Group, T x, typename Group::id_type local_id) { + T> broadcast(Group, T x, typename Group::id_type local_id) { #ifdef __SYCL_DEVICE_ONLY__ return sycl::detail::spirv::GroupBroadcast(x, local_id); #else @@ -320,10 +153,12 @@ broadcast(Group, T x, typename Group::id_type local_id) { } template +__SYCL2020_DEPRECATED( + "ONEAPI::broadcast is deprecated. Use group_broadcast instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_vector_arithmetic::value), - T> -broadcast(Group g, T x, typename Group::id_type local_id) { + T> broadcast(Group g, T x, + typename Group::id_type local_id) { #ifdef __SYCL_DEVICE_ONLY__ T result; for (int s = 0; s < x.get_size(); ++s) { @@ -340,11 +175,14 @@ broadcast(Group g, T x, typename Group::id_type local_id) { } template +__SYCL2020_DEPRECATED( + "ONEAPI::broadcast is deprecated. Use group_broadcast instead.") detail::enable_if_t<(detail::is_generic_group::value && std::is_trivially_copyable::value && !detail::is_vector_arithmetic::value), - T> -broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) { + T> broadcast(Group g, T x, + typename Group::linear_id_type + linear_local_id) { #ifdef __SYCL_DEVICE_ONLY__ return broadcast( g, x, @@ -359,10 +197,13 @@ broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) { } template +__SYCL2020_DEPRECATED( + "ONEAPI::broadcast is deprecated. Use group_broadcast instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_vector_arithmetic::value), - T> -broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) { + T> broadcast(Group g, T x, + typename Group::linear_id_type + linear_local_id) { #ifdef __SYCL_DEVICE_ONLY__ T result; for (int s = 0; s < x.get_size(); ++s) { @@ -379,11 +220,12 @@ broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) { } template +__SYCL2020_DEPRECATED( + "ONEAPI::broadcast is deprecated. Use group_broadcast instead.") detail::enable_if_t<(detail::is_generic_group::value && std::is_trivially_copyable::value && !detail::is_vector_arithmetic::value), - T> -broadcast(Group g, T x) { + T> broadcast(Group g, T x) { #ifdef __SYCL_DEVICE_ONLY__ return broadcast(g, x, 0); #else @@ -395,10 +237,11 @@ broadcast(Group g, T x) { } template +__SYCL2020_DEPRECATED( + "ONEAPI::broadcast is deprecated. Use group_broadcast instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_vector_arithmetic::value), - T> -broadcast(Group g, T x) { + T> broadcast(Group g, T x) { #ifdef __SYCL_DEVICE_ONLY__ T result; for (int s = 0; s < x.get_size(); ++s) { @@ -414,54 +257,33 @@ broadcast(Group g, T x) { } template +__SYCL2020_DEPRECATED( + "ONEAPI::reduce is deprecated. Use reduce_over_group instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_scalar_arithmetic::value && detail::is_native_op::value), - T> -reduce(Group, T x, BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match reduction accumulation type."); -#ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc::value>( - typename sycl::detail::GroupOpTag::type(), x, binary_op); -#else - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + T> reduce(Group g, T x, BinaryOperation binary_op) { + return reduce_over_group(g, x, binary_op); } template +__SYCL2020_DEPRECATED( + "ONEAPI::reduce is deprecated. Use reduce_over_group instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_vector_arithmetic::value && detail::is_native_op::value), - T> -reduce(Group g, T x, BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match reduction accumulation type."); - T result; - for (int s = 0; s < x.get_size(); ++s) { - result[s] = reduce(g, x[s], binary_op); - } - return result; + T> reduce(Group g, T x, BinaryOperation binary_op) { + return reduce_over_group(g, x, binary_op); } template +__SYCL2020_DEPRECATED( + "ONEAPI::reduce is deprecated. Use reduce_over_group instead.") detail::enable_if_t<(detail::is_sub_group::value && std::is_trivially_copyable::value && (!detail::is_arithmetic::value || !detail::is_native_op::value)), - T> -reduce(Group g, T x, BinaryOperation op) { + T> reduce(Group g, T x, BinaryOperation op) { T result = x; for (int mask = 1; mask < g.get_max_local_range()[0]; mask *= 2) { T tmp = g.shuffle_xor(result, id<1>(mask)); @@ -473,65 +295,39 @@ reduce(Group g, T x, BinaryOperation op) { } template +__SYCL2020_DEPRECATED( + "ONEAPI::reduce is deprecated. Use reduce_over_group instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_scalar_arithmetic::value && detail::is_scalar_arithmetic::value && detail::is_native_op::value && detail::is_native_op::value), - T> -reduce(Group g, V x, T init, BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match reduction accumulation type."); -#ifdef __SYCL_DEVICE_ONLY__ - return binary_op(init, reduce(g, x, binary_op)); -#else - (void)g; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + T> reduce(Group g, V x, T init, BinaryOperation binary_op) { + return reduce_over_group(g, x, init, binary_op); } template +__SYCL2020_DEPRECATED( + "ONEAPI::reduce is deprecated. Use reduce_over_group instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_vector_arithmetic::value && detail::is_vector_arithmetic::value && detail::is_native_op::value && detail::is_native_op::value), - T> -reduce(Group g, V x, T init, BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match reduction accumulation type."); -#ifdef __SYCL_DEVICE_ONLY__ - T result = init; - for (int s = 0; s < x.get_size(); ++s) { - result[s] = binary_op(init[s], reduce(g, x[s], binary_op)); - } - return result; -#else - (void)g; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + T> reduce(Group g, V x, T init, BinaryOperation binary_op) { + return reduce_over_group(g, x, init, binary_op); } template +__SYCL2020_DEPRECATED( + "ONEAPI::reduce is deprecated. Use reduce_over_group instead.") detail::enable_if_t<(detail::is_sub_group::value && std::is_trivially_copyable::value && std::is_trivially_copyable::value && (!detail::is_arithmetic::value || !detail::is_arithmetic::value || !detail::is_native_op::value)), - T> -reduce(Group g, V x, T init, BinaryOperation op) { + T> reduce(Group g, V x, T init, BinaryOperation op) { T result = x; for (int mask = 1; mask < g.get_max_local_range()[0]; mask *= 2) { T tmp = g.shuffle_xor(result, id<1>(mask)); @@ -543,34 +339,19 @@ reduce(Group g, V x, T init, BinaryOperation op) { } template +__SYCL2020_DEPRECATED("ONEAPI::reduce is deprecated. Use joint_reduce instead.") detail::enable_if_t< (detail::is_generic_group::value && detail::is_pointer::value && detail::is_arithmetic::type>::value), - typename detail::remove_pointer::type> -reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) { - using T = typename detail::remove_pointer::type; - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match reduction accumulation type."); -#ifdef __SYCL_DEVICE_ONLY__ - typename Ptr::element_type partial = - sycl::detail::identity::value; - sycl::detail::for_each(g, first, last, - [&](const T &x) { partial = binary_op(partial, x); }); - return reduce(g, partial, binary_op); -#else - (void)g; - (void)last; - (void)binary_op; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + typename detail::remove_pointer::type> reduce(Group g, Ptr first, + Ptr last, + BinaryOperation + binary_op) { + return joint_reduce(g, first, last, binary_op); } template +__SYCL2020_DEPRECATED("ONEAPI::reduce is deprecated. Use joint_reduce instead.") detail::enable_if_t< (detail::is_generic_group::value && detail::is_pointer::value && detail::is_arithmetic::type>::value && @@ -578,125 +359,60 @@ detail::enable_if_t< detail::is_native_op::type, BinaryOperation>::value && detail::is_native_op::value), - T> -reduce(Group g, Ptr first, Ptr last, T init, BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match reduction accumulation type."); -#ifdef __SYCL_DEVICE_ONLY__ - T partial = sycl::detail::identity::value; - sycl::detail::for_each( - g, first, last, [&](const typename detail::remove_pointer::type &x) { - partial = binary_op(partial, x); - }); - return reduce(g, partial, init, binary_op); -#else - (void)g; - (void)last; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + T> reduce(Group g, Ptr first, Ptr last, T init, BinaryOperation binary_op) { + return joint_reduce(g, first, last, init, binary_op); } template +__SYCL2020_DEPRECATED("ONEAPI::exclusive_scan is deprecated. Use " + "exclusive_scan_over_group instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_scalar_arithmetic::value && detail::is_native_op::value), - T> -exclusive_scan(Group, T x, BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert(std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match scan accumulation type."); -#ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc::value>( - typename sycl::detail::GroupOpTag::type(), x, binary_op); -#else - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + T> exclusive_scan(Group g, T x, BinaryOperation binary_op) { + return exclusive_scan_over_group(g, x, binary_op); } template +__SYCL2020_DEPRECATED("ONEAPI::exclusive_scan is deprecated. Use " + "exclusive_scan_over_group instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_vector_arithmetic::value && detail::is_native_op::value), - T> -exclusive_scan(Group g, T x, BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match scan accumulation type."); - T result; - for (int s = 0; s < x.get_size(); ++s) { - result[s] = exclusive_scan(g, x[s], binary_op); - } - return result; + T> exclusive_scan(Group g, T x, BinaryOperation binary_op) { + return exclusive_scan_over_group(g, x, binary_op); } template +__SYCL2020_DEPRECATED("ONEAPI::exclusive_scan is deprecated. Use " + "exclusive_scan_over_group instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_vector_arithmetic::value && detail::is_vector_arithmetic::value && detail::is_native_op::value && detail::is_native_op::value), - T> -exclusive_scan(Group g, V x, T init, BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match scan accumulation type."); - T result; - for (int s = 0; s < x.get_size(); ++s) { - result[s] = exclusive_scan(g, x[s], init[s], binary_op); - } - return result; + T> exclusive_scan(Group g, V x, T init, + BinaryOperation binary_op) { + return exclusive_scan_over_group(g, x, init, binary_op); } template +__SYCL2020_DEPRECATED("ONEAPI::exclusive_scan is deprecated. Use " + "exclusive_scan_over_group instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_scalar_arithmetic::value && detail::is_scalar_arithmetic::value && detail::is_native_op::value && detail::is_native_op::value), - T> -exclusive_scan(Group g, V x, T init, BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert(std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match scan accumulation type."); -#ifdef __SYCL_DEVICE_ONLY__ - typename Group::linear_id_type local_linear_id = - sycl::detail::get_local_linear_id(g); - if (local_linear_id == 0) { - x = binary_op(init, x); - } - T scan = exclusive_scan(g, x, binary_op); - if (local_linear_id == 0) { - scan = init; - } - return scan; -#else - (void)g; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + T> exclusive_scan(Group g, V x, T init, + BinaryOperation binary_op) { + return exclusive_scan_over_group(g, x, init, binary_op); } template +__SYCL2020_DEPRECATED( + "ONEAPI::exclusive_scan is deprecated. Use joint_exclusive_scan instead.") detail::enable_if_t< (detail::is_generic_group::value && detail::is_pointer::value && detail::is_pointer::value && @@ -706,49 +422,15 @@ detail::enable_if_t< detail::is_native_op::type, BinaryOperation>::value && detail::is_native_op::value), - OutPtr> -exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, T init, - BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match scan accumulation type."); -#ifdef __SYCL_DEVICE_ONLY__ - ptrdiff_t offset = sycl::detail::get_local_linear_id(g); - ptrdiff_t stride = sycl::detail::get_local_linear_range(g); - ptrdiff_t N = last - first; - auto roundup = [=](const ptrdiff_t &v, - const ptrdiff_t &divisor) -> ptrdiff_t { - return ((v + divisor - 1) / divisor) * divisor; - }; - typename InPtr::element_type x; - typename OutPtr::element_type carry = init; - for (ptrdiff_t chunk = 0; chunk < roundup(N, stride); chunk += stride) { - ptrdiff_t i = chunk + offset; - if (i < N) { - x = first[i]; - } - typename OutPtr::element_type out = exclusive_scan(g, x, carry, binary_op); - if (i < N) { - result[i] = out; - } - carry = broadcast(g, binary_op(out, x), stride - 1); - } - return result + N; -#else - (void)g; - (void)last; - (void)result; - (void)init; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + OutPtr> exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, + T init, BinaryOperation binary_op) { + return joint_exclusive_scan(g, first, last, result, init, binary_op); } template +__SYCL2020_DEPRECATED( + "ONEAPI::exclusive_scan is deprecated. Use joint_exclusive_scan instead.") detail::enable_if_t< (detail::is_generic_group::value && detail::is_pointer::value && detail::is_pointer::value && @@ -756,111 +438,61 @@ detail::enable_if_t< typename detail::remove_pointer::type>::value && detail::is_native_op::type, BinaryOperation>::value), - OutPtr> -exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, - BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match scan accumulation type."); - return exclusive_scan(g, first, last, result, - sycl::detail::identity::value, - binary_op); + OutPtr> exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, + BinaryOperation binary_op) { + return joint_exclusive_scan(g, first, last, result, binary_op); } template +__SYCL2020_DEPRECATED("ONEAPI::inclusive_scan is deprecated. Use " + "inclusive_scan_over_group instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_vector_arithmetic::value && detail::is_native_op::value), - T> -inclusive_scan(Group g, T x, BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match scan accumulation type."); - T result; - for (int s = 0; s < x.get_size(); ++s) { - result[s] = inclusive_scan(g, x[s], binary_op); - } - return result; + T> inclusive_scan(Group g, T x, BinaryOperation binary_op) { + return inclusive_scan_over_group(g, x, binary_op); } template +__SYCL2020_DEPRECATED("ONEAPI::inclusive_scan is deprecated. Use " + "inclusive_scan_over_group instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_scalar_arithmetic::value && detail::is_native_op::value), - T> -inclusive_scan(Group, T x, BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert(std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match scan accumulation type."); -#ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc::value>( - typename sycl::detail::GroupOpTag::type(), x, binary_op); -#else - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + T> inclusive_scan(Group g, T x, BinaryOperation binary_op) { + return inclusive_scan_over_group(g, x, binary_op); } template +__SYCL2020_DEPRECATED("ONEAPI::inclusive_scan is deprecated. Use " + "inclusive_scan_over_group instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_scalar_arithmetic::value && detail::is_scalar_arithmetic::value && detail::is_native_op::value && detail::is_native_op::value), - T> -inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) { - // FIXME: Do not special-case for half precision - static_assert(std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match scan accumulation type."); -#ifdef __SYCL_DEVICE_ONLY__ - if (sycl::detail::get_local_linear_id(g) == 0) { - x = binary_op(init, x); - } - return inclusive_scan(g, x, binary_op); -#else - (void)g; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + T> inclusive_scan(Group g, V x, BinaryOperation binary_op, + T init) { + return inclusive_scan_over_group(g, x, binary_op, init); } template +__SYCL2020_DEPRECATED("ONEAPI::inclusive_scan is deprecated. Use " + "inclusive_scan_over_group instead.") detail::enable_if_t<(detail::is_generic_group::value && detail::is_vector_arithmetic::value && detail::is_vector_arithmetic::value && detail::is_native_op::value && detail::is_native_op::value), - T> -inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) { - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match scan accumulation type."); - T result; - for (int s = 0; s < x.get_size(); ++s) { - result[s] = inclusive_scan(g, x[s], binary_op, init[s]); - } - return result; + T> inclusive_scan(Group g, V x, BinaryOperation binary_op, + T init) { + return inclusive_scan_over_group(g, x, binary_op, init); } template +__SYCL2020_DEPRECATED( + "ONEAPI::inclusive_scan is deprecated. Use joint_inclusive_scan instead.") detail::enable_if_t< (detail::is_generic_group::value && detail::is_pointer::value && detail::is_pointer::value && @@ -870,48 +502,15 @@ detail::enable_if_t< detail::is_native_op::type, BinaryOperation>::value && detail::is_native_op::value), - OutPtr> -inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, - BinaryOperation binary_op, T init) { - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match scan accumulation type."); -#ifdef __SYCL_DEVICE_ONLY__ - ptrdiff_t offset = sycl::detail::get_local_linear_id(g); - ptrdiff_t stride = sycl::detail::get_local_linear_range(g); - ptrdiff_t N = last - first; - auto roundup = [=](const ptrdiff_t &v, - const ptrdiff_t &divisor) -> ptrdiff_t { - return ((v + divisor - 1) / divisor) * divisor; - }; - typename InPtr::element_type x; - typename OutPtr::element_type carry = init; - for (ptrdiff_t chunk = 0; chunk < roundup(N, stride); chunk += stride) { - ptrdiff_t i = chunk + offset; - if (i < N) { - x = first[i]; - } - typename OutPtr::element_type out = inclusive_scan(g, x, binary_op, carry); - if (i < N) { - result[i] = out; - } - carry = broadcast(g, out, stride - 1); - } - return result + N; -#else - (void)g; - (void)last; - (void)result; - throw runtime_error("Group algorithms are not supported on host device.", - PI_INVALID_DEVICE); -#endif + OutPtr> inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, + BinaryOperation binary_op, T init) { + return joint_inclusive_scan(g, first, last, result, binary_op, init); } template +__SYCL2020_DEPRECATED( + "ONEAPI::inclusive_scan is deprecated. Use joint_inclusive_scan instead.") detail::enable_if_t< (detail::is_generic_group::value && detail::is_pointer::value && detail::is_pointer::value && @@ -919,19 +518,9 @@ detail::enable_if_t< typename detail::remove_pointer::type>::value && detail::is_native_op::type, BinaryOperation>::value), - OutPtr> -inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, - BinaryOperation binary_op) { - // FIXME: Do not special-case for half precision - static_assert( - std::is_same::value || - (std::is_same::value && - std::is_same::value), - "Result type of binary_op must match scan accumulation type."); - return inclusive_scan(g, first, last, result, binary_op, - sycl::detail::identity::value); + OutPtr> inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, + BinaryOperation binary_op) { + return joint_inclusive_scan(g, first, last, result, binary_op); } template diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index 168bc67ff6eb8..2780254695a08 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -45,178 +46,9 @@ __SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class Queue, __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups); -template -using IsReduPlus = - bool_constant>::value || - std::is_same>::value>; - -template -using IsReduMultiplies = - bool_constant>::value || - std::is_same>::value>; - -template -using IsReduMinimum = - bool_constant>::value || - std::is_same>::value>; - -template -using IsReduMaximum = - bool_constant>::value || - std::is_same>::value>; - -template -using IsReduBitOR = - bool_constant>::value || - std::is_same>::value>; - -template -using IsReduBitXOR = - bool_constant>::value || - std::is_same>::value>; - -template -using IsReduBitAND = - bool_constant>::value || - std::is_same>::value>; - -template -using IsReduOptForFastAtomicFetch = - bool_constant::value && - sycl::detail::IsValidAtomicType::value && - (IsReduPlus::value || - IsReduMinimum::value || - IsReduMaximum::value || - IsReduBitOR::value || - IsReduBitXOR::value || - IsReduBitAND::value)>; - -template -using IsReduOptForFastReduce = - bool_constant<((is_sgeninteger::value && - (sizeof(T) == 4 || sizeof(T) == 8)) || - is_sgenfloat::value) && - (IsReduPlus::value || - IsReduMinimum::value || - IsReduMaximum::value)>; - -// Identity = 0 -template -using IsZeroIdentityOp = bool_constant< - (is_sgeninteger::value && (IsReduPlus::value || - IsReduBitOR::value || - IsReduBitXOR::value)) || - (is_sgenfloat::value && IsReduPlus::value)>; - -// Identity = 1 -template -using IsOneIdentityOp = - bool_constant<(is_sgeninteger::value || is_sgenfloat::value) && - IsReduMultiplies::value>; - -// Identity = ~0 -template -using IsOnesIdentityOp = bool_constant::value && - IsReduBitAND::value>; - -// Identity = -template -using IsMinimumIdentityOp = - bool_constant<(is_sgeninteger::value || is_sgenfloat::value) && - IsReduMinimum::value>; - -// Identity = -template -using IsMaximumIdentityOp = - bool_constant<(is_sgeninteger::value || is_sgenfloat::value) && - IsReduMaximum::value>; - -template -using IsKnownIdentityOp = - bool_constant::value || - IsOneIdentityOp::value || - IsOnesIdentityOp::value || - IsMinimumIdentityOp::value || - IsMaximumIdentityOp::value>; -template -struct has_known_identity_impl - : std::integral_constant< - bool, IsKnownIdentityOp::value> {}; - -template -struct known_identity_impl {}; - -/// Returns zero as identity for ADD, OR, XOR operations. -template -struct known_identity_impl::value>::type> { - static constexpr AccumulatorT value = 0; -}; - -template -struct known_identity_impl::value>::type> { - static constexpr half value = -#ifdef __SYCL_DEVICE_ONLY__ - 0; -#else - cl::sycl::detail::host_half_impl::half(static_cast(0)); -#endif -}; - -/// Returns one as identify for MULTIPLY operations. -template -struct known_identity_impl::value>::type> { - static constexpr AccumulatorT value = 1; -}; - -template -struct known_identity_impl::value>::type> { - static constexpr half value = -#ifdef __SYCL_DEVICE_ONLY__ - 1; -#else - cl::sycl::detail::host_half_impl::half(static_cast(0x3C00)); -#endif -}; -/// Returns bit image consisting of all ones as identity for AND operations. -template -struct known_identity_impl::value>::type> { - static constexpr AccumulatorT value = ~static_cast(0); -}; - -/// Returns maximal possible value as identity for MIN operations. -template -struct known_identity_impl::value>::type> { - static constexpr AccumulatorT value = - std::numeric_limits::has_infinity - ? std::numeric_limits::infinity() - : (std::numeric_limits::max)(); -}; -/// Returns minimal possible value as identity for MAX operations. -template -struct known_identity_impl::value>::type> { - static constexpr AccumulatorT value = - std::numeric_limits::has_infinity - ? static_cast( - -std::numeric_limits::infinity()) - : std::numeric_limits::lowest(); -}; /// Class that is used to represent objects that are passed to user's lambda /// functions and representing users' reduction variable. @@ -1838,25 +1670,23 @@ reduction(T *VarPtr, BinaryOperation) { return {VarPtr}; } +// ---- has_known_identity template -struct has_known_identity : detail::has_known_identity_impl< - typename std::decay::type, - typename std::decay::type> {}; -#if __cplusplus >= 201703L +struct has_known_identity + : sycl::has_known_identity {}; + template -inline constexpr bool has_known_identity_v = +__SYCL_INLINE_CONSTEXPR bool has_known_identity_v = has_known_identity::value; -#endif +// ---- known_identity template -struct known_identity - : detail::known_identity_impl::type, - typename std::decay::type> {}; -#if __cplusplus >= 201703L +struct known_identity : sycl::known_identity {}; + template -inline constexpr AccumulatorT known_identity_v = +__SYCL_INLINE_CONSTEXPR AccumulatorT known_identity_v = known_identity::value; -#endif + } // namespace ONEAPI } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/detail/type_traits.hpp b/sycl/include/CL/sycl/detail/type_traits.hpp index ee2a3bd9a327d..98989ab9fba2a 100644 --- a/sycl/include/CL/sycl/detail/type_traits.hpp +++ b/sycl/include/CL/sycl/detail/type_traits.hpp @@ -22,6 +22,21 @@ namespace ONEAPI { struct sub_group; } // namespace ONEAPI namespace detail { + +template struct is_group : std::false_type {}; + +template +struct is_group> : std::true_type {}; + +template struct is_sub_group : std::false_type {}; + +template <> struct is_sub_group : std::true_type {}; + +template +struct is_generic_group + : std::integral_constant::value || is_sub_group::value> {}; + namespace half_impl { class half; } @@ -31,6 +46,10 @@ using half = detail::half_impl::half; // Forward declaration template class multi_ptr; +template +__SYCL_INLINE_CONSTEXPR bool is_group_v = + detail::is_group::value || detail::is_sub_group::value; + namespace detail { template struct copy_cv_qualifiers; @@ -319,19 +338,6 @@ template using const_if_const_AS = DataT; #endif -template struct is_group : std::false_type {}; - -template -struct is_group> : std::true_type {}; - -template struct is_sub_group : std::false_type {}; - -template <> struct is_sub_group : std::true_type {}; - -template -struct is_generic_group - : std::integral_constant::value || is_sub_group::value> {}; } // namespace detail } // namespace sycl diff --git a/sycl/include/CL/sycl/group_algorithm.hpp b/sycl/include/CL/sycl/group_algorithm.hpp new file mode 100644 index 0000000000000..c7dd9680eb0f7 --- /dev/null +++ b/sycl/include/CL/sycl/group_algorithm.hpp @@ -0,0 +1,838 @@ +//==----------- group_algorithm.hpp ------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +__SYCL_INLINE_NAMESPACE(cl) { +namespace sycl { +namespace detail { + +// ---- linear_id_to_id +template +id linear_id_to_id(range, size_t linear_id); +template <> inline id<1> linear_id_to_id(range<1>, size_t linear_id) { + return id<1>(linear_id); +} +template <> inline id<2> linear_id_to_id(range<2> r, size_t linear_id) { + id<2> result; + result[0] = linear_id / r[1]; + result[1] = linear_id % r[1]; + return result; +} +template <> inline id<3> linear_id_to_id(range<3> r, size_t linear_id) { + id<3> result; + result[0] = linear_id / (r[1] * r[2]); + result[1] = (linear_id % (r[1] * r[2])) / r[2]; + result[2] = linear_id % r[2]; + return result; +} + +// ---- get_local_linear_range +template size_t get_local_linear_range(Group g); +template <> inline size_t get_local_linear_range>(group<1> g) { + return g.get_local_range(0); +} +template <> inline size_t get_local_linear_range>(group<2> g) { + return g.get_local_range(0) * g.get_local_range(1); +} +template <> inline size_t get_local_linear_range>(group<3> g) { + return g.get_local_range(0) * g.get_local_range(1) * g.get_local_range(2); +} +template <> +inline size_t get_local_linear_range(ONEAPI::sub_group g) { + return g.get_local_range()[0]; +} + +// ---- get_local_linear_id +template +typename Group::linear_id_type get_local_linear_id(Group g); + +#ifdef __SYCL_DEVICE_ONLY__ +#define __SYCL_GROUP_GET_LOCAL_LINEAR_ID(D) \ + template <> \ + group::linear_id_type get_local_linear_id>(group) { \ + nd_item it = cl::sycl::detail::Builder::getNDItem(); \ + return it.get_local_linear_id(); \ + } +__SYCL_GROUP_GET_LOCAL_LINEAR_ID(1); +__SYCL_GROUP_GET_LOCAL_LINEAR_ID(2); +__SYCL_GROUP_GET_LOCAL_LINEAR_ID(3); +#undef __SYCL_GROUP_GET_LOCAL_LINEAR_ID +#endif // __SYCL_DEVICE_ONLY__ + +template <> +inline ONEAPI::sub_group::linear_id_type +get_local_linear_id(ONEAPI::sub_group g) { + return g.get_local_id()[0]; +} + +// ---- is_native_op +template +using native_op_list = + type_list, ONEAPI::bit_or, ONEAPI::bit_xor, + ONEAPI::bit_and, ONEAPI::maximum, ONEAPI::minimum, + ONEAPI::multiplies>; + +template struct is_native_op { + static constexpr bool value = + is_contained>::value || + is_contained>::value; +}; + +// ---- for_each +template +Function for_each(Group g, Ptr first, Ptr last, Function f) { +#ifdef __SYCL_DEVICE_ONLY__ + ptrdiff_t offset = sycl::detail::get_local_linear_id(g); + ptrdiff_t stride = sycl::detail::get_local_linear_range(g); + for (Ptr p = first + offset; p < last; p += stride) { + f(*p); + } + return f; +#else + (void)g; + (void)first; + (void)last; + (void)f; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} +} // namespace detail + +// ---- reduce_over_group +template +detail::enable_if_t<(is_group_v> && + detail::is_scalar_arithmetic::value && + detail::is_native_op::value), + T> +reduce_over_group(Group, T x, BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match reduction accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + return sycl::detail::calc::value>( + typename sycl::detail::GroupOpTag::type(), x, binary_op); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +detail::enable_if_t<(is_group_v> && + detail::is_vector_arithmetic::value && + detail::is_native_op::value), + T> +reduce_over_group(Group g, T x, BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match reduction accumulation type."); + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = reduce_over_group(g, x[s], binary_op); + } + return result; +} + +template +detail::enable_if_t<(is_group_v> && + detail::is_scalar_arithmetic::value && + detail::is_scalar_arithmetic::value && + detail::is_native_op::value && + detail::is_native_op::value), + T> +reduce_over_group(Group g, V x, T init, BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match reduction accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + return binary_op(init, reduce_over_group(g, x, binary_op)); +#else + (void)g; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +detail::enable_if_t<(is_group_v> && + detail::is_vector_arithmetic::value && + detail::is_vector_arithmetic::value && + detail::is_native_op::value && + detail::is_native_op::value), + T> +reduce_over_group(Group g, V x, T init, BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match reduction accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + T result = init; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = binary_op(init[s], reduce_over_group(g, x[s], binary_op)); + } + return result; +#else + (void)g; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +// ---- joint_reduce +template +detail::enable_if_t< + (is_group_v> && detail::is_pointer::value && + detail::is_arithmetic::type>::value), + typename detail::remove_pointer::type> +joint_reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) { + using T = typename detail::remove_pointer::type; + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match reduction accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + T partial = sycl::known_identity_v; + sycl::detail::for_each(g, first, last, + [&](const T &x) { partial = binary_op(partial, x); }); + return reduce_over_group(g, partial, binary_op); +#else + (void)g; + (void)last; + (void)binary_op; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +detail::enable_if_t< + (is_group_v> && detail::is_pointer::value && + detail::is_arithmetic::type>::value && + detail::is_arithmetic::value && + detail::is_native_op::type, + BinaryOperation>::value && + detail::is_native_op::value), + T> +joint_reduce(Group g, Ptr first, Ptr last, T init, BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match reduction accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + T partial = sycl::known_identity_v; + sycl::detail::for_each( + g, first, last, [&](const typename detail::remove_pointer::type &x) { + partial = binary_op(partial, x); + }); + return reduce_over_group(g, partial, init, binary_op); +#else + (void)g; + (void)last; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +// ---- any_of_group +template +detail::enable_if_t>, bool> +any_of_group(Group, bool pred) { +#ifdef __SYCL_DEVICE_ONLY__ + return sycl::detail::spirv::GroupAny(pred); +#else + (void)pred; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +detail::enable_if_t, bool> any_of_group(Group g, T x, + Predicate pred) { + return any_of_group(g, pred(x)); +} + +// ---- joint_any_of +template +detail::enable_if_t< + (is_group_v> && detail::is_pointer::value), bool> +joint_any_of(Group g, Ptr first, Ptr last, Predicate pred) { +#ifdef __SYCL_DEVICE_ONLY__ + using T = typename detail::remove_pointer::type; + bool partial = false; + sycl::detail::for_each(g, first, last, [&](T &x) { partial |= pred(x); }); + return any_of_group(g, partial); +#else + (void)g; + (void)first; + (void)last; + (void)pred; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +// ---- all_of_group +template +detail::enable_if_t>, bool> +all_of_group(Group, bool pred) { +#ifdef __SYCL_DEVICE_ONLY__ + return sycl::detail::spirv::GroupAll(pred); +#else + (void)pred; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +detail::enable_if_t>, bool> +all_of_group(Group g, T x, Predicate pred) { + return all_of_group(g, pred(x)); +} + +// ---- joint_all_of +template +detail::enable_if_t< + (is_group_v> && detail::is_pointer::value), bool> +joint_all_of(Group g, Ptr first, Ptr last, Predicate pred) { +#ifdef __SYCL_DEVICE_ONLY__ + using T = typename detail::remove_pointer::type; + bool partial = true; + sycl::detail::for_each(g, first, last, [&](T &x) { partial &= pred(x); }); + return all_of_group(g, partial); +#else + (void)g; + (void)first; + (void)last; + (void)pred; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +// ---- none_of_group +template +detail::enable_if_t>, bool> +none_of_group(Group, bool pred) { +#ifdef __SYCL_DEVICE_ONLY__ + return sycl::detail::spirv::GroupAll(!pred); +#else + (void)pred; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +detail::enable_if_t>, bool> +none_of_group(Group g, T x, Predicate pred) { + return none_of_group(g, pred(x)); +} + +// ---- joint_none_of +template +detail::enable_if_t< + (is_group_v> && detail::is_pointer::value), bool> +joint_none_of(Group g, Ptr first, Ptr last, Predicate pred) { +#ifdef __SYCL_DEVICE_ONLY__ + return !joint_any_of(g, first, last, pred); +#else + (void)g; + (void)first; + (void)last; + (void)pred; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +// ---- shift_group_left +template +detail::enable_if_t<(std::is_same, sub_group>::value && + detail::is_arithmetic::value), + T> +shift_group_left(Group, T x, typename Group::linear_id_type delta = 1) { +#ifdef __SYCL_DEVICE_ONLY__ + return sycl::detail::spirv::SubgroupShuffleDown(x, delta); +#else + (void)x; + (void)delta; + throw runtime_error("Sub-groups are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +// ---- shift_group_right +template +detail::enable_if_t<(std::is_same, sub_group>::value && + detail::is_arithmetic::value), + T> +shift_group_right(Group, T x, typename Group::linear_id_type delta = 1) { +#ifdef __SYCL_DEVICE_ONLY__ + return sycl::detail::spirv::SubgroupShuffleUp(x, delta); +#else + (void)x; + (void)delta; + throw runtime_error("Sub-groups are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +// ---- permute_group_by_xor +template +detail::enable_if_t<(std::is_same, sub_group>::value && + detail::is_arithmetic::value), + T> +permute_group_by_xor(Group, T x, typename Group::linear_id_type mask) { +#ifdef __SYCL_DEVICE_ONLY__ + return sycl::detail::spirv::SubgroupShuffleXor(x, mask); +#else + (void)x; + (void)mask; + throw runtime_error("Sub-groups are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +// ---- select_from_group +template +detail::enable_if_t<(std::is_same, sub_group>::value && + detail::is_arithmetic::value), + T> +select_from_group(Group, T x, typename Group::id_type local_id) { +#ifdef __SYCL_DEVICE_ONLY__ + return sycl::detail::spirv::SubgroupShuffle(x, local_id); +#else + (void)x; + (void)local_id; + throw runtime_error("Sub-groups are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +// ---- group_broadcast +template +detail::enable_if_t<(is_group_v> && + detail::is_scalar_arithmetic::value), + T> +group_broadcast(Group, T x, typename Group::id_type local_id) { +#ifdef __SYCL_DEVICE_ONLY__ + return sycl::detail::spirv::GroupBroadcast(x, local_id); +#else + (void)x; + (void)local_id; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +detail::enable_if_t<(is_group_v> && + detail::is_scalar_arithmetic::value), + T> +group_broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) { +#ifdef __SYCL_DEVICE_ONLY__ + return group_broadcast( + g, x, + sycl::detail::linear_id_to_id(g.get_local_range(), linear_local_id)); +#else + (void)g; + (void)x; + (void)linear_local_id; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +detail::enable_if_t<(is_group_v> && + detail::is_scalar_arithmetic::value), + T> +group_broadcast(Group g, T x) { +#ifdef __SYCL_DEVICE_ONLY__ + return group_broadcast(g, x, 0); +#else + (void)g; + (void)x; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +// ---- exclusive_scan_over_group +template +detail::enable_if_t<(is_group_v> && + detail::is_scalar_arithmetic::value && + detail::is_native_op::value), + T> +exclusive_scan_over_group(Group, T x, BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert(std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match scan accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + return sycl::detail::calc::value>( + typename sycl::detail::GroupOpTag::type(), x, binary_op); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +detail::enable_if_t<(is_group_v> && + detail::is_vector_arithmetic::value && + detail::is_native_op::value), + T> +exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match scan accumulation type."); + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = exclusive_scan_over_group(g, x[s], binary_op); + } + return result; +} + +template +detail::enable_if_t<(is_group_v> && + detail::is_vector_arithmetic::value && + detail::is_vector_arithmetic::value && + detail::is_native_op::value && + detail::is_native_op::value), + T> +exclusive_scan_over_group(Group g, V x, T init, BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match scan accumulation type."); + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = exclusive_scan_over_group(g, x[s], init[s], binary_op); + } + return result; +} + +template +detail::enable_if_t<(is_group_v> && + detail::is_scalar_arithmetic::value && + detail::is_scalar_arithmetic::value && + detail::is_native_op::value && + detail::is_native_op::value), + T> +exclusive_scan_over_group(Group g, V x, T init, BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert(std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match scan accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + typename Group::linear_id_type local_linear_id = + sycl::detail::get_local_linear_id(g); + if (local_linear_id == 0) { + x = binary_op(init, x); + } + T scan = exclusive_scan_over_group(g, x, binary_op); + if (local_linear_id == 0) { + scan = init; + } + return scan; +#else + (void)g; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +// ---- joint_exclusive_scan +template +detail::enable_if_t< + (is_group_v> && detail::is_pointer::value && + detail::is_pointer::value && + detail::is_arithmetic< + typename detail::remove_pointer::type>::value && + detail::is_arithmetic::value && + detail::is_native_op::type, + BinaryOperation>::value && + detail::is_native_op::value), + OutPtr> +joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, T init, + BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match scan accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + ptrdiff_t offset = sycl::detail::get_local_linear_id(g); + ptrdiff_t stride = sycl::detail::get_local_linear_range(g); + ptrdiff_t N = last - first; + auto roundup = [=](const ptrdiff_t &v, + const ptrdiff_t &divisor) -> ptrdiff_t { + return ((v + divisor - 1) / divisor) * divisor; + }; + typename InPtr::element_type x; + typename OutPtr::element_type carry = init; + for (ptrdiff_t chunk = 0; chunk < roundup(N, stride); chunk += stride) { + ptrdiff_t i = chunk + offset; + if (i < N) { + x = first[i]; + } + typename OutPtr::element_type out = + exclusive_scan_over_group(g, x, carry, binary_op); + if (i < N) { + result[i] = out; + } + carry = group_broadcast(g, binary_op(out, x), stride - 1); + } + return result + N; +#else + (void)g; + (void)last; + (void)result; + (void)init; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +detail::enable_if_t< + (is_group_v> && detail::is_pointer::value && + detail::is_pointer::value && + detail::is_arithmetic< + typename detail::remove_pointer::type>::value && + detail::is_native_op::type, + BinaryOperation>::value), + OutPtr> +joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, + BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match scan accumulation type."); + return joint_exclusive_scan( + g, first, last, result, + sycl::known_identity_v, + binary_op); +} + +// ---- inclusive_scan_over_group +template +detail::enable_if_t<(is_group_v> && + detail::is_vector_arithmetic::value && + detail::is_native_op::value), + T> +inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match scan accumulation type."); + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = inclusive_scan_over_group(g, x[s], binary_op); + } + return result; +} + +template +detail::enable_if_t<(is_group_v> && + detail::is_scalar_arithmetic::value && + detail::is_native_op::value), + T> +inclusive_scan_over_group(Group, T x, BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert(std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match scan accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + return sycl::detail::calc::value>( + typename sycl::detail::GroupOpTag::type(), x, binary_op); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +detail::enable_if_t<(is_group_v> && + detail::is_scalar_arithmetic::value && + detail::is_scalar_arithmetic::value && + detail::is_native_op::value && + detail::is_native_op::value), + T> +inclusive_scan_over_group(Group g, V x, BinaryOperation binary_op, T init) { + // FIXME: Do not special-case for half precision + static_assert(std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match scan accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + if (sycl::detail::get_local_linear_id(g) == 0) { + x = binary_op(init, x); + } + return inclusive_scan_over_group(g, x, binary_op); +#else + (void)g; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +detail::enable_if_t<(is_group_v> && + detail::is_vector_arithmetic::value && + detail::is_vector_arithmetic::value && + detail::is_native_op::value && + detail::is_native_op::value), + T> +inclusive_scan_over_group(Group g, V x, BinaryOperation binary_op, T init) { + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match scan accumulation type."); + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = inclusive_scan_over_group(g, x[s], binary_op, init[s]); + } + return result; +} + +// ---- joint_inclusive_scan +template +detail::enable_if_t< + (is_group_v> && detail::is_pointer::value && + detail::is_pointer::value && + detail::is_arithmetic< + typename detail::remove_pointer::type>::value && + detail::is_arithmetic::value && + detail::is_native_op::type, + BinaryOperation>::value && + detail::is_native_op::value), + OutPtr> +joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, + BinaryOperation binary_op, T init) { + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match scan accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + ptrdiff_t offset = sycl::detail::get_local_linear_id(g); + ptrdiff_t stride = sycl::detail::get_local_linear_range(g); + ptrdiff_t N = last - first; + auto roundup = [=](const ptrdiff_t &v, + const ptrdiff_t &divisor) -> ptrdiff_t { + return ((v + divisor - 1) / divisor) * divisor; + }; + typename InPtr::element_type x; + typename OutPtr::element_type carry = init; + for (ptrdiff_t chunk = 0; chunk < roundup(N, stride); chunk += stride) { + ptrdiff_t i = chunk + offset; + if (i < N) { + x = first[i]; + } + typename OutPtr::element_type out = + inclusive_scan_over_group(g, x, binary_op, carry); + if (i < N) { + result[i] = out; + } + carry = group_broadcast(g, out, stride - 1); + } + return result + N; +#else + (void)g; + (void)last; + (void)result; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +detail::enable_if_t< + (is_group_v> && detail::is_pointer::value && + detail::is_pointer::value && + detail::is_arithmetic< + typename detail::remove_pointer::type>::value && + detail::is_native_op::type, + BinaryOperation>::value), + OutPtr> +joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, + BinaryOperation binary_op) { + // FIXME: Do not special-case for half precision + static_assert( + std::is_same::value || + (std::is_same::value && + std::is_same::value), + "Result type of binary_op must match scan accumulation type."); + return joint_inclusive_scan( + g, first, last, result, binary_op, + sycl::known_identity_v); +} + +} // namespace sycl +} // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/known_identity.hpp b/sycl/include/CL/sycl/known_identity.hpp new file mode 100644 index 0000000000000..99ce8c4e5543a --- /dev/null +++ b/sycl/include/CL/sycl/known_identity.hpp @@ -0,0 +1,218 @@ +//==----------- known_identity.hpp -----------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +__SYCL_INLINE_NAMESPACE(cl) { +namespace sycl { +namespace detail { + +using cl::sycl::detail::is_sgeninteger; + +template +using IsReduPlus = + bool_constant>::value || + std::is_same>::value>; + +template +using IsReduMultiplies = + bool_constant>::value || + std::is_same>::value>; + +template +using IsReduMinimum = + bool_constant>::value || + std::is_same>::value>; + +template +using IsReduMaximum = + bool_constant>::value || + std::is_same>::value>; + +template +using IsReduBitOR = + bool_constant>::value || + std::is_same>::value>; + +template +using IsReduBitXOR = + bool_constant>::value || + std::is_same>::value>; + +template +using IsReduBitAND = + bool_constant>::value || + std::is_same>::value>; + +template +using IsReduOptForFastAtomicFetch = + bool_constant::value && + sycl::detail::IsValidAtomicType::value && + (IsReduPlus::value || + IsReduMinimum::value || + IsReduMaximum::value || + IsReduBitOR::value || + IsReduBitXOR::value || + IsReduBitAND::value)>; + +template +using IsReduOptForFastReduce = + bool_constant<((is_sgeninteger::value && + (sizeof(T) == 4 || sizeof(T) == 8)) || + is_sgenfloat::value) && + (IsReduPlus::value || + IsReduMinimum::value || + IsReduMaximum::value)>; + +// Identity = 0 +template +using IsZeroIdentityOp = bool_constant< + (is_sgeninteger::value && (IsReduPlus::value || + IsReduBitOR::value || + IsReduBitXOR::value)) || + (is_sgenfloat::value && IsReduPlus::value)>; + +// Identity = 1 +template +using IsOneIdentityOp = + bool_constant<(is_sgeninteger::value || is_sgenfloat::value) && + IsReduMultiplies::value>; + +// Identity = ~0 +template +using IsOnesIdentityOp = bool_constant::value && + IsReduBitAND::value>; + +// Identity = +template +using IsMinimumIdentityOp = + bool_constant<(is_sgeninteger::value || is_sgenfloat::value) && + IsReduMinimum::value>; + +// Identity = +template +using IsMaximumIdentityOp = + bool_constant<(is_sgeninteger::value || is_sgenfloat::value) && + IsReduMaximum::value>; + +template +using IsKnownIdentityOp = + bool_constant::value || + IsOneIdentityOp::value || + IsOnesIdentityOp::value || + IsMinimumIdentityOp::value || + IsMaximumIdentityOp::value>; + +template +struct has_known_identity_impl + : std::integral_constant< + bool, IsKnownIdentityOp::value> {}; + +template +struct known_identity_impl {}; + +/// Returns zero as identity for ADD, OR, XOR operations. +template +struct known_identity_impl::value>::type> { + static constexpr AccumulatorT value = 0; +}; + +template +struct known_identity_impl::value>::type> { + static constexpr half value = +#ifdef __SYCL_DEVICE_ONLY__ + 0; +#else + cl::sycl::detail::host_half_impl::half(static_cast(0)); +#endif +}; + +/// Returns one as identify for MULTIPLY operations. +template +struct known_identity_impl::value>::type> { + static constexpr AccumulatorT value = 1; +}; + +template +struct known_identity_impl::value>::type> { + static constexpr half value = +#ifdef __SYCL_DEVICE_ONLY__ + 1; +#else + cl::sycl::detail::host_half_impl::half(static_cast(0x3C00)); +#endif +}; + +/// Returns bit image consisting of all ones as identity for AND operations. +template +struct known_identity_impl::value>::type> { + static constexpr AccumulatorT value = ~static_cast(0); +}; + +/// Returns maximal possible value as identity for MIN operations. +template +struct known_identity_impl::value>::type> { + static constexpr AccumulatorT value = + std::numeric_limits::has_infinity + ? std::numeric_limits::infinity() + : (std::numeric_limits::max)(); +}; + +/// Returns minimal possible value as identity for MAX operations. +template +struct known_identity_impl::value>::type> { + static constexpr AccumulatorT value = + std::numeric_limits::has_infinity + ? static_cast( + -std::numeric_limits::infinity()) + : std::numeric_limits::lowest(); +}; + +} // namespace detail + +// ---- has_known_identity +template +struct has_known_identity : detail::has_known_identity_impl< + typename std::decay::type, + typename std::decay::type> {}; + +template +__SYCL_INLINE_CONSTEXPR bool has_known_identity_v = + sycl::has_known_identity::value; + +// ---- known_identity +template +struct known_identity + : detail::known_identity_impl::type, + typename std::decay::type> {}; + +template +__SYCL_INLINE_CONSTEXPR AccumulatorT known_identity_v = + sycl::known_identity::value; + +} // namespace sycl +} // __SYCL_INLINE_NAMESPACE(cl) \ No newline at end of file diff --git a/sycl/include/CL/sycl/reduction.hpp b/sycl/include/CL/sycl/reduction.hpp index da8acd9460976..34f592b52dee9 100644 --- a/sycl/include/CL/sycl/reduction.hpp +++ b/sycl/include/CL/sycl/reduction.hpp @@ -8,34 +8,13 @@ #pragma once +#include + #include "CL/sycl/ONEAPI/reduction.hpp" __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -// Currently, the type traits defined below correspond to SYCL 1.2.1 ONEAPI -// reduction extension. That may be changed later when SYCL 2020 reductions -// are implemented. -template -struct has_known_identity - : ONEAPI::has_known_identity {}; - -#if __cplusplus >= 201703L -template -inline constexpr bool has_known_identity_v = - has_known_identity::value; -#endif - -template -struct known_identity : ONEAPI::known_identity { -}; - -#if __cplusplus >= 201703L -template -inline constexpr AccumulatorT known_identity_v = - known_identity::value; -#endif - /// Constructs a reduction object using the given buffer \p Var, handler \p CGH, /// reduction operation \p Combiner, and optional reduction properties. template diff --git a/sycl/test/on-device/group_algorithms_sycl2020/all_of.cpp b/sycl/test/on-device/group_algorithms_sycl2020/all_of.cpp new file mode 100644 index 0000000000000..9d76fa2aaa48d --- /dev/null +++ b/sycl/test/on-device/group_algorithms_sycl2020/all_of.cpp @@ -0,0 +1,63 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include "support.h" +#include +#include +#include +#include +using namespace sycl; + +template class all_of_kernel; + +struct IsEven { + bool operator()(int i) const { return (i % 2) == 0; } +}; + +template +void test(queue q, InputContainer input, OutputContainer output, + Predicate pred) { + typedef class all_of_kernel kernel_name; + size_t N = input.size(); + size_t G = 64; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = all_of_group(g, pred(in[lid])); + out[1] = all_of_group(g, in[lid], pred); + out[2] = joint_all_of(g, in.get_pointer(), in.get_pointer() + N, pred); + }); + }); + } + bool expected = std::all_of(input.begin(), input.end(), pred); + assert(output[0] == expected); + assert(output[1] == expected); + assert(output[2] == expected); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 128; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), false); + + test(q, input, output, IsEven()); + + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/on-device/group_algorithms_sycl2020/any_of.cpp b/sycl/test/on-device/group_algorithms_sycl2020/any_of.cpp new file mode 100644 index 0000000000000..5dc5922f4391f --- /dev/null +++ b/sycl/test/on-device/group_algorithms_sycl2020/any_of.cpp @@ -0,0 +1,73 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include "support.h" +#include +#include +#include +#include +using namespace sycl; + +template class any_of_kernel; + +struct GeZero { + bool operator()(int i) const { return i >= 0; } +}; +struct IsEven { + bool operator()(int i) const { return (i % 2) == 0; } +}; +struct LtZero { + bool operator()(int i) const { return i < 0; } +}; + +template +void test(queue q, InputContainer input, OutputContainer output, + Predicate pred) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class any_of_kernel kernel_name; + size_t N = input.size(); + size_t G = 64; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = any_of_group(g, pred(in[lid])); + out[1] = any_of_group(g, in[lid], pred); + out[2] = joint_any_of(g, in.get_pointer(), in.get_pointer() + N, pred); + }); + }); + } + bool expected = std::any_of(input.begin(), input.end(), pred); + assert(output[0] == expected); + assert(output[1] == expected); + assert(output[2] == expected); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 128; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), false); + + test(q, input, output, GeZero()); + test(q, input, output, IsEven()); + test(q, input, output, LtZero()); + + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/on-device/group_algorithms_sycl2020/exclusive_scan.cpp b/sycl/test/on-device/group_algorithms_sycl2020/exclusive_scan.cpp new file mode 100644 index 0000000000000..7932cb9ccd138 --- /dev/null +++ b/sycl/test/on-device/group_algorithms_sycl2020/exclusive_scan.cpp @@ -0,0 +1,162 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: enable compile+runtime checks for operations defined in SPIR-V 1.3. +// That requires either adding a switch to clang (-spirv-max-version=1.3) or +// raising the spirv version from 1.1. to 1.3 for spirv translator +// unconditionally. Using operators specific for spirv 1.3 and higher with +// -spirv-max-version=1.1 being set by default causes assert/check fails +// in spirv translator. +// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \ + %t13.out + +#include "support.h" +#include +#include +#include +#include +#include +#include +using namespace sycl; + +template +class exclusive_scan_kernel; + +// std::exclusive_scan isn't implemented yet, so use serial implementation +// instead +namespace emu { +template +OutputIterator exclusive_scan(InputIterator first, InputIterator last, + OutputIterator result, T init, + BinaryOperation binary_op) { + T partial = init; + for (InputIterator it = first; it != last; ++it) { + *(result++) = partial; + partial = binary_op(partial, *it); + } + return result; +} +} // namespace emu + +template +void test(queue q, InputContainer input, OutputContainer output, + BinaryOperation binary_op, + typename OutputContainer::value_type identity) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class exclusive_scan_kernel kernel_name0; + typedef class exclusive_scan_kernel kernel_name1; + typedef class exclusive_scan_kernel kernel_name2; + typedef class exclusive_scan_kernel kernel_name3; + OutputT init = 42; + size_t N = input.size(); + size_t G = 64; + std::vector expected(N); + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = exclusive_scan_over_group(g, in[lid], binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), + identity, binary_op); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = exclusive_scan_over_group(g, in[lid], init, binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), init, + binary_op); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + joint_exclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), + identity, binary_op); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + joint_exclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), init, binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), init, + binary_op); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 128; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), 0); + + test(q, input, output, std::plus<>(), 0); + test(q, input, output, sycl::ONEAPI::minimum<>(), + std::numeric_limits::max()); + test(q, input, output, sycl::ONEAPI::maximum<>(), + std::numeric_limits::lowest()); + + test(q, input, output, std::plus(), 0); + test(q, input, output, sycl::ONEAPI::minimum(), + std::numeric_limits::max()); + test(q, input, output, sycl::ONEAPI::maximum(), + std::numeric_limits::lowest()); + +#ifdef SPIRV_1_3 + test(q, input, output, multiplies(), + 1); + test(q, input, output, bit_or(), 0); + test(q, input, output, bit_xor(), 0); + test(q, input, output, bit_and(), ~0); +#endif // SPIRV_1_3 + + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/on-device/group_algorithms_sycl2020/group_broadcast.cpp b/sycl/test/on-device/group_algorithms_sycl2020/group_broadcast.cpp new file mode 100644 index 0000000000000..2fadb3445626f --- /dev/null +++ b/sycl/test/on-device/group_algorithms_sycl2020/group_broadcast.cpp @@ -0,0 +1,98 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include "support.h" +#include +#include +#include +#include +#include +using namespace sycl; + +template +void test(queue q, InputContainer input, OutputContainer output) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + size_t N = input.size(); + size_t G = 4; + range<2> R(G, G); + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<2>(R, R), [=](nd_item<2> it) { + group<2> g = it.get_group(); + int lid = it.get_local_linear_id(); + out[0] = group_broadcast(g, in[lid]); + out[1] = group_broadcast(g, in[lid], group<2>::id_type(1, 2)); + out[2] = + group_broadcast(g, in[lid], group<2>::linear_id_type(2 * G + 1)); + }); + }); + } + assert(output[0] == input[0]); + assert(output[1] == input[1 * G + 2]); + assert(output[2] == input[2 * G + 1]); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 16; + + // Test built-in scalar type + { + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 1); + std::fill(output.begin(), output.end(), false); + test(q, input, output); + } + + // Test pointer type + // { + // std::array input; + // std::array output; + // for (int i = 0; i < N; ++i) { + // input[i] = static_cast(0x0) + i; + // } + // std::fill(output.begin(), output.end(), static_cast(0x0)); + // test(q, input, output); + // } + + // Test user-defined type + // - Use complex as a proxy for this + // - Test float and double to test 64-bit and 128-bit types + // { + // std::array, N> input; + // std::array, 3> output; + // for (int i = 0; i < N; ++i) { + // input[i] = + // std::complex(0, 1) + (float)i * std::complex(2, 2); + // } + // std::fill(output.begin(), output.end(), std::complex(0, 0)); + // test(q, input, output); + // } + // { + // std::array, N> input; + // std::array, 3> output; + // for (int i = 0; i < N; ++i) { + // input[i] = + // std::complex(0, 1) + (double)i * std::complex(2, + // 2); + // } + // std::fill(output.begin(), output.end(), std::complex(0, 0)); + // test(q, input, output); + // } + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/on-device/group_algorithms_sycl2020/helpers.hpp b/sycl/test/on-device/group_algorithms_sycl2020/helpers.hpp new file mode 100644 index 0000000000000..e4b74966a6c5a --- /dev/null +++ b/sycl/test/on-device/group_algorithms_sycl2020/helpers.hpp @@ -0,0 +1,168 @@ +//==---------- helpers.hpp -*- C++ -*--------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include +#include +#include +#include + +using namespace cl::sycl; + +// ---- utils +template struct utils { + static T1 add_vec(const vec &v); + static bool cmp_vec(const vec &v, const vec &r); + static std::string stringify_vec(const vec &v); +}; +template struct utils { + static T2 add_vec(const vec &v) { return v.s0(); } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0(); + } + static std::string stringify_vec(const vec &v) { + return std::to_string((T2)v.s0()); + } +}; +template struct utils { + static T2 add_vec(const vec &v) { return v.s0() + v.s1(); } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + " )"; + } +}; +template struct utils { + static T2 add_vec(const vec &v) { return v.s0() + v.s1() + v.s2(); } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + ", " + std::to_string((T2)v.s3()) + + " )"; + } +}; +template struct utils { + static T2 add_vec(const vec &v) { + return v.s0() + v.s1() + v.s2() + v.s3(); + } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() && + v.s3() == r.s3(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " + + std::to_string((T2)v.s3()) + " )"; + } +}; +template struct utils { + static T2 add_vec(const vec &v) { + return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() + + v.s7(); + } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() && + v.s3() == r.s3() && v.s4() == r.s4() && v.s5() == r.s5() && + v.s6() == r.s6() && v.s7() == r.s7(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " + + std::to_string((T2)v.s3()) + std::to_string((T2)v.s4()) + ", " + + std::to_string((T2)v.s5()) + std::to_string((T2)v.s6()) + ", " + + std::to_string((T2)v.s7()) + " )"; + } +}; + +template struct utils { + static T2 add_vec(const vec &v) { + return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() + + v.s7() + v.s8() + v.s9() + v.sA() + v.sB() + v.sC() + v.sD() + + v.sE() + v.sF(); + } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() && + v.s3() == r.s3() && v.s4() == r.s4() && v.s5() == r.s5() && + v.s6() == r.s6() && v.s7() == r.s7() && v.s8() == r.s8() && + v.s9() == r.s9() && v.sA() == r.sA() && v.sB() == r.sB() && + v.sC() == r.sC() && v.sD() == r.sD() && v.sE() == r.sE() && + v.sF() == r.sF(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " + + std::to_string((T2)v.s3()) + std::to_string((T2)v.s4()) + ", " + + std::to_string((T2)v.s5()) + std::to_string((T2)v.s6()) + ", " + + std::to_string((T2)v.s7()) + std::to_string((T2)v.s8()) + ", " + + std::to_string((T2)v.s9()) + std::to_string((T2)v.sA()) + ", " + + std::to_string((T2)v.sB()) + std::to_string((T2)v.sC()) + ", " + + std::to_string((T2)v.sE()) + std::to_string((T2)v.sD()) + ", " + + std::to_string((T2)v.sF()) + " )"; + } +}; + +// ---- exit_if_not_equal +template void exit_if_not_equal(T val, T ref, const char *name) { + if (std::is_floating_point::value) { + auto cmp_val = std::bitset(val); + auto cmp_ref = std::bitset(ref); + if (cmp_val != cmp_ref) { + std::cout << "Unexpected result for " << name << ": " << val << "(" + << cmp_val << ") expected value: " << ref << "(" << cmp_ref + << ")" << std::endl; + exit(1); + } + } else { + if ((val - ref) != 0) { + std::cout << "Unexpected result for " << name << ": " << (long)val + << " expected value: " << (long)ref << std::endl; + exit(1); + } + } +} + +// template +// void exit_if_not_equal(std::complex val, std::complex ref, +// const char *name) { +// std::string Name{name}; +// exit_if_not_equal(val.real(), ref.real(), (Name + ".real()").c_str()); +// exit_if_not_equal(val.imag(), ref.imag(), (Name + ".imag()").c_str()); +// } + +template void exit_if_not_equal(T *val, T *ref, const char *name) { + if ((val - ref) != 0) { + std::cout << "Unexpected result for " << name << ": " << val + << " expected value: " << ref << std::endl; + exit(1); + } +} + +template <> void exit_if_not_equal(half val, half ref, const char *name) { + int16_t cmp_val = reinterpret_cast(val); + int16_t cmp_ref = reinterpret_cast(ref); + if (std::abs(cmp_val - cmp_ref) > 1) { + std::cout << "Unexpected result for " << name << ": " << (float)val + << " expected value: " << (float)ref << std::endl; + exit(1); + } +} + +template +void exit_if_not_equal_vec(vec val, vec ref, const char *name) { + if (!utils::cmp_vec(ref, val)) { + std::cout << "Unexpected result for " << name << ": " + << utils::stringify_vec(val) + << " expected value: " << utils::stringify_vec(ref) + << std::endl; + + exit(1); + } +} \ No newline at end of file diff --git a/sycl/test/on-device/group_algorithms_sycl2020/inclusive_scan.cpp b/sycl/test/on-device/group_algorithms_sycl2020/inclusive_scan.cpp new file mode 100644 index 0000000000000..d084574b58c6c --- /dev/null +++ b/sycl/test/on-device/group_algorithms_sycl2020/inclusive_scan.cpp @@ -0,0 +1,163 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: enable compile+runtime checks for operations defined in SPIR-V 1.3. +// That requires either adding a switch to clang (-spirv-max-version=1.3) or +// raising the spirv version from 1.1. to 1.3 for spirv translator +// unconditionally. Using operators specific for spirv 1.3 and higher with +// -spirv-max-version=1.1 being set by default causes assert/check fails +// in spirv translator. +// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \ + %t13.out + +#include "support.h" +#include +#include +#include +#include +#include +#include +using namespace sycl; + +template +class inclusive_scan_kernel; + +// std::inclusive_scan isn't implemented yet, so use serial implementation +// instead +namespace emu { +template +OutputIterator inclusive_scan(InputIterator first, InputIterator last, + OutputIterator result, BinaryOperation binary_op, + T init) { + T partial = init; + for (InputIterator it = first; it != last; ++it) { + partial = binary_op(partial, *it); + *(result++) = partial; + } + return result; +} +} // namespace emu + +template +void test(queue q, InputContainer input, OutputContainer output, + BinaryOperation binary_op, + typename OutputContainer::value_type identity) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class inclusive_scan_kernel kernel_name0; + typedef class inclusive_scan_kernel kernel_name1; + typedef class inclusive_scan_kernel kernel_name2; + typedef class inclusive_scan_kernel kernel_name3; + OutputT init = 42; + size_t N = input.size(); + size_t G = 64; + std::vector expected(N); + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = inclusive_scan_over_group(g, in[lid], binary_op); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(), + binary_op, identity); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = inclusive_scan_over_group(g, in[lid], binary_op, init); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(), + binary_op, init); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + joint_inclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), binary_op); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(), + binary_op, identity); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + joint_inclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), binary_op, init); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(), + binary_op, init); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 128; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), 0); + + test(q, input, output, std::plus<>(), 0); + test(q, input, output, sycl::ONEAPI::minimum<>(), + std::numeric_limits::max()); + test(q, input, output, sycl::ONEAPI::maximum<>(), + std::numeric_limits::lowest()); + + test(q, input, output, std::plus(), 0); + test(q, input, output, sycl::ONEAPI::minimum(), + std::numeric_limits::max()); + test(q, input, output, sycl::ONEAPI::maximum(), + std::numeric_limits::lowest()); + +#ifdef SPIRV_1_3 + test(q, input, output, + multiplies(), 1); + test(q, input, output, bit_or(), 0); + test(q, input, output, bit_xor(), + 0); + test(q, input, output, bit_and(), ~0); +#endif // SPIRV_1_3 + + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/on-device/group_algorithms_sycl2020/none_of.cpp b/sycl/test/on-device/group_algorithms_sycl2020/none_of.cpp new file mode 100644 index 0000000000000..010641bc83560 --- /dev/null +++ b/sycl/test/on-device/group_algorithms_sycl2020/none_of.cpp @@ -0,0 +1,71 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include "support.h" +#include +#include +#include +#include +using namespace sycl; + +template class none_of_kernel; + +struct GeZero { + bool operator()(int i) const { return i >= 0; } +}; +struct IsEven { + bool operator()(int i) const { return (i % 2) == 0; } +}; +struct LtZero { + bool operator()(int i) const { return i < 0; } +}; + +template +void test(queue q, InputContainer input, OutputContainer output, + Predicate pred) { + typedef class none_of_kernel kernel_name; + size_t N = input.size(); + size_t G = 64; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = none_of_group(g, pred(in[lid])); + out[1] = none_of_group(g, in[lid], pred); + out[2] = joint_none_of(g, in.get_pointer(), in.get_pointer() + N, pred); + }); + }); + } + bool expected = std::none_of(input.begin(), input.end(), pred); + assert(output[0] == expected); + assert(output[1] == expected); + assert(output[2] == expected); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 128; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), false); + + test(q, input, output, GeZero()); + test(q, input, output, IsEven()); + test(q, input, output, LtZero()); + + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/on-device/group_algorithms_sycl2020/permute_select.cpp b/sycl/test/on-device/group_algorithms_sycl2020/permute_select.cpp new file mode 100644 index 0000000000000..b0fbb35982573 --- /dev/null +++ b/sycl/test/on-device/group_algorithms_sycl2020/permute_select.cpp @@ -0,0 +1,39 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +//==------------ permute_select.cpp -*- C++ -*-----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "permute_select.hpp" + +int main() { + queue Queue; + if (Queue.get_device().is_host()) { + std::cout << "Skipping test\n"; + return 0; + } + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/sycl/test/on-device/group_algorithms_sycl2020/permute_select.hpp b/sycl/test/on-device/group_algorithms_sycl2020/permute_select.hpp new file mode 100644 index 0000000000000..67b2a05c7020e --- /dev/null +++ b/sycl/test/on-device/group_algorithms_sycl2020/permute_select.hpp @@ -0,0 +1,140 @@ +//==----- permute_select.hpp -*- C++ -*------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helpers.hpp" +#include +template class sycl_subgr; + +using namespace cl::sycl; + +// TODO remove this workaround when clang will support correct generation of +// half typename in integration header +struct wa_half; + +template +void check(queue &Queue, size_t G = 256, size_t L = 64) { + try { + nd_range<1> NdRange(G, L); + buffer> buf_select(G); + buffer> buf_xor(G); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + accessor acc_select{buf_select, cgh, sycl::read_write}; + accessor acc_xor{buf_xor, cgh, sycl::read_write}; + accessor sgsizeacc{sgsizebuf, cgh, sycl::read_write}; + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + ONEAPI::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + vec vwggid(wggid), vsgid(sgid); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + + /*GID of middle element in every subgroup*/ + acc_select[NdItem.get_global_id()] = + select_from_group(SG, vwggid, SG.get_max_local_range()[0] / 2); + /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + acc_xor[NdItem.get_global_id()] = permute_group_by_xor( + SG, vwggid, sgid % SG.get_max_local_range()[0]); + }); + }); + host_accessor acc_select{buf_select, sycl::read_write}; + host_accessor acc_xor{buf_xor, sycl::read_write}; + host_accessor sgsizeacc{sgsizebuf, sycl::read_write}; + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + int SGLid = 0; + int SGBeginGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + SGLid = 0; + SGBeginGid = j; + } + if (j % L == 0) { + SGid = 0; + SGLid = 0; + SGBeginGid = j; + } + /*GID of middle element in every subgroup*/ + exit_if_not_equal_vec( + acc_select[j], vec(j / L * L + SGid * sg_size + sg_size / 2), + "select_from_group"); + /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + exit_if_not_equal_vec(acc_xor[j], + vec(SGBeginGid + (SGLid ^ (SGid % sg_size))), + "permute_group_by_xor"); + SGLid++; + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} + +template void check(queue &Queue, size_t G = 256, size_t L = 64) { + try { + nd_range<1> NdRange(G, L); + buffer buf_select(G); + buffer buf_xor(G); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + accessor acc_select{buf_select, cgh, sycl::read_write}; + accessor acc_xor{buf_xor, cgh, sycl::read_write}; + accessor sgsizeacc{sgsizebuf, cgh, sycl::read_write}; + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + ONEAPI::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + + /*GID of middle element in every subgroup*/ + acc_select[NdItem.get_global_id()] = + select_from_group(SG, wggid, SG.get_max_local_range()[0] / 2); + /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + acc_xor[NdItem.get_global_id()] = + permute_group_by_xor(SG, wggid, sgid % SG.get_max_local_range()[0]); + }); + }); + host_accessor acc_select{buf_select, sycl::read_write}; + host_accessor acc_xor{buf_xor, sycl::read_write}; + host_accessor sgsizeacc{sgsizebuf, sycl::read_write}; + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + int SGLid = 0; + int SGBeginGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + SGLid = 0; + SGBeginGid = j; + } + if (j % L == 0) { + SGid = 0; + SGLid = 0; + SGBeginGid = j; + } + + /*GID of middle element in every subgroup*/ + exit_if_not_equal(acc_select[j], + j / L * L + SGid * sg_size + sg_size / 2, + "select_from_group"); + + /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + exit_if_not_equal(acc_xor[j], SGBeginGid + (SGLid ^ (SGid % sg_size)), + "permute_group_by_xor"); + SGLid++; + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} diff --git a/sycl/test/on-device/group_algorithms_sycl2020/reduce.cpp b/sycl/test/on-device/group_algorithms_sycl2020/reduce.cpp new file mode 100644 index 0000000000000..0daf7b4158500 --- /dev/null +++ b/sycl/test/on-device/group_algorithms_sycl2020/reduce.cpp @@ -0,0 +1,98 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: enable compile+runtime checks for operations defined in SPIR-V 1.3. +// That requires either adding a switch to clang (-spirv-max-version=1.3) or +// raising the spirv version from 1.1. to 1.3 for spirv translator +// unconditionally. Using operators specific for spirv 1.3 and higher with +// -spirv-max-version=1.1 being set by default causes assert/check fails +// in spirv translator. +// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \ + %t13.out + +#include "support.h" +#include +#include +#include +#include +#include +using namespace sycl; + +template +void test(queue q, InputContainer input, OutputContainer output, + BinaryOperation binary_op, + typename OutputContainer::value_type identity) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + OutputT init = 42; + size_t N = input.size(); + size_t G = 64; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for( + nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = reduce_over_group(g, in[lid], binary_op); + out[1] = reduce_over_group(g, in[lid], init, binary_op); + out[2] = joint_reduce(g, in.get_pointer(), in.get_pointer() + N, + binary_op); + out[3] = joint_reduce(g, in.get_pointer(), in.get_pointer() + N, + init, binary_op); + }); + }); + } + // std::reduce is not implemented yet, so use std::accumulate instead + assert(output[0] == std::accumulate(input.begin(), input.begin() + G, + identity, binary_op)); + assert(output[1] == + std::accumulate(input.begin(), input.begin() + G, init, binary_op)); + assert(output[2] == + std::accumulate(input.begin(), input.end(), identity, binary_op)); + assert(output[3] == + std::accumulate(input.begin(), input.end(), init, binary_op)); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 128; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), 0); + + test(q, input, output, std::plus<>(), 0); + test(q, input, output, sycl::ONEAPI::minimum<>(), + std::numeric_limits::max()); + test(q, input, output, sycl::ONEAPI::maximum<>(), + std::numeric_limits::lowest()); + + test(q, input, output, std::plus(), 0); + test(q, input, output, sycl::ONEAPI::minimum(), + std::numeric_limits::max()); + test(q, input, output, sycl::ONEAPI::maximum(), + std::numeric_limits::lowest()); + +#ifdef SPIRV_1_3 + test(q, input, output, + multiplies(), 1); + test(q, input, output, bit_or(), 0); + test(q, input, output, bit_xor(), 0); + test(q, input, output, bit_and(), ~0); +#endif // SPIRV_1_3 + + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/on-device/group_algorithms_sycl2020/shift_left_right.cpp b/sycl/test/on-device/group_algorithms_sycl2020/shift_left_right.cpp new file mode 100644 index 0000000000000..e3c97cac0e81d --- /dev/null +++ b/sycl/test/on-device/group_algorithms_sycl2020/shift_left_right.cpp @@ -0,0 +1,39 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +//==------------ shift_left_right.cpp -*- C++ -*----------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "shift_left_right.hpp" + +int main() { + queue Queue; + if (Queue.get_device().is_host()) { + std::cout << "Skipping test\n"; + return 0; + } + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/sycl/test/on-device/group_algorithms_sycl2020/shift_left_right.hpp b/sycl/test/on-device/group_algorithms_sycl2020/shift_left_right.hpp new file mode 100644 index 0000000000000..12ece40f27c49 --- /dev/null +++ b/sycl/test/on-device/group_algorithms_sycl2020/shift_left_right.hpp @@ -0,0 +1,143 @@ +//==------- shift_left_right.hpp -*- C++ -*---------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helpers.hpp" +#include +template class sycl_subgr; + +using namespace cl::sycl; + +// TODO remove this workaround when clang will support correct generation of +// half typename in integration header +struct wa_half; + +// ---- check +template +void check(queue &Queue, size_t G = 256, size_t L = 64) { + try { + nd_range<1> NdRange(G, L); + buffer> buf_right(G); + buffer> buf_left(G); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + accessor acc_right{buf_right, cgh, sycl::read_write}; + accessor acc_left{buf_left, cgh, sycl::read_write}; + accessor sgsizeacc{sgsizebuf, cgh, sycl::read_write}; + + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + ONEAPI::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + vec vwggid(wggid), vsgid(sgid); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + + /* Save GID-SGID */ + acc_right[NdItem.get_global_id()] = shift_group_right(SG, vwggid, sgid); + /* Save GID+SGID */ + acc_left[NdItem.get_global_id()] = shift_group_left(SG, vwggid, sgid); + }); + }); + host_accessor acc_right{buf_right, sycl::read_write}; + host_accessor acc_left{buf_left, sycl::read_write}; + host_accessor sgsizeacc{sgsizebuf, sycl::read_write}; + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + int SGLid = 0; + int SGBeginGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + SGLid = 0; + SGBeginGid = j; + } + if (j % L == 0) { + SGid = 0; + SGLid = 0; + SGBeginGid = j; + } + + /* Value GID+SGID for all element except last SGID in SG*/ + if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { + exit_if_not_equal_vec(acc_left[j], vec(j + SGid % sg_size), + "shift_group_left"); + } + /* Value GID-SGID for all element except first SGID in SG*/ + if (j % L % sg_size >= SGid) { + exit_if_not_equal_vec(acc_right[j], vec(j - SGid % sg_size), + "shift_group_right"); + } + + SGLid++; + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} + +template void check(queue &Queue, size_t G = 256, size_t L = 64) { + try { + nd_range<1> NdRange(G, L); + buffer buf_right(G); + buffer buf_left(G); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + accessor acc_right{buf_right, cgh, sycl::read_write}; + accessor acc_left{buf_left, cgh, sycl::read_write}; + accessor sgsizeacc{sgsizebuf, cgh, sycl::read_write}; + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + ONEAPI::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + + /* Save GID-SGID */ + acc_right[NdItem.get_global_id()] = shift_group_right(SG, wggid, sgid); + /* Save GID+SGID */ + acc_left[NdItem.get_global_id()] = shift_group_left(SG, wggid, sgid); + }); + }); + host_accessor acc_right{buf_right, sycl::read_write}; + host_accessor acc_left{buf_left, sycl::read_write}; + host_accessor sgsizeacc{sgsizebuf, sycl::read_write}; + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + int SGLid = 0; + int SGBeginGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + SGLid = 0; + SGBeginGid = j; + } + if (j % L == 0) { + SGid = 0; + SGLid = 0; + SGBeginGid = j; + } + + /* Value GID+SGID for all element except last SGID in SG*/ + if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { + exit_if_not_equal(acc_left[j], j + SGid, "shift_group_left"); + } + /* Value GID-SGID for all element except first SGID in SG*/ + if (j % L % sg_size >= SGid) { + exit_if_not_equal(acc_right[j], j - SGid, "shift_group_right"); + } + + SGLid++; + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} diff --git a/sycl/test/on-device/group_algorithms_sycl2020/support.h b/sycl/test/on-device/group_algorithms_sycl2020/support.h new file mode 100644 index 0000000000000..fa5118ea27282 --- /dev/null +++ b/sycl/test/on-device/group_algorithms_sycl2020/support.h @@ -0,0 +1,23 @@ +#include +using namespace sycl; + +bool isSupportedDevice(device D) { + std::string PlatformName = D.get_platform().get_info(); + if (PlatformName.find("CUDA") != std::string::npos) + return true; + + if (PlatformName.find("Level-Zero") != std::string::npos) + return true; + + if (PlatformName.find("OpenCL") != std::string::npos) { + std::string Version = D.get_info(); + size_t Offset = Version.find("OpenCL"); + if (Offset == std::string::npos) + return false; + Version = Version.substr(Offset + 7, 3); + if (Version >= std::string("2.0")) + return true; + } + + return false; +}