diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 38bcba20d8a27..a773694f69db7 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -191,7 +191,7 @@ extern SYCL_EXTERNAL bool __spirv_GroupAny(__spv::Scope Execution, template extern SYCL_EXTERNAL dataT __spirv_GroupBroadcast(__spv::Scope Execution, dataT Value, - uint32_t LocalId) noexcept; + size_t LocalId) noexcept; template extern SYCL_EXTERNAL dataT diff --git a/sycl/include/CL/sycl.hpp b/sycl/include/CL/sycl.hpp index 4c6e6e8790052..3455e0ab07219 100644 --- a/sycl/include/CL/sycl.hpp +++ b/sycl/include/CL/sycl.hpp @@ -23,22 +23,22 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include #include #include -#include #include #include #include #include #include #include - diff --git a/sycl/include/CL/sycl/detail/spirv.hpp b/sycl/include/CL/sycl/detail/spirv.hpp new file mode 100644 index 0000000000000..8c5f80f3674b2 --- /dev/null +++ b/sycl/include/CL/sycl/detail/spirv.hpp @@ -0,0 +1,55 @@ +//===-- spirv.hpp - Helpers to generate SPIR-V instructions ----*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#ifdef __SYCL_DEVICE_ONLY__ +__SYCL_INLINE_NAMESPACE(cl) { +namespace sycl { +namespace detail { +namespace spirv { + +// Broadcast with scalar local index +template <__spv::Scope S, typename T, typename IdT> +detail::enable_if_t::value, T> +GroupBroadcast(T x, IdT local_id) { + using OCLT = detail::ConvertToOpenCLType_t; + using OCLIdT = detail::ConvertToOpenCLType_t; + OCLT ocl_x = detail::convertDataToType(x); + OCLIdT ocl_id = detail::convertDataToType(local_id); + return __spirv_GroupBroadcast(S, ocl_x, ocl_id); +} + +// Broadcast with vector local index +template <__spv::Scope S, typename T, int Dimensions> +T GroupBroadcast(T x, id local_id) { + if (Dimensions == 1) { + return GroupBroadcast(x, local_id[0]); + } + using IdT = vec; + using OCLT = detail::ConvertToOpenCLType_t; + using OCLIdT = detail::ConvertToOpenCLType_t; + IdT vec_id; + for (int i = 0; i < Dimensions; ++i) { + vec_id[i] = local_id[Dimensions - i - 1]; + } + OCLT ocl_x = detail::convertDataToType(x); + OCLIdT ocl_id = detail::convertDataToType(vec_id); + return __spirv_GroupBroadcast(S, ocl_x, ocl_id); +} + +} // namespace spirv +} // namespace detail +} // namespace sycl +} // __SYCL_INLINE_NAMESPACE(cl) +#endif // __SYCL_DEVICE_ONLY__ diff --git a/sycl/include/CL/sycl/detail/type_traits.hpp b/sycl/include/CL/sycl/detail/type_traits.hpp index 4050c2742b66d..6b0bbd255f103 100644 --- a/sycl/include/CL/sycl/detail/type_traits.hpp +++ b/sycl/include/CL/sycl/detail/type_traits.hpp @@ -195,6 +195,14 @@ template struct is_arithmetic : bool_constant::value || is_floating_point::value> {}; +template +struct is_scalar_arithmetic + : bool_constant::value && is_arithmetic::value> {}; + +template +struct is_vector_arithmetic + : bool_constant::value && is_arithmetic::value> {}; + // is_pointer template struct is_pointer_impl : std::false_type {}; diff --git a/sycl/include/CL/sycl/group.hpp b/sycl/include/CL/sycl/group.hpp index 7fc777a53e644..c871ec95bd0bf 100644 --- a/sycl/include/CL/sycl/group.hpp +++ b/sycl/include/CL/sycl/group.hpp @@ -81,25 +81,32 @@ template class private_memory { #endif // #ifdef __SYCL_DEVICE_ONLY__ }; -template class group { +template class group { public: +#ifndef __DISABLE_SYCL_INTEL_GROUP_ALGORITHMS__ + using id_type = id; + using range_type = range; + using linear_id_type = size_t; + static constexpr int dimensions = Dimensions; +#endif // __DISABLE_SYCL_INTEL_GROUP_ALGORITHMS__ + group() = delete; - id get_id() const { return index; } + id get_id() const { return index; } size_t get_id(int dimension) const { return index[dimension]; } - range get_global_range() const { return globalRange; } + range get_global_range() const { return globalRange; } size_t get_global_range(int dimension) const { return globalRange[dimension]; } - range get_local_range() const { return localRange; } + range get_local_range() const { return localRange; } size_t get_local_range(int dimension) const { return localRange[dimension]; } - range get_group_range() const { return groupRange; } + range get_group_range() const { return groupRange; } size_t get_group_range(int dimension) const { return get_group_range()[dimension]; @@ -107,12 +114,12 @@ template class group { size_t operator[](int dimension) const { return index[dimension]; } - template + template typename std::enable_if<(dims == 1), size_t>::type get_linear_id() const { return index[0]; } - template + template typename std::enable_if<(dims == 2), size_t>::type get_linear_id() const { return index[0] * groupRange[1] + index[1]; } @@ -127,7 +134,7 @@ template class group { // size_t get_linear_id()const // Get a linearized version of the work-group id. Calculating a linear // work-group id from a multi-dimensional index follows the equation 4.3. - template + template typename std::enable_if<(dims == 3), size_t>::type get_linear_id() const { return (index[0] * groupRange[1] * groupRange[2]) + (index[1] * groupRange[2]) + index[2]; @@ -139,41 +146,41 @@ template class group { // compilers are expected to optimize when possible detail::workGroupBarrier(); #ifdef __SYCL_DEVICE_ONLY__ - range GlobalSize{ - __spirv::initGlobalSize>()}; - range LocalSize{ - __spirv::initWorkgroupSize>()}; - id GlobalId{ - __spirv::initGlobalInvocationId>()}; - id LocalId{ - __spirv::initLocalInvocationId>()}; + range GlobalSize{ + __spirv::initGlobalSize>()}; + range LocalSize{ + __spirv::initWorkgroupSize>()}; + id GlobalId{ + __spirv::initGlobalInvocationId>()}; + id LocalId{ + __spirv::initLocalInvocationId>()}; // no 'iterate' in the device code variant, because // (1) this code is already invoked by each work item as a part of the // enclosing parallel_for_work_group kernel // (2) the range this pfwi iterates over matches work group size exactly - item GlobalItem = - detail::Builder::createItem(GlobalSize, GlobalId); - item LocalItem = - detail::Builder::createItem(LocalSize, LocalId); - h_item HItem = - detail::Builder::createHItem(GlobalItem, LocalItem); + item GlobalItem = + detail::Builder::createItem(GlobalSize, GlobalId); + item LocalItem = + detail::Builder::createItem(LocalSize, LocalId); + h_item HItem = + detail::Builder::createHItem(GlobalItem, LocalItem); Func(HItem); #else - id GroupStartID = index * localRange; + id GroupStartID = index * localRange; // ... host variant needs explicit 'iterate' because it is serial - detail::NDLoop::iterate( - localRange, [&](const id &LocalID) { - item GlobalItem = - detail::Builder::createItem( + detail::NDLoop::iterate( + localRange, [&](const id &LocalID) { + item GlobalItem = + detail::Builder::createItem( globalRange, GroupStartID + LocalID); - item LocalItem = - detail::Builder::createItem(localRange, + item LocalItem = + detail::Builder::createItem(localRange, LocalID); - h_item HItem = - detail::Builder::createHItem(GlobalItem, LocalItem); + h_item HItem = + detail::Builder::createHItem(GlobalItem, LocalItem); Func(HItem); }); #endif // __SYCL_DEVICE_ONLY__ @@ -185,52 +192,52 @@ template class group { } template - void parallel_for_work_item(range flexibleRange, + void parallel_for_work_item(range flexibleRange, WorkItemFunctionT Func) const { detail::workGroupBarrier(); #ifdef __SYCL_DEVICE_ONLY__ - range GlobalSize{ - __spirv::initGlobalSize>()}; - range LocalSize{ - __spirv::initWorkgroupSize>()}; - id GlobalId{ - __spirv::initGlobalInvocationId>()}; - id LocalId{ - __spirv::initLocalInvocationId>()}; - - item GlobalItem = - detail::Builder::createItem(GlobalSize, GlobalId); - item LocalItem = - detail::Builder::createItem(LocalSize, LocalId); - h_item HItem = detail::Builder::createHItem( + range GlobalSize{ + __spirv::initGlobalSize>()}; + range LocalSize{ + __spirv::initWorkgroupSize>()}; + id GlobalId{ + __spirv::initGlobalInvocationId>()}; + id LocalId{ + __spirv::initLocalInvocationId>()}; + + item GlobalItem = + detail::Builder::createItem(GlobalSize, GlobalId); + item LocalItem = + detail::Builder::createItem(LocalSize, LocalId); + h_item HItem = detail::Builder::createHItem( GlobalItem, LocalItem, flexibleRange); // iterate over flexible range with work group size stride; each item // performs flexibleRange/LocalSize iterations (if the former is divisible // by the latter) - detail::NDLoop::iterate( + detail::NDLoop::iterate( LocalId, LocalSize, flexibleRange, - [&](const id &LogicalLocalID) { + [&](const id &LogicalLocalID) { HItem.setLogicalLocalID(LogicalLocalID); Func(HItem); }); #else - id GroupStartID = index * localRange; + id GroupStartID = index * localRange; - detail::NDLoop::iterate( - localRange, [&](const id &LocalID) { - item GlobalItem = - detail::Builder::createItem( + detail::NDLoop::iterate( + localRange, [&](const id &LocalID) { + item GlobalItem = + detail::Builder::createItem( globalRange, GroupStartID + LocalID); - item LocalItem = - detail::Builder::createItem(localRange, + item LocalItem = + detail::Builder::createItem(localRange, LocalID); - h_item HItem = detail::Builder::createHItem( + h_item HItem = detail::Builder::createHItem( GlobalItem, LocalItem, flexibleRange); - detail::NDLoop::iterate( + detail::NDLoop::iterate( LocalID, localRange, flexibleRange, - [&](const id &LogicalLocalID) { + [&](const id &LogicalLocalID) { HItem.setLogicalLocalID(LogicalLocalID); Func(HItem); }); @@ -311,7 +318,7 @@ template class group { waitForHelper(Events...); } - bool operator==(const group &rhs) const { + bool operator==(const group &rhs) const { bool Result = (rhs.globalRange == globalRange) && (rhs.localRange == localRange) && (rhs.index == index); __SYCL_ASSERT(rhs.groupRange == groupRange && @@ -319,15 +326,15 @@ template class group { return Result; } - bool operator!=(const group &rhs) const { + bool operator!=(const group &rhs) const { return !((*this) == rhs); } private: - range globalRange; - range localRange; - range groupRange; - id index; + range globalRange; + range localRange; + range groupRange; + id index; void waitForHelper() const {} @@ -343,8 +350,8 @@ template class group { protected: friend class detail::Builder; - group(const range &G, const range &L, - const range GroupRange, const id &I) + group(const range &G, const range &L, + const range GroupRange, const id &I) : globalRange(G), localRange(L), groupRange(GroupRange), index(I) { // Make sure local range divides global without remainder: __SYCL_ASSERT(((G % L).size() == 0) && diff --git a/sycl/include/CL/sycl/intel/functional.hpp b/sycl/include/CL/sycl/intel/functional.hpp index 0971d9089205c..018f6d0c2a28b 100644 --- a/sycl/include/CL/sycl/intel/functional.hpp +++ b/sycl/include/CL/sycl/intel/functional.hpp @@ -44,7 +44,8 @@ template <> struct maximum { template auto operator()(T &&lhs, U &&rhs) const -> typename std::common_type::type { - return std::greater<>()(std::forward(lhs), std::forward(rhs)) + return std::greater<>()(std::forward(lhs), + std::forward(rhs)) ? std::forward(lhs) : std::forward(rhs); } @@ -54,5 +55,60 @@ template <> struct maximum { template using plus = std::plus; } // namespace intel + +#ifdef __SYCL_DEVICE_ONLY__ +namespace detail { + +struct GroupOpISigned {}; +struct GroupOpIUnsigned {}; +struct GroupOpFP {}; + +template struct GroupOpTag; + +template +struct GroupOpTag::value>> { + using type = GroupOpISigned; +}; + +template +struct GroupOpTag::value>> { + using type = GroupOpIUnsigned; +}; + +template +struct GroupOpTag::value>> { + using type = GroupOpFP; +}; + +#define __SYCL_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation) \ + template \ + static T calc(GroupTag, T x, BinaryOperation op) { \ + using OCLT = detail::ConvertToOpenCLType_t; \ + OCLT Arg = x; \ + OCLT Ret = __spirv_Group##SPIRVOperation(S, O, Arg); \ + return Ret; \ + } + +__SYCL_CALC_OVERLOAD(GroupOpISigned, SMin, intel::minimum) +__SYCL_CALC_OVERLOAD(GroupOpIUnsigned, UMin, intel::minimum) +__SYCL_CALC_OVERLOAD(GroupOpFP, FMin, intel::minimum) +__SYCL_CALC_OVERLOAD(GroupOpISigned, SMax, intel::maximum) +__SYCL_CALC_OVERLOAD(GroupOpIUnsigned, UMax, intel::maximum) +__SYCL_CALC_OVERLOAD(GroupOpFP, FMax, intel::maximum) +__SYCL_CALC_OVERLOAD(GroupOpISigned, IAdd, intel::plus) +__SYCL_CALC_OVERLOAD(GroupOpIUnsigned, IAdd, intel::plus) +__SYCL_CALC_OVERLOAD(GroupOpFP, FAdd, intel::plus) + +#undef __SYCL_CALC_OVERLOAD + +template class BinaryOperation> +static T calc(typename GroupOpTag::type, T x, BinaryOperation) { + return calc(typename GroupOpTag::type(), x, BinaryOperation()); +} + +} // namespace detail +#endif // __SYCL_DEVICE_ONLY__ + } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/group_algorithm.hpp b/sycl/include/CL/sycl/intel/group_algorithm.hpp new file mode 100644 index 0000000000000..ad8fa67313d91 --- /dev/null +++ b/sycl/include/CL/sycl/intel/group_algorithm.hpp @@ -0,0 +1,666 @@ +//==----------- group_algorithm.hpp --- SYCL group algorithm----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#ifndef __DISABLE_SYCL_INTEL_GROUP_ALGORITHMS__ +__SYCL_INLINE_NAMESPACE(cl) { +namespace sycl { +namespace detail { + +template size_t get_local_linear_range(Group g); +template <> inline size_t get_local_linear_range>(group<1> g) { + return g.get_local_range(0); +} +template <> inline size_t get_local_linear_range>(group<2> g) { + return g.get_local_range(0) * g.get_local_range(1); +} +template <> inline size_t get_local_linear_range>(group<3> g) { + return g.get_local_range(0) * g.get_local_range(1) * g.get_local_range(2); +} + +template +id linear_id_to_id(range, size_t linear_id); +template <> inline id<1> linear_id_to_id(range<1> r, size_t linear_id) { + return id<1>(linear_id); +} +template <> inline id<2> linear_id_to_id(range<2> r, size_t linear_id) { + id<2> result; + result[0] = linear_id / r[1]; + result[1] = linear_id % r[1]; + return result; +} +template <> inline id<3> linear_id_to_id(range<3> r, size_t linear_id) { + id<3> result; + result[0] = linear_id / (r[1] * r[2]); + result[1] = (linear_id % (r[1] * r[2])) / r[2]; + result[2] = linear_id % r[2]; + return result; +} + +template struct is_group : std::false_type {}; + +template +struct is_group> : std::true_type {}; + +template struct identity {}; + +template struct identity> { + static constexpr T value = 0; +}; + +template struct identity> { + static constexpr T value = std::numeric_limits::max(); +}; + +template struct identity> { + static constexpr T value = std::numeric_limits::lowest(); +}; + +template +Function for_each(Group g, Ptr first, Ptr last, Function f) { +#ifdef __SYCL_DEVICE_ONLY__ + nd_item it = + cl::sycl::detail::Builder::getNDItem(); + ptrdiff_t offset = it.get_local_linear_id(); + ptrdiff_t stride = detail::get_local_linear_range(g); + for (Ptr p = first + offset; p < last; p += stride) { + f(*p); + } + return f; +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +} // namespace detail + +namespace intel { + +template +using EnableIfIsScalarArithmetic = cl::sycl::detail::enable_if_t< + cl::sycl::detail::is_scalar_arithmetic::value, T>; + +template +using EnableIfIsVectorArithmetic = cl::sycl::detail::enable_if_t< + cl::sycl::detail::is_vector_arithmetic::value, T>; + +template +using EnableIfIsPointer = + cl::sycl::detail::enable_if_t::value, T>; + +template bool all_of(Group g, bool pred) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_GroupAll(__spv::Scope::Workgroup, pred); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +bool all_of(Group g, T x, Predicate pred) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + return all_of(g, pred(x)); +} + +template +EnableIfIsPointer all_of(Group g, Ptr first, Ptr last, + Predicate pred) { +#ifdef __SYCL_DEVICE_ONLY__ + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + bool partial = true; + detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) { + partial &= pred(x); + }); + return all_of(g, partial); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template bool any_of(Group g, bool pred) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_GroupAny(__spv::Scope::Workgroup, pred); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +bool any_of(Group g, T x, Predicate pred) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + return any_of(g, pred(x)); +} + +template +EnableIfIsPointer any_of(Group g, Ptr first, Ptr last, + Predicate pred) { +#ifdef __SYCL_DEVICE_ONLY__ + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + bool partial = false; + detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) { + partial |= pred(x); + }); + return any_of(g, partial); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template bool none_of(Group g, bool pred) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_GroupAll(__spv::Scope::Workgroup, not pred); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +bool none_of(Group g, T x, Predicate pred) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + return none_of(g, pred(x)); +} + +template +EnableIfIsPointer none_of(Group g, Ptr first, Ptr last, + Predicate pred) { +#ifdef __SYCL_DEVICE_ONLY__ + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + return not any_of(g, first, last, pred); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsScalarArithmetic broadcast(Group g, T x, + typename Group::id_type local_id) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); +#ifdef __SYCL_DEVICE_ONLY__ + return detail::spirv::GroupBroadcast<__spv::Scope::Workgroup>(x, local_id); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsVectorArithmetic broadcast(Group g, T x, + typename Group::id_type local_id) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); +#ifdef __SYCL_DEVICE_ONLY__ + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = broadcast(g, x[s], local_id); + } + return result; +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsScalarArithmetic +broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); +#ifdef __SYCL_DEVICE_ONLY__ + return broadcast( + g, x, detail::linear_id_to_id(g.get_local_range(), linear_local_id)); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsVectorArithmetic +broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); +#ifdef __SYCL_DEVICE_ONLY__ + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = broadcast(g, x[s], linear_local_id); + } + return result; +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsScalarArithmetic broadcast(Group g, T x) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); +#ifdef __SYCL_DEVICE_ONLY__ + return broadcast(g, x, 0); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsVectorArithmetic broadcast(Group g, T x) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); +#ifdef __SYCL_DEVICE_ONLY__ + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = broadcast(g, x[s]); + } + return result; +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsScalarArithmetic reduce(Group g, T x, BinaryOperation binary_op) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert( + std::is_same::value, + "Result type of binary_op must match reduction accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + return detail::calc( + typename detail::GroupOpTag::type(), x, binary_op); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsVectorArithmetic reduce(Group g, T x, BinaryOperation binary_op) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert( + std::is_same::value, + "Result type of binary_op must match reduction accumulation type."); + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = reduce(g, x[s], binary_op); + } + return result; +} + +template +EnableIfIsScalarArithmetic reduce(Group g, V x, T init, + BinaryOperation binary_op) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert( + std::is_same::value, + "Result type of binary_op must match reduction accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + return binary_op(init, reduce(g, x, binary_op)); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsVectorArithmetic reduce(Group g, V x, T init, + BinaryOperation binary_op) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert( + std::is_same::value, + "Result type of binary_op must match reduction accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + T result = init; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = binary_op(init[s], reduce(g, x[s], binary_op)); + } + return result; +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsPointer +reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert( + std::is_same::value, + "Result type of binary_op must match reduction accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + typename Ptr::element_type partial = + detail::identity::value; + detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) { + partial = binary_op(partial, x); + }); + return reduce(g, partial, binary_op); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsPointer reduce(Group g, Ptr first, Ptr last, T init, + BinaryOperation binary_op) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert( + std::is_same::value, + "Result type of binary_op must match reduction accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + T partial = + detail::identity::value; + detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) { + partial = binary_op(partial, x); + }); + return reduce(g, partial, init, binary_op); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsScalarArithmetic exclusive_scan(Group g, T x, + BinaryOperation binary_op) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert(std::is_same::value, + "Result type of binary_op must match scan accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + return detail::calc( + typename detail::GroupOpTag::type(), x, binary_op); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsVectorArithmetic exclusive_scan(Group g, T x, + BinaryOperation binary_op) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert(std::is_same::value, + "Result type of binary_op must match scan accumulation type."); + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = exclusive_scan(g, x[s], binary_op); + } + return result; +} + +template +EnableIfIsVectorArithmetic exclusive_scan(Group g, V x, T init, + BinaryOperation binary_op) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert(std::is_same::value, + "Result type of binary_op must match scan accumulation type."); + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = exclusive_scan(g, x[s], init[s], binary_op); + } + return result; +} + +template +EnableIfIsScalarArithmetic exclusive_scan(Group g, V x, T init, + BinaryOperation binary_op) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert(std::is_same::value, + "Result type of binary_op must match scan accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + nd_item it = + detail::Builder::getNDItem(); + if (it.get_local_linear_id() == 0) { + x = binary_op(init, x); + } + T scan = exclusive_scan(g, x, binary_op); + if (it.get_local_linear_id() == 0) { + scan = init; + } + return scan; +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsPointer +exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, T init, + BinaryOperation binary_op) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert(std::is_same::value, + "Result type of binary_op must match scan accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + nd_item it = + cl::sycl::detail::Builder::getNDItem(); + ptrdiff_t offset = it.get_local_linear_id(); + ptrdiff_t stride = detail::get_local_linear_range(g); + ptrdiff_t N = last - first; + auto roundup = [=](const ptrdiff_t &v, + const ptrdiff_t &divisor) -> ptrdiff_t { + return ((v + divisor - 1) / divisor) * divisor; + }; + typename InPtr::element_type x; + typename OutPtr::element_type carry = init; + for (ptrdiff_t chunk = 0; chunk < roundup(N, stride); chunk += stride) { + ptrdiff_t i = chunk + offset; + if (i < N) { + x = first[i]; + } + typename OutPtr::element_type out = exclusive_scan(g, x, carry, binary_op); + if (i < N) { + result[i] = out; + } + carry = broadcast(g, binary_op(out, x), stride - 1); + } + return result + N; +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsPointer exclusive_scan(Group g, InPtr first, + InPtr last, OutPtr result, + BinaryOperation binary_op) { + static_assert(std::is_same::value, + "Result type of binary_op must match scan accumulation type."); + return exclusive_scan( + g, first, last, result, + detail::identity::value, + binary_op); +} + +template +EnableIfIsVectorArithmetic inclusive_scan(Group g, T x, + BinaryOperation binary_op) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert(std::is_same::value, + "Result type of binary_op must match scan accumulation type."); + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = inclusive_scan(g, x[s], binary_op); + } + return result; +} + +template +EnableIfIsScalarArithmetic inclusive_scan(Group g, T x, + BinaryOperation binary_op) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert(std::is_same::value, + "Result type of binary_op must match scan accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + return detail::calc( + typename detail::GroupOpTag::type(), x, binary_op); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsScalarArithmetic +inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert(std::is_same::value, + "Result type of binary_op must match scan accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + nd_item it = + detail::Builder::getNDItem(); + if (it.get_local_linear_id() == 0) { + x = binary_op(init, x); + } + return inclusive_scan(g, x, binary_op); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsVectorArithmetic +inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert(std::is_same::value, + "Result type of binary_op must match scan accumulation type."); + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = inclusive_scan(g, x[s], binary_op, init[s]); + } + return result; +} + +template +EnableIfIsPointer +inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, + BinaryOperation binary_op, T init) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); + static_assert(std::is_same::value, + "Result type of binary_op must match scan accumulation type."); +#ifdef __SYCL_DEVICE_ONLY__ + nd_item it = + cl::sycl::detail::Builder::getNDItem(); + ptrdiff_t offset = it.get_local_linear_id(); + ptrdiff_t stride = detail::get_local_linear_range(g); + ptrdiff_t N = last - first; + auto roundup = [=](const ptrdiff_t &v, + const ptrdiff_t &divisor) -> ptrdiff_t { + return ((v + divisor - 1) / divisor) * divisor; + }; + typename InPtr::element_type x; + typename OutPtr::element_type carry = init; + for (ptrdiff_t chunk = 0; chunk < roundup(N, stride); chunk += stride) { + ptrdiff_t i = chunk + offset; + if (i < N) { + x = first[i]; + } + typename OutPtr::element_type out = inclusive_scan(g, x, binary_op, carry); + if (i < N) { + result[i] = out; + } + carry = broadcast(g, out, stride - 1); + } + return result + N; +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template +EnableIfIsPointer inclusive_scan(Group g, InPtr first, + InPtr last, OutPtr result, + BinaryOperation binary_op) { + static_assert(std::is_same::value, + "Result type of binary_op must match scan accumulation type."); + return inclusive_scan( + g, first, last, result, binary_op, + detail::identity::value); +} + +template bool leader(Group g) { + static_assert(detail::is_group::value, + "Group algorithms only support the sycl::group class."); +#ifdef __SYCL_DEVICE_ONLY__ + nd_item it = + cl::sycl::detail::Builder::getNDItem(); + typename Group::linear_id_type linear_id = it.get_local_linear_id(); + return (linear_id == 0); +#else + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +} // namespace intel +} // namespace sycl +} // __SYCL_INLINE_NAMESPACE(cl) +#endif // __DISABLE_SYCL_INTEL_GROUP_ALGORITHMS__ diff --git a/sycl/include/CL/sycl/intel/sub_group.hpp b/sycl/include/CL/sycl/intel/sub_group.hpp index c8326eaa0728b..12dfb0eb262f7 100644 --- a/sycl/include/CL/sycl/intel/sub_group.hpp +++ b/sycl/include/CL/sycl/intel/sub_group.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -33,12 +34,6 @@ namespace detail { namespace sub_group { -template T broadcast(T x, id<1> local_id) { - using OCLT = detail::ConvertToOpenCLType_t; - return __spirv_GroupBroadcast(__spv::Scope::Subgroup, OCLT(x), - local_id.get(0)); -} - #define __SYCL_SG_GENERATE_BODY_1ARG(name, SPIRVOperation) \ template T name(T x, id<1> local_id) { \ using OCLT = detail::ConvertToOpenCLType_t; \ @@ -130,52 +125,6 @@ void store(multi_ptr dst, const vec &x) { bit_cast(x)); } -struct GroupOpISigned {}; struct GroupOpIUnsigned {}; struct GroupOpFP {}; - -template struct GroupOpTag; - -template -struct GroupOpTag::value>> { - using type = GroupOpISigned; -}; - -template -struct GroupOpTag::value>> { - using type = GroupOpIUnsigned; -}; - -template -struct GroupOpTag::value>> { - using type = GroupOpFP; -}; - -#define __SYCL_SG_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation) \ - template \ - static T calc(GroupTag, T x, BinaryOperation op) { \ - using OCLT = detail::ConvertToOpenCLType_t; \ - OCLT Arg = x; \ - OCLT Ret = __spirv_Group##SPIRVOperation(__spv::Scope::Subgroup, O, Arg); \ - return Ret; \ - } - -__SYCL_SG_CALC_OVERLOAD(GroupOpISigned, SMin, intel::minimum) -__SYCL_SG_CALC_OVERLOAD(GroupOpIUnsigned, UMin, intel::minimum) -__SYCL_SG_CALC_OVERLOAD(GroupOpFP, FMin, intel::minimum) -__SYCL_SG_CALC_OVERLOAD(GroupOpISigned, SMax, intel::maximum) -__SYCL_SG_CALC_OVERLOAD(GroupOpIUnsigned, UMax, intel::maximum) -__SYCL_SG_CALC_OVERLOAD(GroupOpFP, FMax, intel::maximum) -__SYCL_SG_CALC_OVERLOAD(GroupOpISigned, IAdd, intel::plus) -__SYCL_SG_CALC_OVERLOAD(GroupOpIUnsigned, IAdd, intel::plus) -__SYCL_SG_CALC_OVERLOAD(GroupOpFP, FAdd, intel::plus) - -#undef __SYCL_SG_CALC_OVERLOAD - -template class BinaryOperation> -static T calc(typename GroupOpTag::type, T x, BinaryOperation) { - return calc(typename GroupOpTag::type(), x, BinaryOperation()); -} - } // namespace sub_group } // namespace detail @@ -213,20 +162,21 @@ struct sub_group { } template - using EnableIfIsScalarArithmetic = detail::enable_if_t< - !detail::is_vec::value && detail::is_arithmetic::value, T>; + using EnableIfIsScalarArithmetic = + detail::enable_if_t::value, T>; /* --- collectives --- */ template EnableIfIsScalarArithmetic broadcast(T x, id<1> local_id) const { - return detail::sub_group::broadcast(x, local_id); + return detail::spirv::GroupBroadcast<__spv::Scope::Subgroup>(x, local_id); } template EnableIfIsScalarArithmetic reduce(T x, BinaryOperation op) const { - return detail::sub_group::calc( - typename detail::sub_group::GroupOpTag::type(), x, op); + return detail::calc( + typename detail::GroupOpTag::type(), x, op); } template @@ -236,8 +186,9 @@ struct sub_group { template EnableIfIsScalarArithmetic exclusive_scan(T x, BinaryOperation op) const { - return detail::sub_group::calc( - typename detail::sub_group::GroupOpTag::type(), x, op); + return detail::calc( + typename detail::GroupOpTag::type(), x, op); } template @@ -255,13 +206,14 @@ struct sub_group { template EnableIfIsScalarArithmetic inclusive_scan(T x, BinaryOperation op) const { - return detail::sub_group::calc( - typename detail::sub_group::GroupOpTag::type(), x, op); + return detail::calc( + typename detail::GroupOpTag::type(), x, op); } template EnableIfIsScalarArithmetic inclusive_scan(T x, BinaryOperation op, - T init) const { + T init) const { if (get_local_id().get(0) == 0) { x = op(init, x); } @@ -271,8 +223,7 @@ struct sub_group { /* --- one-input shuffles --- */ /* indices in [0 , sub_group size) */ - template - T shuffle(T x, id<1> local_id) const { + template T shuffle(T x, id<1> local_id) const { return detail::sub_group::shuffle(x, local_id); } @@ -280,21 +231,18 @@ struct sub_group { return detail::sub_group::shuffle_down(x, x, delta); } - template - T shuffle_up(T x, uint32_t delta) const { + template T shuffle_up(T x, uint32_t delta) const { return detail::sub_group::shuffle_up(x, x, delta); } - template - T shuffle_xor(T x, id<1> value) const { + template T shuffle_xor(T x, id<1> value) const { return detail::sub_group::shuffle_xor(x, value); } /* --- two-input shuffles --- */ /* indices in [0 , 2 * sub_group size) */ - template - T shuffle(T x, T y, id<1> local_id) const { + template T shuffle(T x, T y, id<1> local_id) const { return detail::sub_group::shuffle_down(x, y, (local_id - get_local_id()).get(0)); } diff --git a/sycl/test/group-algorithm/all_of.cpp b/sycl/test/group-algorithm/all_of.cpp new file mode 100644 index 0000000000000..a8b4fc4bfff2b --- /dev/null +++ b/sycl/test/group-algorithm/all_of.cpp @@ -0,0 +1,74 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +template +class all_of_kernel; + +struct GeZero { + bool operator()(int i) const { return i >= 0; } +}; +struct IsEven { + bool operator()(int i) const { return (i % 2) == 0; } +}; +struct LtZero { + bool operator()(int i) const { return i < 0; } +}; + +template +void test(queue q, InputContainer input, OutputContainer output, + Predicate pred) { + typedef class all_of_kernel kernel_name; + size_t N = input.size(); + size_t G = 16; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto in = in_buf.get_access(cgh); + auto out = out_buf.get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = all_of(g, pred(in[lid])); + out[1] = all_of(g, in[lid], pred); + out[2] = all_of(g, in.get_pointer(), in.get_pointer() + N, pred); + }); + }); + } + bool expected = std::all_of(input.begin(), input.end(), pred); + assert(output[0] == expected); + assert(output[1] == expected); + assert(output[2] == expected); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), false); + + test(q, input, output, GeZero()); + test(q, input, output, IsEven()); + test(q, input, output, LtZero()); + + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/group-algorithm/any_of.cpp b/sycl/test/group-algorithm/any_of.cpp new file mode 100644 index 0000000000000..4e5391b5b01be --- /dev/null +++ b/sycl/test/group-algorithm/any_of.cpp @@ -0,0 +1,76 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +template +class any_of_kernel; + +struct GeZero { + bool operator()(int i) const { return i >= 0; } +}; +struct IsEven { + bool operator()(int i) const { return (i % 2) == 0; } +}; +struct LtZero { + bool operator()(int i) const { return i < 0; } +}; + +template +void test(queue q, InputContainer input, OutputContainer output, + Predicate pred) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class any_of_kernel kernel_name; + size_t N = input.size(); + size_t G = 16; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = any_of(g, pred(in[lid])); + out[1] = any_of(g, in[lid], pred); + out[2] = any_of(g, in.get_pointer(), in.get_pointer() + N, pred); + }); + }); + } + bool expected = std::any_of(input.begin(), input.end(), pred); + assert(output[0] == expected); + assert(output[1] == expected); + assert(output[2] == expected); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), false); + + test(q, input, output, GeZero()); + test(q, input, output, IsEven()); + test(q, input, output, LtZero()); + + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/group-algorithm/broadcast.cpp b/sycl/test/group-algorithm/broadcast.cpp new file mode 100644 index 0000000000000..9fcce3b938673 --- /dev/null +++ b/sycl/test/group-algorithm/broadcast.cpp @@ -0,0 +1,62 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +class broadcast_kernel; + +template +void test(queue q, InputContainer input, OutputContainer output) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class broadcast_kernel kernel_name; + size_t N = input.size(); + size_t G = 4; + range<2> R(G, G); + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<2>(R, R), [=](nd_item<2> it) { + group<2> g = it.get_group(); + int lid = it.get_local_linear_id(); + out[0] = broadcast(g, in[lid]); + out[1] = broadcast(g, in[lid], group<2>::id_type(1, 2)); + out[2] = broadcast(g, in[lid], group<2>::linear_id_type(2 * G + 1)); + }); + }); + } + assert(output[0] == input[0]); + assert(output[1] == input[1 * G + 2]); + assert(output[2] == input[2 * G + 1]); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 16; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 1); + std::fill(output.begin(), output.end(), false); + + test(q, input, output); + + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/group-algorithm/exclusive_scan.cpp b/sycl/test/group-algorithm/exclusive_scan.cpp new file mode 100644 index 0000000000000..fad4777a7cec1 --- /dev/null +++ b/sycl/test/group-algorithm/exclusive_scan.cpp @@ -0,0 +1,144 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +template +class exclusive_scan_kernel; + +// std::exclusive_scan isn't implemented yet, so use serial implementation +// instead +namespace emu { +template +OutputIterator exclusive_scan(InputIterator first, InputIterator last, + OutputIterator result, T init, + BinaryOperation binary_op) { + T partial = init; + for (InputIterator it = first; it != last; ++it) { + *(result++) = partial; + partial = binary_op(partial, *it); + } + return result; +} +} // namespace emu + +template +void test(queue q, InputContainer input, OutputContainer output, + BinaryOperation binary_op, + typename OutputContainer::value_type identity) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class exclusive_scan_kernel kernel_name0; + typedef class exclusive_scan_kernel kernel_name1; + typedef class exclusive_scan_kernel kernel_name2; + typedef class exclusive_scan_kernel kernel_name3; + OutputT init = 42; + size_t N = input.size(); + size_t G = 16; + std::vector expected(N); + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = exclusive_scan(g, in[lid], binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), + identity, binary_op); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = exclusive_scan(g, in[lid], init, binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), init, + binary_op); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + exclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), + identity, binary_op); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + exclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), init, binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), init, + binary_op); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), 0); + +#if __cplusplus >= 201402L + test(q, input, output, plus<>(), 0); + test(q, input, output, minimum<>(), std::numeric_limits::max()); + test(q, input, output, maximum<>(), std::numeric_limits::lowest()); +#endif + test(q, input, output, plus(), 0); + test(q, input, output, minimum(), std::numeric_limits::max()); + test(q, input, output, maximum(), std::numeric_limits::lowest()); + + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/group-algorithm/inclusive_scan.cpp b/sycl/test/group-algorithm/inclusive_scan.cpp new file mode 100644 index 0000000000000..54d79f72e5395 --- /dev/null +++ b/sycl/test/group-algorithm/inclusive_scan.cpp @@ -0,0 +1,144 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +template +class inclusive_scan_kernel; + +// std::inclusive_scan isn't implemented yet, so use serial implementation +// instead +namespace emu { +template +OutputIterator inclusive_scan(InputIterator first, InputIterator last, + OutputIterator result, BinaryOperation binary_op, + T init) { + T partial = init; + for (InputIterator it = first; it != last; ++it) { + partial = binary_op(partial, *it); + *(result++) = partial; + } + return result; +} +} // namespace emu + +template +void test(queue q, InputContainer input, OutputContainer output, + BinaryOperation binary_op, + typename OutputContainer::value_type identity) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class inclusive_scan_kernel kernel_name0; + typedef class inclusive_scan_kernel kernel_name1; + typedef class inclusive_scan_kernel kernel_name2; + typedef class inclusive_scan_kernel kernel_name3; + OutputT init = 42; + size_t N = input.size(); + size_t G = 16; + std::vector expected(N); + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = inclusive_scan(g, in[lid], binary_op); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(), + binary_op, identity); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = inclusive_scan(g, in[lid], binary_op, init); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(), + binary_op, init); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + inclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), binary_op); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(), + binary_op, identity); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + inclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), binary_op, init); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(), + binary_op, init); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), 0); + +#if __cplusplus >= 201402L + test(q, input, output, plus<>(), 0); + test(q, input, output, minimum<>(), std::numeric_limits::max()); + test(q, input, output, maximum<>(), std::numeric_limits::lowest()); +#endif + test(q, input, output, plus(), 0); + test(q, input, output, minimum(), std::numeric_limits::max()); + test(q, input, output, maximum(), std::numeric_limits::lowest()); + + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/group-algorithm/leader.cpp b/sycl/test/group-algorithm/leader.cpp new file mode 100644 index 0000000000000..3e0bad4706cfc --- /dev/null +++ b/sycl/test/group-algorithm/leader.cpp @@ -0,0 +1,47 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +using namespace sycl; +using namespace sycl::intel; + +class leader_kernel; + +void test(queue q) { + typedef class leader_kernel kernel_name; + int out = 0; + size_t G = 4; + + range<2> R(G, G); + { + buffer out_buf(&out, 1); + + q.submit([&](handler &cgh) { + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<2>(R, R), [=](nd_item<2> it) { + group<2> g = it.get_group(); + if (leader(g)) { + out[0] += 1; + } + }); + }); + } + assert(out == 1); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + test(q); + + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/group-algorithm/none_of.cpp b/sycl/test/group-algorithm/none_of.cpp new file mode 100644 index 0000000000000..d0ef19b8ed3ea --- /dev/null +++ b/sycl/test/group-algorithm/none_of.cpp @@ -0,0 +1,74 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +template +class none_of_kernel; + +struct GeZero { + bool operator()(int i) const { return i >= 0; } +}; +struct IsEven { + bool operator()(int i) const { return (i % 2) == 0; } +}; +struct LtZero { + bool operator()(int i) const { return i < 0; } +}; + +template +void test(queue q, InputContainer input, OutputContainer output, + Predicate pred) { + typedef class none_of_kernel kernel_name; + size_t N = input.size(); + size_t G = 16; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto in = in_buf.get_access(cgh); + auto out = out_buf.get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = none_of(g, pred(in[lid])); + out[1] = none_of(g, in[lid], pred); + out[2] = none_of(g, in.get_pointer(), in.get_pointer() + N, pred); + }); + }); + } + bool expected = std::none_of(input.begin(), input.end(), pred); + assert(output[0] == expected); + assert(output[1] == expected); + assert(output[2] == expected); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), false); + + test(q, input, output, GeZero()); + test(q, input, output, IsEven()); + test(q, input, output, LtZero()); + + std::cout << "Test passed." << std::endl; +} diff --git a/sycl/test/group-algorithm/reduce.cpp b/sycl/test/group-algorithm/reduce.cpp new file mode 100644 index 0000000000000..988c40f245ff7 --- /dev/null +++ b/sycl/test/group-algorithm/reduce.cpp @@ -0,0 +1,82 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +template +class reduce_kernel; + +template +void test(queue q, InputContainer input, OutputContainer output, + BinaryOperation binary_op, + typename OutputContainer::value_type identity) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class reduce_kernel kernel_name; + OutputT init = 42; + size_t N = input.size(); + size_t G = 16; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = reduce(g, in[lid], binary_op); + out[1] = reduce(g, in[lid], init, binary_op); + out[2] = reduce(g, in.get_pointer(), in.get_pointer() + N, binary_op); + out[3] = + reduce(g, in.get_pointer(), in.get_pointer() + N, init, binary_op); + }); + }); + } + // std::reduce is not implemented yet, so use std::accumulate instead + assert(output[0] == std::accumulate(input.begin(), input.begin() + G, + identity, binary_op)); + assert(output[1] == + std::accumulate(input.begin(), input.begin() + G, init, binary_op)); + assert(output[2] == + std::accumulate(input.begin(), input.end(), identity, binary_op)); + assert(output[3] == + std::accumulate(input.begin(), input.end(), init, binary_op)); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), 0); + +#if __cplusplus >= 201402L + test(q, input, output, plus<>(), 0); + test(q, input, output, minimum<>(), std::numeric_limits::max()); + test(q, input, output, maximum<>(), std::numeric_limits::lowest()); +#endif + test(q, input, output, plus(), 0); + test(q, input, output, minimum(), std::numeric_limits::max()); + test(q, input, output, maximum(), std::numeric_limits::lowest()); + + std::cout << "Test passed." << std::endl; +}