Skip to content

[SYCL] Implement user-defined reduction extension #7587

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions sycl/include/sycl/detail/spirv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,8 @@ EnableIfBitcastBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
}
template <typename Group, typename T, typename IdT>
EnableIfGenericBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
T Result;
// Initialize with x to support type T without default constructor
T Result = x;
char *XBytes = reinterpret_cast<char *>(&x);
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto BroadcastBytes = [=](size_t Offset, size_t Size) {
Expand Down Expand Up @@ -219,7 +220,8 @@ EnableIfGenericBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
if (Dimensions == 1) {
return GroupBroadcast<Group>(x, local_id[0]);
}
T Result;
// Initialize with x to support type T without default constructor
T Result = x;
char *XBytes = reinterpret_cast<char *>(&x);
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto BroadcastBytes = [=](size_t Offset, size_t Size) {
Expand Down
18 changes: 18 additions & 0 deletions sycl/include/sycl/detail/type_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,18 @@ template <int Dimensions> class group;
namespace ext {
namespace oneapi {
struct sub_group;

namespace experimental {
template <typename Group, std::size_t Extent> class group_with_scratchpad;

namespace detail {
template <typename T> struct is_group_helper : std::false_type {};

template <typename Group, std::size_t Extent>
struct is_group_helper<group_with_scratchpad<Group, Extent>> : std::true_type {
};
} // namespace detail
} // namespace experimental
} // namespace oneapi
} // namespace ext

Expand Down Expand Up @@ -57,6 +69,12 @@ template <class T>
__SYCL_INLINE_CONSTEXPR bool is_group_v =
detail::is_group<T>::value || detail::is_sub_group<T>::value;

namespace ext::oneapi::experimental {
template <class T>
__SYCL_INLINE_CONSTEXPR bool is_group_helper_v =
detail::is_group_helper<std::decay_t<T>>::value;
} // namespace ext::oneapi::experimental

namespace detail {
// Type for Intel device UUID extension.
// For details about this extension, see
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
//==--- user_defined_reductions.hpp -- SYCL ext header file -=--*- C++ -*---==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once

#include <sycl/detail/defines.hpp>
#include <sycl/group_algorithm.hpp>

namespace sycl {
__SYCL_INLINE_VER_NAMESPACE(_V1) {
namespace ext::oneapi::experimental {

// ---- reduce_over_group
template <typename GroupHelper, typename T, typename BinaryOperation>
sycl::detail::enable_if_t<(is_group_helper_v<GroupHelper>), T>
reduce_over_group(GroupHelper group_helper, T x, BinaryOperation binary_op) {
if constexpr (sycl::detail::is_native_op<T, BinaryOperation>::value) {
return sycl::reduce_over_group(group_helper.get_group(), x, binary_op);
}
#ifdef __SYCL_DEVICE_ONLY__
T *Memory = reinterpret_cast<T *>(group_helper.get_memory().data());
auto g = group_helper.get_group();
Memory[g.get_local_linear_id()] = x;
group_barrier(g);
T result = Memory[0];
if (g.leader()) {
for (int i = 1; i < g.get_local_linear_range(); i++) {
result = binary_op(result, Memory[i]);
}
}
Comment on lines +31 to +35
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I doubt people would be happy with this naive implementation. Might be fine for the initial commit though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is intentional. Making this through SPIR-V extensions is too long (spec, impl for them). The plan is to re-write the current using these extensions without rush if customers would not be happy with the performance.

group_barrier(g);
return group_broadcast(g, result);
#else
std::ignore = group_helper;
throw runtime_error("Group algorithms are not supported on host.",
PI_ERROR_INVALID_DEVICE);
#endif
}

template <typename GroupHelper, typename V, typename T,
typename BinaryOperation>
sycl::detail::enable_if_t<(is_group_helper_v<GroupHelper>), T>
reduce_over_group(GroupHelper group_helper, V x, T init,
BinaryOperation binary_op) {
if constexpr (sycl::detail::is_native_op<V, BinaryOperation>::value &&
sycl::detail::is_native_op<T, BinaryOperation>::value) {
return sycl::reduce_over_group(group_helper.get_group(), x, init,
binary_op);
}
#ifdef __SYCL_DEVICE_ONLY__
return binary_op(init, reduce_over_group(group_helper, x, binary_op));
#else
std::ignore = group_helper;
throw runtime_error("Group algorithms are not supported on host.",
PI_ERROR_INVALID_DEVICE);
#endif
}

// ---- joint_reduce
template <typename GroupHelper, typename Ptr, typename BinaryOperation>
sycl::detail::enable_if_t<(is_group_helper_v<GroupHelper> &&
sycl::detail::is_pointer<Ptr>::value),
typename std::iterator_traits<Ptr>::value_type>
joint_reduce(GroupHelper group_helper, Ptr first, Ptr last,
BinaryOperation binary_op) {
if constexpr (sycl::detail::is_native_op<
typename std::iterator_traits<Ptr>::value_type,
BinaryOperation>::value) {
return sycl::joint_reduce(group_helper.get_group(), first, last, binary_op);
}
#ifdef __SYCL_DEVICE_ONLY__
// TODO: the complexity is linear and not logarithmic. Something like
// https://github.com/intel/llvm/blob/8ebd912679f27943d8ef6c33a9775347dce6b80d/sycl/include/sycl/reduction.hpp#L1810-L1818
// might be applicable here.
using T = typename std::iterator_traits<Ptr>::value_type;
auto g = group_helper.get_group();
T partial = *(first + g.get_local_linear_id());
Ptr second = first + g.get_local_linear_range();
sycl::detail::for_each(g, second, last,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it work with non-pointer iterators?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably not, SYCL2020 sycl::joint_reduce also uses this approach. User passes sycl::accessor::get_pointer() to this func.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we claim the extension as supported with that implementation though?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the moment this won't work with non-pointer iterators simply because we have enable_if at the beginning of the function, which requires Ptr to be a pointer type.

I would suggest that we go with the current approach as a first iteration of the extension: we will anyway only claim it as experimental for now.

@dm-vodopyanov, we should raise this question with spec writers, because both extension and core spec do not say whether Ptr can be a generic iterator or is it a mere pointer.

[&](const T &x) { partial = binary_op(partial, x); });
group_barrier(g);
return reduce_over_group(group_helper, partial, binary_op);
#else
std::ignore = group_helper;
std::ignore = first;
std::ignore = last;
std::ignore = binary_op;
throw runtime_error("Group algorithms are not supported on host.",
PI_ERROR_INVALID_DEVICE);
#endif
}

template <typename GroupHelper, typename Ptr, typename T,
typename BinaryOperation>
sycl::detail::enable_if_t<
(is_group_helper_v<GroupHelper> && sycl::detail::is_pointer<Ptr>::value), T>
joint_reduce(GroupHelper group_helper, Ptr first, Ptr last, T init,
BinaryOperation binary_op) {
if constexpr (sycl::detail::is_native_op<T, BinaryOperation>::value) {
return sycl::joint_reduce(group_helper.get_group(), first, last, init,
binary_op);
}
#ifdef __SYCL_DEVICE_ONLY__
return binary_op(init, joint_reduce(group_helper, first, last, binary_op));
#else
std::ignore = group_helper;
std::ignore = last;
throw runtime_error("Group algorithms are not supported on host.",
PI_ERROR_INVALID_DEVICE);
#endif
}
} // namespace ext::oneapi::experimental
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
} // namespace sycl
1 change: 1 addition & 0 deletions sycl/include/sycl/feature_test.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
#define SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO 3
#define SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY 1
#define SYCL_EXT_ONEAPI_KERNEL_PROPERTIES 1
#define SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS 1
#cmakedefine01 SYCL_BUILD_PI_CUDA
#if SYCL_BUILD_PI_CUDA
#define SYCL_EXT_ONEAPI_BACKEND_CUDA 1
Expand Down
1 change: 1 addition & 0 deletions sycl/include/sycl/group_algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <CL/__spirv/spirv_ops.hpp>
#include <CL/__spirv/spirv_types.hpp>
#include <CL/__spirv/spirv_vars.hpp>
#include <sycl/builtins.hpp>
#include <sycl/detail/spirv.hpp>
#include <sycl/detail/type_traits.hpp>
#include <sycl/ext/oneapi/experimental/group_sort.hpp>
Expand Down
1 change: 1 addition & 0 deletions sycl/include/sycl/group_barrier.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <sycl/detail/spirv.hpp>
#include <sycl/detail/type_traits.hpp>
#include <sycl/group.hpp>
#include <sycl/sub_group.hpp>

namespace sycl {
__SYCL_INLINE_VER_NAMESPACE(_V1) {
Expand Down