Skip to content

[SYCL] Fix big and non-uniform work-groups handling in reduction kernels #2859

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 38 additions & 55 deletions sycl/include/CL/sycl/ONEAPI/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -619,24 +619,19 @@ struct get_reduction_aux_kernel_name_t {
///
/// Briefly: calls user's lambda, ONEAPI::reduce() + atomic, INT + ADD/MIN/MAX.
template <typename KernelName, typename KernelType, int Dims, class Reduction,
bool UniformWG, typename OutputT>
bool IsPow2WG, typename OutputT>
enable_if_t<Reduction::has_fast_reduce && Reduction::has_fast_atomics>
reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
Reduction &, OutputT Out) {
size_t NWorkItems = Range.get_global_range().size();
using Name = typename get_reduction_main_kernel_name_t<
KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
// Call user's function. Reducer.MValue gets initialized there.
typename Reduction::reducer_type Reducer;
KernelFunc(NDIt, Reducer);

typename Reduction::binary_operation BOp;
typename Reduction::result_type Val =
(UniformWG || NDIt.get_global_linear_id() < NWorkItems)
? Reducer.MValue
: Reducer.getIdentity();
Reducer.MValue = ONEAPI::reduce(NDIt.get_group(), Val, BOp);
Reducer.MValue = ONEAPI::reduce(NDIt.get_group(), Reducer.MValue, BOp);
if (NDIt.get_local_linear_id() == 0)
Reducer.atomic_combine(Reduction::getOutPointer(Out));
});
Expand All @@ -651,22 +646,21 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
///
/// Briefly: calls user's lambda, tree-reduction + atomic, INT + AND/OR/XOR.
template <typename KernelName, typename KernelType, int Dims, class Reduction,
bool UniformPow2WG, typename OutputT>
bool IsPow2WG, typename OutputT>
enable_if_t<!Reduction::has_fast_reduce && Reduction::has_fast_atomics>
reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
Reduction &Redu, OutputT Out) {
size_t NWorkItems = Range.get_global_range().size();
size_t WGSize = Range.get_local_range().size();

// Use local memory to reduce elements in work-groups into zero-th element.
// If WGSize is not power of two, then WGSize+1 elements are allocated.
// The additional last element is used to catch reduce elements that could
// otherwise be lost in the tree-reduction algorithm used in the kernel.
size_t NLocalElements = WGSize + (UniformPow2WG ? 0 : 1);
size_t NLocalElements = WGSize + (IsPow2WG ? 0 : 1);
auto LocalReds = Redu.getReadWriteLocalAcc(NLocalElements, CGH);

using Name = typename get_reduction_main_kernel_name_t<
KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
// Call user's functions. Reducer.MValue gets initialized there.
typename Reduction::reducer_type Reducer;
Expand All @@ -676,12 +670,9 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
size_t LID = NDIt.get_local_linear_id();

// Copy the element to local memory to prepare it for tree-reduction.
typename Reduction::result_type ReduIdentity = Reducer.getIdentity();
LocalReds[LID] = (UniformPow2WG || NDIt.get_global_linear_id() < NWorkItems)
? Reducer.MValue
: ReduIdentity;
if (!UniformPow2WG)
LocalReds[WGSize] = ReduIdentity;
LocalReds[LID] = Reducer.MValue;
if (!IsPow2WG)
LocalReds[WGSize] = Reducer.getIdentity();
NDIt.barrier();

// Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
Expand All @@ -692,15 +683,15 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
if (LID < CurStep)
LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1))
else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1))
LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
NDIt.barrier();
PrevStep = CurStep;
}

if (LID == 0) {
Reducer.MValue =
UniformPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
Reducer.atomic_combine(Reduction::getOutPointer(Out));
}
});
Expand All @@ -712,14 +703,14 @@ enable_if_t<Reduction::has_fast_atomics>
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
Reduction &Redu, OutputT Out) {

size_t NWorkItems = Range.get_global_range().size();
size_t WGSize = Range.get_local_range().size();
size_t NWorkGroups = Range.get_group_range().size();

bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
if (!Reduction::has_fast_reduce)
HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
if (HasUniformWG)
// If the work group size is not pow of 2, then the kernel runs some
// additional code and checks in it.
// If the reduction has fast reduce then the kernel does not care if the work
// group size is pow of 2 or not, assume true for such cases.
bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1)) == 0);
if (IsPow2WG)
reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
CGH, KernelFunc, Range, Redu, Out);
else
Expand All @@ -736,33 +727,28 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
///
/// Briefly: user's lambda, ONEAPI:reduce(), FP + ADD/MIN/MAX.
template <typename KernelName, typename KernelType, int Dims, class Reduction,
bool UniformWG, typename OutputT>
bool IsPow2WG, typename OutputT>
enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
Reduction &, OutputT Out) {

size_t NWorkItems = Range.get_global_range().size();
size_t NWorkGroups = Range.get_group_range().size();

// This additional check is needed for 'read_write' accessor case only.
// It does not slow-down the kernel writing to 'discard_write' accessor as
// the condition seems to be resolved at compile time for 'discard_write'.
bool IsUpdateOfUserVar =
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;

using Name = typename get_reduction_main_kernel_name_t<
KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
// Call user's functions. Reducer.MValue gets initialized there.
typename Reduction::reducer_type Reducer;
KernelFunc(NDIt, Reducer);

// Compute the partial sum/reduction for the work-group.
size_t WGID = NDIt.get_group_linear_id();
typename Reduction::result_type PSum =
(UniformWG || (NDIt.get_group_linear_id() < NWorkItems))
? Reducer.MValue
: Reducer.getIdentity();
typename Reduction::result_type PSum = Reducer.MValue;
typename Reduction::binary_operation BOp;
PSum = ONEAPI::reduce(NDIt.get_group(), PSum, BOp);
if (NDIt.get_local_linear_id() == 0) {
Expand All @@ -782,11 +768,10 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
///
/// Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
template <typename KernelName, typename KernelType, int Dims, class Reduction,
bool UniformPow2WG, typename OutputT>
bool IsPow2WG, typename OutputT>
enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
Reduction &Redu, OutputT Out) {
size_t NWorkItems = Range.get_global_range().size();
size_t WGSize = Range.get_local_range().size();
size_t NWorkGroups = Range.get_group_range().size();

Expand All @@ -797,11 +782,11 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
// If WGSize is not power of two, then WGSize+1 elements are allocated.
// The additional last element is used to catch elements that could
// otherwise be lost in the tree-reduction algorithm.
size_t NumLocalElements = WGSize + (UniformPow2WG ? 0 : 1);
size_t NumLocalElements = WGSize + (IsPow2WG ? 0 : 1);
auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH);
typename Reduction::result_type ReduIdentity = Redu.getIdentity();
using Name = typename get_reduction_main_kernel_name_t<
KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
auto BOp = Redu.getBinaryOperation();
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
// Call user's functions. Reducer.MValue gets initialized there.
Expand All @@ -810,10 +795,9 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,

size_t WGSize = NDIt.get_local_range().size();
size_t LID = NDIt.get_local_linear_id();
size_t GID = NDIt.get_global_linear_id();
// Copy the element to local memory to prepare it for tree-reduction.
LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity;
if (!UniformPow2WG)
LocalReds[LID] = Reducer.MValue;
if (!IsPow2WG)
LocalReds[WGSize] = ReduIdentity;
NDIt.barrier();

Expand All @@ -824,7 +808,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
if (LID < CurStep)
LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1))
else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1))
LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
NDIt.barrier();
PrevStep = CurStep;
Expand All @@ -834,7 +818,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
if (LID == 0) {
size_t GrID = NDIt.get_group_linear_id();
typename Reduction::result_type PSum =
UniformPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
if (IsUpdateOfUserVar)
PSum = BOp(*(Reduction::getOutPointer(Out)), PSum);
Reduction::getOutPointer(Out)[GrID] = PSum;
Expand All @@ -846,27 +830,25 @@ template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<!Reduction::has_fast_atomics>
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
Reduction &Redu) {
size_t NWorkItems = Range.get_global_range().size();
size_t WGSize = Range.get_local_range().size();
size_t NWorkGroups = Range.get_group_range().size();

// The last work-group may be not fully loaded with work, or the work group
// size may be not power of two. Those two cases considered inefficient
// as they require additional code and checks in the kernel.
bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
if (!Reduction::has_fast_reduce)
HasUniformWG = HasUniformWG && ((WGSize & (WGSize - 1)) == 0);
// If the work group size is not pow of 2, then the kernel runs some
// additional code and checks in it.
// If the reduction has fast reduce then the kernel does not care if the work
// group size is pow of 2 or not, assume true for such cases.
bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1)) == 0);

if (Reduction::is_usm && NWorkGroups == 1) {
if (HasUniformWG)
if (IsPow2WG)
reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
CGH, KernelFunc, Range, Redu, Redu.getUSMPointer());
else
reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
CGH, KernelFunc, Range, Redu, Redu.getUSMPointer());
} else {
auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
if (HasUniformWG)
if (IsPow2WG)
reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
CGH, KernelFunc, Range, Redu, Out);
else
Expand All @@ -889,10 +871,10 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
size_t WGSize, Reduction &, InputT In, OutputT Out) {
using Name = typename get_reduction_aux_kernel_name_t<
KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;

bool IsUpdateOfUserVar =
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
range<1> GlobalRange = {UniformWG ? NWorkItems : NWorkGroups * WGSize};
nd_range<1> Range{GlobalRange, range<1>(WGSize)};
CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
typename Reduction::binary_operation BOp;
size_t WGID = NDIt.get_group_linear_id();
Expand Down Expand Up @@ -936,7 +918,8 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
auto BOp = Redu.getBinaryOperation();
using Name = typename get_reduction_aux_kernel_name_t<
KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize};
nd_range<1> Range{GlobalRange, range<1>(WGSize)};
CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
size_t WGSize = NDIt.get_local_range().size();
size_t LID = NDIt.get_local_linear_id();
Expand Down
4 changes: 3 additions & 1 deletion sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1193,7 +1193,9 @@ class __SYCL_EXPORT handler {
size_t MaxWGSize = ONEAPI::detail::reduGetMaxWGSize(MQueue, OneElemSize);
if (Range.get_local_range().size() > MaxWGSize)
throw sycl::runtime_error("The implementation handling parallel_for with"
" reduction requires smaller work group size.",
" reduction requires work group size not bigger"
" than " +
std::to_string(MaxWGSize),
PI_INVALID_WORK_GROUP_SIZE);

// 1. Call the kernel that includes user's lambda function.
Expand Down
35 changes: 31 additions & 4 deletions sycl/source/detail/reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,45 @@ __SYCL_EXPORT size_t
reduGetMaxWGSize(shared_ptr_class<sycl::detail::queue_impl> Queue,
size_t LocalMemBytesPerWorkItem) {
device Dev = Queue->get_device();
size_t WGSize = Dev.get_info<info::device::max_work_group_size>();
size_t MaxWGSize = Dev.get_info<info::device::max_work_group_size>();
size_t WGSizePerMem = MaxWGSize * 2;
size_t WGSize = MaxWGSize;
if (LocalMemBytesPerWorkItem != 0) {
size_t MemSize = Dev.get_info<info::device::local_mem_size>();
size_t WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;

// If the work group size is not pow of two, then an additional element
// If the work group size is NOT power of two, then an additional element
// in local memory is needed for the reduction algorithm and thus the real
// work-group size requirement per available memory is stricter.
if ((WGSize & (WGSize - 1)) == 0)
if ((WGSizePerMem & (WGSizePerMem - 1)) != 0)
WGSizePerMem--;
WGSize = (std::min)(WGSizePerMem, WGSize);
}
// TODO: This is a temporary workaround for a big problem of detecting
// the maximal usable work-group size. The detection method used above
// is based on maximal work-group size possible on the device is too risky
// as may return too big value. Even though it also tries using the memory
// factor into consideration, it is too rough estimation. For example,
// if (WGSize * LocalMemBytesPerWorkItem) is equal to local_mem_size, then
// the reduction local accessor takes all available local memory for it needs
// not leaving any local memory for other kernel needs (barriers,
// builtin calls, etc), which often leads to crushes with CL_OUT_OF_RESOURCES
// error, or in even worse cases it may cause silent writes/clobbers of
// the local memory assigned to one work-group by code in another work-group.
// It seems the only good solution for this work-group detection problem is
// kernel precompilation and querying the kernel properties.
if (WGSize >= 4) {
// Let's return a twice smaller number, but... do that only if the kernel
// is limited by memory, or the kernel uses opencl:cpu backend, which
// surprisingly uses lots of resources to run the kernels with reductions
// and often causes CL_OUT_OF_RESOURCES error even when reduction
// does not use local accessors.
if (WGSizePerMem < MaxWGSize * 2 ||
(Queue->get_device().is_cpu() &&
Queue->get_device().get_platform().get_backend() == backend::opencl))
WGSize /= 2;
}

return WGSize;
}

Expand Down
Loading