diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
index a11d3d89fb51..a0c415373b44 100644
--- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp
+++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
@@ -619,24 +619,19 @@ struct get_reduction_aux_kernel_name_t {
 ///
 /// Briefly: calls user's lambda, ONEAPI::reduce() + atomic, INT + ADD/MIN/MAX.
 template <typename KernelName, typename KernelType, int Dims, class Reduction,
-          bool UniformWG, typename OutputT>
+          bool IsPow2WG, typename OutputT>
 enable_if_t<Reduction::has_fast_reduce && Reduction::has_fast_atomics>
 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
                Reduction &, OutputT Out) {
-  size_t NWorkItems = Range.get_global_range().size();
   using Name = typename get_reduction_main_kernel_name_t<
-      KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
+      KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
   CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
     // Call user's function. Reducer.MValue gets initialized there.
     typename Reduction::reducer_type Reducer;
     KernelFunc(NDIt, Reducer);
 
     typename Reduction::binary_operation BOp;
-    typename Reduction::result_type Val =
-        (UniformWG || NDIt.get_global_linear_id() < NWorkItems)
-            ? Reducer.MValue
-            : Reducer.getIdentity();
-    Reducer.MValue = ONEAPI::reduce(NDIt.get_group(), Val, BOp);
+    Reducer.MValue = ONEAPI::reduce(NDIt.get_group(), Reducer.MValue, BOp);
     if (NDIt.get_local_linear_id() == 0)
       Reducer.atomic_combine(Reduction::getOutPointer(Out));
   });
@@ -651,22 +646,21 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
 ///
 /// Briefly: calls user's lambda, tree-reduction + atomic, INT + AND/OR/XOR.
 template <typename KernelName, typename KernelType, int Dims, class Reduction,
-          bool UniformPow2WG, typename OutputT>
+          bool IsPow2WG, typename OutputT>
 enable_if_t<!Reduction::has_fast_reduce && Reduction::has_fast_atomics>
 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
                Reduction &Redu, OutputT Out) {
-  size_t NWorkItems = Range.get_global_range().size();
   size_t WGSize = Range.get_local_range().size();
 
   // Use local memory to reduce elements in work-groups into zero-th element.
   // If WGSize is not power of two, then WGSize+1 elements are allocated.
   // The additional last element is used to catch reduce elements that could
   // otherwise be lost in the tree-reduction algorithm used in the kernel.
-  size_t NLocalElements = WGSize + (UniformPow2WG ? 0 : 1);
+  size_t NLocalElements = WGSize + (IsPow2WG ? 0 : 1);
   auto LocalReds = Redu.getReadWriteLocalAcc(NLocalElements, CGH);
 
   using Name = typename get_reduction_main_kernel_name_t<
-      KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
+      KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
   CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
     // Call user's functions. Reducer.MValue gets initialized there.
     typename Reduction::reducer_type Reducer;
@@ -676,12 +670,9 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
     size_t LID = NDIt.get_local_linear_id();
 
     // Copy the element to local memory to prepare it for tree-reduction.
-    typename Reduction::result_type ReduIdentity = Reducer.getIdentity();
-    LocalReds[LID] = (UniformPow2WG || NDIt.get_global_linear_id() < NWorkItems)
-                         ? Reducer.MValue
-                         : ReduIdentity;
-    if (!UniformPow2WG)
-      LocalReds[WGSize] = ReduIdentity;
+    LocalReds[LID] = Reducer.MValue;
+    if (!IsPow2WG)
+      LocalReds[WGSize] = Reducer.getIdentity();
     NDIt.barrier();
 
     // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
@@ -692,7 +683,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
     for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
       if (LID < CurStep)
         LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
-      else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1))
+      else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1))
         LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
       NDIt.barrier();
       PrevStep = CurStep;
@@ -700,7 +691,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
 
     if (LID == 0) {
       Reducer.MValue =
-          UniformPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
+          IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
       Reducer.atomic_combine(Reduction::getOutPointer(Out));
     }
   });
@@ -712,14 +703,14 @@ enable_if_t<Reduction::has_fast_atomics>
 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
            Reduction &Redu, OutputT Out) {
 
-  size_t NWorkItems = Range.get_global_range().size();
   size_t WGSize = Range.get_local_range().size();
-  size_t NWorkGroups = Range.get_group_range().size();
 
-  bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
-  if (!Reduction::has_fast_reduce)
-    HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
-  if (HasUniformWG)
+  // If the work group size is not pow of 2, then the kernel runs some
+  // additional code and checks in it.
+  // If the reduction has fast reduce then the kernel does not care if the work
+  // group size is pow of 2 or not, assume true for such cases.
+  bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1)) == 0);
+  if (IsPow2WG)
     reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
         CGH, KernelFunc, Range, Redu, Out);
   else
@@ -736,14 +727,12 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
 ///
 /// Briefly: user's lambda, ONEAPI:reduce(), FP + ADD/MIN/MAX.
 template <typename KernelName, typename KernelType, int Dims, class Reduction,
-          bool UniformWG, typename OutputT>
+          bool IsPow2WG, typename OutputT>
 enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
                Reduction &, OutputT Out) {
 
-  size_t NWorkItems = Range.get_global_range().size();
   size_t NWorkGroups = Range.get_group_range().size();
-
   // This additional check is needed for 'read_write' accessor case only.
   // It does not slow-down the kernel writing to 'discard_write' accessor as
   // the condition seems to be resolved at compile time for 'discard_write'.
@@ -751,7 +740,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
       Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
 
   using Name = typename get_reduction_main_kernel_name_t<
-      KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
+      KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
   CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
     // Call user's functions. Reducer.MValue gets initialized there.
     typename Reduction::reducer_type Reducer;
@@ -759,10 +748,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
 
     // Compute the partial sum/reduction for the work-group.
     size_t WGID = NDIt.get_group_linear_id();
-    typename Reduction::result_type PSum =
-        (UniformWG || (NDIt.get_group_linear_id() < NWorkItems))
-            ? Reducer.MValue
-            : Reducer.getIdentity();
+    typename Reduction::result_type PSum = Reducer.MValue;
     typename Reduction::binary_operation BOp;
     PSum = ONEAPI::reduce(NDIt.get_group(), PSum, BOp);
     if (NDIt.get_local_linear_id() == 0) {
@@ -782,11 +768,10 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
 ///
 /// Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
 template <typename KernelName, typename KernelType, int Dims, class Reduction,
-          bool UniformPow2WG, typename OutputT>
+          bool IsPow2WG, typename OutputT>
 enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
 reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
                Reduction &Redu, OutputT Out) {
-  size_t NWorkItems = Range.get_global_range().size();
   size_t WGSize = Range.get_local_range().size();
   size_t NWorkGroups = Range.get_group_range().size();
 
@@ -797,11 +782,11 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
   // If WGSize is not power of two, then WGSize+1 elements are allocated.
   // The additional last element is used to catch elements that could
   // otherwise be lost in the tree-reduction algorithm.
-  size_t NumLocalElements = WGSize + (UniformPow2WG ? 0 : 1);
+  size_t NumLocalElements = WGSize + (IsPow2WG ? 0 : 1);
   auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH);
   typename Reduction::result_type ReduIdentity = Redu.getIdentity();
   using Name = typename get_reduction_main_kernel_name_t<
-      KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
+      KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
   auto BOp = Redu.getBinaryOperation();
   CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
     // Call user's functions. Reducer.MValue gets initialized there.
@@ -810,10 +795,9 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
 
     size_t WGSize = NDIt.get_local_range().size();
     size_t LID = NDIt.get_local_linear_id();
-    size_t GID = NDIt.get_global_linear_id();
     // Copy the element to local memory to prepare it for tree-reduction.
-    LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity;
-    if (!UniformPow2WG)
+    LocalReds[LID] = Reducer.MValue;
+    if (!IsPow2WG)
       LocalReds[WGSize] = ReduIdentity;
     NDIt.barrier();
 
@@ -824,7 +808,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
     for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
       if (LID < CurStep)
         LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
-      else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1))
+      else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1))
         LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
       NDIt.barrier();
       PrevStep = CurStep;
@@ -834,7 +818,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
     if (LID == 0) {
       size_t GrID = NDIt.get_group_linear_id();
       typename Reduction::result_type PSum =
-          UniformPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
+          IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
       if (IsUpdateOfUserVar)
         PSum = BOp(*(Reduction::getOutPointer(Out)), PSum);
       Reduction::getOutPointer(Out)[GrID] = PSum;
@@ -846,19 +830,17 @@ template <typename KernelName, typename KernelType, int Dims, class Reduction>
 enable_if_t<!Reduction::has_fast_atomics>
 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
            Reduction &Redu) {
-  size_t NWorkItems = Range.get_global_range().size();
   size_t WGSize = Range.get_local_range().size();
   size_t NWorkGroups = Range.get_group_range().size();
 
-  // The last work-group may be not fully loaded with work, or the work group
-  // size may be not power of two. Those two cases considered inefficient
-  // as they require additional code and checks in the kernel.
-  bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
-  if (!Reduction::has_fast_reduce)
-    HasUniformWG = HasUniformWG && ((WGSize & (WGSize - 1)) == 0);
+  // If the work group size is not pow of 2, then the kernel runs some
+  // additional code and checks in it.
+  // If the reduction has fast reduce then the kernel does not care if the work
+  // group size is pow of 2 or not, assume true for such cases.
+  bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1)) == 0);
 
   if (Reduction::is_usm && NWorkGroups == 1) {
-    if (HasUniformWG)
+    if (IsPow2WG)
       reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
           CGH, KernelFunc, Range, Redu, Redu.getUSMPointer());
     else
@@ -866,7 +848,7 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
           CGH, KernelFunc, Range, Redu, Redu.getUSMPointer());
   } else {
     auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
-    if (HasUniformWG)
+    if (IsPow2WG)
       reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
           CGH, KernelFunc, Range, Redu, Out);
     else
@@ -889,10 +871,10 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
                   size_t WGSize, Reduction &, InputT In, OutputT Out) {
   using Name = typename get_reduction_aux_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
-
   bool IsUpdateOfUserVar =
       Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
-  nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
+  range<1> GlobalRange = {UniformWG ? NWorkItems : NWorkGroups * WGSize};
+  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
   CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
     typename Reduction::binary_operation BOp;
     size_t WGID = NDIt.get_group_linear_id();
@@ -936,7 +918,8 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
   auto BOp = Redu.getBinaryOperation();
   using Name = typename get_reduction_aux_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
-  nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
+  range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize};
+  nd_range<1> Range{GlobalRange, range<1>(WGSize)};
   CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
     size_t WGSize = NDIt.get_local_range().size();
     size_t LID = NDIt.get_local_linear_id();
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index fa678f28593f..50169dbd116e 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -1193,7 +1193,9 @@ class __SYCL_EXPORT handler {
     size_t MaxWGSize = ONEAPI::detail::reduGetMaxWGSize(MQueue, OneElemSize);
     if (Range.get_local_range().size() > MaxWGSize)
       throw sycl::runtime_error("The implementation handling parallel_for with"
-                                " reduction requires smaller work group size.",
+                                " reduction requires work group size not bigger"
+                                " than " +
+                                    std::to_string(MaxWGSize),
                                 PI_INVALID_WORK_GROUP_SIZE);
 
     // 1. Call the kernel that includes user's lambda function.
diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp
index 910342977b13..db24ced823a0 100644
--- a/sycl/source/detail/reduction.cpp
+++ b/sycl/source/detail/reduction.cpp
@@ -52,18 +52,45 @@ __SYCL_EXPORT size_t
 reduGetMaxWGSize(shared_ptr_class<sycl::detail::queue_impl> Queue,
                  size_t LocalMemBytesPerWorkItem) {
   device Dev = Queue->get_device();
-  size_t WGSize = Dev.get_info<info::device::max_work_group_size>();
+  size_t MaxWGSize = Dev.get_info<info::device::max_work_group_size>();
+  size_t WGSizePerMem = MaxWGSize * 2;
+  size_t WGSize = MaxWGSize;
   if (LocalMemBytesPerWorkItem != 0) {
     size_t MemSize = Dev.get_info<info::device::local_mem_size>();
-    size_t WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
+    WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
 
-    // If the work group size is not pow of two, then an additional element
+    // If the work group size is NOT power of two, then an additional element
     // in local memory is needed for the reduction algorithm and thus the real
     // work-group size requirement per available memory is stricter.
-    if ((WGSize & (WGSize - 1)) == 0)
+    if ((WGSizePerMem & (WGSizePerMem - 1)) != 0)
       WGSizePerMem--;
     WGSize = (std::min)(WGSizePerMem, WGSize);
   }
+  // TODO: This is a temporary workaround for a big problem of detecting
+  // the maximal usable work-group size. The detection method used above
+  // is based on maximal work-group size possible on the device is too risky
+  // as may return too big value. Even though it also tries using the memory
+  // factor into consideration, it is too rough estimation. For example,
+  // if (WGSize * LocalMemBytesPerWorkItem) is equal to local_mem_size, then
+  // the reduction local accessor takes all available local memory for it needs
+  // not leaving any local memory for other kernel needs (barriers,
+  // builtin calls, etc), which often leads to crushes with CL_OUT_OF_RESOURCES
+  // error, or in even worse cases it may cause silent writes/clobbers of
+  // the local memory assigned to one work-group by code in another work-group.
+  // It seems the only good solution for this work-group detection problem is
+  // kernel precompilation and querying the kernel properties.
+  if (WGSize >= 4) {
+    // Let's return a twice smaller number, but... do that only if the kernel
+    // is limited by memory, or the kernel uses opencl:cpu backend, which
+    // surprisingly uses lots of resources to run the kernels with reductions
+    // and often causes CL_OUT_OF_RESOURCES error even when reduction
+    // does not use local accessors.
+    if (WGSizePerMem < MaxWGSize * 2 ||
+        (Queue->get_device().is_cpu() &&
+         Queue->get_device().get_platform().get_backend() == backend::opencl))
+      WGSize /= 2;
+  }
+
   return WGSize;
 }
 
diff --git a/sycl/test/on-device/reduction/reduction_big_data.cpp b/sycl/test/on-device/reduction/reduction_big_data.cpp
new file mode 100644
index 000000000000..cfd6711f36ae
--- /dev/null
+++ b/sycl/test/on-device/reduction/reduction_big_data.cpp
@@ -0,0 +1,109 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+
+// RUNx: %RUN_ON_HOST %t.out
+// TODO: Enable the test for HOST when it supports ONEAPI::reduce() and
+// barrier()
+
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// where the bigger data size and/or non-uniform work-group sizes may cause
+// errors.
+
+#include "reduction_utils.hpp"
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+
+using namespace cl::sycl;
+
+template <typename... Ts> class KernelNameGroup;
+
+size_t getSafeMaxWGSize(size_t MaxWGSize, size_t MemSize, size_t OneElemSize) {
+  size_t MaxNumElems = MemSize / OneElemSize;
+  if ((MaxNumElems & (MaxNumElems - 1)) != 0)
+    MaxNumElems--; // Need 1 additional element in mem if not pow of 2
+  return std::min(MaxNumElems / 2, MaxWGSize);
+}
+
+template <typename KernelName, typename T, int Dim, class BinaryOperation>
+void test(T Identity) {
+  queue Q;
+  device Device = Q.get_device();
+
+  std::size_t MaxWGSize = Device.get_info<info::device::max_work_group_size>();
+  std::size_t LocalMemSize = Device.get_info<info::device::local_mem_size>();
+  std::cout << "Detected device::max_work_group_size = " << MaxWGSize << "\n";
+  std::cout << "Detected device::local_mem_size = " << LocalMemSize << "\n";
+
+  size_t WGSize = getSafeMaxWGSize(MaxWGSize, LocalMemSize, sizeof(T));
+
+  size_t MaxGlobalMem = 2LL * 1024 * 1024 * 1024; // Don't use more than 2 Gb
+  // Limit max global range by mem and also subtract 1 to make it non-uniform.
+  size_t MaxGlobalRange = MaxGlobalMem / sizeof(T) - 1;
+  size_t NWorkItems = std::min(WGSize * MaxWGSize + 1, MaxGlobalRange);
+
+  size_t NWorkGroups = (NWorkItems - 1) / WGSize + 1;
+  range<1> GlobalRange(NWorkGroups * WGSize);
+  range<1> LocalRange(WGSize);
+  nd_range<1> NDRange(GlobalRange, LocalRange);
+  std::cout << "Running the test with: GlobalRange = " << (NWorkGroups * WGSize)
+            << ", LocalRange = " << WGSize << ", NWorkItems = " << NWorkItems
+            << "\n";
+
+  buffer<T, 1> InBuf(NWorkItems);
+  buffer<T, 1> OutBuf(1);
+
+  // Initialize.
+  BinaryOperation BOp;
+  T CorrectOut;
+  initInputData(InBuf, CorrectOut, Identity, BOp, NWorkItems);
+
+  // Compute.
+  Q.submit([&](handler &CGH) {
+    auto In = InBuf.template get_access<access::mode::read>(CGH);
+    accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
+        Out(OutBuf, CGH);
+    CGH.parallel_for<KernelName>(NDRange, ONEAPI::reduction(Out, Identity, BOp),
+                                 [=](nd_item<1> NDIt, auto &Sum) {
+                                   if (NDIt.get_global_linear_id() < NWorkItems)
+                                     Sum.combine(
+                                         In[NDIt.get_global_linear_id()]);
+                                 });
+  });
+
+  // Check correctness.
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  T ComputedOut = *(Out.get_pointer());
+  if (ComputedOut != CorrectOut) {
+    std::cout << "Computed value: " << ComputedOut
+              << ", Expected value: " << CorrectOut << "\n";
+    assert(0 && "Wrong value.");
+  }
+  std::cout << "Test case passed\n\n";
+}
+
+template <typename T> struct BigCustomVec : public CustomVec<T> {
+  BigCustomVec() : CustomVec<T>() {}
+  BigCustomVec(T X, T Y) : CustomVec<T>(X, Y) {}
+  BigCustomVec(T V) : CustomVec<T>(V) {}
+  unsigned char OtherData[512 - sizeof(CustomVec<T>)];
+};
+
+template <class T> struct BigCustomVecPlus {
+  using CV = BigCustomVec<T>;
+  CV operator()(const CV &A, const CV &B) const {
+    return CV(A.X + B.X, A.Y + B.Y);
+  }
+};
+
+int main() {
+  test<class Test1, float, 0, ONEAPI::maximum<>>(getMinimumFPValue<float>());
+
+  using BCV = BigCustomVec<long long>;
+  test<class Test2, BCV, 1, BigCustomVecPlus<long long>>(BCV(0));
+
+  std::cout << "Test passed\n";
+  return 0;
+}