From a9bc1ad309545312fb1167dbe1c0319c30370a1b Mon Sep 17 00:00:00 2001
From: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Thu, 15 Jul 2021 13:53:30 -0700
Subject: [PATCH] [SYCL] Add lit tests for reduction + range (#4101)

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 SYCL/Reduction/reduction_range_1d_s0_dw.cpp | 56 +++++++++++++
 SYCL/Reduction/reduction_range_1d_s0_rw.cpp | 57 +++++++++++++
 SYCL/Reduction/reduction_range_1d_s1_dw.cpp | 57 +++++++++++++
 SYCL/Reduction/reduction_range_1d_s1_rw.cpp | 57 +++++++++++++
 SYCL/Reduction/reduction_range_2d_s1_dw.cpp | 55 ++++++++++++
 SYCL/Reduction/reduction_range_2d_s1_rw.cpp | 55 ++++++++++++
 SYCL/Reduction/reduction_range_3d_s1_dw.cpp | 68 +++++++++++++++
 SYCL/Reduction/reduction_range_3d_s1_rw.cpp | 68 +++++++++++++++
 SYCL/Reduction/reduction_range_lambda.cpp   | 50 +++++++++++
 SYCL/Reduction/reduction_range_scalar.hpp   | 92 +++++++++++++++++++++
 SYCL/Reduction/reduction_utils.hpp          | 58 ++++++++++++-
 11 files changed, 670 insertions(+), 3 deletions(-)
 create mode 100644 SYCL/Reduction/reduction_range_1d_s0_dw.cpp
 create mode 100644 SYCL/Reduction/reduction_range_1d_s0_rw.cpp
 create mode 100644 SYCL/Reduction/reduction_range_1d_s1_dw.cpp
 create mode 100644 SYCL/Reduction/reduction_range_1d_s1_rw.cpp
 create mode 100644 SYCL/Reduction/reduction_range_2d_s1_dw.cpp
 create mode 100644 SYCL/Reduction/reduction_range_2d_s1_rw.cpp
 create mode 100644 SYCL/Reduction/reduction_range_3d_s1_dw.cpp
 create mode 100644 SYCL/Reduction/reduction_range_3d_s1_rw.cpp
 create mode 100644 SYCL/Reduction/reduction_range_lambda.cpp
 create mode 100644 SYCL/Reduction/reduction_range_scalar.hpp
diff --git a/SYCL/Reduction/reduction_range_1d_s0_dw.cpp b/SYCL/Reduction/reduction_range_1d_s0_dw.cpp
new file mode 100644
index 0000000000..a000c9cfe7
--- /dev/null
+++ b/SYCL/Reduction/reduction_range_1d_s0_dw.cpp
@@ -0,0 +1,56 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<1>, reduction, func)
+// with reductions initialized with 0-dimensional discard_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_range_scalar.hpp"
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
+  constexpr access::mode DW = access::mode::discard_write;
+  test<Name, false, DW, T, 0>(Q, Identity, Init, BOp, range<1>{NWItems});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  // Fast-reduce and Fast-atomics. Try various range types/sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, 1);
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, 2);
+  tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
+  tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
+  tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
+  tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
+
+  // Try various types & ranges.
+  tests<class B1, int>(Q, ~0, ~0, std::bit_and<>{}, 8);
+  tests<class B2, int>(Q, 0, 0x12340000, std::bit_xor<>{}, 16);
+  tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4);
+  tests<class B4, int>(Q, 1, 2, std::multiplies<>{}, 256);
+  tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize + 1);
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, MaxWGSize * 2);
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, 8);
+  tests<class B8, uint64_t>(Q, 1, 99, std::multiplies<>{}, 37);
+
+  // Check with CUSTOM type.
+  using CV = CustomVec<long long>;
+  tests<class C1>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, 64);
+  tests<class C2>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, MaxWGSize * 3);
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/SYCL/Reduction/reduction_range_1d_s0_rw.cpp b/SYCL/Reduction/reduction_range_1d_s0_rw.cpp
new file mode 100644
index 0000000000..8e23e9e26a
--- /dev/null
+++ b/SYCL/Reduction/reduction_range_1d_s0_rw.cpp
@@ -0,0 +1,57 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<1>, reduction, func)
+// with reductions initialized with 0-dimensional read_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_range_scalar.hpp"
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
+  constexpr access::mode RW = access::mode::read_write;
+  test<Name, false, RW, T, 0>(Q, Identity, Init, BOp, range<1>{NWItems});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  // Fast-reduce and Fast-atomics. Try various range types/sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, 1);
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, 2);
+  tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
+  tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
+  tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
+  tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
+
+  // Try various types & ranges.
+  tests<class B1, int>(Q, ~0, ~0, std::bit_and<>{}, 8);
+  tests<class B2, int>(Q, 0, 0x12340000, std::bit_xor<>{}, 16);
+  tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4);
+  tests<class B4, int>(Q, 1, 2, std::multiplies<>{}, 256);
+  tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize + 1);
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, MaxWGSize * 2);
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, 8);
+  tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, MaxWGSize);
+
+  // Check with CUSTOM type.
+  tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, 64);
+  tests<class C2>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, MaxWGSize * 3);
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/SYCL/Reduction/reduction_range_1d_s1_dw.cpp b/SYCL/Reduction/reduction_range_1d_s1_dw.cpp
new file mode 100644
index 0000000000..e2924a0050
--- /dev/null
+++ b/SYCL/Reduction/reduction_range_1d_s1_dw.cpp
@@ -0,0 +1,57 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+#include "reduction_range_scalar.hpp"
+
+// This test performs basic checks of parallel_for(range<1>, reduction, func)
+// with reductions initialized with 1-dimensional discard_write accessor
+// accessing 1 element buffer.
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
+  constexpr access::mode DW = access::mode::discard_write;
+  testBoth<Name, DW, T>(Q, Identity, Init, BOp, range<1>{NWItems});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  // Fast-reduce and Fast-atomics. Try various range types/sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, 1);
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, 2);
+  tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
+  tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
+  tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
+  tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
+
+  // Try various types & ranges.
+  tests<class B1, int>(Q, ~0, 99, std::bit_and<>{}, 7);
+  tests<class B2, int>(Q, 0, 0xff99, std::bit_xor<>{}, MaxWGSize);
+  tests<class B3, int>(Q, 0, 0xff99, std::bit_or<>{}, 3);
+  tests<class B4, int>(Q, 1, 3, std::multiplies<>{}, 32);
+  tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize * 4);
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, MaxWGSize * 2);
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, 8);
+  tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, MaxWGSize);
+
+  // Check with CUSTOM type.
+  tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, 256);
+  tests<class C2>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, MaxWGSize * 3);
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/SYCL/Reduction/reduction_range_1d_s1_rw.cpp b/SYCL/Reduction/reduction_range_1d_s1_rw.cpp
new file mode 100644
index 0000000000..e9c55376fa
--- /dev/null
+++ b/SYCL/Reduction/reduction_range_1d_s1_rw.cpp
@@ -0,0 +1,57 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<1>, reduction, func)
+// with reductions initialized with 1-dimensional read_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_range_scalar.hpp"
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
+  constexpr access::mode RW = access::mode::read_write;
+  testBoth<Name, RW, T>(Q, Identity, Init, BOp, range<1>{NWItems});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  // Fast-reduce and Fast-atomics. Try various range types/sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<int>{}, 1);
+  tests<class A2, int>(Q, 0, 99, std::plus<int>{}, 2);
+  tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
+  tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
+  tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
+  tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
+
+  // Try various types & ranges.
+  tests<class B1, int>(Q, ~0, ~0, std::bit_and<>{}, 8);
+  tests<class B2, int>(Q, 0, 0x12340000, std::bit_xor<>{}, 16);
+  tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4);
+  tests<class B4, int>(Q, 1, 2, std::multiplies<>{}, 256);
+  tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize * 4);
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, MaxWGSize * 2);
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, 8);
+  tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, MaxWGSize);
+
+  // Check with CUSTOM type.
+  tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, 256);
+  tests<class C2>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, MaxWGSize * 3);
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/SYCL/Reduction/reduction_range_2d_s1_dw.cpp b/SYCL/Reduction/reduction_range_2d_s1_dw.cpp
new file mode 100644
index 0000000000..0846b0ac0e
--- /dev/null
+++ b/SYCL/Reduction/reduction_range_2d_s1_dw.cpp
@@ -0,0 +1,55 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<2>, reduction, func)
+// with reductions initialized with 1-dimensional discard_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_range_scalar.hpp"
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<2> Range) {
+  constexpr access::mode DW = access::mode::discard_write;
+  testBoth<Name, DW, T>(Q, Identity, Init, BOp, Range);
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
+  tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
+  tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
+  tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
+  tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
+  tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});
+
+  tests<class B1, int>(Q, 0, 0x2021ff99, std::bit_xor<>{}, range<2>{3, 3});
+  tests<class B2, int>(Q, ~0, 99, std::bit_and<>{}, range<2>{4, 3});
+  tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, range<2>{2, 2});
+  tests<class B4, uint64_t>(Q, 1, 3, std::multiplies<>{}, range<2>{16, 3});
+  tests<class B5, uint64_t>(Q, 1, 3, std::multiplies<>{},
+                            range<2>{3, MaxWGSize});
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, range<2>{8, 3});
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, range<2>{3, 3});
+  tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, range<2>{3, 3});
+
+  tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, range<2>{33, MaxWGSize});
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/SYCL/Reduction/reduction_range_2d_s1_rw.cpp b/SYCL/Reduction/reduction_range_2d_s1_rw.cpp
new file mode 100644
index 0000000000..56a5a9e7af
--- /dev/null
+++ b/SYCL/Reduction/reduction_range_2d_s1_rw.cpp
@@ -0,0 +1,55 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<2>, reduction, func)
+// with reductions initialized with 1-dimensional discard_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_range_scalar.hpp"
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<2> Range) {
+  constexpr access::mode RW = access::mode::read_write;
+  testBoth<Name, RW, T>(Q, Identity, Init, BOp, Range);
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
+  tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
+  tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
+  tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
+  tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
+  tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});
+
+  tests<class B1, int>(Q, 0, 0x2021ff99, std::bit_xor<>{}, range<2>{3, 3});
+  tests<class B2, int>(Q, ~0, 99, std::bit_and<>{}, range<2>{4, 3});
+  tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, range<2>{2, 2});
+  tests<class B4, uint64_t>(Q, 1, 3, std::multiplies<>{}, range<2>{16, 3});
+  tests<class B5, uint64_t>(Q, 1, 3, std::multiplies<>{},
+                            range<2>{3, MaxWGSize});
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, range<2>{8, 3});
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, range<2>{3, 3});
+  tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, range<2>{3, 3});
+
+  tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, range<2>{33, MaxWGSize});
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/SYCL/Reduction/reduction_range_3d_s1_dw.cpp b/SYCL/Reduction/reduction_range_3d_s1_dw.cpp
new file mode 100644
index 0000000000..d0b11a98f5
--- /dev/null
+++ b/SYCL/Reduction/reduction_range_3d_s1_dw.cpp
@@ -0,0 +1,68 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<3>, reduction, func)
+// with reductions initialized with 1-dimensional discard_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_range_scalar.hpp"
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<3> Range) {
+  constexpr access::mode DW = access::mode::discard_write;
+  testBoth<Name, DW, T>(Q, Identity, Init, BOp, Range);
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<3>{1, 1, 1});
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 2, 2});
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 3, 4});
+
+  tests<class A4, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{1, 1, MaxWGSize + 1});
+  tests<class A5, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{1, MaxWGSize + 1, 1});
+  tests<class A6, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{MaxWGSize + 1, 1, 1});
+
+  tests<class A7, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{2, 5, MaxWGSize * 2});
+  tests<class A8, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{3, MaxWGSize * 3, 2});
+  tests<class A9, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{MaxWGSize * 3, 8, 4});
+
+  tests<class B1, int>(Q, 0, 0x2021ff99, std::bit_xor<>{},
+                       range<3>{2, MaxWGSize * 3, 3});
+  tests<class B2, int>(Q, ~0, 99, std::bit_and<>{},
+                       range<3>{MaxWGSize * 3, 4, 3});
+  tests<class B3, int>(Q, 0, 99, std::bit_or<>{},
+                       range<3>{2, 2, MaxWGSize * 3});
+  tests<class B4, uint64_t>(Q, 1, 3, std::multiplies<>{}, range<3>{16, 3, 5});
+  tests<class B5, uint64_t>(Q, 1, 3, std::multiplies<>{},
+                            range<3>{2, 3, MaxWGSize});
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, range<3>{MaxWGSize, 8, 3});
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, range<3>{3, MaxWGSize, 3});
+  tests<class B8, float>(Q, 1, 99, std::multiplies<>{},
+                         range<3>{3, 3, MaxWGSize});
+
+  tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, range<3>{2, 33, MaxWGSize});
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/SYCL/Reduction/reduction_range_3d_s1_rw.cpp b/SYCL/Reduction/reduction_range_3d_s1_rw.cpp
new file mode 100644
index 0000000000..963e9b4769
--- /dev/null
+++ b/SYCL/Reduction/reduction_range_3d_s1_rw.cpp
@@ -0,0 +1,68 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<3>, reduction, func)
+// with reductions initialized with 1-dimensional discard_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_range_scalar.hpp"
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<3> Range) {
+  constexpr access::mode RW = access::mode::read_write;
+  testBoth<Name, RW, T>(Q, Identity, Init, BOp, Range);
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<3>{1, 1, 1});
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 2, 2});
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 3, 4});
+
+  tests<class A4, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{1, 1, MaxWGSize + 1});
+  tests<class A5, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{1, MaxWGSize + 1, 1});
+  tests<class A6, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{MaxWGSize + 1, 1, 1});
+
+  tests<class A7, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{2, 5, MaxWGSize * 2});
+  tests<class A8, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{3, MaxWGSize * 3, 2});
+  tests<class A9, int64_t>(Q, 0, 99, std::plus<>{},
+                           range<3>{MaxWGSize * 3, 8, 4});
+
+  tests<class B1, int>(Q, 0, 0x2021ff99, std::bit_xor<>{},
+                       range<3>{2, MaxWGSize * 3, 3});
+  tests<class B2, int>(Q, ~0, 99, std::bit_and<>{},
+                       range<3>{MaxWGSize * 3, 4, 3});
+  tests<class B3, int>(Q, 0, 99, std::bit_or<>{},
+                       range<3>{2, 2, MaxWGSize * 3});
+  tests<class B4, uint64_t>(Q, 1, 3, std::multiplies<>{}, range<3>{16, 3, 5});
+  tests<class B5, uint64_t>(Q, 1, 3, std::multiplies<>{},
+                            range<3>{2, 3, MaxWGSize});
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, range<3>{MaxWGSize, 8, 3});
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, range<3>{3, MaxWGSize, 3});
+  tests<class B8, float>(Q, 1, 99, std::multiplies<>{},
+                         range<3>{3, 3, MaxWGSize});
+
+  tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, range<3>{2, 33, MaxWGSize});
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/SYCL/Reduction/reduction_range_lambda.cpp b/SYCL/Reduction/reduction_range_lambda.cpp
new file mode 100644
index 0000000000..ef7a1d46be
--- /dev/null
+++ b/SYCL/Reduction/reduction_range_lambda.cpp
@@ -0,0 +1,50 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range, reduction, lambda)
+// with reductions initialized with 1-dimensional accessor accessing
+// 1 element buffer.
+
+#include "reduction_range_scalar.hpp"
+
+using namespace cl::sycl;
+
+constexpr access::mode RW = access::mode::read_write;
+constexpr access::mode DW = access::mode::discard_write;
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  auto LambdaSum = [](auto x, auto y) { return (x + y); };
+
+  testBoth<class A1, RW, int>(Q, 0, 99, LambdaSum, range<1>{7});
+  testBoth<class A2, DW, int>(Q, 0, 99, LambdaSum, range<1>{7});
+
+  testBoth<class A3, RW, int>(Q, 0, 99, LambdaSum, range<1>{MaxWGSize + 1});
+  testBoth<class A4, DW, int>(Q, 0, 99, LambdaSum, range<1>{MaxWGSize + 1});
+
+  testBoth<class B1, RW, int>(Q, 0, 99, LambdaSum, range<2>{3, 4});
+  testBoth<class B2, DW, int>(Q, 0, 99, LambdaSum, range<2>{3, 4});
+
+  testBoth<class B3, RW, int>(Q, 0, 99, LambdaSum, range<2>{3, MaxWGSize + 1});
+  testBoth<class B4, DW, int>(Q, 0, 99, LambdaSum, range<2>{3, MaxWGSize + 1});
+
+  testBoth<class C1, RW, int>(Q, 0, 99, LambdaSum, range<3>{2, 3, 4});
+  testBoth<class C2, RW, int>(Q, 0, 99, LambdaSum, range<3>{2, 3, 4});
+
+  testBoth<class C3, RW, int>(Q, 0, 99, LambdaSum,
+                              range<3>{2, 3, MaxWGSize + 1});
+  testBoth<class C4, DW, int>(Q, 0, 99, LambdaSum,
+                              range<3>{2, 3, MaxWGSize + 1});
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/SYCL/Reduction/reduction_range_scalar.hpp b/SYCL/Reduction/reduction_range_scalar.hpp
new file mode 100644
index 0000000000..7ee833f9d1
--- /dev/null
+++ b/SYCL/Reduction/reduction_range_scalar.hpp
@@ -0,0 +1,92 @@
+// This test performs basic checks of parallel_for(range<Dims>, reduction, func)
+// with reductions initialized with 1-dimensional buffer/accessor
+// accessing a scalar holding the reduction result.
+
+#include "reduction_utils.hpp"
+#include <CL/sycl.hpp>
+#include <cassert>
+
+using namespace cl::sycl;
+
+template <typename T, bool B> class KName;
+
+template <int Dims>
+std::ostream &operator<<(std::ostream &OS, const range<Dims> &Range) {
+  OS << "{" << Range[0];
+  if constexpr (Dims > 1)
+    OS << ", " << Range[1];
+  if constexpr (Dims > 2)
+    OS << ", " << Range[2];
+  OS << "}";
+  return OS;
+}
+
+template <typename Name, bool IsSYCL2020Mode, access::mode Mode, typename T,
+          int AccDim = 1, class BinaryOperation, int Dims>
+void test(queue &Q, T Identity, T Init, BinaryOperation BOp,
+          range<Dims> Range) {
+  std::string StdMode = IsSYCL2020Mode ? "SYCL2020" : "ONEAPI  ";
+  std::cout << "Running the test case: " << StdMode
+            << " {T=" << typeid(T).name()
+            << ", BOp=" << typeid(BinaryOperation).name() << ", Range=" << Range
+            << std::endl;
+
+  // Skip the test for such big arrays now.
+  constexpr size_t TwoGB = 2LL * 1024 * 1024 * 1024;
+  if (Range.size() > TwoGB)
+    return;
+
+  buffer<T, Dims> InBuf(Range);
+  buffer<T, 1> OutBuf(1);
+
+  // Initialize.
+  T CorrectOut;
+  initInputData(InBuf, CorrectOut, Identity, BOp, Range);
+  if constexpr (Mode == access::mode::read_write) {
+    CorrectOut = BOp(CorrectOut, Init);
+  }
+
+  // The value assigned here must be discarded (if IsReadWrite is true).
+  // Verify that it is really discarded and assign some value.
+  (OutBuf.template get_access<access::mode::write>())[0] = Init;
+
+  // Compute.
+  if constexpr (IsSYCL2020Mode) {
+    Q.submit([&](handler &CGH) {
+      auto In = InBuf.template get_access<access::mode::read>(CGH);
+      property_list PropList = getPropertyList<Mode>();
+      auto Redu = sycl::reduction(OutBuf, CGH, Identity, BOp, PropList);
+
+      CGH.parallel_for<Name>(
+          Range, Redu, [=](id<Dims> Id, auto &Sum) { Sum.combine(In[Id]); });
+    });
+  } else {
+    Q.submit([&](handler &CGH) {
+      auto In = InBuf.template get_access<access::mode::read>(CGH);
+      accessor<T, AccDim, Mode, access::target::global_buffer> Out(OutBuf, CGH);
+      auto Redu = ext::oneapi::reduction(Out, Identity, BOp);
+
+      CGH.parallel_for<Name>(
+          Range, Redu, [=](id<Dims> Id, auto &Sum) { Sum.combine(In[Id]); });
+    });
+  }
+
+  // Check correctness.
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  T ComputedOut = *(Out.get_pointer());
+  if (ComputedOut != CorrectOut) {
+    printDeviceInfo(Q, true);
+    std::cerr << "Error: Range = " << Range << ", "
+              << "Computed value: " << ComputedOut
+              << ", Expected value: " << CorrectOut << "\n";
+    assert(0 && "Wrong value.");
+  }
+}
+
+template <typename Name, access::mode Mode, typename T, class BinaryOperation,
+          int Dims>
+void testBoth(queue &Q, T Identity, T Init, BinaryOperation BOp,
+              range<Dims> Range) {
+  test<KName<Name, false>, false, Mode>(Q, Identity, Init, BOp, Range);
+  test<KName<Name, true>, true, Mode>(Q, Identity, Init, BOp, Range);
+}
diff --git a/SYCL/Reduction/reduction_utils.hpp b/SYCL/Reduction/reduction_utils.hpp
index e7044b462f..e51030f1ea 100644
--- a/SYCL/Reduction/reduction_utils.hpp
+++ b/SYCL/Reduction/reduction_utils.hpp
@@ -2,12 +2,13 @@
 
 using namespace cl::sycl;
 
-// Initializes 'InBuf' buffer with pseudo-random values, computes the reduction
-// value for the buffer and writes it to 'ExpectedOut'.
+/// Initializes the buffer<1> \p 'InBuf' buffer with pseudo-random values,
+/// computes the write the reduction value \p 'ExpectedOut'.
 template <typename T, class BinaryOperation>
 void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
-                   BinaryOperation BOp, size_t N) {
+                   BinaryOperation BOp, range<1> Range) {
   ExpectedOut = Identity;
+  size_t N = Range.size();
   auto In = InBuf.template get_access<access::mode::write>();
   for (int I = 0; I < N; ++I) {
     if (std::is_same<BinaryOperation, std::multiplies<T>>::value)
@@ -18,6 +19,44 @@ void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
   }
 };
 
+/// Initializes the buffer<2> \p 'InBuf' buffer with pseudo-random values,
+/// computes the write the reduction value \p 'ExpectedOut'.
+template <typename T, class BinaryOperation>
+void initInputData(buffer<T, 2> &InBuf, T &ExpectedOut, T Identity,
+                   BinaryOperation BOp, range<2> Range) {
+  ExpectedOut = Identity;
+  auto In = InBuf.template get_access<access::mode::write>();
+  for (int J = 0; J < Range[0]; ++J) {
+    for (int I = 0; I < Range[1]; ++I) {
+      if (std::is_same<BinaryOperation, std::multiplies<T>>::value)
+        In[J][I] = 1 + ((((I * 2 + J * 3) % 37) == 0) ? 1 : 0);
+      else
+        In[J][I] = ((I + 1 + J) % 5) + 1.1;
+      ExpectedOut = BOp(ExpectedOut, In[J][I]);
+    }
+  }
+};
+
+/// Initializes the buffer<3> \p 'InBuf' buffer with pseudo-random values,
+/// computes the write the reduction value \p 'ExpectedOut'.
+template <typename T, class BinaryOperation>
+void initInputData(buffer<T, 3> &InBuf, T &ExpectedOut, T Identity,
+                   BinaryOperation BOp, range<3> Range) {
+  ExpectedOut = Identity;
+  auto In = InBuf.template get_access<access::mode::write>();
+  for (int K = 0; K < Range[0]; ++K) {
+    for (int J = 0; J < Range[1]; ++J) {
+      for (int I = 0; I < Range[2]; ++I) {
+        if (std::is_same<BinaryOperation, std::multiplies<T>>::value)
+          In[K][J][I] = 1 + ((((I * 2 + J * 3 + K) % 37) == 0) ? 1 : 0);
+        else
+          In[K][J][I] = ((I + 1 + J + K * 3) % 5) + 1.1;
+        ExpectedOut = BOp(ExpectedOut, In[K][J][I]);
+      }
+    }
+  }
+};
+
 // This type is needed only to check that custom types are properly handled
 // in parallel_for() with reduction. For simplicity it needs a default
 // constructor, a constructor with one argument, operators ==, != and
@@ -64,3 +103,16 @@ template <access::mode Mode> property_list getPropertyList() {
     return property_list();
   return property_list(property::reduction::initialize_to_identity{});
 }
+
+void printDeviceInfo(queue &Q, bool ToCERR = false) {
+  device D = Q.get_device();
+  auto Name = D.get_info<sycl::info::device::name>();
+  size_t MaxWGSize = D.get_info<info::device::max_work_group_size>();
+  size_t LocalMemSize = D.get_info<info::device::local_mem_size>();
+  if (ToCERR)
+    std::cout << "Device: " << Name << ", MaxWGSize: " << MaxWGSize
+              << ", LocalMemSize: " << LocalMemSize << std::endl;
+  else
+    std::cerr << "Device: " << Name << ", MaxWGSize: " << MaxWGSize
+              << ", LocalMemSize: " << LocalMemSize << std::endl;
+}