From a9bc1ad309545312fb1167dbe1c0319c30370a1b Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Thu, 15 Jul 2021 13:53:30 -0700 Subject: [PATCH] [SYCL] Add lit tests for reduction + range (#4101) Signed-off-by: Vyacheslav N Klochkov --- SYCL/Reduction/reduction_range_1d_s0_dw.cpp | 56 +++++++++++++ SYCL/Reduction/reduction_range_1d_s0_rw.cpp | 57 +++++++++++++ SYCL/Reduction/reduction_range_1d_s1_dw.cpp | 57 +++++++++++++ SYCL/Reduction/reduction_range_1d_s1_rw.cpp | 57 +++++++++++++ SYCL/Reduction/reduction_range_2d_s1_dw.cpp | 55 ++++++++++++ SYCL/Reduction/reduction_range_2d_s1_rw.cpp | 55 ++++++++++++ SYCL/Reduction/reduction_range_3d_s1_dw.cpp | 68 +++++++++++++++ SYCL/Reduction/reduction_range_3d_s1_rw.cpp | 68 +++++++++++++++ SYCL/Reduction/reduction_range_lambda.cpp | 50 +++++++++++ SYCL/Reduction/reduction_range_scalar.hpp | 92 +++++++++++++++++++++ SYCL/Reduction/reduction_utils.hpp | 58 ++++++++++++- 11 files changed, 670 insertions(+), 3 deletions(-) create mode 100644 SYCL/Reduction/reduction_range_1d_s0_dw.cpp create mode 100644 SYCL/Reduction/reduction_range_1d_s0_rw.cpp create mode 100644 SYCL/Reduction/reduction_range_1d_s1_dw.cpp create mode 100644 SYCL/Reduction/reduction_range_1d_s1_rw.cpp create mode 100644 SYCL/Reduction/reduction_range_2d_s1_dw.cpp create mode 100644 SYCL/Reduction/reduction_range_2d_s1_rw.cpp create mode 100644 SYCL/Reduction/reduction_range_3d_s1_dw.cpp create mode 100644 SYCL/Reduction/reduction_range_3d_s1_rw.cpp create mode 100644 SYCL/Reduction/reduction_range_lambda.cpp create mode 100644 SYCL/Reduction/reduction_range_scalar.hpp diff --git a/SYCL/Reduction/reduction_range_1d_s0_dw.cpp b/SYCL/Reduction/reduction_range_1d_s0_dw.cpp new file mode 100644 index 0000000000..a000c9cfe7 --- /dev/null +++ b/SYCL/Reduction/reduction_range_1d_s0_dw.cpp @@ -0,0 +1,56 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// TODO: accelerator may not suport atomics required by the current +// implementation. Enable testing when implementation is fixed. +// RUNx: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(range<1>, reduction, func) +// with reductions initialized with 0-dimensional discard_write accessor +// accessing 1 element buffer. + +#include "reduction_range_scalar.hpp" + +using namespace cl::sycl; + +template +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) { + constexpr access::mode DW = access::mode::discard_write; + test(Q, Identity, Init, BOp, range<1>{NWItems}); +} + +int main() { + queue Q; + printDeviceInfo(Q); + size_t MaxWGSize = + Q.get_device().get_info(); + + // Fast-reduce and Fast-atomics. Try various range types/sizes. + tests(Q, 0, 99, std::plus<>{}, 1); + tests(Q, 0, 99, std::plus<>{}, 2); + tests(Q, 0, 99, std::plus<>{}, 7); + tests(Q, 0, 99, std::plus<>{}, 64); + tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2); + tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5); + + // Try various types & ranges. + tests(Q, ~0, ~0, std::bit_and<>{}, 8); + tests(Q, 0, 0x12340000, std::bit_xor<>{}, 16); + tests(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4); + tests(Q, 1, 2, std::multiplies<>{}, 256); + tests(Q, 1, 3, std::multiplies<>{}, MaxWGSize + 1); + tests(Q, (std::numeric_limits::max)(), -99, + ext::oneapi::minimum<>{}, MaxWGSize * 2); + tests(Q, (std::numeric_limits::min)(), 99, + ext::oneapi::maximum<>{}, 8); + tests(Q, 1, 99, std::multiplies<>{}, 37); + + // Check with CUSTOM type. + using CV = CustomVec; + tests(Q, CV(0), CV(99), CustomVecPlus{}, 64); + tests(Q, CV(0), CV(99), CustomVecPlus{}, MaxWGSize * 3); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/SYCL/Reduction/reduction_range_1d_s0_rw.cpp b/SYCL/Reduction/reduction_range_1d_s0_rw.cpp new file mode 100644 index 0000000000..8e23e9e26a --- /dev/null +++ b/SYCL/Reduction/reduction_range_1d_s0_rw.cpp @@ -0,0 +1,57 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// TODO: accelerator may not suport atomics required by the current +// implementation. Enable testing when implementation is fixed. +// RUNx: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(range<1>, reduction, func) +// with reductions initialized with 0-dimensional read_write accessor +// accessing 1 element buffer. + +#include "reduction_range_scalar.hpp" + +using namespace cl::sycl; + +template +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) { + constexpr access::mode RW = access::mode::read_write; + test(Q, Identity, Init, BOp, range<1>{NWItems}); +} + +int main() { + queue Q; + printDeviceInfo(Q); + size_t MaxWGSize = + Q.get_device().get_info(); + + // Fast-reduce and Fast-atomics. Try various range types/sizes. + tests(Q, 0, 99, std::plus<>{}, 1); + tests(Q, 0, 99, std::plus<>{}, 2); + tests(Q, 0, 99, std::plus<>{}, 7); + tests(Q, 0, 99, std::plus<>{}, 64); + tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2); + tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5); + + // Try various types & ranges. + tests(Q, ~0, ~0, std::bit_and<>{}, 8); + tests(Q, 0, 0x12340000, std::bit_xor<>{}, 16); + tests(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4); + tests(Q, 1, 2, std::multiplies<>{}, 256); + tests(Q, 1, 3, std::multiplies<>{}, MaxWGSize + 1); + tests(Q, (std::numeric_limits::max)(), -99, + ext::oneapi::minimum<>{}, MaxWGSize * 2); + tests(Q, (std::numeric_limits::min)(), 99, + ext::oneapi::maximum<>{}, 8); + tests(Q, 1, 99, std::multiplies<>{}, MaxWGSize); + + // Check with CUSTOM type. + tests(Q, CustomVec(0), CustomVec(99), + CustomVecPlus{}, 64); + tests(Q, CustomVec(0), CustomVec(99), + CustomVecPlus{}, MaxWGSize * 3); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/SYCL/Reduction/reduction_range_1d_s1_dw.cpp b/SYCL/Reduction/reduction_range_1d_s1_dw.cpp new file mode 100644 index 0000000000..e2924a0050 --- /dev/null +++ b/SYCL/Reduction/reduction_range_1d_s1_dw.cpp @@ -0,0 +1,57 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// TODO: accelerator may not suport atomics required by the current +// implementation. Enable testing when implementation is fixed. +// RUNx: %ACC_RUN_PLACEHOLDER %t.out + +#include "reduction_range_scalar.hpp" + +// This test performs basic checks of parallel_for(range<1>, reduction, func) +// with reductions initialized with 1-dimensional discard_write accessor +// accessing 1 element buffer. + +using namespace cl::sycl; + +template +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) { + constexpr access::mode DW = access::mode::discard_write; + testBoth(Q, Identity, Init, BOp, range<1>{NWItems}); +} + +int main() { + queue Q; + printDeviceInfo(Q); + size_t MaxWGSize = + Q.get_device().get_info(); + + // Fast-reduce and Fast-atomics. Try various range types/sizes. + tests(Q, 0, 99, std::plus<>{}, 1); + tests(Q, 0, 99, std::plus<>{}, 2); + tests(Q, 0, 99, std::plus<>{}, 7); + tests(Q, 0, 99, std::plus<>{}, 64); + tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2); + tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5); + + // Try various types & ranges. + tests(Q, ~0, 99, std::bit_and<>{}, 7); + tests(Q, 0, 0xff99, std::bit_xor<>{}, MaxWGSize); + tests(Q, 0, 0xff99, std::bit_or<>{}, 3); + tests(Q, 1, 3, std::multiplies<>{}, 32); + tests(Q, 1, 3, std::multiplies<>{}, MaxWGSize * 4); + tests(Q, (std::numeric_limits::max)(), -99, + ext::oneapi::minimum<>{}, MaxWGSize * 2); + tests(Q, (std::numeric_limits::min)(), 99, + ext::oneapi::maximum<>{}, 8); + tests(Q, 1, 99, std::multiplies<>{}, MaxWGSize); + + // Check with CUSTOM type. + tests(Q, CustomVec(0), CustomVec(99), + CustomVecPlus{}, 256); + tests(Q, CustomVec(0), CustomVec(99), + CustomVecPlus{}, MaxWGSize * 3); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/SYCL/Reduction/reduction_range_1d_s1_rw.cpp b/SYCL/Reduction/reduction_range_1d_s1_rw.cpp new file mode 100644 index 0000000000..e9c55376fa --- /dev/null +++ b/SYCL/Reduction/reduction_range_1d_s1_rw.cpp @@ -0,0 +1,57 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// TODO: accelerator may not suport atomics required by the current +// implementation. Enable testing when implementation is fixed. +// RUNx: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(range<1>, reduction, func) +// with reductions initialized with 1-dimensional read_write accessor +// accessing 1 element buffer. + +#include "reduction_range_scalar.hpp" + +using namespace cl::sycl; + +template +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) { + constexpr access::mode RW = access::mode::read_write; + testBoth(Q, Identity, Init, BOp, range<1>{NWItems}); +} + +int main() { + queue Q; + printDeviceInfo(Q); + size_t MaxWGSize = + Q.get_device().get_info(); + + // Fast-reduce and Fast-atomics. Try various range types/sizes. + tests(Q, 0, 99, std::plus{}, 1); + tests(Q, 0, 99, std::plus{}, 2); + tests(Q, 0, 99, std::plus<>{}, 7); + tests(Q, 0, 99, std::plus<>{}, 64); + tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2); + tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5); + + // Try various types & ranges. + tests(Q, ~0, ~0, std::bit_and<>{}, 8); + tests(Q, 0, 0x12340000, std::bit_xor<>{}, 16); + tests(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4); + tests(Q, 1, 2, std::multiplies<>{}, 256); + tests(Q, 1, 3, std::multiplies<>{}, MaxWGSize * 4); + tests(Q, (std::numeric_limits::max)(), -99, + ext::oneapi::minimum<>{}, MaxWGSize * 2); + tests(Q, (std::numeric_limits::min)(), 99, + ext::oneapi::maximum<>{}, 8); + tests(Q, 1, 99, std::multiplies<>{}, MaxWGSize); + + // Check with CUSTOM type. + tests(Q, CustomVec(0), CustomVec(99), + CustomVecPlus{}, 256); + tests(Q, CustomVec(0), CustomVec(99), + CustomVecPlus{}, MaxWGSize * 3); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/SYCL/Reduction/reduction_range_2d_s1_dw.cpp b/SYCL/Reduction/reduction_range_2d_s1_dw.cpp new file mode 100644 index 0000000000..0846b0ac0e --- /dev/null +++ b/SYCL/Reduction/reduction_range_2d_s1_dw.cpp @@ -0,0 +1,55 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// TODO: accelerator may not suport atomics required by the current +// implementation. Enable testing when implementation is fixed. +// RUNx: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(range<2>, reduction, func) +// with reductions initialized with 1-dimensional discard_write accessor +// accessing 1 element buffer. + +#include "reduction_range_scalar.hpp" + +using namespace cl::sycl; + +template +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<2> Range) { + constexpr access::mode DW = access::mode::discard_write; + testBoth(Q, Identity, Init, BOp, Range); +} + +int main() { + queue Q; + printDeviceInfo(Q); + size_t MaxWGSize = + Q.get_device().get_info(); + + tests(Q, 0, 99, std::plus<>{}, range<2>{1, 1}); + tests(Q, 0, 99, std::plus<>{}, range<2>{2, 2}); + tests(Q, 0, 99, std::plus<>{}, range<2>{2, 3}); + tests(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1}); + tests(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize}); + tests(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2}); + tests(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7}); + tests(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3}); + + tests(Q, 0, 0x2021ff99, std::bit_xor<>{}, range<2>{3, 3}); + tests(Q, ~0, 99, std::bit_and<>{}, range<2>{4, 3}); + tests(Q, 0, 99, std::bit_or<>{}, range<2>{2, 2}); + tests(Q, 1, 3, std::multiplies<>{}, range<2>{16, 3}); + tests(Q, 1, 3, std::multiplies<>{}, + range<2>{3, MaxWGSize}); + tests(Q, (std::numeric_limits::max)(), -99, + ext::oneapi::minimum<>{}, range<2>{8, 3}); + tests(Q, (std::numeric_limits::min)(), 99, + ext::oneapi::maximum<>{}, range<2>{3, 3}); + tests(Q, 1, 99, std::multiplies<>{}, range<2>{3, 3}); + + tests(Q, CustomVec(0), CustomVec(99), + CustomVecPlus{}, range<2>{33, MaxWGSize}); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/SYCL/Reduction/reduction_range_2d_s1_rw.cpp b/SYCL/Reduction/reduction_range_2d_s1_rw.cpp new file mode 100644 index 0000000000..56a5a9e7af --- /dev/null +++ b/SYCL/Reduction/reduction_range_2d_s1_rw.cpp @@ -0,0 +1,55 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// TODO: accelerator may not suport atomics required by the current +// implementation. Enable testing when implementation is fixed. +// RUNx: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(range<2>, reduction, func) +// with reductions initialized with 1-dimensional discard_write accessor +// accessing 1 element buffer. + +#include "reduction_range_scalar.hpp" + +using namespace cl::sycl; + +template +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<2> Range) { + constexpr access::mode RW = access::mode::read_write; + testBoth(Q, Identity, Init, BOp, Range); +} + +int main() { + queue Q; + printDeviceInfo(Q); + size_t MaxWGSize = + Q.get_device().get_info(); + + tests(Q, 0, 99, std::plus<>{}, range<2>{1, 1}); + tests(Q, 0, 99, std::plus<>{}, range<2>{2, 2}); + tests(Q, 0, 99, std::plus<>{}, range<2>{2, 3}); + tests(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1}); + tests(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize}); + tests(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2}); + tests(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7}); + tests(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3}); + + tests(Q, 0, 0x2021ff99, std::bit_xor<>{}, range<2>{3, 3}); + tests(Q, ~0, 99, std::bit_and<>{}, range<2>{4, 3}); + tests(Q, 0, 99, std::bit_or<>{}, range<2>{2, 2}); + tests(Q, 1, 3, std::multiplies<>{}, range<2>{16, 3}); + tests(Q, 1, 3, std::multiplies<>{}, + range<2>{3, MaxWGSize}); + tests(Q, (std::numeric_limits::max)(), -99, + ext::oneapi::minimum<>{}, range<2>{8, 3}); + tests(Q, (std::numeric_limits::min)(), 99, + ext::oneapi::maximum<>{}, range<2>{3, 3}); + tests(Q, 1, 99, std::multiplies<>{}, range<2>{3, 3}); + + tests(Q, CustomVec(0), CustomVec(99), + CustomVecPlus{}, range<2>{33, MaxWGSize}); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/SYCL/Reduction/reduction_range_3d_s1_dw.cpp b/SYCL/Reduction/reduction_range_3d_s1_dw.cpp new file mode 100644 index 0000000000..d0b11a98f5 --- /dev/null +++ b/SYCL/Reduction/reduction_range_3d_s1_dw.cpp @@ -0,0 +1,68 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// TODO: accelerator may not suport atomics required by the current +// implementation. Enable testing when implementation is fixed. +// RUNx: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(range<3>, reduction, func) +// with reductions initialized with 1-dimensional discard_write accessor +// accessing 1 element buffer. + +#include "reduction_range_scalar.hpp" + +using namespace cl::sycl; + +template +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<3> Range) { + constexpr access::mode DW = access::mode::discard_write; + testBoth(Q, Identity, Init, BOp, Range); +} + +int main() { + queue Q; + printDeviceInfo(Q); + size_t MaxWGSize = + Q.get_device().get_info(); + + tests(Q, 0, 99, std::plus<>{}, range<3>{1, 1, 1}); + tests(Q, 0, 99, std::plus<>{}, range<3>{2, 2, 2}); + tests(Q, 0, 99, std::plus<>{}, range<3>{2, 3, 4}); + + tests(Q, 0, 99, std::plus<>{}, + range<3>{1, 1, MaxWGSize + 1}); + tests(Q, 0, 99, std::plus<>{}, + range<3>{1, MaxWGSize + 1, 1}); + tests(Q, 0, 99, std::plus<>{}, + range<3>{MaxWGSize + 1, 1, 1}); + + tests(Q, 0, 99, std::plus<>{}, + range<3>{2, 5, MaxWGSize * 2}); + tests(Q, 0, 99, std::plus<>{}, + range<3>{3, MaxWGSize * 3, 2}); + tests(Q, 0, 99, std::plus<>{}, + range<3>{MaxWGSize * 3, 8, 4}); + + tests(Q, 0, 0x2021ff99, std::bit_xor<>{}, + range<3>{2, MaxWGSize * 3, 3}); + tests(Q, ~0, 99, std::bit_and<>{}, + range<3>{MaxWGSize * 3, 4, 3}); + tests(Q, 0, 99, std::bit_or<>{}, + range<3>{2, 2, MaxWGSize * 3}); + tests(Q, 1, 3, std::multiplies<>{}, range<3>{16, 3, 5}); + tests(Q, 1, 3, std::multiplies<>{}, + range<3>{2, 3, MaxWGSize}); + tests(Q, (std::numeric_limits::max)(), -99, + ext::oneapi::minimum<>{}, range<3>{MaxWGSize, 8, 3}); + tests(Q, (std::numeric_limits::min)(), 99, + ext::oneapi::maximum<>{}, range<3>{3, MaxWGSize, 3}); + tests(Q, 1, 99, std::multiplies<>{}, + range<3>{3, 3, MaxWGSize}); + + tests(Q, CustomVec(0), CustomVec(99), + CustomVecPlus{}, range<3>{2, 33, MaxWGSize}); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/SYCL/Reduction/reduction_range_3d_s1_rw.cpp b/SYCL/Reduction/reduction_range_3d_s1_rw.cpp new file mode 100644 index 0000000000..963e9b4769 --- /dev/null +++ b/SYCL/Reduction/reduction_range_3d_s1_rw.cpp @@ -0,0 +1,68 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// TODO: accelerator may not suport atomics required by the current +// implementation. Enable testing when implementation is fixed. +// RUNx: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(range<3>, reduction, func) +// with reductions initialized with 1-dimensional discard_write accessor +// accessing 1 element buffer. + +#include "reduction_range_scalar.hpp" + +using namespace cl::sycl; + +template +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<3> Range) { + constexpr access::mode RW = access::mode::read_write; + testBoth(Q, Identity, Init, BOp, Range); +} + +int main() { + queue Q; + printDeviceInfo(Q); + size_t MaxWGSize = + Q.get_device().get_info(); + + tests(Q, 0, 99, std::plus<>{}, range<3>{1, 1, 1}); + tests(Q, 0, 99, std::plus<>{}, range<3>{2, 2, 2}); + tests(Q, 0, 99, std::plus<>{}, range<3>{2, 3, 4}); + + tests(Q, 0, 99, std::plus<>{}, + range<3>{1, 1, MaxWGSize + 1}); + tests(Q, 0, 99, std::plus<>{}, + range<3>{1, MaxWGSize + 1, 1}); + tests(Q, 0, 99, std::plus<>{}, + range<3>{MaxWGSize + 1, 1, 1}); + + tests(Q, 0, 99, std::plus<>{}, + range<3>{2, 5, MaxWGSize * 2}); + tests(Q, 0, 99, std::plus<>{}, + range<3>{3, MaxWGSize * 3, 2}); + tests(Q, 0, 99, std::plus<>{}, + range<3>{MaxWGSize * 3, 8, 4}); + + tests(Q, 0, 0x2021ff99, std::bit_xor<>{}, + range<3>{2, MaxWGSize * 3, 3}); + tests(Q, ~0, 99, std::bit_and<>{}, + range<3>{MaxWGSize * 3, 4, 3}); + tests(Q, 0, 99, std::bit_or<>{}, + range<3>{2, 2, MaxWGSize * 3}); + tests(Q, 1, 3, std::multiplies<>{}, range<3>{16, 3, 5}); + tests(Q, 1, 3, std::multiplies<>{}, + range<3>{2, 3, MaxWGSize}); + tests(Q, (std::numeric_limits::max)(), -99, + ext::oneapi::minimum<>{}, range<3>{MaxWGSize, 8, 3}); + tests(Q, (std::numeric_limits::min)(), 99, + ext::oneapi::maximum<>{}, range<3>{3, MaxWGSize, 3}); + tests(Q, 1, 99, std::multiplies<>{}, + range<3>{3, 3, MaxWGSize}); + + tests(Q, CustomVec(0), CustomVec(99), + CustomVecPlus{}, range<3>{2, 33, MaxWGSize}); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/SYCL/Reduction/reduction_range_lambda.cpp b/SYCL/Reduction/reduction_range_lambda.cpp new file mode 100644 index 0000000000..ef7a1d46be --- /dev/null +++ b/SYCL/Reduction/reduction_range_lambda.cpp @@ -0,0 +1,50 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// TODO: accelerator may not suport atomics required by the current +// implementation. Enable testing when implementation is fixed. +// RUNx: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(range, reduction, lambda) +// with reductions initialized with 1-dimensional accessor accessing +// 1 element buffer. + +#include "reduction_range_scalar.hpp" + +using namespace cl::sycl; + +constexpr access::mode RW = access::mode::read_write; +constexpr access::mode DW = access::mode::discard_write; + +int main() { + queue Q; + printDeviceInfo(Q); + size_t MaxWGSize = + Q.get_device().get_info(); + + auto LambdaSum = [](auto x, auto y) { return (x + y); }; + + testBoth(Q, 0, 99, LambdaSum, range<1>{7}); + testBoth(Q, 0, 99, LambdaSum, range<1>{7}); + + testBoth(Q, 0, 99, LambdaSum, range<1>{MaxWGSize + 1}); + testBoth(Q, 0, 99, LambdaSum, range<1>{MaxWGSize + 1}); + + testBoth(Q, 0, 99, LambdaSum, range<2>{3, 4}); + testBoth(Q, 0, 99, LambdaSum, range<2>{3, 4}); + + testBoth(Q, 0, 99, LambdaSum, range<2>{3, MaxWGSize + 1}); + testBoth(Q, 0, 99, LambdaSum, range<2>{3, MaxWGSize + 1}); + + testBoth(Q, 0, 99, LambdaSum, range<3>{2, 3, 4}); + testBoth(Q, 0, 99, LambdaSum, range<3>{2, 3, 4}); + + testBoth(Q, 0, 99, LambdaSum, + range<3>{2, 3, MaxWGSize + 1}); + testBoth(Q, 0, 99, LambdaSum, + range<3>{2, 3, MaxWGSize + 1}); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/SYCL/Reduction/reduction_range_scalar.hpp b/SYCL/Reduction/reduction_range_scalar.hpp new file mode 100644 index 0000000000..7ee833f9d1 --- /dev/null +++ b/SYCL/Reduction/reduction_range_scalar.hpp @@ -0,0 +1,92 @@ +// This test performs basic checks of parallel_for(range, reduction, func) +// with reductions initialized with 1-dimensional buffer/accessor +// accessing a scalar holding the reduction result. + +#include "reduction_utils.hpp" +#include +#include + +using namespace cl::sycl; + +template class KName; + +template +std::ostream &operator<<(std::ostream &OS, const range &Range) { + OS << "{" << Range[0]; + if constexpr (Dims > 1) + OS << ", " << Range[1]; + if constexpr (Dims > 2) + OS << ", " << Range[2]; + OS << "}"; + return OS; +} + +template +void test(queue &Q, T Identity, T Init, BinaryOperation BOp, + range Range) { + std::string StdMode = IsSYCL2020Mode ? "SYCL2020" : "ONEAPI "; + std::cout << "Running the test case: " << StdMode + << " {T=" << typeid(T).name() + << ", BOp=" << typeid(BinaryOperation).name() << ", Range=" << Range + << std::endl; + + // Skip the test for such big arrays now. + constexpr size_t TwoGB = 2LL * 1024 * 1024 * 1024; + if (Range.size() > TwoGB) + return; + + buffer InBuf(Range); + buffer OutBuf(1); + + // Initialize. + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, Range); + if constexpr (Mode == access::mode::read_write) { + CorrectOut = BOp(CorrectOut, Init); + } + + // The value assigned here must be discarded (if IsReadWrite is true). + // Verify that it is really discarded and assign some value. + (OutBuf.template get_access())[0] = Init; + + // Compute. + if constexpr (IsSYCL2020Mode) { + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + property_list PropList = getPropertyList(); + auto Redu = sycl::reduction(OutBuf, CGH, Identity, BOp, PropList); + + CGH.parallel_for( + Range, Redu, [=](id Id, auto &Sum) { Sum.combine(In[Id]); }); + }); + } else { + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor Out(OutBuf, CGH); + auto Redu = ext::oneapi::reduction(Out, Identity, BOp); + + CGH.parallel_for( + Range, Redu, [=](id Id, auto &Sum) { Sum.combine(In[Id]); }); + }); + } + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + printDeviceInfo(Q, true); + std::cerr << "Error: Range = " << Range << ", " + << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +template +void testBoth(queue &Q, T Identity, T Init, BinaryOperation BOp, + range Range) { + test, false, Mode>(Q, Identity, Init, BOp, Range); + test, true, Mode>(Q, Identity, Init, BOp, Range); +} diff --git a/SYCL/Reduction/reduction_utils.hpp b/SYCL/Reduction/reduction_utils.hpp index e7044b462f..e51030f1ea 100644 --- a/SYCL/Reduction/reduction_utils.hpp +++ b/SYCL/Reduction/reduction_utils.hpp @@ -2,12 +2,13 @@ using namespace cl::sycl; -// Initializes 'InBuf' buffer with pseudo-random values, computes the reduction -// value for the buffer and writes it to 'ExpectedOut'. +/// Initializes the buffer<1> \p 'InBuf' buffer with pseudo-random values, +/// computes the write the reduction value \p 'ExpectedOut'. template void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, - BinaryOperation BOp, size_t N) { + BinaryOperation BOp, range<1> Range) { ExpectedOut = Identity; + size_t N = Range.size(); auto In = InBuf.template get_access(); for (int I = 0; I < N; ++I) { if (std::is_same>::value) @@ -18,6 +19,44 @@ void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, } }; +/// Initializes the buffer<2> \p 'InBuf' buffer with pseudo-random values, +/// computes the write the reduction value \p 'ExpectedOut'. +template +void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, + BinaryOperation BOp, range<2> Range) { + ExpectedOut = Identity; + auto In = InBuf.template get_access(); + for (int J = 0; J < Range[0]; ++J) { + for (int I = 0; I < Range[1]; ++I) { + if (std::is_same>::value) + In[J][I] = 1 + ((((I * 2 + J * 3) % 37) == 0) ? 1 : 0); + else + In[J][I] = ((I + 1 + J) % 5) + 1.1; + ExpectedOut = BOp(ExpectedOut, In[J][I]); + } + } +}; + +/// Initializes the buffer<3> \p 'InBuf' buffer with pseudo-random values, +/// computes the write the reduction value \p 'ExpectedOut'. +template +void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, + BinaryOperation BOp, range<3> Range) { + ExpectedOut = Identity; + auto In = InBuf.template get_access(); + for (int K = 0; K < Range[0]; ++K) { + for (int J = 0; J < Range[1]; ++J) { + for (int I = 0; I < Range[2]; ++I) { + if (std::is_same>::value) + In[K][J][I] = 1 + ((((I * 2 + J * 3 + K) % 37) == 0) ? 1 : 0); + else + In[K][J][I] = ((I + 1 + J + K * 3) % 5) + 1.1; + ExpectedOut = BOp(ExpectedOut, In[K][J][I]); + } + } + } +}; + // This type is needed only to check that custom types are properly handled // in parallel_for() with reduction. For simplicity it needs a default // constructor, a constructor with one argument, operators ==, != and @@ -64,3 +103,16 @@ template property_list getPropertyList() { return property_list(); return property_list(property::reduction::initialize_to_identity{}); } + +void printDeviceInfo(queue &Q, bool ToCERR = false) { + device D = Q.get_device(); + auto Name = D.get_info(); + size_t MaxWGSize = D.get_info(); + size_t LocalMemSize = D.get_info(); + if (ToCERR) + std::cout << "Device: " << Name << ", MaxWGSize: " << MaxWGSize + << ", LocalMemSize: " << LocalMemSize << std::endl; + else + std::cerr << "Device: " << Name << ", MaxWGSize: " << MaxWGSize + << ", LocalMemSize: " << LocalMemSize << std::endl; +}