From 27674ad6b2ccee68929f50a376f1538018b203ae Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Mon, 1 Feb 2021 19:52:49 -0800 Subject: [PATCH] [SYCL] Add LIT test for parallel_for() accepting many reductions This test verifies the following change-set in compiler: intel/llvm#3123 Signed-off-by: Vyacheslav N Klochkov --- SYCL/Reduction/reduction_nd_N_vars.cpp | 208 +++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 SYCL/Reduction/reduction_nd_N_vars.cpp diff --git a/SYCL/Reduction/reduction_nd_N_vars.cpp b/SYCL/Reduction/reduction_nd_N_vars.cpp new file mode 100644 index 0000000000..a8d7bd75de --- /dev/null +++ b/SYCL/Reduction/reduction_nd_N_vars.cpp @@ -0,0 +1,208 @@ +// TODO: level_zero reports an internal error for this test. +// UNSUPPORTED: level_zero + +// TODO: Windows implementation of std::tuple is not trivially copiable and +// thus cannot be passed from HOST to DEVICE. Enable the test on Windows when +// SYCL RT gets new type traits having less strict requirements for objects +// being passed to DEVICE. +// UNSUPPORTED: windows + +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// This test checks handling of parallel_for() accepting nd_range and +// two or more reductions. + +#include "reduction_utils.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include + +template class KernelNameGroup; + +using namespace cl::sycl; + +template +bool cherkResultIsExpected(int TestCaseNum, T Expected, T Computed) { + bool Success; + if (!std::is_floating_point::value) + Success = (Expected == Computed); + else + Success = std::abs((Expected / Computed) - 1) < 0.5; + + if (!Success) + std::cout << TestCaseNum << ": Expected value = " << Expected + << ", Computed value = " << Computed << "\n"; + + return Success; +} + +template +int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1, + T2 IdentityVal2, T2 InitVal2, BinaryOperation2 BOp2, + T3 IdentityVal3, T3 InitVal3, BinaryOperation3 BOp3, + T4 IdentityVal4, T3 InitVal4, BinaryOperation4 BOp4, + usm::alloc AllocType4, size_t NWorkItems, size_t WGSize) { + buffer InBuf1(NWorkItems); + buffer InBuf2(NWorkItems); + buffer InBuf3(NWorkItems); + buffer InBuf4(NWorkItems); + buffer OutBuf1(1); + buffer OutBuf2(1); + buffer OutBuf3(1); + + queue Q; + auto Dev = Q.get_device(); + if (AllocType4 == usm::alloc::shared && + !Dev.get_info()) + return 4; + if (AllocType4 == usm::alloc::host && + !Dev.get_info()) + return 4; + if (AllocType4 == usm::alloc::device && + !Dev.get_info()) + return 4; + T4 *Out4 = (T4 *)malloc(sizeof(T4), Dev, Q.get_context(), AllocType4); + if (Out4 == nullptr) + return 4; + + // Initialize the arrays with sentinel values + // and pre-compute the expected result 'CorrectOut'. + T1 CorrectOut1; + T2 CorrectOut2; + T3 CorrectOut3; + T4 CorrectOut4; + initInputData(InBuf1, CorrectOut1, IdentityVal1, BOp1, NWorkItems); + initInputData(InBuf2, CorrectOut2, IdentityVal2, BOp2, NWorkItems); + initInputData(InBuf3, CorrectOut3, IdentityVal3, BOp3, NWorkItems); + initInputData(InBuf4, CorrectOut4, IdentityVal4, BOp4, NWorkItems); + + if (Mode1 == access::mode::read_write) + CorrectOut1 = BOp1(CorrectOut1, InitVal1); + if (Mode2 == access::mode::read_write) + CorrectOut2 = BOp2(CorrectOut2, InitVal2); + if (Mode3 == access::mode::read_write) + CorrectOut3 = BOp3(CorrectOut3, InitVal3); + // 4th reduction is USM and this is read_write. + CorrectOut4 = BOp4(CorrectOut4, InitVal4); + + // Inititialize data. + { + auto Out1 = OutBuf1.template get_access(); + Out1[0] = InitVal1; + auto Out2 = OutBuf2.template get_access(); + Out2[0] = InitVal2; + auto Out3 = OutBuf3.template get_access(); + Out3[0] = InitVal3; + + if (AllocType4 == usm::alloc::device) { + Q.submit([&](handler &CGH) { + CGH.single_task< + KernelNameGroup>( + [=]() { *Out4 = InitVal4; }); + }).wait(); + } else { + *Out4 = InitVal4; + } + } + + // The main code to be tested. + Q.submit([&](handler &CGH) { + auto In1 = InBuf1.template get_access(CGH); + auto In2 = InBuf2.template get_access(CGH); + auto In3 = InBuf3.template get_access(CGH); + auto In4 = InBuf4.template get_access(CGH); + + auto Out1 = OutBuf1.template get_access(CGH); + auto Out2 = OutBuf2.template get_access(CGH); + accessor Out3(OutBuf3, CGH); + + auto Lambda = [=](nd_item<1> NDIt, auto &Sum1, auto &Sum2, auto &Sum3, + auto &Sum4) { + size_t I = NDIt.get_global_id(0); + Sum1.combine(In1[I]); + Sum2.combine(In2[I]); + Sum3.combine(In3[I]); + Sum4.combine(In4[I]); + }; + + auto Redu1 = + ONEAPI::reduction(Out1, IdentityVal1, BOp1); + auto Redu2 = + ONEAPI::reduction(Out2, IdentityVal2, BOp2); + auto Redu3 = + ONEAPI::reduction(Out3, IdentityVal3, BOp3); + auto Redu4 = + ONEAPI::reduction(Out4, IdentityVal4, BOp4); + + auto NDR = nd_range<1>{range<1>(NWorkItems), range<1>{WGSize}}; + CGH.parallel_for(NDR, Redu1, Redu2, Redu3, Redu4, + Lambda); + }).wait(); + + // Check the results and free memory. + int Error = 0; + { + auto Out1 = OutBuf1.template get_access(); + auto Out2 = OutBuf2.template get_access(); + auto Out3 = OutBuf3.template get_access(); + + T4 Out4Val; + if (AllocType4 == usm::alloc::device) { + buffer Buf(&Out4Val, range<1>(1)); + Q.submit([&](handler &CGH) { + auto OutAcc = Buf.template get_access(CGH); + CGH.copy(Out4, OutAcc); + }); + Out4Val = (Buf.template get_access())[0]; + } else { + Out4Val = *Out4; + } + + Error += cherkResultIsExpected(1, CorrectOut1, Out1[0]) ? 0 : 1; + Error += cherkResultIsExpected(2, CorrectOut2, Out2[0]) ? 0 : 1; + Error += cherkResultIsExpected(3, CorrectOut3, Out3[0]) ? 0 : 1; + Error += cherkResultIsExpected(4, CorrectOut4, Out4Val) ? 0 : 1; + free(Out4, Q.get_context()); + } + + if (Error) + std::cerr << "The test failed for nd_range(" << NWorkItems << "," << WGSize + << ")\n\n"; + + return Error; +} + +int main() { + int Error = + runTest( + 0, 1000, std::plus{}, 0, 2000, std::plus<>{}, 0, 4000, + std::bit_or<>{}, 0, 8000, std::bit_xor<>{}, usm::alloc::shared, 16, + 16); + + auto Add = [](auto x, auto y) { return (x + y); }; + Error += runTest( + 0, 1000, std::plus{}, 0, 2000, std::plus<>{}, 0, 4000, Add, 0, + 8000, std::bit_xor{}, usm::alloc::device, 5 * (256 + 1), 5); + + if (!Error) + std::cout << "Test passed\n"; + else + std::cout << Error << " test-cases failed\n"; + return Error; +}