diff --git a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl index fba9ad72d8a52..96e6ba9e20632 100644 --- a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl +++ b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl @@ -264,6 +264,7 @@ __CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, double, -DBL_MAX) result = OP(sg_x, scratch[sg_id - 1]); \ } \ } \ + __spirv_ControlBarrier(Workgroup, 0, 0); \ return result; \ } diff --git a/sycl/test/on-device/back_to_back_collectives.cpp b/sycl/test/on-device/back_to_back_collectives.cpp new file mode 100644 index 0000000000000..492ca0b6a157e --- /dev/null +++ b/sycl/test/on-device/back_to_back_collectives.cpp @@ -0,0 +1,70 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %RUN_ON_HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +using namespace cl::sycl; +using namespace cl::sycl::ONEAPI; + +class back_to_back; + +int main() { + queue q; + if (q.get_device().is_host()) { + std::cout << "Skipping test\n"; + return 0; + } + + // Use max work-group size to maximize chance of race + program prog(q.get_context()); + prog.build_with_kernel_type(); + kernel k = prog.get_kernel(); + device d = q.get_device(); + int N = k.get_info(d); + + std::vector Input(N), Sum(N), EScan(N), IScan(N); + std::iota(Input.begin(), Input.end(), 0); + std::fill(Sum.begin(), Sum.end(), 0); + std::fill(EScan.begin(), EScan.end(), 0); + std::fill(IScan.begin(), IScan.end(), 0); + + { + buffer InputBuf(Input.data(), N); + buffer SumBuf(Sum.data(), N); + buffer EScanBuf(EScan.data(), N); + buffer IScanBuf(IScan.data(), N); + q.submit([&](handler &h) { + auto Input = InputBuf.get_access(h); + auto Sum = SumBuf.get_access(h); + auto EScan = EScanBuf.get_access(h); + auto IScan = IScanBuf.get_access(h); + h.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + size_t i = it.get_global_id(0); + auto g = it.get_group(); + // Loop to increase number of back-to-back calls + for (int r = 0; r < 10; ++r) { + Sum[i] = reduce(g, Input[i], plus<>()); + EScan[i] = exclusive_scan(g, Input[i], plus<>()); + IScan[i] = inclusive_scan(g, Input[i], plus<>()); + } + }); + }); + } + + int sum = 0; + bool passed = true; + for (int i = 0; i < N; ++i) { + passed &= (sum == EScan[i]); + sum += i; + passed &= (sum == IScan[i]); + } + for (int i = 0; i < N; ++i) { + passed &= (sum == Sum[i]); + } + std::cout << "Test passed." << std::endl; + return 0; +}