diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 7406f82331bf4..4f7db7d0f4dca 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -493,10 +493,10 @@ class ExecCGCommand : public Command { detail::CG &getCG() const { return *MCommandGroup; } - // MEmptyCmd one is only employed if this command refers to host-task. - // MEmptyCmd due to unreliable mechanism of lookup for single EmptyCommand - // amongst users of host-task-representing command. This unreliability roots - // in cleanup process. + // MEmptyCmd is only employed if this command refers to host-task. + // The mechanism of lookup for single EmptyCommand amongst users of + // host-task-representing command is unreliable. This unreliability roots in + // the cleanup process. EmptyCommand *MEmptyCmd = nullptr; private: diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index bc7f813069f39..480072d3a8cd7 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -58,12 +58,6 @@ bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, if (!Cmd || Cmd->isSuccessfullyEnqueued()) return true; - // Exit early if the command is blocked and the enqueue type is non-blocking - if (Cmd->isEnqueueBlocked() && !Blocking) { - EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, Cmd); - return false; - } - // Recursively enqueue all the dependencies first and // exit immediately if any of the commands cannot be enqueued. for (DepDesc &Dep : Cmd->MDeps) { @@ -71,6 +65,11 @@ bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, return false; } + if (Cmd->isEnqueueBlocked() && !Blocking) { + EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, Cmd); + return false; + } + return Cmd->enqueue(EnqueueResult, Blocking); } diff --git a/sycl/test/host-interop-task/host-task-dependency2.cpp b/sycl/test/host-interop-task/host-task-dependency2.cpp new file mode 100644 index 0000000000000..0d162c9478ed1 --- /dev/null +++ b/sycl/test/host-interop-task/host-task-dependency2.cpp @@ -0,0 +1,97 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out + +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %CPU_RUN_PLACEHOLDER %t.out 10 +// RUN: %GPU_RUN_PLACEHOLDER %t.out 10 +// RUN: %ACC_RUN_PLACEHOLDER %t.out 10 + +#include +#include + +using namespace cl::sycl; +using namespace cl::sycl::access; + +static constexpr size_t BUFFER_SIZE = 1024; + +static auto EH = [](exception_list EL) { + for (const std::exception_ptr &E : EL) { + throw E; + } +}; + +// Host-task depending on another host-task via handler::depends_on() only +// should not hang +void test(size_t Count) { + queue Q(EH); + + static constexpr size_t BufferSize = 10 * 1024; + + buffer B0{range<1>{BufferSize}}; + buffer B1{range<1>{BufferSize}}; + buffer B2{range<1>{BufferSize}}; + buffer B3{range<1>{BufferSize}}; + buffer B4{range<1>{BufferSize}}; + buffer B5{range<1>{BufferSize}}; + + for (size_t Idx = 1; Idx <= Count; ++Idx) { + // This host task should be submitted without hesitation + event E1 = Q.submit([&](handler &CGH) { + std::cout << "Submit 1" << std::endl; + + auto Acc0 = B0.get_access(CGH); + auto Acc1 = B1.get_access(CGH); + auto Acc2 = B2.get_access(CGH); + + CGH.codeplay_host_task([=] { + Acc0[0] = 1 * Idx; + Acc1[0] = 2 * Idx; + Acc2[0] = 3 * Idx; + }); + }); + + // This host task is going to depend on blocked empty node of the first + // host-task (via buffer #2). Still this one should be enqueued. + event E2 = Q.submit([&](handler &CGH) { + std::cout << "Submit 2" << std::endl; + + auto Acc2 = B2.get_access(CGH); + auto Acc3 = B3.get_access(CGH); + + CGH.codeplay_host_task([=] { + Acc2[1] = 1 * Idx; + Acc3[1] = 2 * Idx; + }); + }); + + // This host-task only depends on the second host-task via + // handler::depends_on(). This one should not hang and should be eexecuted + // after host-task #2. + event E3 = Q.submit([&](handler &CGH) { + CGH.depends_on(E2); + + std::cout << "Submit 3" << std::endl; + + auto Acc4 = B4.get_access(CGH); + auto Acc5 = B5.get_access(CGH); + + CGH.codeplay_host_task([=] { + Acc4[2] = 1 * Idx; + Acc5[2] = 2 * Idx; + }); + }); + } + + Q.wait_and_throw(); +} + +int main(int Argc, const char *Argv[]) { + size_t Count = 1; + if (Argc > 1) + Count = std::stoi(Argv[1]); + + test(Count); + return 0; +} diff --git a/sycl/test/host-interop-task/host-task-dependency3.cpp b/sycl/test/host-interop-task/host-task-dependency3.cpp new file mode 100644 index 0000000000000..43a5a05448827 --- /dev/null +++ b/sycl/test/host-interop-task/host-task-dependency3.cpp @@ -0,0 +1,126 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out + +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %CPU_RUN_PLACEHOLDER %t.out 10 +// RUN: %GPU_RUN_PLACEHOLDER %t.out 10 +// RUN: %ACC_RUN_PLACEHOLDER %t.out 10 + +#include +#include +#include +#include + +using namespace cl::sycl; +using namespace cl::sycl::access; + +static constexpr size_t BUFFER_SIZE = 1024; + +static auto EH = [](exception_list EL) { + for (const std::exception_ptr &E : EL) { + throw E; + } +}; + +// Host-task depending on another host-task via handler::depends_on() only +// should not hang. A bit more complicated case with kernels depending on +// host-task being involved. +void test(size_t Count) { + queue Q(EH); + + static constexpr size_t BufferSize = 10 * 1024; + + buffer B0{range<1>{BufferSize}}; + buffer B1{range<1>{BufferSize}}; + buffer B2{range<1>{BufferSize}}; + buffer B3{range<1>{BufferSize}}; + buffer B4{range<1>{BufferSize}}; + buffer B5{range<1>{BufferSize}}; + + using namespace std::chrono_literals; + constexpr auto SleepFor = 1s; + + for (size_t Idx = 1; Idx <= Count; ++Idx) { + // This host task should be submitted without hesitation + Q.submit([&](handler &CGH) { + std::cout << "Submit HT-1" << std::endl; + + auto Acc0 = B0.get_access(CGH); + + CGH.codeplay_host_task([=] { + std::this_thread::sleep_for(SleepFor); + Acc0[0] = 1 * Idx; + }); + }); + + Q.submit([&](handler &CGH) { + std::cout << "Submit Kernel-1" << std::endl; + + auto Acc0 = B0.get_access(CGH); + + CGH.single_task([=] { Acc0[1] = 1 * Idx; }); + }); + + Q.submit([&](handler &CGH) { + std::cout << "Submit Kernel-2" << std::endl; + + auto Acc1 = B1.get_access(CGH); + + CGH.single_task([=] { Acc1[2] = 1 * Idx; }); + }); + + Q.submit([&](handler &CGH) { + std::cout << "Submit HT-2" << std::endl; + + auto Acc2 = B2.get_access(CGH); + + CGH.codeplay_host_task([=] { + std::this_thread::sleep_for(SleepFor); + Acc2[3] = 1 * Idx; + }); + }); + + // This host task is going to depend on blocked empty node of the second + // host-task (via buffer #0). Still this one should be enqueued. + event EHT3 = Q.submit([&](handler &CGH) { + std::cout << "Submit HT-3" << std::endl; + + auto Acc0 = B0.get_access(CGH); + auto Acc1 = B1.get_access(CGH); + auto Acc2 = B2.get_access(CGH); + + CGH.codeplay_host_task([=] { + std::this_thread::sleep_for(SleepFor); + Acc0[4] = 1 * Idx; + Acc1[4] = 2 * Idx; + Acc2[4] = 3 * Idx; + }); + }); + + // This host-task only depends on the third host-task via + // handler::depends_on(). This one should not hang and should be executed + // after host-task #3. + Q.submit([&](handler &CGH) { + std::cout << "Submit HT-4" << std::endl; + + CGH.depends_on(EHT3); + + auto Acc5 = B5.get_access(CGH); + + CGH.codeplay_host_task([=] { Acc5[5] = 1 * Idx; }); + }); + } + + Q.wait_and_throw(); +} + +int main(int Argc, const char *Argv[]) { + size_t Count = 1; + if (Argc > 1) + Count = std::stoi(Argv[1]); + + test(Count); + return 0; +} diff --git a/sycl/test/host-interop-task/host-task-dependency4.cpp b/sycl/test/host-interop-task/host-task-dependency4.cpp new file mode 100644 index 0000000000000..e95edc3c078a1 --- /dev/null +++ b/sycl/test/host-interop-task/host-task-dependency4.cpp @@ -0,0 +1,30 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out + +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include + +cl::sycl::event submit(cl::sycl::queue &Q, cl::sycl::buffer &B) { + return Q.submit([&](cl::sycl::handler &CGH) { + auto A = B.template get_access(CGH); + CGH.codeplay_host_task([=]() { (void)A; }); + }); +} + +int main() { + cl::sycl::queue Q; + int Status = 0; + cl::sycl::buffer A{&Status, 1}; + cl::sycl::vector_class Events; + + Events.push_back(submit(Q, A)); + Events.push_back(submit(Q, A)); + Q.submit([&](sycl::handler &CGH) { + CGH.depends_on(Events); + CGH.codeplay_host_task([&] { printf("all done\n"); }); + }).wait_and_throw(); + + return 0; +} diff --git a/sycl/test/host-interop-task/host-task.cpp b/sycl/test/host-interop-task/host-task.cpp index ca355bcb4b654..85b92e71e756d 100644 --- a/sycl/test/host-interop-task/host-task.cpp +++ b/sycl/test/host-interop-task/host-task.cpp @@ -11,10 +11,6 @@ // RUN: %GPU_RUN_PLACEHOLDER %t.out 3 // RUN: %ACC_RUN_PLACEHOLDER %t.out 3 -// RUNx: %CPU_RUN_PLACEHOLDER %t.out 4 -// RUNx: %GPU_RUN_PLACEHOLDER %t.out 4 -// RUNx: %ACC_RUN_PLACEHOLDER %t.out 4 - #include #include #include @@ -103,7 +99,6 @@ void test3() { std::vector Deps; - using namespace std::chrono_literals; static constexpr size_t Count = 10; auto Start = std::chrono::steady_clock::now(); @@ -146,74 +141,12 @@ void test3() { Q.wait_and_throw(); auto End = std::chrono::steady_clock::now(); + using namespace std::chrono_literals; constexpr auto Threshold = 2s; assert(End - Start < Threshold && "Host tasks were waiting for too long"); } -// Host-task depending on another host-task via handler::depends_on() only -// should not hang -void test4() { - queue Q(EH); - - static constexpr size_t BufferSize = 10 * 1024; - - buffer B0{range<1>{BufferSize}}; - buffer B1{range<1>{BufferSize}}; - buffer B2{range<1>{BufferSize}}; - buffer B3{range<1>{BufferSize}}; - buffer B4{range<1>{BufferSize}}; - buffer B5{range<1>{BufferSize}}; - - // This host task should be submitted without hesitation - event E1 = Q.submit([&](handler &CGH) { - std::cout << "Submit 1" << std::endl; - - auto Acc0 = B0.get_access(CGH); - auto Acc1 = B1.get_access(CGH); - auto Acc2 = B2.get_access(CGH); - - CGH.codeplay_host_task([=] { - Acc0[0] = 1; - Acc1[0] = 2; - Acc2[0] = 3; - }); - }); - - // This host task is going to depend on blocked empty node of the first - // host-task (via buffer #2). Still this one should be enqueued. - event E2 = Q.submit([&](handler &CGH) { - std::cout << "Submit 2" << std::endl; - - auto Acc2 = B2.get_access(CGH); - auto Acc3 = B3.get_access(CGH); - - CGH.codeplay_host_task([=] { - Acc2[1] = 1; - Acc3[1] = 2; - }); - }); - - // This host-task only depends on the second host-task via - // handler::depends_on(). This one should not hang and should be enqueued - // after host-task #2. - event E3 = Q.submit([&](handler &CGH) { - CGH.depends_on(E2); - - std::cout << "Submit 3" << std::endl; - - auto Acc4 = B4.get_access(CGH); - auto Acc5 = B5.get_access(CGH); - - CGH.codeplay_host_task([=] { - Acc4[2] = 1; - Acc5[2] = 2; - }); - }); - - Q.wait_and_throw(); -} - int main(int Argc, const char *Argv[]) { if (Argc < 2) return 1; @@ -230,9 +163,6 @@ int main(int Argc, const char *Argv[]) { case 3: test3(); break; - case 4: - test4(); - break; default: return 1; }