From 3ec79ba9d6f616dfcd278c9b7861616acb555550 Mon Sep 17 00:00:00 2001 From: Alexander Flegontov Date: Thu, 11 Jun 2020 12:49:17 +0300 Subject: [PATCH 01/11] [SYCL] Fix race that occurs when submitting to single queue in parallel Signed-off-by: Alexander Flegontov --- sycl/source/detail/scheduler/scheduler.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index d896dfc6be08b..2ece264f8d458 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -83,7 +83,8 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, } { - std::shared_lock Lock(MGraphLock); + std::unique_lock Lock(MGraphLock, std::defer_lock); + lockSharedTimedMutex(Lock); // TODO: Check if lazy mode. EnqueueResultT Res; From 939dc8185da2045a28dacc283f03762510882a52 Mon Sep 17 00:00:00 2001 From: Alexander Flegontov Date: Thu, 11 Jun 2020 15:10:38 +0300 Subject: [PATCH 02/11] [SYCL] Apply comment from review about NewCmd Signed-off-by: Alexander Flegontov --- sycl/source/detail/scheduler/scheduler.cpp | 25 +++++++++++++--------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 2ece264f8d458..f72e2a24ab6be 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -63,12 +63,13 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record) { EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, QueueImplPtr Queue) { - Command *NewCmd = nullptr; + EventImplPtr NewEvent = nullptr; const bool IsKernel = CommandGroup->getType() == CG::KERNEL; { std::unique_lock Lock(MGraphLock, std::defer_lock); lockSharedTimedMutex(Lock); + Command *NewCmd = nullptr; switch (CommandGroup->getType()) { case CG::UPDATE_HOST: NewCmd = MGraphBuilder.addCGUpdateHost(std::move(CommandGroup), @@ -80,23 +81,27 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, default: NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), std::move(Queue)); } + NewEvent = NewCmd->getEvent(); } { std::unique_lock Lock(MGraphLock, std::defer_lock); lockSharedTimedMutex(Lock); - // TODO: Check if lazy mode. - EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); - if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) - throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); - } + Command *Cmd = static_cast(NewEvent->getCommand()); + if (Cmd) { + // TODO: Check if lazy mode. + EnqueueResultT Res; + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) + throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); - if (IsKernel) - ((ExecCGCommand *)NewCmd)->flushStreams(); + if (IsKernel) + ((ExecCGCommand *)Cmd)->flushStreams(); + } + } - return NewCmd->getEvent(); + return NewEvent; } EventImplPtr Scheduler::addCopyBack(Requirement *Req) { From edbbcf4971e3c64d502bd213c5f7395b3736f1a6 Mon Sep 17 00:00:00 2001 From: Alexander Flegontov Date: Thu, 11 Jun 2020 15:33:36 +0300 Subject: [PATCH 03/11] [SYCL] Added distinct lock for enqueue new command Signed-off-by: Alexander Flegontov --- sycl/source/detail/scheduler/scheduler.cpp | 2 +- sycl/source/detail/scheduler/scheduler.hpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index f72e2a24ab6be..462562cfa67bf 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -85,7 +85,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, } { - std::unique_lock Lock(MGraphLock, std::defer_lock); + std::unique_lock Lock(MGraphLockEnqueue, std::defer_lock); lockSharedTimedMutex(Lock); Command *Cmd = static_cast(NewEvent->getCommand()); diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index b7d580e5d1823..a1ba128e26605 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -698,6 +698,8 @@ class Scheduler { // std::shared_mutex std::shared_timed_mutex MGraphLock; + std::shared_timed_mutex MGraphLockEnqueue; + QueueImplPtr DefaultHostQueue; friend class Command; From f1faabe49c7488b87c4ba2fcfcd02a30db71a1d9 Mon Sep 17 00:00:00 2001 From: Alexander Flegontov Date: Thu, 11 Jun 2020 17:35:42 +0300 Subject: [PATCH 04/11] [SYCL] Apply clang-format Signed-off-by: Alexander Flegontov --- sycl/source/detail/scheduler/scheduler.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 462562cfa67bf..d43a43a432e55 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -85,7 +85,8 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, } { - std::unique_lock Lock(MGraphLockEnqueue, std::defer_lock); + std::unique_lock Lock(MGraphLockEnqueue, + std::defer_lock); lockSharedTimedMutex(Lock); Command *Cmd = static_cast(NewEvent->getCommand()); From 301a0f1d70d716b91974fa115f029e450f128d13 Mon Sep 17 00:00:00 2001 From: Alexander Flegontov Date: Thu, 11 Jun 2020 22:43:00 +0300 Subject: [PATCH 05/11] [SYCL] Apply a single lock in Scheduler::addCG(), change locking in other places where enqueue takes place Signed-off-by: Alexander Flegontov --- sycl/source/detail/scheduler/commands.cpp | 5 +-- sycl/source/detail/scheduler/scheduler.cpp | 37 ++++++++-------------- sycl/source/detail/scheduler/scheduler.hpp | 2 -- 3 files changed, 17 insertions(+), 27 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index a58c28f1487bb..4dfa2a06a9972 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -209,10 +209,11 @@ class DispatchHostTask { // process. Thus we'll copy deps prior to completing of event and unblocking // of empty command. // Also, it's possible to have record deallocated prior to enqueue process. - // Thus we employ read-lock of graph. { Scheduler &Sched = Scheduler::getInstance(); - std::shared_lock Lock(Sched.MGraphLock); + std::unique_lock Lock(Sched.MGraphLock, + std::defer_lock); + Sched.lockSharedTimedMutex(Lock); std::vector Deps = MThisCmd->MDeps; diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index d43a43a432e55..a91f5663fb1ff 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -63,13 +63,12 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record) { EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, QueueImplPtr Queue) { - EventImplPtr NewEvent = nullptr; + Command *NewCmd = nullptr; const bool IsKernel = CommandGroup->getType() == CG::KERNEL; { std::unique_lock Lock(MGraphLock, std::defer_lock); lockSharedTimedMutex(Lock); - Command *NewCmd = nullptr; switch (CommandGroup->getType()) { case CG::UPDATE_HOST: NewCmd = MGraphBuilder.addCGUpdateHost(std::move(CommandGroup), @@ -81,28 +80,18 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, default: NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), std::move(Queue)); } - NewEvent = NewCmd->getEvent(); - } - - { - std::unique_lock Lock(MGraphLockEnqueue, - std::defer_lock); - lockSharedTimedMutex(Lock); - Command *Cmd = static_cast(NewEvent->getCommand()); - if (Cmd) { - // TODO: Check if lazy mode. - EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); - if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) - throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); - - if (IsKernel) - ((ExecCGCommand *)Cmd)->flushStreams(); - } + // TODO: Check if lazy mode. + EnqueueResultT Res; + bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) + throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } - return NewEvent; + if (IsKernel) + ((ExecCGCommand *)NewCmd)->flushStreams(); + + return NewCmd->getEvent(); } EventImplPtr Scheduler::addCopyBack(Requirement *Req) { @@ -144,7 +133,8 @@ std::vector Scheduler::getWaitList(EventImplPtr Event) { } void Scheduler::waitForEvent(EventImplPtr Event) { - std::shared_lock Lock(MGraphLock); + std::unique_lock Lock(MGraphLock, std::defer_lock); + lockSharedTimedMutex(Lock); GraphProcessor::waitForEvent(std::move(Event)); } @@ -195,7 +185,8 @@ EventImplPtr Scheduler::addHostAccessor(Requirement *Req) { void Scheduler::releaseHostAccessor(Requirement *Req) { Command *const BlockedCmd = Req->MBlockedCmd; - std::shared_lock Lock(MGraphLock); + std::unique_lock Lock(MGraphLock, std::defer_lock); + lockSharedTimedMutex(Lock); assert(BlockedCmd && "Can't find appropriate command to unblock"); diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index a1ba128e26605..b7d580e5d1823 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -698,8 +698,6 @@ class Scheduler { // std::shared_mutex std::shared_timed_mutex MGraphLock; - std::shared_timed_mutex MGraphLockEnqueue; - QueueImplPtr DefaultHostQueue; friend class Command; From 287026998685052c7900adb774115e6167fcd084 Mon Sep 17 00:00:00 2001 From: Alexander Flegontov Date: Mon, 15 Jun 2020 13:43:31 +0300 Subject: [PATCH 06/11] [SYCL] Fix hang in Lit tests Signed-off-by: Alexander Flegontov --- sycl/source/detail/scheduler/commands.cpp | 5 ++--- sycl/source/detail/scheduler/scheduler.cpp | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 4dfa2a06a9972..a58c28f1487bb 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -209,11 +209,10 @@ class DispatchHostTask { // process. Thus we'll copy deps prior to completing of event and unblocking // of empty command. // Also, it's possible to have record deallocated prior to enqueue process. + // Thus we employ read-lock of graph. { Scheduler &Sched = Scheduler::getInstance(); - std::unique_lock Lock(Sched.MGraphLock, - std::defer_lock); - Sched.lockSharedTimedMutex(Lock); + std::shared_lock Lock(Sched.MGraphLock); std::vector Deps = MThisCmd->MDeps; diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index a91f5663fb1ff..83c5ae6601547 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -133,8 +133,7 @@ std::vector Scheduler::getWaitList(EventImplPtr Event) { } void Scheduler::waitForEvent(EventImplPtr Event) { - std::unique_lock Lock(MGraphLock, std::defer_lock); - lockSharedTimedMutex(Lock); + std::shared_lock Lock(MGraphLock); GraphProcessor::waitForEvent(std::move(Event)); } From 2e4b1d38c9c42ef307161cf0f2d6ba5334768a5a Mon Sep 17 00:00:00 2001 From: Alexander Flegontov Date: Mon, 15 Jun 2020 15:01:17 +0300 Subject: [PATCH 07/11] [SYCL] Fix hang in Lit with Cuda Signed-off-by: Alexander Flegontov --- sycl/source/detail/scheduler/scheduler.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 83c5ae6601547..617d27964f908 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -184,8 +184,7 @@ EventImplPtr Scheduler::addHostAccessor(Requirement *Req) { void Scheduler::releaseHostAccessor(Requirement *Req) { Command *const BlockedCmd = Req->MBlockedCmd; - std::unique_lock Lock(MGraphLock, std::defer_lock); - lockSharedTimedMutex(Lock); + std::shared_lock Lock(MGraphLock); assert(BlockedCmd && "Can't find appropriate command to unblock"); From 0bd1bf9e2fad6cc07696a89d2a76716c786b6de4 Mon Sep 17 00:00:00 2001 From: Alexander Flegontov Date: Mon, 22 Jun 2020 13:16:46 +0300 Subject: [PATCH 08/11] [SYCL] Add per-kernel mutex to fix the race when setting kernel parameters in parallel Signed-off-by: Alexander Flegontov --- sycl/source/detail/kernel_program_cache.hpp | 9 ++++++- sycl/source/detail/program_impl.cpp | 5 ++-- .../program_manager/program_manager.cpp | 26 +++++++++++-------- .../program_manager/program_manager.hpp | 6 ++--- sycl/source/detail/scheduler/commands.cpp | 15 ++++++++--- sycl/source/detail/scheduler/scheduler.cpp | 3 +++ 6 files changed, 43 insertions(+), 21 deletions(-) diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp index bc2e74800de8f..24bc38ae7e241 100644 --- a/sycl/source/detail/kernel_program_cache.hpp +++ b/sycl/source/detail/kernel_program_cache.hpp @@ -58,8 +58,15 @@ class KernelProgramCache { using ContextPtr = context_impl *; using PiKernelT = std::remove_pointer::type; + + struct BuildResultKernel : public BuildResult { + std::mutex MKernelMutex; + + BuildResultKernel(PiKernelT *P, int S) : BuildResult(P, S) {} + }; + using PiKernelPtrT = std::atomic; - using KernelWithBuildStateT = BuildResult; + using KernelWithBuildStateT = BuildResultKernel; using KernelByNameT = std::map; using KernelCacheT = std::map; diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp index a423b2fac9f43..ac8276c1bd43f 100644 --- a/sycl/source/detail/program_impl.cpp +++ b/sycl/source/detail/program_impl.cpp @@ -396,8 +396,9 @@ RT::PiKernel program_impl::get_pi_kernel(const string_class &KernelName) const { RT::PiKernel Kernel; if (is_cacheable()) { - Kernel = ProgramManager::getInstance().getOrCreateKernel( - MProgramModuleHandle, get_context(), KernelName, this); + std::tie(Kernel, std::ignore) = + ProgramManager::getInstance().getOrCreateKernel( + MProgramModuleHandle, get_context(), KernelName, this); getPlugin().call(Kernel); } else { const detail::plugin &Plugin = getPlugin(); diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 44376f2236cfb..9eada72a89681 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -167,8 +167,9 @@ RetT *waitUntilBuilt(KernelProgramCache &Cache, /// cache. Accepts nothing. Return pointer to built entity. template -RetT *getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey, - AcquireFT &&Acquire, GetCacheFT &&GetCache, BuildFT &&Build) { +KernelProgramCache::BuildResult * +getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey, AcquireFT &&Acquire, + GetCacheFT &&GetCache, BuildFT &&Build) { bool InsertionTookPlace; KernelProgramCache::BuildResult *BuildResult; @@ -190,7 +191,7 @@ RetT *getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey, RetT *Result = waitUntilBuilt(KPCache, BuildResult); if (Result) - return Result; + return BuildResult; // Previous build is failed. There was no SYCL exception though. // We might try to build once more. @@ -220,7 +221,7 @@ RetT *getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey, KPCache.notifyAllBuild(); - return Desired; + return BuildResult; } catch (const exception &Ex) { BuildResult->Error.Msg = Ex.what(); BuildResult->Error.Code = Ex.get_cl_code(); @@ -400,14 +401,15 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(OSModuleHandle M, if (Prg) Prg->stableSerializeSpecConstRegistry(SpecConsts); - return getOrBuild( + auto BuildResult = getOrBuild( Cache, KeyT(std::move(SpecConsts), KSId), AcquireF, GetF, BuildF); + return BuildResult->Ptr.load(); } -RT::PiKernel ProgramManager::getOrCreateKernel(OSModuleHandle M, - const context &Context, - const string_class &KernelName, - const program_impl *Prg) { +std::pair +ProgramManager::getOrCreateKernel(OSModuleHandle M, const context &Context, + const string_class &KernelName, + const program_impl *Prg) { if (DbgProgMgr > 0) { std::cerr << ">>> ProgramManager::getOrCreateKernel(" << M << ", " << getRawSyclObjImpl(Context) << ", " << KernelName << ")\n"; @@ -441,8 +443,10 @@ RT::PiKernel ProgramManager::getOrCreateKernel(OSModuleHandle M, return Result; }; - return getOrBuild(Cache, KernelName, - AcquireF, GetF, BuildF); + auto BuildResult = static_cast( + getOrBuild(Cache, KernelName, AcquireF, + GetF, BuildF)); + return std::make_pair(BuildResult->Ptr.load(), &(BuildResult->MKernelMutex)); } RT::PiProgram diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 2e6933cb3e841..33fbbb23d48af 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -77,9 +77,9 @@ class ProgramManager { RT::PiProgram getBuiltPIProgram(OSModuleHandle M, const context &Context, const string_class &KernelName, const program_impl *Prg = nullptr); - RT::PiKernel getOrCreateKernel(OSModuleHandle M, const context &Context, - const string_class &KernelName, - const program_impl *Prg); + std::pair + getOrCreateKernel(OSModuleHandle M, const context &Context, + const string_class &KernelName, const program_impl *Prg); RT::PiProgram getPiProgramFromPiKernel(RT::PiKernel Kernel, const ContextImplPtr Context); diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index a58c28f1487bb..af3b06359952c 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1799,15 +1799,19 @@ cl_int ExecCGCommand::enqueueImp() { sycl::context Context = MQueue->get_context(); const detail::plugin &Plugin = MQueue->getPlugin(); RT::PiKernel Kernel = nullptr; + std::mutex *KernelMutex = nullptr; if (nullptr != ExecKernel->MSyclKernel) { assert(ExecKernel->MSyclKernel->get_info() == Context); Kernel = ExecKernel->MSyclKernel->getHandleRef(); - } else - Kernel = detail::ProgramManager::getInstance().getOrCreateKernel( - ExecKernel->MOSModuleHandle, Context, ExecKernel->MKernelName, - nullptr); + } else { + std::tie(Kernel, KernelMutex) = + detail::ProgramManager::getInstance().getOrCreateKernel( + ExecKernel->MOSModuleHandle, Context, ExecKernel->MKernelName, + nullptr); + KernelMutex->lock(); + } for (ArgDesc &Arg : ExecKernel->MArgs) { switch (Arg.MType) { @@ -1863,6 +1867,9 @@ cl_int ExecCGCommand::enqueueImp() { &NDRDesc.GlobalSize[0], HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr, RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], &Event); + if (KernelMutex != nullptr) + KernelMutex->unlock(); + if (PI_SUCCESS != Error) { // If we have got non-success error code, let's analyze it to emit nice // exception explaining what was wrong diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 617d27964f908..8aa1d0bcba62d 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -80,7 +80,10 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, default: NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), std::move(Queue)); } + } + { + std::shared_lock Lock(MGraphLock); // TODO: Check if lazy mode. EnqueueResultT Res; bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); From fcb4e6cda3e45672e7a74fcebf946c34097885c4 Mon Sep 17 00:00:00 2001 From: Alexander Flegontov Date: Tue, 23 Jun 2020 13:35:21 +0300 Subject: [PATCH 09/11] [SYCL] Use lock_guard to lock mutex Signed-off-by: Alexander Flegontov --- sycl/source/detail/scheduler/commands.cpp | 109 +++++++++++---------- sycl/source/detail/scheduler/scheduler.cpp | 1 + 2 files changed, 60 insertions(+), 50 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 8131b997e8f5e..3f9983657ef23 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1797,7 +1797,6 @@ cl_int ExecCGCommand::enqueueImp() { // Run OpenCL kernel sycl::context Context = MQueue->get_context(); - const detail::plugin &Plugin = MQueue->getPlugin(); RT::PiKernel Kernel = nullptr; std::mutex *KernelMutex = nullptr; @@ -1810,65 +1809,75 @@ cl_int ExecCGCommand::enqueueImp() { detail::ProgramManager::getInstance().getOrCreateKernel( ExecKernel->MOSModuleHandle, Context, ExecKernel->MKernelName, nullptr); - KernelMutex->lock(); } - for (ArgDesc &Arg : ExecKernel->MArgs) { - switch (Arg.MType) { - case kernel_param_kind_t::kind_accessor: { - Requirement *Req = (Requirement *)(Arg.MPtr); - AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); - RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation(); - if (Plugin.getBackend() == backend::opencl) { + auto SetKernelParamsAndLaunch = [this, &ExecKernel, &Kernel, &NDRDesc, + &RawEvents, &Event] { + const detail::plugin &Plugin = MQueue->getPlugin(); + for (ArgDesc &Arg : ExecKernel->MArgs) { + switch (Arg.MType) { + case kernel_param_kind_t::kind_accessor: { + Requirement *Req = (Requirement *)(Arg.MPtr); + AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); + RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation(); + if (Plugin.getBackend() == backend::opencl) { + Plugin.call(Kernel, Arg.MIndex, + sizeof(RT::PiMem), &MemArg); + } else { + Plugin.call(Kernel, Arg.MIndex, + &MemArg); + } + break; + } + case kernel_param_kind_t::kind_std_layout: { + Plugin.call(Kernel, Arg.MIndex, Arg.MSize, + Arg.MPtr); + break; + } + case kernel_param_kind_t::kind_sampler: { + sampler *SamplerPtr = (sampler *)Arg.MPtr; + RT::PiSampler Sampler = + detail::getSyclObjImpl(*SamplerPtr) + ->getOrCreateSampler(MQueue->get_context()); Plugin.call(Kernel, Arg.MIndex, - sizeof(RT::PiMem), &MemArg); - } else { - Plugin.call(Kernel, Arg.MIndex, - &MemArg); + sizeof(cl_sampler), &Sampler); + break; + } + case kernel_param_kind_t::kind_pointer: { + Plugin.call(Kernel, Arg.MIndex, + Arg.MSize, Arg.MPtr); + break; + } } - break; - } - case kernel_param_kind_t::kind_std_layout: { - Plugin.call(Kernel, Arg.MIndex, Arg.MSize, - Arg.MPtr); - break; - } - case kernel_param_kind_t::kind_sampler: { - sampler *SamplerPtr = (sampler *)Arg.MPtr; - RT::PiSampler Sampler = - detail::getSyclObjImpl(*SamplerPtr)->getOrCreateSampler(Context); - Plugin.call(Kernel, Arg.MIndex, - sizeof(cl_sampler), &Sampler); - break; - } - case kernel_param_kind_t::kind_pointer: { - Plugin.call(Kernel, Arg.MIndex, - Arg.MSize, Arg.MPtr); - break; - } } - } - - adjustNDRangePerKernel(NDRDesc, Kernel, - *(detail::getSyclObjImpl(MQueue->get_device()))); - // Some PI Plugins (like OpenCL) require this call to enable USM - // For others, PI will turn this into a NOP. - Plugin.call(Kernel, PI_USM_INDIRECT_ACCESS, - sizeof(pi_bool), &PI_TRUE); + adjustNDRangePerKernel(NDRDesc, Kernel, + *(detail::getSyclObjImpl(MQueue->get_device()))); - // Remember this information before the range dimensions are reversed - const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0); + // Some PI Plugins (like OpenCL) require this call to enable USM + // For others, PI will turn this into a NOP. + Plugin.call( + Kernel, PI_USM_INDIRECT_ACCESS, sizeof(pi_bool), &PI_TRUE); - ReverseRangeDimensionsForKernel(NDRDesc); + // Remember this information before the range dimensions are reversed + const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0); - pi_result Error = Plugin.call_nocheck( - MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], - &NDRDesc.GlobalSize[0], HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr, - RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], &Event); + ReverseRangeDimensionsForKernel(NDRDesc); + pi_result Error = Plugin.call_nocheck( + MQueue->getHandleRef(), Kernel, NDRDesc.Dims, + &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], + HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr, RawEvents.size(), + RawEvents.empty() ? nullptr : &RawEvents[0], &Event); + return Error; + }; - if (KernelMutex != nullptr) - KernelMutex->unlock(); + pi_result Error = PI_SUCCESS; + if (KernelMutex != nullptr) { + std::lock_guard Lock(*KernelMutex); + Error = SetKernelParamsAndLaunch(); + } else { + Error = SetKernelParamsAndLaunch(); + } if (PI_SUCCESS != Error) { // If we have got non-success error code, let's analyze it to emit nice diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 8aa1d0bcba62d..d896dfc6be08b 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -84,6 +84,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, { std::shared_lock Lock(MGraphLock); + // TODO: Check if lazy mode. EnqueueResultT Res; bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); From 0dca88f2588ffe4aa84059e84c3f94512042f308 Mon Sep 17 00:00:00 2001 From: Alexander Flegontov Date: Wed, 24 Jun 2020 14:45:52 +0300 Subject: [PATCH 10/11] [SYCL] Use per-kernel mutex for all cases when the kernel is cacheable. Signed-off-by: Alexander Flegontov --- sycl/source/detail/program_impl.hpp | 6 +++--- sycl/source/detail/scheduler/commands.cpp | 13 +++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/program_impl.hpp b/sycl/source/detail/program_impl.hpp index 4205e806a6a91..c15cfc8eee4ff 100644 --- a/sycl/source/detail/program_impl.hpp +++ b/sycl/source/detail/program_impl.hpp @@ -318,6 +318,9 @@ class program_impl { /// Tells whether a specialization constant has been set for this program. bool hasSetSpecConstants() const { return !SpecConstRegistry.empty(); } + /// \return true if caching is allowed for this program. + bool is_cacheable() const { return MProgramAndKernelCachingAllowed; } + private: // Deligating Constructor used in Implementation. program_impl(ContextImplPtr Context, pi_native_handle InteropProgram, @@ -368,9 +371,6 @@ class program_impl { /// \return a vector of devices managed by the plugin. vector_class get_pi_devices() const; - /// \return true if caching is allowed for this program. - bool is_cacheable() const { return MProgramAndKernelCachingAllowed; } - /// \param Options is a string containing OpenCL C build options. /// \return true if caching is allowed for this program and build options. static bool is_cacheable_with_options(const string_class &Options) { diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 3f9983657ef23..2fbed5c573a5d 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1804,6 +1804,18 @@ cl_int ExecCGCommand::enqueueImp() { assert(ExecKernel->MSyclKernel->get_info() == Context); Kernel = ExecKernel->MSyclKernel->getHandleRef(); + + auto SyclProg = detail::getSyclObjImpl( + ExecKernel->MSyclKernel->get_info()); + if (SyclProg->is_cacheable()) { + RT::PiKernel FoundKernel = nullptr; + std::tie(FoundKernel, KernelMutex) = + detail::ProgramManager::getInstance().getOrCreateKernel( + ExecKernel->MOSModuleHandle, + ExecKernel->MSyclKernel->get_info(), + ExecKernel->MKernelName, SyclProg.get()); + assert(FoundKernel == Kernel); + } } else { std::tie(Kernel, KernelMutex) = detail::ProgramManager::getInstance().getOrCreateKernel( @@ -1873,6 +1885,7 @@ cl_int ExecCGCommand::enqueueImp() { pi_result Error = PI_SUCCESS; if (KernelMutex != nullptr) { + // For cacheable kernels, we use per-kernel mutex std::lock_guard Lock(*KernelMutex); Error = SetKernelParamsAndLaunch(); } else { From 12502ae9a1a025a0433fae97647ed586c05905d2 Mon Sep 17 00:00:00 2001 From: Alexander Flegontov Date: Fri, 26 Jun 2020 12:15:11 +0300 Subject: [PATCH 11/11] [SYCL] Refactoring of lambda into a separate function Signed-off-by: Alexander Flegontov --- sycl/source/detail/scheduler/commands.cpp | 125 +++++++++++----------- sycl/source/detail/scheduler/commands.hpp | 5 + 2 files changed, 68 insertions(+), 62 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 2fbed5c573a5d..d8fddb4b007c7 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1617,6 +1617,65 @@ static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) { } } +pi_result ExecCGCommand::SetKernelParamsAndLaunch( + CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc, + std::vector &RawEvents, RT::PiEvent &Event) { + const detail::plugin &Plugin = MQueue->getPlugin(); + for (ArgDesc &Arg : ExecKernel->MArgs) { + switch (Arg.MType) { + case kernel_param_kind_t::kind_accessor: { + Requirement *Req = (Requirement *)(Arg.MPtr); + AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); + RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation(); + if (Plugin.getBackend() == backend::opencl) { + Plugin.call(Kernel, Arg.MIndex, + sizeof(RT::PiMem), &MemArg); + } else { + Plugin.call(Kernel, Arg.MIndex, + &MemArg); + } + break; + } + case kernel_param_kind_t::kind_std_layout: { + Plugin.call(Kernel, Arg.MIndex, Arg.MSize, + Arg.MPtr); + break; + } + case kernel_param_kind_t::kind_sampler: { + sampler *SamplerPtr = (sampler *)Arg.MPtr; + RT::PiSampler Sampler = detail::getSyclObjImpl(*SamplerPtr) + ->getOrCreateSampler(MQueue->get_context()); + Plugin.call(Kernel, Arg.MIndex, + sizeof(cl_sampler), &Sampler); + break; + } + case kernel_param_kind_t::kind_pointer: { + Plugin.call(Kernel, Arg.MIndex, + Arg.MSize, Arg.MPtr); + break; + } + } + } + + adjustNDRangePerKernel(NDRDesc, Kernel, + *(detail::getSyclObjImpl(MQueue->get_device()))); + + // Some PI Plugins (like OpenCL) require this call to enable USM + // For others, PI will turn this into a NOP. + Plugin.call(Kernel, PI_USM_INDIRECT_ACCESS, + sizeof(pi_bool), &PI_TRUE); + + // Remember this information before the range dimensions are reversed + const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0); + + ReverseRangeDimensionsForKernel(NDRDesc); + pi_result Error = Plugin.call_nocheck( + MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], + &NDRDesc.GlobalSize[0], HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr, + RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], &Event); + return Error; +} + // The function initialize accessors and calls lambda. // The function is used as argument to piEnqueueNativeKernel which requires // that the passed function takes one void* argument. @@ -1823,73 +1882,15 @@ cl_int ExecCGCommand::enqueueImp() { nullptr); } - auto SetKernelParamsAndLaunch = [this, &ExecKernel, &Kernel, &NDRDesc, - &RawEvents, &Event] { - const detail::plugin &Plugin = MQueue->getPlugin(); - for (ArgDesc &Arg : ExecKernel->MArgs) { - switch (Arg.MType) { - case kernel_param_kind_t::kind_accessor: { - Requirement *Req = (Requirement *)(Arg.MPtr); - AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); - RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation(); - if (Plugin.getBackend() == backend::opencl) { - Plugin.call(Kernel, Arg.MIndex, - sizeof(RT::PiMem), &MemArg); - } else { - Plugin.call(Kernel, Arg.MIndex, - &MemArg); - } - break; - } - case kernel_param_kind_t::kind_std_layout: { - Plugin.call(Kernel, Arg.MIndex, Arg.MSize, - Arg.MPtr); - break; - } - case kernel_param_kind_t::kind_sampler: { - sampler *SamplerPtr = (sampler *)Arg.MPtr; - RT::PiSampler Sampler = - detail::getSyclObjImpl(*SamplerPtr) - ->getOrCreateSampler(MQueue->get_context()); - Plugin.call(Kernel, Arg.MIndex, - sizeof(cl_sampler), &Sampler); - break; - } - case kernel_param_kind_t::kind_pointer: { - Plugin.call(Kernel, Arg.MIndex, - Arg.MSize, Arg.MPtr); - break; - } - } - } - - adjustNDRangePerKernel(NDRDesc, Kernel, - *(detail::getSyclObjImpl(MQueue->get_device()))); - - // Some PI Plugins (like OpenCL) require this call to enable USM - // For others, PI will turn this into a NOP. - Plugin.call( - Kernel, PI_USM_INDIRECT_ACCESS, sizeof(pi_bool), &PI_TRUE); - - // Remember this information before the range dimensions are reversed - const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0); - - ReverseRangeDimensionsForKernel(NDRDesc); - pi_result Error = Plugin.call_nocheck( - MQueue->getHandleRef(), Kernel, NDRDesc.Dims, - &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], - HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr, RawEvents.size(), - RawEvents.empty() ? nullptr : &RawEvents[0], &Event); - return Error; - }; - pi_result Error = PI_SUCCESS; if (KernelMutex != nullptr) { // For cacheable kernels, we use per-kernel mutex std::lock_guard Lock(*KernelMutex); - Error = SetKernelParamsAndLaunch(); + Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents, + Event); } else { - Error = SetKernelParamsAndLaunch(); + Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents, + Event); } if (PI_SUCCESS != Error) { diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index b809c1503a397..76542bf4d1fa6 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -482,6 +482,11 @@ class ExecCGCommand : public Command { AllocaCommandBase *getAllocaForReq(Requirement *Req); + pi_result SetKernelParamsAndLaunch(CGExecKernel *ExecKernel, + RT::PiKernel Kernel, NDRDescT &NDRDesc, + std::vector &RawEvents, + RT::PiEvent &Event); + std::unique_ptr MCommandGroup; friend class Command;