Skip to content

[SYCL] Build proper barrier deps if host task is involved in pipeline #13094

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
9f15ae2
Revert "[SYCL] Fix unused variable warnings (#12523)"
KseniyaTikhomirova Mar 15, 2024
706fc20
Extend deps structure
KseniyaTikhomirova Mar 15, 2024
3ab87d6
Adapt changes to current code base. Buils is failing due to producesP…
KseniyaTikhomirova Mar 18, 2024
ce69929
Fix build
KseniyaTikhomirova Mar 19, 2024
280ccb4
fix braces
KseniyaTikhomirova Mar 21, 2024
e6fd2ad
Fix recursive mutex lock
KseniyaTikhomirova Mar 21, 2024
c79e215
fix clang-format
KseniyaTikhomirova Mar 21, 2024
5625070
update linux symbols
KseniyaTikhomirova Mar 21, 2024
4d6fb63
Merge branch 'sycl' into barrier_rebase
KseniyaTikhomirova Mar 21, 2024
9080bcc
Merge branch 'sycl' into barrier_rebase
KseniyaTikhomirova Mar 28, 2024
69c53e9
Draft
KseniyaTikhomirova Apr 2, 2024
2640855
fix idea
KseniyaTikhomirova Apr 9, 2024
6454bc4
Fix logic
KseniyaTikhomirova Apr 12, 2024
81e295a
fix clang format
KseniyaTikhomirova Apr 12, 2024
8c5dd95
Merge branch 'sycl' into barrier_rebase
KseniyaTikhomirova Apr 12, 2024
45914f9
Cleanup
KseniyaTikhomirova Apr 12, 2024
2b47556
Add unittests
KseniyaTikhomirova Apr 15, 2024
1646766
add test for barrier with wait list
KseniyaTikhomirova Apr 15, 2024
7590a5d
fix issue
KseniyaTikhomirova Apr 15, 2024
2bee189
FIx clang-format
KseniyaTikhomirova Apr 16, 2024
d249da7
Merge branch 'sycl' into barrier_rebase
KseniyaTikhomirova Apr 16, 2024
df17b54
Update Win symbols
KseniyaTikhomirova Apr 18, 2024
aed5f81
Fix seg fault
KseniyaTikhomirova Apr 19, 2024
a9eb8a3
fix comments, part1
KseniyaTikhomirova May 6, 2024
166ffd4
Merge branch 'sycl' into barrier_rebase
KseniyaTikhomirova May 6, 2024
cfb2ef0
fix comments, part 2
KseniyaTikhomirova May 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1558,8 +1558,7 @@ class __SYCL_EXPORT handler {
nullptr,
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
#endif
__SYCL_KERNEL_ATTR__ void
kernel_single_task(_KERNELFUNCPARAM(KernelFunc)) {
__SYCL_KERNEL_ATTR__ void kernel_single_task(_KERNELFUNCPARAM(KernelFunc)) {
#ifdef __SYCL_DEVICE_ONLY__
KernelFunc();
#else
Expand All @@ -1577,8 +1576,8 @@ class __SYCL_EXPORT handler {
nullptr,
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
#endif
__SYCL_KERNEL_ATTR__ void
kernel_single_task(_KERNELFUNCPARAM(KernelFunc), kernel_handler KH) {
__SYCL_KERNEL_ATTR__ void kernel_single_task(_KERNELFUNCPARAM(KernelFunc),
kernel_handler KH) {
#ifdef __SYCL_DEVICE_ONLY__
KernelFunc(KH);
#else
Expand All @@ -1596,8 +1595,7 @@ class __SYCL_EXPORT handler {
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
#endif
__SYCL_KERNEL_ATTR__ void
kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc)) {
__SYCL_KERNEL_ATTR__ void kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc)) {
#ifdef __SYCL_DEVICE_ONLY__
KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
#else
Expand All @@ -1614,8 +1612,8 @@ class __SYCL_EXPORT handler {
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
#endif
__SYCL_KERNEL_ATTR__ void
kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc), kernel_handler KH) {
__SYCL_KERNEL_ATTR__ void kernel_parallel_for(_KERNELFUNCPARAM(KernelFunc),
kernel_handler KH) {
#ifdef __SYCL_DEVICE_ONLY__
KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()), KH);
#else
Expand Down Expand Up @@ -3652,6 +3650,12 @@ class __SYCL_EXPORT handler {

// Set that an ND Range was used during a call to parallel_for
void setNDRangeUsed(bool Value);

protected:
/// Registers event dependencies in this command group.
void depends_on(const detail::EventImplPtr &Event);
/// Registers event dependencies in this command group.
void depends_on(const std::vector<detail::EventImplPtr> &Events);
};
} // namespace _V1
} // namespace sycl
9 changes: 9 additions & 0 deletions sycl/source/detail/event_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,11 @@ class event_impl {
/// \return true if this event is complete.
bool isCompleted();

/// Checks if associated command is enqueued
///
/// \return true if command passed enqueue
bool isEnqueued() const noexcept { return MIsEnqueued; };

void attachEventToComplete(const EventImplPtr &Event) {
std::lock_guard<std::mutex> Lock(MMutex);
MPostCompleteEvents.push_back(Event);
Expand Down Expand Up @@ -338,6 +343,8 @@ class event_impl {
return MPostCompleteEvents;
}

void setEnqueued() { MIsEnqueued = true; }

protected:
// When instrumentation is enabled emits trace event for event wait begin and
// returns the telemetry event generated for the wait
Expand Down Expand Up @@ -404,6 +411,8 @@ class event_impl {
friend std::vector<sycl::detail::pi::PiEvent>
getOrWaitEvents(std::vector<sycl::event> DepEvents,
std::shared_ptr<sycl::detail::context_impl> Context);

std::atomic_bool MIsEnqueued{false};
};

} // namespace detail
Expand Down
55 changes: 44 additions & 11 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ queue_impl::getExtendDependencyList(const std::vector<event> &DepEvents,
return DepEvents;

QueueLock.lock();
EventImplPtr ExtraEvent =
MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr;
EventImplPtr ExtraEvent = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
: MExtGraphDeps.LastEventPtr;
std::optional<event> ExternalEvent = popExternalEvent();

if (!ExternalEvent && !ExtraEvent)
Expand Down Expand Up @@ -271,11 +271,11 @@ event queue_impl::getLastEvent() {
std::lock_guard<std::mutex> Lock{MMutex};
if (MDiscardEvents)
return createDiscardedEvent();
if (!MGraph.expired() && MGraphLastEventPtr)
return detail::createSyclObjFromImpl<event>(MGraphLastEventPtr);
if (!MLastEventPtr)
MLastEventPtr = std::make_shared<event_impl>(std::nullopt);
return detail::createSyclObjFromImpl<event>(MLastEventPtr);
if (!MGraph.expired() && MExtGraphDeps.LastEventPtr)
return detail::createSyclObjFromImpl<event>(MExtGraphDeps.LastEventPtr);
if (!MDefaultGraphDeps.LastEventPtr)
MDefaultGraphDeps.LastEventPtr = std::make_shared<event_impl>(std::nullopt);
return detail::createSyclObjFromImpl<event>(MDefaultGraphDeps.LastEventPtr);
}

void queue_impl::addEvent(const event &Event) {
Expand Down Expand Up @@ -376,8 +376,8 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
return MDiscardEvents ? createDiscardedEvent() : event();

if (isInOrder()) {
auto &EventToStoreIn =
MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr;
auto &EventToStoreIn = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
: MExtGraphDeps.LastEventPtr;
EventToStoreIn = EventImpl;
}
// Track only if we won't be able to handle it with piQueueFinish.
Expand Down Expand Up @@ -581,8 +581,9 @@ bool queue_impl::ext_oneapi_empty() const {
// the status of the last event.
if (isInOrder() && !MDiscardEvents) {
std::lock_guard<std::mutex> Lock(MMutex);
return !MLastEventPtr ||
MLastEventPtr->get_info<info::event::command_execution_status>() ==
return !MDefaultGraphDeps.LastEventPtr ||
MDefaultGraphDeps.LastEventPtr
->get_info<info::event::command_execution_status>() ==
info::event_command_status::complete;
}

Expand Down Expand Up @@ -625,6 +626,38 @@ event queue_impl::discard_or_return(const event &Event) {
return createDiscardedEvent();
}

void queue_impl::revisitUnenqueuedCommandsState(
const EventImplPtr &CompletedHostTask) {
if (MIsInorder)
return;
auto tryToCleanup = [](DependencyTrackingItems &Deps) {
if (Deps.LastBarrier && Deps.LastBarrier->isEnqueued()) {
Deps.LastBarrier = nullptr;
Deps.UnenqueuedCmdEvents.clear();
} else {
if (Deps.UnenqueuedCmdEvents.empty())
return;
Deps.UnenqueuedCmdEvents.erase(
std::remove_if(
Deps.UnenqueuedCmdEvents.begin(), Deps.UnenqueuedCmdEvents.end(),
[](const EventImplPtr &CommandEvent) {
return (CommandEvent->is_host() ? CommandEvent->isCompleted()
: CommandEvent->isEnqueued());
}),
Deps.UnenqueuedCmdEvents.end());
}
};
std::lock_guard<std::mutex> Lock{MMutex};
// Barrier enqueue could be significantly postponed due to host task
// dependency if any. No guarantee that it will happen while same graph deps
// are still recording.
if (auto Graph = CompletedHostTask->getCommandGraph()) {
if (Graph == getCommandGraph())
tryToCleanup(MExtGraphDeps);
} else
tryToCleanup(MDefaultGraphDeps);
}

} // namespace detail
} // namespace _V1
} // namespace sycl
71 changes: 55 additions & 16 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,7 @@ class queue_impl {
std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph) {
std::lock_guard<std::mutex> Lock(MMutex);
MGraph = Graph;
MGraphLastEventPtr = nullptr;
MExtGraphDeps.LastEventPtr = nullptr;
}

std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
Expand All @@ -745,14 +745,27 @@ class queue_impl {
std::vector<event> &MutableVec,
std::unique_lock<std::mutex> &QueueLock);

// Helps to manage host tasks presence in scenario with barrier usage.
// Approach that tracks almost all tasks to provide barrier sync for both pi
// tasks and host tasks is applicable for out of order queues only. No-op
// for in order ones.
void tryToResetEnqueuedBarrierDep(const EventImplPtr &EnqueuedBarrierEvent);
Comment on lines +748 to +752
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be a dead code.


// Called on host task completion that could block some kernels from enqueue.
// Approach that tracks almost all tasks to provide barrier sync for both pi
// tasks and host tasks is applicable for out of order queues only. Not neede
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

neede -> needed

// for in order ones.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Considering that the backend is not aware of host tasks, how in-order property is provided for host tasks?

void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask);

protected:
event discard_or_return(const event &Event);
// Hook to the scheduler to clean up any fusion command held on destruction.
void cleanup_fusion_cmd();

// template is needed for proper unit testing
template <typename HandlerType = handler>
void finalizeHandler(HandlerType &Handler, event &EventRet) {
void finalizeHandler(HandlerType &Handler, const CG::CGTYPE &Type,
event &EventRet) {
if (MIsInorder) {
// Accessing and changing of an event isn't atomic operation.
// Hence, here is the lock for thread-safety.
Expand All @@ -764,11 +777,11 @@ class queue_impl {
// by a host task. This dependency allows to build the enqueue order in
// the RT but will not be passed to the backend. See getPIEvents in
// Command.
auto &EventToBuildDeps =
MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr;

auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
: MExtGraphDeps.LastEventPtr;
if (EventToBuildDeps)
Handler.depends_on(
createSyclObjFromImpl<sycl::event>(EventToBuildDeps));
Handler.depends_on(EventToBuildDeps);

// If there is an external event set, add it as a dependency and clear it.
// We do not need to hold the lock as MLastEventMtx will ensure the last
Expand All @@ -779,8 +792,31 @@ class queue_impl {

EventRet = Handler.finalize();
EventToBuildDeps = getSyclObjImpl(EventRet);
} else
} else {
// The following code supports barrier synchronization if host task is
// involved in the scenario. Native barriers cannot handle host task
// dependency so in the case where some commands were not enqueued
// (blocked), we track them to prevent barrier from being enqueued
// earlier.
std::lock_guard<std::mutex> Lock{MMutex};
auto &Deps = MGraph.expired() ? MDefaultGraphDeps : MExtGraphDeps;
if (Type == CG::Barrier && !Deps.UnenqueuedCmdEvents.empty()) {
Handler.depends_on(Deps.UnenqueuedCmdEvents);
}
if (Deps.LastBarrier)
Handler.depends_on(Deps.LastBarrier);
EventRet = Handler.finalize();
EventImplPtr EventRetImpl = getSyclObjImpl(EventRet);
if (Type == CG::CodeplayHostTask)
Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
else if (!EventRetImpl->isEnqueued()) {
if (Type == CG::Barrier || Type == CG::BarrierWaitlist) {
Deps.LastBarrier = EventRetImpl;
Deps.UnenqueuedCmdEvents.clear();
} else
Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
}
}
}

/// Performs command group submission to the queue.
Expand Down Expand Up @@ -836,11 +872,11 @@ class queue_impl {
KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) &&
ProgramManager::getInstance().kernelUsesAssert(
Handler.MKernelName.c_str());
finalizeHandler(Handler, Event);
finalizeHandler(Handler, Type, Event);

(*PostProcess)(IsKernel, KernelUsesAssert, Event);
} else
finalizeHandler(Handler, Event);
finalizeHandler(Handler, Type, Event);

addEvent(Event);
return Event;
Expand Down Expand Up @@ -924,13 +960,16 @@ class queue_impl {
/// need to emulate it with multiple native in-order queues.
bool MEmulateOOO = false;

// This event is employed for enhanced dependency tracking with in-order queue
// Access to the event should be guarded with MMutex
EventImplPtr MLastEventPtr;
// Same as above but for graph begin-end recording cycle.
// Track deps within graph commands separately.
// Protected by common queue object mutex MMutex.
EventImplPtr MGraphLastEventPtr;
// Access should be guarded with MMutex
struct DependencyTrackingItems {
// This event is employed for enhanced dependency tracking with in-order
// queue
EventImplPtr LastEventPtr;
// The following two items are employed for proper out of order enqueue
// ordering
std::vector<EventImplPtr> UnenqueuedCmdEvents;
EventImplPtr LastBarrier;
} MDefaultGraphDeps, MExtGraphDeps;

const bool MIsInorder;

Expand Down
9 changes: 4 additions & 5 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ std::vector<sycl::detail::pi::PiEvent> Command::getPiEventsBlocking(
continue;
// In this path nullptr native event means that the command has not been
// enqueued. It may happen if async enqueue in a host task is involved.
if (EventImpl->getHandleRef() == nullptr) {
if (!EventImpl->isEnqueued()) {
if (!EventImpl->getCommand() ||
!static_cast<Command *>(EventImpl->getCommand())->producesPiEvent())
continue;
Expand Down Expand Up @@ -883,6 +883,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking,
EnqueueResult =
EnqueueResultT(EnqueueResultT::SyclEnqueueFailed, this, Res);
else {
MEvent->setEnqueued();
if (MShouldCompleteEventIfPossible &&
(MEvent->is_host() || MEvent->getHandleRef() == nullptr))
MEvent->setComplete();
Expand Down Expand Up @@ -1784,8 +1785,7 @@ void EmptyCommand::printDot(std::ostream &Stream) const {
Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";

Stream << "ID = " << this << "\\n";
Stream << "EMPTY NODE"
<< "\\n";
Stream << "EMPTY NODE" << "\\n";

Stream << "\"];" << std::endl;

Expand Down Expand Up @@ -3472,8 +3472,7 @@ void UpdateCommandBufferCommand::printDot(std::ostream &Stream) const {
Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";

Stream << "ID = " << this << "\\n";
Stream << "CommandBuffer Command Update"
<< "\\n";
Stream << "CommandBuffer Command Update" << "\\n";

Stream << "\"];" << std::endl;

Expand Down
7 changes: 5 additions & 2 deletions sycl/source/detail/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,8 @@ void Scheduler::NotifyHostTaskCompletion(Command *Cmd) {
// Thus we employ read-lock of graph.

std::vector<Command *> ToCleanUp;
auto CmdEvent = Cmd->getEvent();
auto QueueImpl = Cmd->getQueue();
{
ReadLockT Lock = acquireReadLock();

Expand All @@ -487,14 +489,15 @@ void Scheduler::NotifyHostTaskCompletion(Command *Cmd) {
ToCleanUp.push_back(Cmd);
Cmd->MMarkedForCleanup = true;
}

{
std::lock_guard<std::mutex> Guard(Cmd->MBlockedUsersMutex);
// update self-event status
Cmd->getEvent()->setComplete();
CmdEvent->setComplete();
}
Scheduler::enqueueUnblockedCommands(Cmd->MBlockedUsers, Lock, ToCleanUp);
}
QueueImpl->revisitUnenqueuedCommandsState(CmdEvent);

cleanupCommands(ToCleanUp);
}

Expand Down
Loading