diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index b7af801f2e141..4edc5f05f79ee 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -79,6 +79,10 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) { class handler; template class buffer; + +namespace ext::oneapi::experimental::detail { +class graph_impl; +} namespace detail { class handler_impl; @@ -372,6 +376,14 @@ class __SYCL_EXPORT handler { std::shared_ptr PrimaryQueue, std::shared_ptr SecondaryQueue, bool IsHost); + /// Constructs SYCL handler from Graph. + /// + /// The hander will add the command-group as a node to the graph rather than + /// enqueueing it straight away. + /// + /// \param Graph is a SYCL command_graph + handler(std::shared_ptr Graph); + /// Stores copy of Arg passed to the MArgsStorage. template >> @@ -2528,6 +2540,8 @@ class __SYCL_EXPORT handler { private: std::shared_ptr MImpl; std::shared_ptr MQueue; + std::shared_ptr MGraph; + /// The storage for the arguments passed. /// We need to store a copy of values that are passed explicitly through /// set_arg, require and so on, because we need them to be alive after @@ -2611,6 +2625,8 @@ class __SYCL_EXPORT handler { friend class ::MockHandler; friend class detail::queue_impl; + friend class ext::oneapi::experimental::detail::graph_impl; + bool DisableRangeRounding(); bool RangeRoundingTrace(); diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 1af071d84ef0a..337486d6096fc 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -8,6 +8,7 @@ #include #include +#include #include namespace sycl { @@ -61,76 +62,94 @@ void graph_impl::remove_root(const std::shared_ptr &Root) { // Recursive check if a graph node or its successors contains a given kernel // argument. // -// @param[in] arg The kernel argument to check for. -// @param[in] currentNode The current graph node being checked. -// @param[in,out] deps The unique list of dependencies which have been +// @param[in] Arg The kernel argument to check for. +// @param[in] CurrentNode The current graph node being checked. +// @param[in,out] Deps The unique list of dependencies which have been // identified for this arg. -// @param[in] dereferencePtr if true arg comes direct from the handler in which -// case it will need to be deferenced to check actual value. // // @returns True if a dependency was added in this node of any of its // successors. bool check_for_arg(const sycl::detail::ArgDesc &Arg, const std::shared_ptr &CurrentNode, - std::set> &Deps, - bool DereferencePtr = false) { + std::set> &Deps) { bool SuccessorAddedDep = false; for (auto &Successor : CurrentNode->MSuccessors) { - SuccessorAddedDep |= check_for_arg(Arg, Successor, Deps, DereferencePtr); + SuccessorAddedDep |= check_for_arg(Arg, Successor, Deps); } - if (Deps.find(CurrentNode) == Deps.end() && - CurrentNode->has_arg(Arg, DereferencePtr) && !SuccessorAddedDep) { + if (Deps.find(CurrentNode) == Deps.end() && CurrentNode->has_arg(Arg) && + !SuccessorAddedDep) { Deps.insert(CurrentNode); return true; } return SuccessorAddedDep; } -template std::shared_ptr -graph_impl::add(const std::shared_ptr &impl, T cgf, - const std::vector &args, - const std::vector> &dep) { - std::shared_ptr nodeImpl = - std::make_shared(impl, cgf, args); +graph_impl::add(const std::shared_ptr &Impl, + std::function CGF, + const std::vector &Args, + const std::vector> &Dep) { + sycl::handler Handler{Impl}; + CGF(Handler); + + return this->add(Impl, Handler.MKernel, Handler.MNDRDesc, + Handler.MOSModuleHandle, Handler.MKernelName, + Handler.MAccStorage, Handler.MLocalAccStorage, + Handler.MRequirements, Handler.MArgs, {}); +} + +std::shared_ptr graph_impl::add( + const std::shared_ptr &Impl, + std::shared_ptr Kernel, + sycl::detail::NDRDescT NDRDesc, sycl::detail::OSModuleHandle OSModuleHandle, + std::string KernelName, + const std::vector &AccStorage, + const std::vector &LocalAccStorage, + const std::vector &Requirements, + const std::vector &Args, + const std::vector> &Dep) { + const std::shared_ptr &NodeImpl = std::make_shared( + Impl, Kernel, NDRDesc, OSModuleHandle, KernelName, AccStorage, + LocalAccStorage, Requirements, Args); // Copy deps so we can modify them - auto deps = dep; + auto Deps = Dep; // A unique set of dependencies obtained by checking kernel arguments - std::set> uniqueDeps; - for (auto &arg : args) { - if (arg.MType != sycl::detail::kernel_param_kind_t::kind_pointer) { + // for accessors + std::set> UniqueDeps; + for (auto &Arg : Args) { + if (Arg.MType != sycl::detail::kernel_param_kind_t::kind_accessor) { continue; } // Look through the graph for nodes which share this argument - for (auto nodePtr : MRoots) { - check_for_arg(arg, nodePtr, uniqueDeps, true); + for (auto NodePtr : MRoots) { + check_for_arg(Arg, NodePtr, UniqueDeps); } } - // Add any deps determined from arguments into the dependency list - deps.insert(deps.end(), uniqueDeps.begin(), uniqueDeps.end()); - if (!deps.empty()) { - for (auto n : deps) { - n->register_successor(nodeImpl); // register successor - this->remove_root(nodeImpl); // remove receiver from root node + // Add any deps determined from accessor arguments into the dependency list + Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end()); + if (!Deps.empty()) { + for (auto N : Deps) { + N->register_successor(NodeImpl); // register successor + this->remove_root(NodeImpl); // remove receiver from root node // list } } else { - this->add_root(nodeImpl); + this->add_root(NodeImpl); } - return nodeImpl; + return NodeImpl; } bool graph_impl::clear_queues() { - bool anyQueuesCleared = false; - for (auto &q : MRecordingQueues) { - q->setCommandGraph(nullptr); - anyQueuesCleared = true; + bool AnyQueuesCleared = false; + for (auto &Queue : MRecordingQueues) { + Queue->setCommandGraph(nullptr); + AnyQueuesCleared = true; } MRecordingQueues.clear(); - return anyQueuesCleared; + return AnyQueuesCleared; } void node_impl::exec(const std::shared_ptr &Queue @@ -139,7 +158,33 @@ void node_impl::exec(const std::shared_ptr &Queue for (auto Sender : MPredecessors) Deps.push_back(Sender->get_event()); - MEvent = Queue->submit(wrapper{MBody, Deps}, Queue _CODELOCFW(CodeLoc)); + // Enqueue kernel here instead of submit + + std::vector RawEvents; + pi_event *OutEvent = nullptr; + auto NewEvent = std::make_shared(Queue); + NewEvent->setContextImpl(Queue->getContextImplPtr()); + NewEvent->setStateIncomplete(); + OutEvent = &NewEvent->getHandleRef(); + pi_result Res = + Queue->getPlugin().call_nocheck( + sycl::detail::getSyclObjImpl(Queue->get_context())->getHandleRef(), + OutEvent); + if (Res != pi_result::PI_SUCCESS) { + throw sycl::exception(errc::event, + "Failed to create event for node submission"); + } + + pi_int32 Result = enqueueImpKernel( + Queue, MNDRDesc, MArgs, /* KernelBundleImpPtr */ nullptr, MKernel, + MKernelName, MOSModuleHandle, RawEvents, OutEvent, nullptr); + if (Result != pi_result::PI_SUCCESS) { + throw sycl::exception(errc::kernel, "Error enqueuing graph node kernel"); + } + sycl::event QueueEvent = + sycl::detail::createSyclObjFromImpl(NewEvent); + Queue->addEvent(QueueEvent); + MEvent = QueueEvent; } } // namespace detail diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index a35b9858fa228..f913cae633954 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -9,9 +9,13 @@ #pragma once #include +#include #include #include +#include + +#include #include #include #include @@ -48,9 +52,24 @@ struct node_impl { std::vector> MSuccessors; std::vector> MPredecessors; - std::function MBody; - + /// Kernel to be executed by this node + std::shared_ptr MKernel; + /// Description of the kernel global and local sizes as well as offset + sycl::detail::NDRDescT MNDRDesc; + /// Module handle for the kernel to be executed. + sycl::detail::OSModuleHandle MOSModuleHandle = + sycl::detail::OSUtil::ExeModuleHandle; + /// Kernel name inside the module + std::string MKernelName; + std::vector MAccStorage; + std::vector MLocalAccStorage; + std::vector MRequirements; + + /// Store arg descriptors for the kernel arguments std::vector MArgs; + // We need to store local copies of the values pointed to by MArgs since they + // may go out of scope before execution. + std::vector> MArgStorage; void exec(const std::shared_ptr &Queue _CODELOCPARAM(&CodeLoc)); @@ -66,17 +85,30 @@ struct node_impl { sycl::event get_event(void) const { return MEvent; } - template - node_impl(const std::shared_ptr &Graph, T CGF, - const std::vector &Args) - : MScheduled(false), MGraph(Graph), MBody(CGF), MArgs(Args) { + node_impl( + const std::shared_ptr &Graph, + std::shared_ptr Kernel, + sycl::detail::NDRDescT NDRDesc, + sycl::detail::OSModuleHandle OSModuleHandle, std::string KernelName, + const std::vector &AccStorage, + const std::vector &LocalAccStorage, + const std::vector &Requirements, + const std::vector &args) + : MScheduled(false), MGraph(Graph), MKernel(Kernel), MNDRDesc(NDRDesc), + MOSModuleHandle(OSModuleHandle), MKernelName(KernelName), + MAccStorage(AccStorage), MLocalAccStorage(LocalAccStorage), + MRequirements(Requirements), MArgs(args), MArgStorage() { + + // Need to copy the arg values to node local storage so that they don't go + // out of scope before execution for (size_t i = 0; i < MArgs.size(); i++) { - if (MArgs[i].MType == sycl::detail::kernel_param_kind_t::kind_pointer) { - // Make sure we are storing the actual USM pointer for comparison - // purposes, note we couldn't actually submit using these copies of the - // args if subsequent code expects a void**. - MArgs[i].MPtr = *(void **)(MArgs[i].MPtr); - } + auto &CurrentArg = MArgs[i]; + MArgStorage.emplace_back(CurrentArg.MSize); + auto StoragePtr = MArgStorage.back().data(); + if (CurrentArg.MPtr) + std::memcpy(StoragePtr, CurrentArg.MPtr, CurrentArg.MSize); + // Set the arg descriptor to point to the new storage + CurrentArg.MPtr = StoragePtr; } } @@ -90,13 +122,14 @@ struct node_impl { Schedule.push_front(std::shared_ptr(this)); } - bool has_arg(const sycl::detail::ArgDesc &Arg, bool DereferencePtr = false) { + bool has_arg(const sycl::detail::ArgDesc &Arg) { for (auto &NodeArg : MArgs) { if (Arg.MType == NodeArg.MType && Arg.MSize == NodeArg.MSize) { - // Args coming directly from the handler will need to be dereferenced - // since they are actually void** - void *IncomingPtr = DereferencePtr ? *(void **)Arg.MPtr : Arg.MPtr; - if (IncomingPtr == NodeArg.MPtr) { + // Args are actually void** so we need to dereference them to compare + // actual values + void *IncomingPtr = *static_cast(Arg.MPtr); + void *ArgPtr = *static_cast(NodeArg.MPtr); + if (IncomingPtr == ArgPtr) { return true; } } @@ -119,9 +152,20 @@ struct graph_impl { void add_root(const std::shared_ptr &); void remove_root(const std::shared_ptr &); - template std::shared_ptr - add(const std::shared_ptr &Impl, T CGF, + add(const std::shared_ptr &Impl, + std::shared_ptr Kernel, + sycl::detail::NDRDescT NDRDesc, + sycl::detail::OSModuleHandle OSModuleHandle, std::string KernelName, + const std::vector &AccStorage, + const std::vector &LocalAccStorage, + const std::vector &Requirements, + const std::vector &Args, + const std::vector> &Dep = {}); + + std::shared_ptr + add(const std::shared_ptr &Impl, + std::function CGF, const std::vector &Args, const std::vector> &Dep = {}); diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index baa9276fe4069..5f883b224f206 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -29,6 +29,8 @@ class handler_impl { : MSubmissionPrimaryQueue(std::move(SubmissionPrimaryQueue)), MSubmissionSecondaryQueue(std::move(SubmissionSecondaryQueue)){}; + handler_impl() = default; + void setStateExplicitKernelBundle() { if (MSubmissionState == HandlerSubmissionState::SPEC_CONST_SET_STATE) throw sycl::exception( diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 0c34a24275346..ba67dcfe482d9 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -550,34 +550,27 @@ class queue_impl { handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue); Handler.saveCodeLoc(Loc); CGF(Handler); - if (auto graphImpl = Self->getCommandGraph(); graphImpl != nullptr) { - // Pass the args obtained by the handler to the graph to use in - // determining edges between this node and previously submitted nodes. - graphImpl->add(graphImpl, CGF, Handler.MArgs, {}); - } else { - // Scheduler will later omit events, that are not required to execute - // tasks. Host and interop tasks, however, are not submitted to low-level - // runtimes and require separate dependency management. - const CG::CGTYPE Type = Handler.getType(); + // Scheduler will later omit events, that are not required to execute + // tasks. Host and interop tasks, however, are not submitted to low-level + // runtimes and require separate dependency management. + const CG::CGTYPE Type = Handler.getType(); - if (PostProcess) { - bool IsKernel = Type == CG::Kernel; - bool KernelUsesAssert = false; + if (PostProcess) { + bool IsKernel = Type == CG::Kernel; + bool KernelUsesAssert = false; - if (IsKernel) - // Kernel only uses assert if it's non interop one - KernelUsesAssert = - !(Handler.MKernel && Handler.MKernel->isInterop()) && - ProgramManager::getInstance().kernelUsesAssert( - Handler.MOSModuleHandle, Handler.MKernelName); + if (IsKernel) + // Kernel only uses assert if it's non interop one + KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) && + ProgramManager::getInstance().kernelUsesAssert( + Handler.MOSModuleHandle, Handler.MKernelName); - finalizeHandler(Handler, Type, Event); + finalizeHandler(Handler, Type, Event); - (*PostProcess)(IsKernel, KernelUsesAssert, Event); - } else - finalizeHandler(Handler, Type, Event); - } + (*PostProcess)(IsKernel, KernelUsesAssert, Event); + } else + finalizeHandler(Handler, Type, Event); addEvent(Event); return Event; @@ -664,6 +657,8 @@ class queue_impl { // commands to this queue. Used by subgraphs to determine if they are part of // a larger command graph submission. bool MIsGraphSubmitting = false; + + friend class sycl::ext::oneapi::experimental::detail::node_impl; }; } // namespace detail diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp index 5134f9b51996c..47ba843243cda 100644 --- a/sycl/source/detail/reduction.cpp +++ b/sycl/source/detail/reduction.cpp @@ -52,6 +52,16 @@ __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, // with the given queue. __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups( std::shared_ptr Queue) { + // TODO: Graphs extension explicit API uses a handler with no queue attached, + // so return some value here. In the future we should have access to the + // device so can remove this. + // + // The 8 value was chosen as the hardcoded value as it is the returned + // value for sycl::info::device::max_compute_units on + // Intel HD Graphics devices used as a L0 backend during development. + if (Queue == nullptr) { + return 8; + } device Dev = Queue->get_device(); uint32_t NumThreads = Dev.get_info(); // TODO: The heuristics here require additional tuning for various devices @@ -104,6 +114,16 @@ reduGetMaxWGSize(std::shared_ptr Queue, __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr &Queue, size_t LocalMemBytesPerWorkItem) { + // TODO: Graphs extension explicit API uses a handler with a null queue to + // process CGFs, in future we should have access to the device so we can + // correctly calculate this. + // + // The 32 value was chosen as the hardcoded value as it is the returned + // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on + // Intel HD Graphics devices used as a L0 backend during development. + if (Queue == nullptr) { + return 32; + } device Dev = Queue->get_device(); // The maximum WGSize returned by CPU devices is very large and does not diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index fbd42f6c2563d..e0546e5066453 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -40,6 +40,10 @@ handler::handler(std::shared_ptr Queue, std::move(SecondaryQueue))), MQueue(std::move(Queue)), MIsHost(IsHost) {} +handler::handler( + std::shared_ptr Graph) + : MImpl(std::make_shared()), MGraph(Graph) {} + // Sets the submission state to indicate that an explicit kernel bundle has been // set. Throws a sycl::exception with errc::invalid if the current state // indicates that a specialization constant has been set. @@ -93,6 +97,14 @@ event handler::finalize() { if (MIsFinalized) return MLastEvent; MIsFinalized = true; + if (auto GraphImpl = MQueue->getCommandGraph(); GraphImpl != nullptr) { + // Extract relevant data from the handler and pass to graph to create a new + // node representing this command group. + GraphImpl->add(GraphImpl, MKernel, MNDRDesc, MOSModuleHandle, MKernelName, + MAccStorage, MLocalAccStorage, MRequirements, MArgs, {}); + return detail::createSyclObjFromImpl( + std::make_shared()); + } std::shared_ptr KernelBundleImpPtr = nullptr; // If there were uses of set_specialization_constant build the kernel_bundle diff --git a/sycl/test/graph/graph-explicit-dotp-buffer.cpp b/sycl/test/graph/graph-explicit-dotp-buffer.cpp new file mode 100644 index 0000000000000..0b795714f98dd --- /dev/null +++ b/sycl/test/graph/graph-explicit-dotp-buffer.cpp @@ -0,0 +1,108 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +#include + +#include + +const size_t n = 10; + +float host_gold_result() { + float alpha = 1.0f; + float beta = 2.0f; + float gamma = 3.0f; + + float sum = 0.0f; + + for (size_t i = 0; i < n; ++i) { + sum += (alpha * 1.0f + beta * 2.0f) * (gamma * 3.0f + beta * 2.0f); + } + + return sum; +} + +int main() { + float alpha = 1.0f; + float beta = 2.0f; + float gamma = 3.0f; + + sycl::property_list properties{ + sycl::property::queue::in_order{}, + sycl::ext::oneapi::property::queue::lazy_execution{}}; + + sycl::queue q{sycl::gpu_selector_v, properties}; + + sycl::ext::oneapi::experimental::command_graph g; + + float dotpData = 0.f; + std::vector xData(n); + std::vector yData(n); + std::vector zData(n); + + { + sycl::buffer dotpBuf(&dotpData, sycl::range<1>(1)); + + sycl::buffer xBuf(xData); + sycl::buffer yBuf(yData); + sycl::buffer zBuf(zData); + + /* init data on the device */ + auto n_i = g.add([&](sycl::handler &h) { + auto x = xBuf.get_access(h); + auto y = yBuf.get_access(h); + auto z = zBuf.get_access(h); + h.parallel_for(n, [=](sycl::id<1> it) { + const size_t i = it[0]; + x[i] = 1.0f; + y[i] = 2.0f; + z[i] = 3.0f; + }); + }); + + auto node_a = g.add([&](sycl::handler &h) { + auto x = xBuf.get_access(h); + auto y = yBuf.get_access(h); + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { + const size_t i = it[0]; + x[i] = alpha * x[i] + beta * y[i]; + }); + }); + + auto node_b = g.add([&](sycl::handler &h) { + auto y = yBuf.get_access(h); + auto z = zBuf.get_access(h); + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { + const size_t i = it[0]; + z[i] = gamma * z[i] + beta * y[i]; + }); + }); + + auto node_c = g.add([&](sycl::handler &h) { + auto dotp = dotpBuf.get_access(h); + auto x = xBuf.get_access(h); + auto z = zBuf.get_access(h); +#ifdef TEST_GRAPH_REDUCTIONS + h.parallel_for(sycl::range<1>{n}, + sycl::reduction(dotpBuf, h, 0.0f, std::plus()), + [=](sycl::id<1> it, auto &sum) { + const size_t i = it[0]; + sum += x[i] * z[i]; + }); +#else + h.single_task([=]() { + // Doing a manual reduction here because reduction objects cause issues + // with graphs. + for (size_t j = 0; j < n; j++) { + dotp[0] += x[j] * z[j]; + } + }); +#endif + }); + + auto executable_graph = g.finalize(q.get_context()); + + // Using shortcut for executing a graph of commands + q.ext_oneapi_graph(executable_graph).wait(); + } + + assert(dotpData == host_gold_result()); + return 0; +} diff --git a/sycl/test/graph/graph-explicit-subgraph.cpp b/sycl/test/graph/graph-explicit-subgraph.cpp index 160418a9ae012..6f20c34541a0e 100644 --- a/sycl/test/graph/graph-explicit-subgraph.cpp +++ b/sycl/test/graph/graph-explicit-subgraph.cpp @@ -74,12 +74,22 @@ int main() { auto node_c = g.add( [&](sycl::handler &h) { +#ifdef TEST_GRAPH_REDUCTIONS h.parallel_for(sycl::range<1>{n}, sycl::reduction(dotp, 0.0f, std::plus()), [=](sycl::id<1> it, auto &sum) { const size_t i = it[0]; sum += x[i] * z[i]; }); +#else + h.single_task([=]() { + // Doing a manual reduction here because reduction objects cause + // issues with graphs. + for (size_t j = 0; j < n; j++) { + dotp[0] += x[j] * z[j]; + } + }); +#endif }, {node_sub}); diff --git a/sycl/test/graph/graph-record-dotp-buffer.cpp b/sycl/test/graph/graph-record-dotp-buffer.cpp index 0e49e86cf1cad..75764428e617f 100644 --- a/sycl/test/graph/graph-record-dotp-buffer.cpp +++ b/sycl/test/graph/graph-record-dotp-buffer.cpp @@ -83,18 +83,23 @@ int main() { auto dotp = dotpBuf.get_access(h); auto x = xBuf.get_access(h); auto z = zBuf.get_access(h); - h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { - const size_t i = it[0]; +#ifdef TEST_GRAPH_REDUCTIONS + h.parallel_for(sycl::range<1>{n}, + sycl::reduction(dotpBuf, h, 0.0f, std::plus()), + [=](sycl::id<1> it, auto &sum) { + const size_t i = it[0]; + sum += x[i] * z[i]; + }); +#else + h.single_task([=]() { // Doing a manual reduction here because reduction objects cause issues // with graphs. - if (i == 0) { - for (size_t j = 0; j < n; j++) { - dotp[0] += x[j] * z[j]; - } + for (size_t j = 0; j < n; j++) { + dotp[0] += x[j] * z[j]; } }); +#endif }); - g.end_recording(); auto exec_graph = g.finalize(q.get_context()); @@ -109,4 +114,4 @@ int main() { } return 0; -} \ No newline at end of file +} diff --git a/sycl/test/graph/graph-record-dotp.cpp b/sycl/test/graph/graph-record-dotp.cpp index 538383cebc0de..a7627a449c603 100644 --- a/sycl/test/graph/graph-record-dotp.cpp +++ b/sycl/test/graph/graph-record-dotp.cpp @@ -67,16 +67,21 @@ int main() { }); q.submit([&](sycl::handler &h) { - h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { - const size_t i = it[0]; +#ifdef TEST_GRAPH_REDUCTIONS + h.parallel_for(sycl::range<1>{n}, sycl::reduction(dotp, 0.0f, std::plus()), + [=](sycl::id<1> it, auto &sum) { + const size_t i = it[0]; + sum += x[i] * z[i]; + }); +#else + h.single_task([=]() { // Doing a manual reduction here because reduction objects cause issues // with graphs. - if (i == 0) { - for (size_t j = 0; j < n; j++) { - dotp[0] += x[j] * z[j]; - } + for (size_t j = 0; j < n; j++) { + dotp[0] += x[j] * z[j]; } }); +#endif }); g.end_recording(); @@ -99,4 +104,4 @@ int main() { std::cout << "done.\n"; return 0; -} \ No newline at end of file +} diff --git a/sycl/test/graph/graph-record-temp-scope.cpp b/sycl/test/graph/graph-record-temp-scope.cpp new file mode 100644 index 0000000000000..b4d660ccaec0f --- /dev/null +++ b/sycl/test/graph/graph-record-temp-scope.cpp @@ -0,0 +1,49 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out + +#include + +#include + +const size_t n = 10; +const float expectedValue = 42.0f; + +void run_some_kernel(sycl::queue q, float *data) { + // data is captured by ref here but will have gone out of scope when the + // CGF is later run when the graph is executed. + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) { + size_t i = idx; + data[i] = expectedValue; + }); + }); +} + +int main() { + + sycl::property_list properties{ + sycl::property::queue::in_order(), + sycl::ext::oneapi::property::queue::lazy_execution{}}; + + sycl::queue q{sycl::default_selector_v, properties}; + + sycl::ext::oneapi::experimental::command_graph g; + + float *arr = sycl::malloc_shared(n, q); + + g.begin_recording(q); + run_some_kernel(q, arr); + g.end_recording(q); + + auto exec_graph = g.finalize(q.get_context()); + + q.submit([&](sycl::handler &h) { h.ext_oneapi_graph(exec_graph); }); + + // Verify results + for (size_t i = 0; i < n; i++) { + assert(arr[i] == expectedValue); + } + + sycl::free(arr, q.get_context()); + + return 0; +}