Skip to content

[SYCL] Update graph constructor/finalize to current spec #140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 19, 2023
7 changes: 3 additions & 4 deletions sycl/include/sycl/ext/oneapi/experimental/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ class __SYCL_EXPORT node {
template <graph_state State = graph_state::modifiable>
class __SYCL_EXPORT command_graph {
public:
command_graph(const property_list &propList = {});
command_graph(const context &syclContext, const device &syclDevice,
const property_list &propList = {});

// Adding empty node with [0..n] predecessors:
node add(const std::vector<node> &dep = {}) { return add_impl(dep); }
Expand All @@ -67,8 +68,7 @@ class __SYCL_EXPORT command_graph {
void make_edge(node sender, node receiver);

command_graph<graph_state::executable>
finalize(const sycl::context &syclContext,
const property_list &propList = {}) const;
finalize(const property_list &propList = {}) const;

/// Change the state of a queue to be recording and associate this graph with
/// it.
Expand Down Expand Up @@ -138,7 +138,6 @@ template <> class __SYCL_EXPORT command_graph<graph_state::executable> {
void finalize_impl();

int MTag;
const sycl::context &MCtx;
std::shared_ptr<detail::exec_graph_impl> impl;
};
} // namespace experimental
Expand Down
25 changes: 13 additions & 12 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,13 +241,12 @@ void exec_graph_impl::find_real_deps(std::vector<pi_ext_sync_point> &Deps,
}
}

void exec_graph_impl::create_pi_command_buffers(sycl::device D,
const sycl::context &Ctx) {
void exec_graph_impl::create_pi_command_buffers(sycl::device D) {
// TODO we only have a single command-buffer per graph here, but
// this will need to be multiple command-buffers for non-trivial graphs
pi_ext_command_buffer OutCommandBuffer;
pi_ext_command_buffer_desc Desc{};
auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx);
auto ContextImpl = sycl::detail::getSyclObjImpl(MContext);
const sycl::detail::plugin &Plugin = ContextImpl->getPlugin();
auto DeviceImpl = sycl::detail::getSyclObjImpl(D);
pi_result Res =
Expand Down Expand Up @@ -284,13 +283,13 @@ void exec_graph_impl::create_pi_command_buffers(sycl::device D,
Node->MKernelName);
}

auto SetFunc = [&Plugin, &PiKernel, &Ctx](sycl::detail::ArgDesc &Arg,
auto SetFunc = [&Plugin, &PiKernel, this](sycl::detail::ArgDesc &Arg,
size_t NextTrueIndex) {
sycl::detail::SetArgBasedOnType(
Plugin, PiKernel,
nullptr /* TODO: Handle spec constants and pass device image here */,
nullptr /* TODO: Pass getMemAllocation function for buffers */, Ctx,
false, Arg, NextTrueIndex);
nullptr /* TODO: Pass getMemAllocation function for buffers */,
this->MContext, false, Arg, NextTrueIndex);
};
std::vector<sycl::detail::ArgDesc> Args;
sycl::detail::applyFuncOnFilteredArgs(EliminatedArgMask, Node->MArgs,
Expand Down Expand Up @@ -421,8 +420,9 @@ sycl::event exec_graph_impl::enqueue(

template <>
command_graph<graph_state::modifiable>::command_graph(
const sycl::context &syclContext, const sycl::device &syclDevice,
const sycl::property_list &)
: impl(std::make_shared<detail::graph_impl>()) {}
: impl(std::make_shared<detail::graph_impl>(syclContext, syclDevice)) {}

template <>
node command_graph<graph_state::modifiable>::add_impl(
Expand Down Expand Up @@ -465,8 +465,9 @@ void command_graph<graph_state::modifiable>::make_edge(node Sender,
template <>
command_graph<graph_state::executable>
command_graph<graph_state::modifiable>::finalize(
const sycl::context &CTX, const sycl::property_list &) const {
return command_graph<graph_state::executable>{this->impl, CTX};
const sycl::property_list &) const {
return command_graph<graph_state::executable>{this->impl,
this->impl->get_context()};
}

template <>
Expand Down Expand Up @@ -531,7 +532,7 @@ bool command_graph<graph_state::modifiable>::end_recording(

command_graph<graph_state::executable>::command_graph(
const std::shared_ptr<detail::graph_impl> &Graph, const sycl::context &Ctx)
: MTag(rand()), MCtx(Ctx),
: MTag(rand()),
impl(std::make_shared<detail::exec_graph_impl>(Ctx, Graph)) {
finalize_impl(); // Create backend representation for executable graph
}
Expand All @@ -540,8 +541,8 @@ void command_graph<graph_state::executable>::finalize_impl() {
// Create PI command-buffers for each device in the finalized context
impl->schedule();
#if SYCL_EXT_ONEAPI_GRAPH
for (auto device : MCtx.get_devices()) {
impl->create_pi_command_buffers(device, MCtx);
for (auto device : impl->get_context().get_devices()) {
impl->create_pi_command_buffers(device);
}
#endif
}
Expand Down
21 changes: 16 additions & 5 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,10 @@ struct node_impl {
};

struct graph_impl {
std::set<std::shared_ptr<node_impl>> MRoots;

std::shared_ptr<graph_impl> MParent;
graph_impl(const sycl::context &syclContext, const sycl::device &syclDevice)
: MContext(syclContext), MDevice(syclDevice), MRecordingQueues(),
MEventsMap() {}

void add_root(const std::shared_ptr<node_impl> &);
void remove_root(const std::shared_ptr<node_impl> &);
Expand All @@ -155,8 +156,6 @@ struct graph_impl {
std::shared_ptr<node_impl>
add(const std::vector<std::shared_ptr<node_impl>> &Dep = {});

graph_impl() = default;

/// Add a queue to the set of queues which are currently recording to this
/// graph.
void
Expand Down Expand Up @@ -199,8 +198,18 @@ struct graph_impl {
/// an empty node is used to schedule dependencies on this sub graph.
std::shared_ptr<node_impl>
add_subgraph_nodes(const std::list<std::shared_ptr<node_impl>> &NodeList);
sycl::context get_context() const { return MContext; }

std::set<std::shared_ptr<node_impl>> MRoots;
std::shared_ptr<graph_impl> MParent;

private:
// Context associated with this graph.
sycl::context MContext;
// Device associated with this graph. All graph nodes will execute on this
// device.
sycl::device MDevice;
// Unique set of queues which are currently recording to this graph.
std::set<std::shared_ptr<sycl::detail::queue_impl>> MRecordingQueues;
// Map of events to their associated recorded nodes.
std::unordered_map<std::shared_ptr<sycl::detail::event_impl>,
Expand All @@ -224,7 +233,9 @@ class exec_graph_impl {
sycl::event exec(const std::shared_ptr<sycl::detail::queue_impl> &);
/// Turns our internal graph representation into PI command-buffers for a
/// device
void create_pi_command_buffers(sycl::device D, const sycl::context &Ctx);
void create_pi_command_buffers(sycl::device D);

sycl::context get_context() const { return MContext; }

const std::list<std::shared_ptr<node_impl>> &get_schedule() const {
return MSchedule;
Expand Down
5 changes: 3 additions & 2 deletions sycl/test/graph/graph-explicit-dotp-buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ int main() {

sycl::queue q{sycl::gpu_selector_v};

sycl::ext::oneapi::experimental::command_graph g;
sycl::ext::oneapi::experimental::command_graph g{q.get_context(),
q.get_device()};

float dotpData = 0.f;
std::vector<float> xData(n);
Expand Down Expand Up @@ -93,7 +94,7 @@ int main() {
#endif
});

auto executable_graph = g.finalize(q.get_context());
auto executable_graph = g.finalize();

// Using shortcut for executing a graph of commands
q.ext_oneapi_graph(executable_graph).wait();
Expand Down
5 changes: 3 additions & 2 deletions sycl/test/graph/graph-explicit-dotp-device-mem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ int main() {

sycl::queue q{sycl::gpu_selector_v};

sycl::ext::oneapi::experimental::command_graph g;
sycl::ext::oneapi::experimental::command_graph g{q.get_context(),
q.get_device()};

float *dotp = sycl::malloc_device<float>(1, q);

Expand Down Expand Up @@ -83,7 +84,7 @@ int main() {
},
{node_a, node_b});

auto executable_graph = g.finalize(q.get_context());
auto executable_graph = g.finalize();

// Using shortcut for executing a graph of commands
q.ext_oneapi_graph(executable_graph).wait();
Expand Down
5 changes: 3 additions & 2 deletions sycl/test/graph/graph-explicit-dotp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ int main() {

sycl::queue q{sycl::gpu_selector_v};

sycl::ext::oneapi::experimental::command_graph g;
sycl::ext::oneapi::experimental::command_graph g{q.get_context(),
q.get_device()};

float *dotp = sycl::malloc_shared<float>(1, q);

Expand Down Expand Up @@ -83,7 +84,7 @@ int main() {
},
{node_a, node_b});

auto executable_graph = g.finalize(q.get_context());
auto executable_graph = g.finalize();

// Using shortcut for executing a graph of commands
q.ext_oneapi_graph(executable_graph).wait();
Expand Down
5 changes: 3 additions & 2 deletions sycl/test/graph/graph-explicit-empty.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ int main() {

sycl::queue q{sycl::gpu_selector_v};

sycl::ext::oneapi::experimental::command_graph g;
sycl::ext::oneapi::experimental::command_graph g{q.get_context(),
q.get_device()};

const size_t n = 10;
float *arr = sycl::malloc_device<float>(n, q);
Expand Down Expand Up @@ -35,7 +36,7 @@ int main() {
},
{empty2});

auto executable_graph = g.finalize(q.get_context());
auto executable_graph = g.finalize();

q.submit([&](sycl::handler &h) {
h.ext_oneapi_graph(executable_graph);
Expand Down
7 changes: 4 additions & 3 deletions sycl/test/graph/graph-explicit-multiple-exec-graphs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ int main() {

sycl::queue q{sycl::gpu_selector_v};

sycl::ext::oneapi::experimental::command_graph g;
sycl::ext::oneapi::experimental::command_graph g{q.get_context(),
q.get_device()};

float *dotp = sycl::malloc_shared<float>(1, q);

Expand Down Expand Up @@ -73,14 +74,14 @@ int main() {
},
{node_a, node_b});

auto executable_graph = g.finalize(q.get_context());
auto executable_graph = g.finalize();

// Add an extra node for the second executable graph which modifies the output
auto node_d =
g.add([&](sycl::handler &h) { h.single_task([=]() { dotp[0] += 1; }); },
{node_c});

auto executable_graph_2 = g.finalize(q.get_context());
auto executable_graph_2 = g.finalize();

// Using shortcut for executing a graph of commands
q.ext_oneapi_graph(executable_graph).wait();
Expand Down
5 changes: 3 additions & 2 deletions sycl/test/graph/graph-explicit-node-ordering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ int main() {

sycl::queue q{sycl::gpu_selector_v};

sycl::ext::oneapi::experimental::command_graph g;
sycl::ext::oneapi::experimental::command_graph g{q.get_context(),
q.get_device()};

const size_t n = 10;
float *x = sycl::malloc_shared<float>(n, q);
Expand Down Expand Up @@ -36,7 +37,7 @@ int main() {
g.make_edge(init, mult);
g.make_edge(mult, add);

auto executable_graph = g.finalize(q.get_context());
auto executable_graph = g.finalize();

q.submit([&](sycl::handler &h) {
h.ext_oneapi_graph(executable_graph);
Expand Down
5 changes: 3 additions & 2 deletions sycl/test/graph/graph-explicit-queue-shortcuts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ int main() {

// Test passing empty property list, which is the default
sycl::property_list empty_properties;
sycl::ext::oneapi::experimental::command_graph g(empty_properties);
sycl::ext::oneapi::experimental::command_graph g(
q.get_context(), q.get_device(), empty_properties);

const size_t n = 10;
float *arr = sycl::malloc_shared<float>(n, q);
Expand All @@ -21,7 +22,7 @@ int main() {
});
});

auto executable_graph = g.finalize(q.get_context(), empty_properties);
auto executable_graph = g.finalize(empty_properties);

auto e1 = q.ext_oneapi_graph(executable_graph);
auto e2 = q.ext_oneapi_graph(executable_graph, e1);
Expand Down
5 changes: 3 additions & 2 deletions sycl/test/graph/graph-explicit-reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
int main() {
sycl::queue q{sycl::gpu_selector_v};

sycl::ext::oneapi::experimental::command_graph g;
sycl::ext::oneapi::experimental::command_graph g{q.get_context(),
q.get_device()};

const size_t n = 10;
float *input = sycl::malloc_shared<float>(n, q);
Expand All @@ -21,7 +22,7 @@ int main() {
[=](sycl::id<1> idx, auto &sum) { sum += input[idx]; });
});

auto executable_graph = g.finalize(q.get_context());
auto executable_graph = g.finalize();
q.ext_oneapi_graph(executable_graph).wait();

assert(*output == 45);
Expand Down
5 changes: 3 additions & 2 deletions sycl/test/graph/graph-explicit-repeated-exec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ int main() {

sycl::queue q{sycl::gpu_selector_v};

sycl::ext::oneapi::experimental::command_graph g;
sycl::ext::oneapi::experimental::command_graph g{q.get_context(),
q.get_device()};

const size_t n = 10;
float *arr = sycl::malloc_shared<float>(n, q);
Expand All @@ -26,7 +27,7 @@ int main() {
assert(arr[i] == 0);
}

auto executable_graph = g.finalize(q.get_context());
auto executable_graph = g.finalize();

for (int i = 0; i < n; i++) {
assert(arr[i] == 0);
Expand Down
5 changes: 3 additions & 2 deletions sycl/test/graph/graph-explicit-saxpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ int main() {

sycl::queue q{sycl::gpu_selector_v};

sycl::ext::oneapi::experimental::command_graph g;
sycl::ext::oneapi::experimental::command_graph g{q.get_context(),
q.get_device()};

const size_t n = 1000;
const float a = 3.0f;
Expand All @@ -31,7 +32,7 @@ int main() {

g.make_edge(init, compute);

auto executable_graph = g.finalize(q.get_context());
auto executable_graph = g.finalize();

q.submit([&](sycl::handler &h) {
h.ext_oneapi_graph(executable_graph);
Expand Down
5 changes: 3 additions & 2 deletions sycl/test/graph/graph-explicit-single-node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ int main() {

sycl::queue q{sycl::gpu_selector_v};

sycl::ext::oneapi::experimental::command_graph g;
sycl::ext::oneapi::experimental::command_graph g{q.get_context(),
q.get_device()};

const size_t n = 10;
float *arr = sycl::malloc_shared<float>(n, q);
Expand All @@ -27,7 +28,7 @@ int main() {
assert(arr[i] == 0);
}

auto executable_graph = g.finalize(q.get_context());
auto executable_graph = g.finalize();

for (int i = 0; i < n; i++) {
assert(arr[i] == 0);
Expand Down
Loading