Skip to content

Repro for reduction fail #31

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 76 additions & 73 deletions sycl/test/graph/graph-explicit-dotp.cpp
Original file line number Diff line number Diff line change
@@ -1,94 +1,97 @@
#include <iostream>
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
#include <CL/sycl.hpp>
#include <iostream>

#include <sycl/ext/oneapi/experimental/graph.hpp>

const size_t n = 10;

float host_gold_result() {
float alpha = 1.0f;
float beta = 2.0f;
float gamma = 3.0f;

float sum = 0.0f;

for(size_t i = 0; i < n; ++i) {
sum += (alpha * 1.0f + beta * 2.0f)
* (gamma * 3.0f + beta * 2.0f);
}

return sum;
float alpha = 1.0f;
float beta = 2.0f;
float gamma = 3.0f;

float sum = 0.0f;

for (size_t i = 0; i < n; ++i) {
sum += (alpha * 1.0f + beta * 2.0f) * (gamma * 3.0f + beta * 2.0f);
}

return sum;
}

int main() {
float alpha = 1.0f;
float beta = 2.0f;
float gamma = 3.0f;

float *x , *y, *z;

sycl::property_list properties{
sycl::property::queue::in_order(),
sycl::ext::oneapi::property::queue::lazy_execution{}
};

sycl::queue q{sycl::gpu_selector_v, properties};

sycl::ext::oneapi::experimental::command_graph g;

float *dotp = sycl::malloc_shared<float>(1, q);

x = sycl::malloc_shared<float>(n, q);
y = sycl::malloc_shared<float>(n, q);
z = sycl::malloc_shared<float>(n, q);

/* init data on the device */
auto n_i = g.add([&](sycl::handler &h) {
h.parallel_for(n, [=](sycl::id<1> it){
const size_t i = it[0];
x[i] = 1.0f;
y[i] = 2.0f;
z[i] = 3.0f;
});
float alpha = 1.0f;
float beta = 2.0f;
float gamma = 3.0f;

sycl::property_list properties{
sycl::property::queue::in_order{},
sycl::ext::oneapi::property::queue::lazy_execution{}};

sycl::queue q{sycl::gpu_selector_v, properties};

sycl::ext::oneapi::experimental::command_graph g;

float *dotp = sycl::malloc_shared<float>(1, q);

float *x = sycl::malloc_shared<float>(n, q);
float *y = sycl::malloc_shared<float>(n, q);
float *z = sycl::malloc_shared<float>(n, q);

/* init data on the device */
auto n_i = g.add([&](sycl::handler &h) {
h.parallel_for(n, [=](sycl::id<1> it) {
const size_t i = it[0];
x[i] = 1.0f;
y[i] = 2.0f;
z[i] = 3.0f;
});
});

auto node_a = g.add([&](sycl::handler &h) {
auto node_a = g.add(
[&](sycl::handler &h) {
h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
const size_t i = it[0];
x[i] = alpha * x[i] + beta * y[i];
const size_t i = it[0];
x[i] = alpha * x[i] + beta * y[i];
});
}, {n_i});

auto node_b = g.add([&](sycl::handler &h) {
},
{n_i});

auto node_b = g.add(
[&](sycl::handler &h) {
h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
const size_t i = it[0];
z[i] = gamma * z[i] + beta * y[i];
const size_t i = it[0];
z[i] = gamma * z[i] + beta * y[i];
});
}, {n_i});
},
{n_i});

auto node_c = g.add([&](sycl::handler &h) {
auto node_c = g.add(
[&](sycl::handler &h) {
h.parallel_for(sycl::range<1>{n},
sycl::reduction(dotp, 0.0f, std::plus()),
[=](sycl::id<1> it, auto &sum) {
const size_t i = it[0];
sum += x[i] * z[i];
const size_t i = it[0];
sum += x[i] * z[i];
});
}, {node_a, node_b});

auto exec_graph = g.finalize(q.get_context());

exec_graph.exec_and_wait(q);

if (*dotp != host_gold_result()) {
std::cout << "Error unexpected result!\n";
}

sycl::free(dotp, q);
sycl::free(x, q);
sycl::free(y, q);
sycl::free(z, q);

std::cout << "done.\n";

return 0;
}
},
{node_a, node_b});

auto exec_graph = g.finalize(q.get_context());

exec_graph.exec_and_wait(q);

if (*dotp != host_gold_result()) {
std::cout << "Error unexpected result!\n";
}

sycl::free(dotp, q);
sycl::free(x, q);
sycl::free(y, q);
sycl::free(z, q);

std::cout << "done.\n";

return 0;
}
37 changes: 37 additions & 0 deletions sycl/test/graph/graph-explicit-reduction.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
#include <CL/sycl.hpp>
#include <iostream>

#include <sycl/ext/oneapi/experimental/graph.hpp>

int main() {
sycl::property_list properties{
sycl::property::queue::in_order{},
sycl::ext::oneapi::property::queue::lazy_execution{}};

sycl::queue q{sycl::gpu_selector_v, properties};

sycl::ext::oneapi::experimental::command_graph g;

const size_t n = 10;
float *input = sycl::malloc_shared<float>(n, q);
float *output = sycl::malloc_shared<float>(1, q);
for (size_t i = 0; i < n; i++) {
input[i] = i;
}

auto e = q.submit([&](sycl::handler &h) {
h.parallel_for(sycl::range<1>{n},
sycl::reduction(output, 0.0f, std::plus()),
[=](sycl::id<1> idx, auto &sum) { sum += input[idx]; });
});

e.wait();

sycl::free(input, q);
sycl::free(output, q);

std::cout << "done\n";

return 0;
}
73 changes: 36 additions & 37 deletions sycl/test/graph/graph-explicit-simple.cpp
Original file line number Diff line number Diff line change
@@ -1,43 +1,42 @@
#include <iostream>
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
#include <CL/sycl.hpp>
#include <iostream>

#include <sycl/ext/oneapi/experimental/graph.hpp>

const size_t n = 10;

int main() {

sycl::property_list properties{
sycl::property::queue::in_order(),
sycl::ext::oneapi::property::queue::lazy_execution{}
};

//sycl::gpu_selector device_selector;

sycl::queue q{sycl::gpu_selector_v, properties};

//sycl::queue copy_q{};

sycl::ext::oneapi::experimental::command_graph g;

float *arr = sycl::malloc_shared<float>(n, q);

g.add(
[&](sycl::handler& h){
h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx){size_t i = idx; arr[i]=1; });

sycl::property_list properties{
sycl::property::queue::in_order{},
sycl::ext::oneapi::property::queue::lazy_execution{}};

sycl::queue q{sycl::gpu_selector_v, properties};

sycl::ext::oneapi::experimental::command_graph g;

const size_t n = 10;
float *arr = sycl::malloc_shared<float>(n, q);

g.add([&](sycl::handler &h) {
h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) {
size_t i = idx;
arr[i] = 1;
});

auto result_before_exec1 = arr[0];

auto exec_graph = g.finalize(q.get_context());

auto result_before_exec2 = arr[0];

exec_graph.exec_and_wait(q);

auto result = arr[0];

std::cout << "done.\n";

return 0;
}
});

auto result_before_exec1 = arr[0];

auto exec_graph = g.finalize(q.get_context());

auto result_before_exec2 = arr[0];

exec_graph.exec_and_wait(q);

auto result = arr[0];

sycl::free(arr, q);

std::cout << "done.\n";

return 0;
}