diff --git a/SYCL/Scheduler/AllocaCleanup.cpp b/SYCL/Scheduler/AllocaCleanup.cpp new file mode 100644 index 0000000000..acd8b75c91 --- /dev/null +++ b/SYCL/Scheduler/AllocaCleanup.cpp @@ -0,0 +1,97 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER + +#include +#include +#include +#include +#include +#include "CL/sycl.hpp" + +template +void cross_product (sycl::queue& q, + int n, int p, + sycl::buffer data_buf, + sycl::buffer w_buf, + sycl::buffer mean_buf, + sycl::buffer cross_product_buf) +{ + std::cout << '.'; + sycl::buffer helper_buf(n * p, sycl::no_init); // props + + q.submit([&] (auto &h) { + sycl::accessor data_acc(data_buf, h, sycl::read_only); + sycl::accessor helper_acc(helper_buf, h, sycl::write_only); + sycl::accessor mean_acc(mean_buf, h, sycl::read_only); + h.parallel_for(sycl::range<1>(n * p), [=] (sycl::item<1> item) { + int ij = item.get_id(); + helper_acc[ij] = data_acc[ij] - mean_acc[ij / n]; + }); + }); + q.wait(); + + std::vector> sub_helper; // props + for (int i = 0; i < p; i++) { + sub_helper.push_back(sycl::buffer(helper_buf, i*n, n)); + } + + auto policy = oneapi::dpl::execution::make_device_policy(q); + auto w_begin = oneapi::dpl::begin(w_buf); + for (int i = 0; i < p; i++) { + auto helper_i = oneapi::dpl::begin(sub_helper[i]); + + for (int j = i + 1; j < p; j++) { + auto helper_j = oneapi::dpl::begin(sub_helper[j]); + auto zip_begin = oneapi::dpl::make_zip_iterator(w_begin, helper_i, helper_j); + RealType sum = std::transform_reduce(policy, zip_begin, zip_begin + n, RealType{0}, std::plus<>(), + [](auto x) { return std::get<0>(x) * std::get<1>(x) * std::get<2>(x); }); + { + sycl::host_accessor h_cross_product(cross_product_buf); + h_cross_product[i*p+j] = sum; + h_cross_product[i+j*p] = sum; + } + } + auto zip_begin = oneapi::dpl::make_zip_iterator(w_begin, helper_i); + RealType sum = std::transform_reduce(policy, zip_begin, zip_begin + n, RealType{0}, std::plus<>(), + [](auto x) { return std::get<0>(x) * std::get<1>(x) * std::get<1>(x); }); + { + sycl::host_accessor h_cross_product(cross_product_buf); + h_cross_product[i*p+i] = sum; + } + } + q.wait(); +} + +int main() { + const int p = 30; // dimensions + const int n = 100; // observations + const int n_runs = 20; + std::vector data(n * p), w(n, 1), not_mean(p, 1); + // data[i, j] = data[i * n + j] + for (auto& x : data) x = float(std::rand()) / float(RAND_MAX); + + sycl::buffer data_buf(data.data(), data.size()); + sycl::buffer w_buf(w.data(), w.size()); + sycl::buffer cross_product_buf(p * p); + sycl::buffer not_mean_buf(not_mean.data(), not_mean.size()); + + sycl::queue q(sycl::gpu_selector{}); + + std::cout << "Running on: " << q.get_device().get_info() << std::endl; + + std::cout << "Start of parallel computing "; + + for (int i = 0; i < n_runs; i++) { + cross_product(q, n, p, data_buf, w_buf, not_mean_buf, cross_product_buf); + q.wait(); + } + + std::cout << "SUCCESS" << std::endl; + + return 0; +} + +// CHECK:---> piQueueRelease +// CHECK:---> piContextRelease +// CHECK:---> piKernelRelease +// CHECK:---> piProgramRelease