-
Notifications
You must be signed in to change notification settings - Fork 795
Closed
Description
When i use reduction function, it raised a 'cl::sycl::nd_range_error' when input array >=409600 bytes.
Below is the detailed error log:
terminate called after throwing an instance of 'cl::sycl::nd_range_error'
what(): Global_work_size not evenly divisible by local_work_size. Non-uniform work-groups are not allowed by default.
Underlying OpenCL 2.x implementation supports this feature and to enable it, build device program with -cl-std=CL2.0 -54 (CL_INVALID_WORK_GROUP_SIZE)
Aborted (core dumped)
but actually my global_work_size = num_work_group * work_group_size
Below is the code:
#include <algorithm>
#include <iostream>
#include <random>
#include <CL/sycl.hpp>
#define GET_VARIABLE_NAME(Variable) (#Variable)
#define SHOW_DEVICE_INFO(dev, prop) \
std::cout << GET_VARIABLE_NAME(prop) << ": " << dev.get_info<cl::sycl::info::device::prop>() \
<< std::endl;
int device_info(cl::sycl::device dev) {
const cl::sycl::platform plat = dev.get_platform();
std::cout << "platform: " << plat.get_info<cl::sycl::info::platform::name>() << std::endl;
SHOW_DEVICE_INFO(dev, name);
SHOW_DEVICE_INFO(dev, vendor);
SHOW_DEVICE_INFO(dev, driver_version);
std::cout << std::endl;
return 0;
}
template <typename T>
void RandomInit(T *arr, size_t nelems) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<T> rg(0.f, 1.f);
std::generate(arr, arr + nelems, [&] { return rg(gen); });
}
template <typename IN_T>
float reduce_algorithm(cl::sycl::queue &q, IN_T in, size_t nelem) {
auto max_group_sz = 256;
auto num_gz = (nelem + max_group_sz - 1) / max_group_sz;
cl::sycl::range<1> local(max_group_sz);
cl::sycl::range<1> global(num_gz * max_group_sz);
std::array<float, 1> result;
result[0] = 0.0f;
{
cl::sycl::buffer<float> res_buf(result.data(), 1);
printf("local: %zu, glocal: %zu\n", local.size(), global.size());
q.submit([&](cl::sycl::handler &cgh) {
auto res_acc = res_buf.template get_access<cl::sycl::access::mode::read_write>(cgh);
cgh.parallel_for(cl::sycl::nd_range<1>(global, local),
cl::sycl::ONEAPI::reduction(res_acc, 0.0f, cl::sycl::ONEAPI::plus<float>()),
[=](cl::sycl::nd_item<1> it, auto &res_acc) {
int lid = it.get_global_id(0);
if (lid >= nelem) return;
res_acc += in[lid];
});
});
}
return result[0];
}
int main(int argc, char **argv) {
cl::sycl::queue q;
device_info(q.get_device());
const size_t nelems = 100 * 1024;
auto *p = reinterpret_cast<float *>(sycl::aligned_alloc_host(4096, sizeof(float) * nelems, q));
auto *in = reinterpret_cast<float *>(sycl::aligned_alloc_device(4096, sizeof(float) * nelems, q));
float sum;
RandomInit(p, nelems);
q.memcpy(in, p, sizeof(float) * nelems);
sum = reduce_algorithm<float *>(q, in, nelems);
return 0;
}
Metadata
Metadata
Assignees
Labels
No labels