Reduction function throws an instance of 'cl::sycl::nd_range_error'

When i use reduction function, it raised a 'cl::sycl::nd_range_error' when input array >=409600 bytes.
Below is the detailed error log:
```shell
terminate called after throwing an instance of 'cl::sycl::nd_range_error'                                                                                                             
 what():  Global_work_size not evenly divisible by local_work_size. Non-uniform work-groups are not allowed by default.  
 Underlying OpenCL 2.x implementation supports this feature and to enable it, build device program with -cl-std=CL2.0 -54 (CL_INVALID_WORK_GROUP_SIZE)  
 Aborted (core dumped)
```
but actually my global_work_size = num_work_group * work_group_size
Below is the code:
```c++
#include <algorithm>
#include <iostream>
#include <random>
#include <CL/sycl.hpp>

#define GET_VARIABLE_NAME(Variable) (#Variable)

#define SHOW_DEVICE_INFO(dev, prop) \
  std::cout << GET_VARIABLE_NAME(prop) << ": " << dev.get_info<cl::sycl::info::device::prop>() \
            << std::endl;

int device_info(cl::sycl::device dev) {
  const cl::sycl::platform plat = dev.get_platform();
  std::cout << "platform: " << plat.get_info<cl::sycl::info::platform::name>() << std::endl;
  SHOW_DEVICE_INFO(dev, name);
  SHOW_DEVICE_INFO(dev, vendor);
  SHOW_DEVICE_INFO(dev, driver_version);
  std::cout << std::endl;

  return 0;
}

template <typename T>
void RandomInit(T *arr, size_t nelems) {
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_real_distribution<T> rg(0.f, 1.f);
  std::generate(arr, arr + nelems, [&] { return rg(gen); });
}

template <typename IN_T>
float reduce_algorithm(cl::sycl::queue &q, IN_T in, size_t nelem) {
  auto max_group_sz = 256;
  auto num_gz = (nelem + max_group_sz - 1) / max_group_sz;
  cl::sycl::range<1> local(max_group_sz);
  cl::sycl::range<1> global(num_gz * max_group_sz);
  std::array<float, 1> result;
  result[0] = 0.0f;
  {
    cl::sycl::buffer<float> res_buf(result.data(), 1);
    printf("local: %zu, glocal: %zu\n", local.size(), global.size());
    q.submit([&](cl::sycl::handler &cgh) {
      auto res_acc = res_buf.template get_access<cl::sycl::access::mode::read_write>(cgh);
      cgh.parallel_for(cl::sycl::nd_range<1>(global, local),
          cl::sycl::ONEAPI::reduction(res_acc, 0.0f, cl::sycl::ONEAPI::plus<float>()),
          [=](cl::sycl::nd_item<1> it, auto &res_acc) {
            int lid = it.get_global_id(0);
            if (lid >= nelem) return;
            res_acc += in[lid];
          });
    });
  }
  return result[0];
}

int main(int argc, char **argv) {
  cl::sycl::queue q;
  device_info(q.get_device());
  const size_t nelems = 100 * 1024;

  auto *p = reinterpret_cast<float *>(sycl::aligned_alloc_host(4096, sizeof(float) * nelems, q));
  auto *in = reinterpret_cast<float *>(sycl::aligned_alloc_device(4096, sizeof(float) * nelems, q));
  float sum;

  RandomInit(p, nelems);
  q.memcpy(in, p, sizeof(float) * nelems);
  sum = reduce_algorithm<float *>(q, in, nelems);
  return 0;
}
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Reduction function throws an instance of 'cl::sycl::nd_range_error' #3138

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Reduction function throws an instance of 'cl::sycl::nd_range_error' #3138

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions