Accessing iGPU device data allocated across classes with USM

I have a number of classes that use the same `cl::sycl::context` to allocate device-side memory; let's call these `WorkerClass`s. I then have a number of kernels -- in a separate source file -- that need to access the device data from the `WorkerClass`s. The device data are typically `struct`s and/or pointers allocated on the device via USM.

What I've noticed is that if I remove the `WorkerClass`s from the code -- that is, I allocate all device memory in a `main` function -- the code executes as expected. For example,
```
// Compile:
// clang++ \
// -g -fsycl \
// -o kernel_args kernel_args.cc

#include <CL/sycl.hpp>
#include <iostream>

static const unsigned int MAX_FLOATS = 10;

struct MyStructA {
  int* a;
  float* a_floats;
};

struct MyStructB {
  int* b;
  float* b_floats;
  MyStructA* ms_A;
};

class TestKernel {
 public:
  TestKernel() = delete;
  TestKernel(MyStructB* ms)
      : b_(ms->b), b_floats_(ms->b_floats), ms_A_(ms->ms_A) {}

  void operator()(cl::sycl::id<1> idx) {
    unsigned int id = (int)idx[0];
    b_floats_[id] += 0.5;
    *ms_A_->a = 100;
  }

 private:
  int* b_;
  float* b_floats_;
  MyStructA* ms_A_;
};

int main() {
  // Catch asynchronous exceptions
  auto exception_handler = [](cl::sycl::exception_list exceptions) {
    for (std::exception_ptr const& e : exceptions) {
      try {
        std::rethrow_exception(e);
      } catch (cl::sycl::exception const& e) {
        std::cout << "Caught asynchronous SYCL exception during generation:\n"
                  << e.what() << std::endl;
      }
    }
  };
  // Initialize device, queue and context
  cl::sycl::device dev = cl::sycl::device(cl::sycl::default_selector());
  cl::sycl::queue queue = cl::sycl::queue(dev, exception_handler);
  cl::sycl::context ctx = queue.get_context();

  // Name of the device to run on
  std::string dev_name =
      queue.get_device().get_info<cl::sycl::info::device::name>();
  std::cout << "Using device \"" << dev_name << "\"" << std::endl;
  // Ensure device can handle USM device allocations.
  if (!queue.get_device()
           .get_info<cl::sycl::info::device::usm_device_allocations>()) {
    std::cout << "ERROR :: device \"" << dev_name
              << "\" does not support usm_device_allocations!" << std::endl;
    return -1;
  }

  // Instantiate a MyStructA
  MyStructA ms_A{0, nullptr};
  int* a_int = (int*)malloc_device(sizeof(int), dev, ctx);
  float* a_floats = (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, ctx);
  // Host-side float data to copy
  int* host_a = (int*)malloc(sizeof(int));
  float* host_array = new float[MAX_FLOATS];
  for (unsigned int i = 0; i < MAX_FLOATS; i++) {
    host_array[i] = i * 1.0;
  }

  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(a_floats, &host_array[0], MAX_FLOATS * sizeof(float));
      })
      .wait_and_throw();
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(a_int, &host_a[0], sizeof(int));
      })
      .wait_and_throw();
  // Assign MyStructA pointers
  ms_A.a = a_int;
  ms_A.a_floats = a_floats;
  MyStructA* ms_A_dev = (MyStructA*)malloc_device(sizeof(ms_A), dev, ctx);
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(ms_A_dev, &ms_A, sizeof(ms_A));
      })
      .wait_and_throw();

  // Instantiate a MyStructB
  MyStructB ms_B{0, nullptr, nullptr};
  int* b_int = (int*)malloc_device(sizeof(int), dev, ctx);
  float* b_floats = (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, ctx);
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(b_floats, &host_array[0], MAX_FLOATS * sizeof(float));
      })
      .wait_and_throw();
  // Assign MyStructB pointers
  ms_B.b = b_int;
  ms_B.b_floats = b_floats;
  ms_B.ms_A = ms_A_dev;

  queue.submit([&](cl::sycl::handler& h) {
    TestKernel kernel(&ms_B);
    h.parallel_for<class foo>(cl::sycl::range<1>{MAX_FLOATS}, kernel);
  });
  queue.wait();

  // Copy back to host
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(host_array, &ms_B.b_floats[0], MAX_FLOATS * sizeof(float));
      })
      .wait_and_throw();
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(host_a, &ms_B.ms_A->a[0], sizeof(float));
      })
      .wait_and_throw();

  free(host_array);
  free(host_a);
  cl::sycl::free(a_int, ctx);
  cl::sycl::free(a_floats, ctx);
  cl::sycl::free(ms_A_dev, ctx);
  cl::sycl::free(b_int, ctx);
  cl::sycl::free(b_floats, ctx);
  return 0;
}
```
On the other hand, with the `WorkerClass`s taking care of the memory allocations, e.g.,
```
// Compile:
// clang++ \
// -g -fsycl \
// -o kernel_args kernel_args.cc

#include <CL/sycl.hpp>
#include <iostream>

static const unsigned int MAX_FLOATS = 10;

struct MyStructA {
  int* a;
  float* a_floats;
};

struct MyStructB {
  int* b;
  float* b_floats;
  MyStructA* ms_A;
};

class WorkerClass {
 public:
  WorkerClass() {}
  ~WorkerClass() {}

  bool Init() {
    // Initialize device, queue and context
    cl::sycl::device dev = cl::sycl::device(cl::sycl::default_selector());
    cl::sycl::queue queue = cl::sycl::queue(dev);
    ctx_ = new cl::sycl::context(queue.get_context());

    // Name of the device to run on
    std::string dev_name =
        queue.get_device().get_info<cl::sycl::info::device::name>();
    std::cout << "Using device \"" << dev_name << "\"" << std::endl;
    // Ensure device can handle USM device allocations.
    if (!queue.get_device()
             .get_info<cl::sycl::info::device::usm_device_allocations>()) {
      std::cout << "ERROR :: device \"" << dev_name
                << "\" does not support usm_device_allocations!" << std::endl;
      return false;
    }

    // Instantiate a MyStructA
    sA_ = {0, nullptr};
    int* a_int = (int*)malloc_device(sizeof(int), dev, *ctx_);
    float* a_floats =
        (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, *ctx_);
    // Host-side float data to copy
    int* host_a = (int*)malloc(sizeof(int));
    float* host_array = new float[MAX_FLOATS];
    for (unsigned int i = 0; i < MAX_FLOATS; i++) {
      host_array[i] = i * 1.0;
    }

    queue
        .submit([&](cl::sycl::handler& h) {
          // Copy host data to device
          h.memcpy(a_floats, &host_array[0], MAX_FLOATS * sizeof(float));
        })
        .wait_and_throw();
    queue
        .submit([&](cl::sycl::handler& h) {
          // Copy host data to device
          h.memcpy(a_int, &host_a[0], sizeof(int));
        })
        .wait_and_throw();
    // Assign MyStructA pointers
    sA_.a = a_int;
    sA_.a_floats = a_floats;
    MyStructA* sA_dev = (MyStructA*)malloc_device(sizeof(sA_), dev, *ctx_);
    queue
        .submit(
            [&](cl::sycl::handler& h) { h.memcpy(sA_dev, &sA_, sizeof(sA_)); })
        .wait_and_throw();

    // Instantiate a MyStructB
    sB_ = {0, nullptr, nullptr};
    int* b_host = (int*)malloc(sizeof(int));
    int* b_int = (int*)malloc_device(sizeof(int), dev, *ctx_);
    float* b_floats =
        (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, *ctx_);
    queue
        .submit([&](cl::sycl::handler& h) {
          h.memcpy(b_floats, &host_array[0], MAX_FLOATS * sizeof(float));
        })
        .wait_and_throw();
    // Assign MyStructB pointers
    sB_.b = b_int;
    sB_.b_floats = b_floats;
    sB_.ms_A = sA_dev;

    return true;
  }

  cl::sycl::context* GetContext() { return ctx_; }

  MyStructB* sB() { return &sB_; }

 private:
  MyStructA sA_;
  MyStructB sB_;
  cl::sycl::context* ctx_;
};

class TestKernel {
 public:
  TestKernel() = delete;
  TestKernel(MyStructB* sB) {
    b_floats_ = sB->b_floats;
    sA_ = sB->ms_A;
  }

  void operator()(cl::sycl::id<1> idx) {
    unsigned int id = (int)idx[0];
    b_floats_[id] += 0.5;
    if (id == 0) {
        *sA_->a = 100;
    }
  }

 private:
  float* b_floats_;
  MyStructA* sA_;
};

int main() {
  // Catch asynchronous exceptions
  auto exception_handler = [](cl::sycl::exception_list exceptions) {
    for (std::exception_ptr const& e : exceptions) {
      try {
        std::rethrow_exception(e);
      } catch (cl::sycl::exception const& e) {
        std::cout << "Caught asynchronous SYCL exception during generation:\n"
                  << e.what() << std::endl;
      }
    }
  };
  WorkerClass* wc = new WorkerClass();
  wc->Init();

  // Get context from WorkerClass
  cl::sycl::context* ctx = wc->GetContext();
  cl::sycl::device dev = ctx->get_devices()[0];
  cl::sycl::queue queue = cl::sycl::queue(dev);

  MyStructB* sB = wc->sB();

  queue.submit([&](cl::sycl::handler& h) {
    TestKernel kernel(sB);
    h.parallel_for<class foo>(cl::sycl::range<1>{MAX_FLOATS}, kernel);
  });
  queue.wait();

  return 0;
}
```
I get,
```
terminate called after throwing an instance of 'cl::sycl::runtime_error'
  what():  OpenCL API failed. OpenCL API returns: -50 (CL_INVALID_ARG_VALUE) -50 (CL_INVALID_ARG_VALUE)
Aborted
```
I've seen a related post to access device-side memory [1] but does not use kernel function objects as I would like to. I'd like to note that I can access device memory with `TestKernel` constructor-local variables, e.g.,
```
  TestKernel(MyStructB* sB) {
    float* b_floats = sB->b_floats; // Works
    // b_floats_ = sB->b_floats;      // CL_INVALID_ARG_VALUE
    // sA_ = sB->ms_A;                   // CL_INVALID_ARG_VALUE
  }
```
but not member variables. The error does not occur when using the "host" device so I'm lead to believe this is something to do with memory management on the device.

Unfortunately, I'm not sure of the exact version of llvm, only that it was built on 2020-07-23.

Thanks in advance.

[1] https://community.intel.com/t5/Intel-oneAPI-Data-Parallel-C/Error-50-CL-INVALID-ARG-VALUE-for-Intel-iGPU/td-p/1158406

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Accessing iGPU device data allocated across classes with USM #2335

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Accessing iGPU device data allocated across classes with USM #2335

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions