-
Notifications
You must be signed in to change notification settings - Fork 795
Description
I have a number of classes that use the same cl::sycl::context
to allocate device-side memory; let's call these WorkerClass
s. I then have a number of kernels -- in a separate source file -- that need to access the device data from the WorkerClass
s. The device data are typically struct
s and/or pointers allocated on the device via USM.
What I've noticed is that if I remove the WorkerClass
s from the code -- that is, I allocate all device memory in a main
function -- the code executes as expected. For example,
// Compile:
// clang++ \
// -g -fsycl \
// -o kernel_args kernel_args.cc
#include <CL/sycl.hpp>
#include <iostream>
static const unsigned int MAX_FLOATS = 10;
struct MyStructA {
int* a;
float* a_floats;
};
struct MyStructB {
int* b;
float* b_floats;
MyStructA* ms_A;
};
class TestKernel {
public:
TestKernel() = delete;
TestKernel(MyStructB* ms)
: b_(ms->b), b_floats_(ms->b_floats), ms_A_(ms->ms_A) {}
void operator()(cl::sycl::id<1> idx) {
unsigned int id = (int)idx[0];
b_floats_[id] += 0.5;
*ms_A_->a = 100;
}
private:
int* b_;
float* b_floats_;
MyStructA* ms_A_;
};
int main() {
// Catch asynchronous exceptions
auto exception_handler = [](cl::sycl::exception_list exceptions) {
for (std::exception_ptr const& e : exceptions) {
try {
std::rethrow_exception(e);
} catch (cl::sycl::exception const& e) {
std::cout << "Caught asynchronous SYCL exception during generation:\n"
<< e.what() << std::endl;
}
}
};
// Initialize device, queue and context
cl::sycl::device dev = cl::sycl::device(cl::sycl::default_selector());
cl::sycl::queue queue = cl::sycl::queue(dev, exception_handler);
cl::sycl::context ctx = queue.get_context();
// Name of the device to run on
std::string dev_name =
queue.get_device().get_info<cl::sycl::info::device::name>();
std::cout << "Using device \"" << dev_name << "\"" << std::endl;
// Ensure device can handle USM device allocations.
if (!queue.get_device()
.get_info<cl::sycl::info::device::usm_device_allocations>()) {
std::cout << "ERROR :: device \"" << dev_name
<< "\" does not support usm_device_allocations!" << std::endl;
return -1;
}
// Instantiate a MyStructA
MyStructA ms_A{0, nullptr};
int* a_int = (int*)malloc_device(sizeof(int), dev, ctx);
float* a_floats = (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, ctx);
// Host-side float data to copy
int* host_a = (int*)malloc(sizeof(int));
float* host_array = new float[MAX_FLOATS];
for (unsigned int i = 0; i < MAX_FLOATS; i++) {
host_array[i] = i * 1.0;
}
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(a_floats, &host_array[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(a_int, &host_a[0], sizeof(int));
})
.wait_and_throw();
// Assign MyStructA pointers
ms_A.a = a_int;
ms_A.a_floats = a_floats;
MyStructA* ms_A_dev = (MyStructA*)malloc_device(sizeof(ms_A), dev, ctx);
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(ms_A_dev, &ms_A, sizeof(ms_A));
})
.wait_and_throw();
// Instantiate a MyStructB
MyStructB ms_B{0, nullptr, nullptr};
int* b_int = (int*)malloc_device(sizeof(int), dev, ctx);
float* b_floats = (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, ctx);
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(b_floats, &host_array[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
// Assign MyStructB pointers
ms_B.b = b_int;
ms_B.b_floats = b_floats;
ms_B.ms_A = ms_A_dev;
queue.submit([&](cl::sycl::handler& h) {
TestKernel kernel(&ms_B);
h.parallel_for<class foo>(cl::sycl::range<1>{MAX_FLOATS}, kernel);
});
queue.wait();
// Copy back to host
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(host_array, &ms_B.b_floats[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(host_a, &ms_B.ms_A->a[0], sizeof(float));
})
.wait_and_throw();
free(host_array);
free(host_a);
cl::sycl::free(a_int, ctx);
cl::sycl::free(a_floats, ctx);
cl::sycl::free(ms_A_dev, ctx);
cl::sycl::free(b_int, ctx);
cl::sycl::free(b_floats, ctx);
return 0;
}
On the other hand, with the WorkerClass
s taking care of the memory allocations, e.g.,
// Compile:
// clang++ \
// -g -fsycl \
// -o kernel_args kernel_args.cc
#include <CL/sycl.hpp>
#include <iostream>
static const unsigned int MAX_FLOATS = 10;
struct MyStructA {
int* a;
float* a_floats;
};
struct MyStructB {
int* b;
float* b_floats;
MyStructA* ms_A;
};
class WorkerClass {
public:
WorkerClass() {}
~WorkerClass() {}
bool Init() {
// Initialize device, queue and context
cl::sycl::device dev = cl::sycl::device(cl::sycl::default_selector());
cl::sycl::queue queue = cl::sycl::queue(dev);
ctx_ = new cl::sycl::context(queue.get_context());
// Name of the device to run on
std::string dev_name =
queue.get_device().get_info<cl::sycl::info::device::name>();
std::cout << "Using device \"" << dev_name << "\"" << std::endl;
// Ensure device can handle USM device allocations.
if (!queue.get_device()
.get_info<cl::sycl::info::device::usm_device_allocations>()) {
std::cout << "ERROR :: device \"" << dev_name
<< "\" does not support usm_device_allocations!" << std::endl;
return false;
}
// Instantiate a MyStructA
sA_ = {0, nullptr};
int* a_int = (int*)malloc_device(sizeof(int), dev, *ctx_);
float* a_floats =
(float*)malloc_device(MAX_FLOATS * sizeof(float), dev, *ctx_);
// Host-side float data to copy
int* host_a = (int*)malloc(sizeof(int));
float* host_array = new float[MAX_FLOATS];
for (unsigned int i = 0; i < MAX_FLOATS; i++) {
host_array[i] = i * 1.0;
}
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(a_floats, &host_array[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(a_int, &host_a[0], sizeof(int));
})
.wait_and_throw();
// Assign MyStructA pointers
sA_.a = a_int;
sA_.a_floats = a_floats;
MyStructA* sA_dev = (MyStructA*)malloc_device(sizeof(sA_), dev, *ctx_);
queue
.submit(
[&](cl::sycl::handler& h) { h.memcpy(sA_dev, &sA_, sizeof(sA_)); })
.wait_and_throw();
// Instantiate a MyStructB
sB_ = {0, nullptr, nullptr};
int* b_host = (int*)malloc(sizeof(int));
int* b_int = (int*)malloc_device(sizeof(int), dev, *ctx_);
float* b_floats =
(float*)malloc_device(MAX_FLOATS * sizeof(float), dev, *ctx_);
queue
.submit([&](cl::sycl::handler& h) {
h.memcpy(b_floats, &host_array[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
// Assign MyStructB pointers
sB_.b = b_int;
sB_.b_floats = b_floats;
sB_.ms_A = sA_dev;
return true;
}
cl::sycl::context* GetContext() { return ctx_; }
MyStructB* sB() { return &sB_; }
private:
MyStructA sA_;
MyStructB sB_;
cl::sycl::context* ctx_;
};
class TestKernel {
public:
TestKernel() = delete;
TestKernel(MyStructB* sB) {
b_floats_ = sB->b_floats;
sA_ = sB->ms_A;
}
void operator()(cl::sycl::id<1> idx) {
unsigned int id = (int)idx[0];
b_floats_[id] += 0.5;
if (id == 0) {
*sA_->a = 100;
}
}
private:
float* b_floats_;
MyStructA* sA_;
};
int main() {
// Catch asynchronous exceptions
auto exception_handler = [](cl::sycl::exception_list exceptions) {
for (std::exception_ptr const& e : exceptions) {
try {
std::rethrow_exception(e);
} catch (cl::sycl::exception const& e) {
std::cout << "Caught asynchronous SYCL exception during generation:\n"
<< e.what() << std::endl;
}
}
};
WorkerClass* wc = new WorkerClass();
wc->Init();
// Get context from WorkerClass
cl::sycl::context* ctx = wc->GetContext();
cl::sycl::device dev = ctx->get_devices()[0];
cl::sycl::queue queue = cl::sycl::queue(dev);
MyStructB* sB = wc->sB();
queue.submit([&](cl::sycl::handler& h) {
TestKernel kernel(sB);
h.parallel_for<class foo>(cl::sycl::range<1>{MAX_FLOATS}, kernel);
});
queue.wait();
return 0;
}
I get,
terminate called after throwing an instance of 'cl::sycl::runtime_error'
what(): OpenCL API failed. OpenCL API returns: -50 (CL_INVALID_ARG_VALUE) -50 (CL_INVALID_ARG_VALUE)
Aborted
I've seen a related post to access device-side memory [1] but does not use kernel function objects as I would like to. I'd like to note that I can access device memory with TestKernel
constructor-local variables, e.g.,
TestKernel(MyStructB* sB) {
float* b_floats = sB->b_floats; // Works
// b_floats_ = sB->b_floats; // CL_INVALID_ARG_VALUE
// sA_ = sB->ms_A; // CL_INVALID_ARG_VALUE
}
but not member variables. The error does not occur when using the "host" device so I'm lead to believe this is something to do with memory management on the device.
Unfortunately, I'm not sure of the exact version of llvm, only that it was built on 2020-07-23.
Thanks in advance.