Skip to content

Change memory object USM allocation ownership, and make execution asynchronous #1705

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
a51973c
Change _Memory object's memory ownership model
oleksandr-pavlyk Apr 26, 2024
dc77858
Deploy using shared pointers from keep_args_alive
oleksandr-pavlyk Apr 27, 2024
8a53212
Adding test for Memory_GetOpaquePointer CAPI function
oleksandr-pavlyk Apr 27, 2024
cf1cad0
Added tests for UsmNDArray_GetUSMData C-API function
oleksandr-pavlyk Apr 29, 2024
9df2b72
Fixed typo in the comment
oleksandr-pavlyk Apr 29, 2024
c85571b
Fixed unclosed quote in error pragma
oleksandr-pavlyk May 2, 2024
08b6dd0
Fixed memory leak introduced in new methods of usm_ndarray
oleksandr-pavlyk May 2, 2024
96cd26e
Create sequential order manager
oleksandr-pavlyk May 7, 2024
b78698e
Transition tensor to use SequentialOrderManager
oleksandr-pavlyk May 8, 2024
9116e73
Wrap call to sycl::free in try/catch
oleksandr-pavlyk May 8, 2024
be401a6
Fixed missing dependency events
oleksandr-pavlyk May 9, 2024
106c8de
Add queue synchronization points in special methods for conversion to…
oleksandr-pavlyk May 9, 2024
a5a481e
Use manager per queue in tensor implementation
oleksandr-pavlyk May 14, 2024
3f0f935
Adds clear method to SequentialOrderManager
oleksandr-pavlyk May 15, 2024
bdb2f75
Replaced use of synchronizing __sycl_usm_array_interface__ atribute
oleksandr-pavlyk May 17, 2024
1930287
Fix pre-commit
oleksandr-pavlyk Jun 4, 2024
7d3e228
Extend test symmetrically to improve coverage
oleksandr-pavlyk Jun 4, 2024
dd28026
Add tests for order manager
oleksandr-pavlyk Jun 4, 2024
71c5b84
Improve coverage of concat_axis_None
oleksandr-pavlyk Jun 4, 2024
a4b510c
Fix for the bug found by test added in previous commit
oleksandr-pavlyk Jun 4, 2024
61f8c54
Add docs for two new C-API functions added in this branch for 0.18
oleksandr-pavlyk Jun 5, 2024
f6a1f06
Add synchronization points in example
oleksandr-pavlyk Jun 5, 2024
759c1d0
Added wait call to cppclass queue and cppclass even in sycl.pxd
oleksandr-pavlyk Jun 5, 2024
7bc1ef8
Fixed issues found by @ndgrigorian
oleksandr-pavlyk Jun 5, 2024
f4e3b6f
Add comments for usm_ndarray::get_usm_data method
oleksandr-pavlyk Jun 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion docs/doc_sources/api_reference/dpctl_capi.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,16 @@ API for :c:struct:`Py_MemoryObject`
:param nbytes: The size of allocation in bytes
:param QRef: instance of :c:struct:`PySyclQueueRef` corresponding
to ``sycl::queue`` to be associated with this allocation
:param owner: Python object instance whose deleter triggers freeing of this USM allocation
:param owner: Python object instance whose deleter triggers freeing of this USM allocation. Specify `owner=None`
to pass ownership to created Python memory object, which will use ``sycl::free(ptr, sycl_queue)`` for
deallocation.

.. c:function:: void * Memory_GetOpaquePointer(struct Py_MemoryObject *o)

:param o: Input object
:returns: Returns opaque pointer to `std::shared_ptr<void>` which manages the USM allocation,
or a `nullptr` if the USM allocation represented by `o` is not managed by the smart
pointer.

API for :c:struct:`PyUSMArrayObject`
------------------------------------
Expand Down Expand Up @@ -221,6 +230,11 @@ API for :c:struct:`PyUSMArrayObject`
:returns: Offset of zero multi-index array element from the beginning of
the USM allocation.

.. c:function:: PyObject * UsmNDArray_GetUSMData(struct PyUSMArrayObject *arr)

:param arr: Input object
:returns: Python memory object underlying input array `arr`.

.. c:function:: void UsmNDArray_SetWritableFlag(struct PyUSMArrayObject *arr, int flag)

:param arr: Input object
Expand Down
8 changes: 4 additions & 4 deletions dpctl/_sycl_queue.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ cdef DPCTLSyclEventRef _memcpy_impl(
cdef unsigned char[::1] dst_host_buf = None

if isinstance(src, _Memory):
c_src_ptr = <void*>(<_Memory>src).memory_ptr
c_src_ptr = <void*>(<_Memory>src).get_data_ptr()
elif _is_buffer(src):
src_host_buf = src
c_src_ptr = <void *>&src_host_buf[0]
Expand All @@ -354,7 +354,7 @@ cdef DPCTLSyclEventRef _memcpy_impl(
)

if isinstance(dst, _Memory):
c_dst_ptr = <void*>(<_Memory>dst).memory_ptr
c_dst_ptr = <void*>(<_Memory>dst).get_data_ptr()
elif _is_buffer(dst):
dst_host_buf = dst
c_dst_ptr = <void *>&dst_host_buf[0]
Expand Down Expand Up @@ -1265,7 +1265,7 @@ cdef class SyclQueue(_SyclQueue):
cdef DPCTLSyclEventRef ERef = NULL

if isinstance(mem, _Memory):
ptr = <void*>(<_Memory>mem).memory_ptr
ptr = <void*>(<_Memory>mem).get_data_ptr()
else:
raise TypeError("Parameter `mem` should have type _Memory")

Expand All @@ -1285,7 +1285,7 @@ cdef class SyclQueue(_SyclQueue):
cdef DPCTLSyclEventRef ERef = NULL

if isinstance(mem, _Memory):
ptr = <void*>(<_Memory>mem).memory_ptr
ptr = <void*>(<_Memory>mem).get_data_ptr()
else:
raise TypeError("Parameter `mem` should have type _Memory")

Expand Down
223 changes: 203 additions & 20 deletions dpctl/apis/include/dpctl4pybind11.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ class dpctl_capi

// memory
DPCTLSyclUSMRef (*Memory_GetUsmPointer_)(Py_MemoryObject *);
void *(*Memory_GetOpaquePointer_)(Py_MemoryObject *);
DPCTLSyclContextRef (*Memory_GetContextRef_)(Py_MemoryObject *);
DPCTLSyclQueueRef (*Memory_GetQueueRef_)(Py_MemoryObject *);
size_t (*Memory_GetNumBytes_)(Py_MemoryObject *);
Expand All @@ -115,6 +116,7 @@ class dpctl_capi
int (*UsmNDArray_GetFlags_)(PyUSMArrayObject *);
DPCTLSyclQueueRef (*UsmNDArray_GetQueueRef_)(PyUSMArrayObject *);
py::ssize_t (*UsmNDArray_GetOffset_)(PyUSMArrayObject *);
PyObject *(*UsmNDArray_GetUSMData_)(PyUSMArrayObject *);
void (*UsmNDArray_SetWritableFlag_)(PyUSMArrayObject *, int);
PyObject *(*UsmNDArray_MakeSimpleFromMemory_)(int,
const py::ssize_t *,
Expand Down Expand Up @@ -233,15 +235,16 @@ class dpctl_capi
SyclContext_Make_(nullptr), SyclEvent_GetEventRef_(nullptr),
SyclEvent_Make_(nullptr), SyclQueue_GetQueueRef_(nullptr),
SyclQueue_Make_(nullptr), Memory_GetUsmPointer_(nullptr),
Memory_GetContextRef_(nullptr), Memory_GetQueueRef_(nullptr),
Memory_GetNumBytes_(nullptr), Memory_Make_(nullptr),
SyclKernel_GetKernelRef_(nullptr), SyclKernel_Make_(nullptr),
SyclProgram_GetKernelBundleRef_(nullptr), SyclProgram_Make_(nullptr),
UsmNDArray_GetData_(nullptr), UsmNDArray_GetNDim_(nullptr),
UsmNDArray_GetShape_(nullptr), UsmNDArray_GetStrides_(nullptr),
UsmNDArray_GetTypenum_(nullptr), UsmNDArray_GetElementSize_(nullptr),
UsmNDArray_GetFlags_(nullptr), UsmNDArray_GetQueueRef_(nullptr),
UsmNDArray_GetOffset_(nullptr), UsmNDArray_SetWritableFlag_(nullptr),
Memory_GetOpaquePointer_(nullptr), Memory_GetContextRef_(nullptr),
Memory_GetQueueRef_(nullptr), Memory_GetNumBytes_(nullptr),
Memory_Make_(nullptr), SyclKernel_GetKernelRef_(nullptr),
SyclKernel_Make_(nullptr), SyclProgram_GetKernelBundleRef_(nullptr),
SyclProgram_Make_(nullptr), UsmNDArray_GetData_(nullptr),
UsmNDArray_GetNDim_(nullptr), UsmNDArray_GetShape_(nullptr),
UsmNDArray_GetStrides_(nullptr), UsmNDArray_GetTypenum_(nullptr),
UsmNDArray_GetElementSize_(nullptr), UsmNDArray_GetFlags_(nullptr),
UsmNDArray_GetQueueRef_(nullptr), UsmNDArray_GetOffset_(nullptr),
UsmNDArray_GetUSMData_(nullptr), UsmNDArray_SetWritableFlag_(nullptr),
UsmNDArray_MakeSimpleFromMemory_(nullptr),
UsmNDArray_MakeSimpleFromPtr_(nullptr),
UsmNDArray_MakeFromPtr_(nullptr), USM_ARRAY_C_CONTIGUOUS_(0),
Expand Down Expand Up @@ -299,6 +302,7 @@ class dpctl_capi

// dpctl.memory API
this->Memory_GetUsmPointer_ = Memory_GetUsmPointer;
this->Memory_GetOpaquePointer_ = Memory_GetOpaquePointer;
this->Memory_GetContextRef_ = Memory_GetContextRef;
this->Memory_GetQueueRef_ = Memory_GetQueueRef;
this->Memory_GetNumBytes_ = Memory_GetNumBytes;
Expand All @@ -320,6 +324,7 @@ class dpctl_capi
this->UsmNDArray_GetFlags_ = UsmNDArray_GetFlags;
this->UsmNDArray_GetQueueRef_ = UsmNDArray_GetQueueRef;
this->UsmNDArray_GetOffset_ = UsmNDArray_GetOffset;
this->UsmNDArray_GetUSMData_ = UsmNDArray_GetUSMData;
this->UsmNDArray_SetWritableFlag_ = UsmNDArray_SetWritableFlag;
this->UsmNDArray_MakeSimpleFromMemory_ =
UsmNDArray_MakeSimpleFromMemory;
Expand Down Expand Up @@ -779,6 +784,33 @@ class usm_memory : public py::object
return api.Memory_GetNumBytes_(mem_obj);
}

bool is_managed_by_smart_ptr() const
{
auto const &api = ::dpctl::detail::dpctl_capi::get();
Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);

return bool(opaque_ptr);
}

const std::shared_ptr<void> &get_smart_ptr_owner() const
{
auto const &api = ::dpctl::detail::dpctl_capi::get();
Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);

if (opaque_ptr) {
auto shptr_ptr =
reinterpret_cast<std::shared_ptr<void> *>(opaque_ptr);
return *shptr_ptr;
}
else {
throw std::runtime_error(
"Memory object does not have smart pointer "
"managing lifetime of USM allocation");
}
}

protected:
static PyObject *as_usm_memory(PyObject *o)
{
Expand Down Expand Up @@ -1065,6 +1097,71 @@ class usm_ndarray : public py::object
return static_cast<bool>(flags & api.USM_ARRAY_WRITABLE_);
}

/*! @brief Get usm_data property of array */
py::object get_usm_data() const
{
PyUSMArrayObject *raw_ar = usm_array_ptr();

auto const &api = ::dpctl::detail::dpctl_capi::get();
// UsmNDArray_GetUSMData_ gives a new reference
PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);

// pass reference ownership to py::object
return py::reinterpret_steal<py::object>(usm_data);
}

bool is_managed_by_smart_ptr() const
{
PyUSMArrayObject *raw_ar = usm_array_ptr();

auto const &api = ::dpctl::detail::dpctl_capi::get();
PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);

if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) {
Py_DECREF(usm_data);
return false;
}

Py_MemoryObject *mem_obj =
reinterpret_cast<Py_MemoryObject *>(usm_data);
const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);

Py_DECREF(usm_data);
return bool(opaque_ptr);
}

const std::shared_ptr<void> &get_smart_ptr_owner() const
{
PyUSMArrayObject *raw_ar = usm_array_ptr();

auto const &api = ::dpctl::detail::dpctl_capi::get();

PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);

if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) {
Py_DECREF(usm_data);
throw std::runtime_error(
"usm_ndarray object does not have Memory object "
"managing lifetime of USM allocation");
}

Py_MemoryObject *mem_obj =
reinterpret_cast<Py_MemoryObject *>(usm_data);
void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
Py_DECREF(usm_data);

if (opaque_ptr) {
auto shptr_ptr =
reinterpret_cast<std::shared_ptr<void> *>(opaque_ptr);
return *shptr_ptr;
}
else {
throw std::runtime_error(
"Memory object underlying usm_ndarray does not have "
"smart pointer managing lifetime of USM allocation");
}
}

private:
PyUSMArrayObject *usm_array_ptr() const
{
Expand All @@ -1077,26 +1174,112 @@ class usm_ndarray : public py::object
namespace utils
{

namespace detail
{

struct ManagedMemory
{

static bool is_usm_managed_by_shared_ptr(const py::object &h)
{
if (py::isinstance<dpctl::memory::usm_memory>(h)) {
const auto &usm_memory_inst =
py::cast<dpctl::memory::usm_memory>(h);
return usm_memory_inst.is_managed_by_smart_ptr();
}
else if (py::isinstance<dpctl::tensor::usm_ndarray>(h)) {
const auto &usm_array_inst =
py::cast<dpctl::tensor::usm_ndarray>(h);
return usm_array_inst.is_managed_by_smart_ptr();
}

return false;
}

static const std::shared_ptr<void> &extract_shared_ptr(const py::object &h)
{
if (py::isinstance<dpctl::memory::usm_memory>(h)) {
const auto &usm_memory_inst =
py::cast<dpctl::memory::usm_memory>(h);
return usm_memory_inst.get_smart_ptr_owner();
}
else if (py::isinstance<dpctl::tensor::usm_ndarray>(h)) {
const auto &usm_array_inst =
py::cast<dpctl::tensor::usm_ndarray>(h);
return usm_array_inst.get_smart_ptr_owner();
}

throw std::runtime_error(
"Attempted extraction of shared_ptr on an unrecognized type");
}
};

} // end of namespace detail

template <std::size_t num>
sycl::event keep_args_alive(sycl::queue &q,
const py::object (&py_objs)[num],
const std::vector<sycl::event> &depends = {})
{
sycl::event host_task_ev = q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);
std::array<std::shared_ptr<py::handle>, num> shp_arr;
for (std::size_t i = 0; i < num; ++i) {
shp_arr[i] = std::make_shared<py::handle>(py_objs[i]);
shp_arr[i]->inc_ref();
std::size_t n_objects_held = 0;
std::array<std::shared_ptr<py::handle>, num> shp_arr{};

std::size_t n_usm_owners_held = 0;
std::array<std::shared_ptr<void>, num> shp_usm{};

for (std::size_t i = 0; i < num; ++i) {
const auto &py_obj_i = py_objs[i];
if (detail::ManagedMemory::is_usm_managed_by_shared_ptr(py_obj_i)) {
const auto &shp =
detail::ManagedMemory::extract_shared_ptr(py_obj_i);
shp_usm[n_usm_owners_held] = shp;
++n_usm_owners_held;
}
cgh.host_task([shp_arr = std::move(shp_arr)]() {
py::gil_scoped_acquire acquire;
else {
shp_arr[n_objects_held] = std::make_shared<py::handle>(py_obj_i);
shp_arr[n_objects_held]->inc_ref();
++n_objects_held;
}
}

for (std::size_t i = 0; i < num; ++i) {
shp_arr[i]->dec_ref();
bool use_depends = true;
sycl::event host_task_ev;

if (n_usm_owners_held > 0) {
host_task_ev = q.submit([&](sycl::handler &cgh) {
if (use_depends) {
cgh.depends_on(depends);
use_depends = false;
}
else {
cgh.depends_on(host_task_ev);
}
cgh.host_task([shp_usm = std::move(shp_usm)]() {
// no body, but shared pointers are captured in
// the lambda, ensuring that USM allocation is
// kept alive
});
});
}

if (n_objects_held > 0) {
host_task_ev = q.submit([&](sycl::handler &cgh) {
if (use_depends) {
cgh.depends_on(depends);
use_depends = false;
}
else {
cgh.depends_on(host_task_ev);
}
cgh.host_task([n_objects_held, shp_arr = std::move(shp_arr)]() {
py::gil_scoped_acquire acquire;

for (std::size_t i = 0; i < n_objects_held; ++i) {
shp_arr[i]->dec_ref();
}
});
});
});
}

return host_task_ev;
}
Expand Down
12 changes: 6 additions & 6 deletions dpctl/memory/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

file(GLOB _cython_sources *.pyx)
foreach(_cy_file ${_cython_sources})
get_filename_component(_trgt ${_cy_file} NAME_WLE)
build_dpctl_ext(${_trgt} ${_cy_file} "dpctl/memory")
target_link_libraries(DpctlCAPI INTERFACE ${_trgt}_headers)
endforeach()
set(_cy_file ${CMAKE_CURRENT_SOURCE_DIR}/_memory.pyx)
get_filename_component(_trgt ${_cy_file} NAME_WLE)
build_dpctl_ext(${_trgt} ${_cy_file} "dpctl/memory" SYCL)
# _memory include _opaque_smart_ptr.hpp
target_include_directories(${_trgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(DpctlCAPI INTERFACE ${_trgt}_headers)
5 changes: 4 additions & 1 deletion dpctl/memory/_memory.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ cdef DPCTLSyclQueueRef get_queue_ref_from_ptr_and_syclobj(


cdef public api class _Memory [object Py_MemoryObject, type Py_MemoryType]:
cdef DPCTLSyclUSMRef memory_ptr
cdef DPCTLSyclUSMRef _memory_ptr
cdef void* _opaque_ptr
cdef Py_ssize_t nbytes
cdef SyclQueue queue
cdef object refobj
Expand All @@ -50,6 +51,8 @@ cdef public api class _Memory [object Py_MemoryObject, type Py_MemoryType]:
cpdef memset(self, unsigned short val=*)

cpdef bytes tobytes(self)
cdef DPCTLSyclUSMRef get_data_ptr(self)
cdef void * get_opaque_ptr(self)

@staticmethod
cdef SyclDevice get_pointer_device(
Expand Down
Loading