diff --git a/docs/doc_sources/api_reference/dpctl_capi.rst b/docs/doc_sources/api_reference/dpctl_capi.rst index 0d69cf808d..382859efd8 100644 --- a/docs/doc_sources/api_reference/dpctl_capi.rst +++ b/docs/doc_sources/api_reference/dpctl_capi.rst @@ -152,7 +152,16 @@ API for :c:struct:`Py_MemoryObject` :param nbytes: The size of allocation in bytes :param QRef: instance of :c:struct:`PySyclQueueRef` corresponding to ``sycl::queue`` to be associated with this allocation - :param owner: Python object instance whose deleter triggers freeing of this USM allocation + :param owner: Python object instance whose deleter triggers freeing of this USM allocation. Specify `owner=None` + to pass ownership to created Python memory object, which will use ``sycl::free(ptr, sycl_queue)`` for + deallocation. + +.. c:function:: void * Memory_GetOpaquePointer(struct Py_MemoryObject *o) + + :param o: Input object + :returns: Returns opaque pointer to `std::shared_ptr` which manages the USM allocation, + or a `nullptr` if the USM allocation represented by `o` is not managed by the smart + pointer. API for :c:struct:`PyUSMArrayObject` ------------------------------------ @@ -221,6 +230,11 @@ API for :c:struct:`PyUSMArrayObject` :returns: Offset of zero multi-index array element from the beginning of the USM allocation. +.. c:function:: PyObject * UsmNDArray_GetUSMData(struct PyUSMArrayObject *arr) + + :param arr: Input object + :returns: Python memory object underlying input array `arr`. + .. c:function:: void UsmNDArray_SetWritableFlag(struct PyUSMArrayObject *arr, int flag) :param arr: Input object diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index e85e35c29a..083ca3c6dc 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -342,7 +342,7 @@ cdef DPCTLSyclEventRef _memcpy_impl( cdef unsigned char[::1] dst_host_buf = None if isinstance(src, _Memory): - c_src_ptr = (<_Memory>src).memory_ptr + c_src_ptr = (<_Memory>src).get_data_ptr() elif _is_buffer(src): src_host_buf = src c_src_ptr = &src_host_buf[0] @@ -354,7 +354,7 @@ cdef DPCTLSyclEventRef _memcpy_impl( ) if isinstance(dst, _Memory): - c_dst_ptr = (<_Memory>dst).memory_ptr + c_dst_ptr = (<_Memory>dst).get_data_ptr() elif _is_buffer(dst): dst_host_buf = dst c_dst_ptr = &dst_host_buf[0] @@ -1265,7 +1265,7 @@ cdef class SyclQueue(_SyclQueue): cdef DPCTLSyclEventRef ERef = NULL if isinstance(mem, _Memory): - ptr = (<_Memory>mem).memory_ptr + ptr = (<_Memory>mem).get_data_ptr() else: raise TypeError("Parameter `mem` should have type _Memory") @@ -1285,7 +1285,7 @@ cdef class SyclQueue(_SyclQueue): cdef DPCTLSyclEventRef ERef = NULL if isinstance(mem, _Memory): - ptr = (<_Memory>mem).memory_ptr + ptr = (<_Memory>mem).get_data_ptr() else: raise TypeError("Parameter `mem` should have type _Memory") diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp index 4e0cbe1986..8fb435b355 100644 --- a/dpctl/apis/include/dpctl4pybind11.hpp +++ b/dpctl/apis/include/dpctl4pybind11.hpp @@ -89,6 +89,7 @@ class dpctl_capi // memory DPCTLSyclUSMRef (*Memory_GetUsmPointer_)(Py_MemoryObject *); + void *(*Memory_GetOpaquePointer_)(Py_MemoryObject *); DPCTLSyclContextRef (*Memory_GetContextRef_)(Py_MemoryObject *); DPCTLSyclQueueRef (*Memory_GetQueueRef_)(Py_MemoryObject *); size_t (*Memory_GetNumBytes_)(Py_MemoryObject *); @@ -115,6 +116,7 @@ class dpctl_capi int (*UsmNDArray_GetFlags_)(PyUSMArrayObject *); DPCTLSyclQueueRef (*UsmNDArray_GetQueueRef_)(PyUSMArrayObject *); py::ssize_t (*UsmNDArray_GetOffset_)(PyUSMArrayObject *); + PyObject *(*UsmNDArray_GetUSMData_)(PyUSMArrayObject *); void (*UsmNDArray_SetWritableFlag_)(PyUSMArrayObject *, int); PyObject *(*UsmNDArray_MakeSimpleFromMemory_)(int, const py::ssize_t *, @@ -233,15 +235,16 @@ class dpctl_capi SyclContext_Make_(nullptr), SyclEvent_GetEventRef_(nullptr), SyclEvent_Make_(nullptr), SyclQueue_GetQueueRef_(nullptr), SyclQueue_Make_(nullptr), Memory_GetUsmPointer_(nullptr), - Memory_GetContextRef_(nullptr), Memory_GetQueueRef_(nullptr), - Memory_GetNumBytes_(nullptr), Memory_Make_(nullptr), - SyclKernel_GetKernelRef_(nullptr), SyclKernel_Make_(nullptr), - SyclProgram_GetKernelBundleRef_(nullptr), SyclProgram_Make_(nullptr), - UsmNDArray_GetData_(nullptr), UsmNDArray_GetNDim_(nullptr), - UsmNDArray_GetShape_(nullptr), UsmNDArray_GetStrides_(nullptr), - UsmNDArray_GetTypenum_(nullptr), UsmNDArray_GetElementSize_(nullptr), - UsmNDArray_GetFlags_(nullptr), UsmNDArray_GetQueueRef_(nullptr), - UsmNDArray_GetOffset_(nullptr), UsmNDArray_SetWritableFlag_(nullptr), + Memory_GetOpaquePointer_(nullptr), Memory_GetContextRef_(nullptr), + Memory_GetQueueRef_(nullptr), Memory_GetNumBytes_(nullptr), + Memory_Make_(nullptr), SyclKernel_GetKernelRef_(nullptr), + SyclKernel_Make_(nullptr), SyclProgram_GetKernelBundleRef_(nullptr), + SyclProgram_Make_(nullptr), UsmNDArray_GetData_(nullptr), + UsmNDArray_GetNDim_(nullptr), UsmNDArray_GetShape_(nullptr), + UsmNDArray_GetStrides_(nullptr), UsmNDArray_GetTypenum_(nullptr), + UsmNDArray_GetElementSize_(nullptr), UsmNDArray_GetFlags_(nullptr), + UsmNDArray_GetQueueRef_(nullptr), UsmNDArray_GetOffset_(nullptr), + UsmNDArray_GetUSMData_(nullptr), UsmNDArray_SetWritableFlag_(nullptr), UsmNDArray_MakeSimpleFromMemory_(nullptr), UsmNDArray_MakeSimpleFromPtr_(nullptr), UsmNDArray_MakeFromPtr_(nullptr), USM_ARRAY_C_CONTIGUOUS_(0), @@ -299,6 +302,7 @@ class dpctl_capi // dpctl.memory API this->Memory_GetUsmPointer_ = Memory_GetUsmPointer; + this->Memory_GetOpaquePointer_ = Memory_GetOpaquePointer; this->Memory_GetContextRef_ = Memory_GetContextRef; this->Memory_GetQueueRef_ = Memory_GetQueueRef; this->Memory_GetNumBytes_ = Memory_GetNumBytes; @@ -320,6 +324,7 @@ class dpctl_capi this->UsmNDArray_GetFlags_ = UsmNDArray_GetFlags; this->UsmNDArray_GetQueueRef_ = UsmNDArray_GetQueueRef; this->UsmNDArray_GetOffset_ = UsmNDArray_GetOffset; + this->UsmNDArray_GetUSMData_ = UsmNDArray_GetUSMData; this->UsmNDArray_SetWritableFlag_ = UsmNDArray_SetWritableFlag; this->UsmNDArray_MakeSimpleFromMemory_ = UsmNDArray_MakeSimpleFromMemory; @@ -779,6 +784,33 @@ class usm_memory : public py::object return api.Memory_GetNumBytes_(mem_obj); } + bool is_managed_by_smart_ptr() const + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); + const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + + return bool(opaque_ptr); + } + + const std::shared_ptr &get_smart_ptr_owner() const + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); + void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + + if (opaque_ptr) { + auto shptr_ptr = + reinterpret_cast *>(opaque_ptr); + return *shptr_ptr; + } + else { + throw std::runtime_error( + "Memory object does not have smart pointer " + "managing lifetime of USM allocation"); + } + } + protected: static PyObject *as_usm_memory(PyObject *o) { @@ -1065,6 +1097,71 @@ class usm_ndarray : public py::object return static_cast(flags & api.USM_ARRAY_WRITABLE_); } + /*! @brief Get usm_data property of array */ + py::object get_usm_data() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + + auto const &api = ::dpctl::detail::dpctl_capi::get(); + // UsmNDArray_GetUSMData_ gives a new reference + PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar); + + // pass reference ownership to py::object + return py::reinterpret_steal(usm_data); + } + + bool is_managed_by_smart_ptr() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + + auto const &api = ::dpctl::detail::dpctl_capi::get(); + PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar); + + if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) { + Py_DECREF(usm_data); + return false; + } + + Py_MemoryObject *mem_obj = + reinterpret_cast(usm_data); + const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + + Py_DECREF(usm_data); + return bool(opaque_ptr); + } + + const std::shared_ptr &get_smart_ptr_owner() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + + auto const &api = ::dpctl::detail::dpctl_capi::get(); + + PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar); + + if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) { + Py_DECREF(usm_data); + throw std::runtime_error( + "usm_ndarray object does not have Memory object " + "managing lifetime of USM allocation"); + } + + Py_MemoryObject *mem_obj = + reinterpret_cast(usm_data); + void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + Py_DECREF(usm_data); + + if (opaque_ptr) { + auto shptr_ptr = + reinterpret_cast *>(opaque_ptr); + return *shptr_ptr; + } + else { + throw std::runtime_error( + "Memory object underlying usm_ndarray does not have " + "smart pointer managing lifetime of USM allocation"); + } + } + private: PyUSMArrayObject *usm_array_ptr() const { @@ -1077,26 +1174,112 @@ class usm_ndarray : public py::object namespace utils { +namespace detail +{ + +struct ManagedMemory +{ + + static bool is_usm_managed_by_shared_ptr(const py::object &h) + { + if (py::isinstance(h)) { + const auto &usm_memory_inst = + py::cast(h); + return usm_memory_inst.is_managed_by_smart_ptr(); + } + else if (py::isinstance(h)) { + const auto &usm_array_inst = + py::cast(h); + return usm_array_inst.is_managed_by_smart_ptr(); + } + + return false; + } + + static const std::shared_ptr &extract_shared_ptr(const py::object &h) + { + if (py::isinstance(h)) { + const auto &usm_memory_inst = + py::cast(h); + return usm_memory_inst.get_smart_ptr_owner(); + } + else if (py::isinstance(h)) { + const auto &usm_array_inst = + py::cast(h); + return usm_array_inst.get_smart_ptr_owner(); + } + + throw std::runtime_error( + "Attempted extraction of shared_ptr on an unrecognized type"); + } +}; + +} // end of namespace detail + template sycl::event keep_args_alive(sycl::queue &q, const py::object (&py_objs)[num], const std::vector &depends = {}) { - sycl::event host_task_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - std::array, num> shp_arr; - for (std::size_t i = 0; i < num; ++i) { - shp_arr[i] = std::make_shared(py_objs[i]); - shp_arr[i]->inc_ref(); + std::size_t n_objects_held = 0; + std::array, num> shp_arr{}; + + std::size_t n_usm_owners_held = 0; + std::array, num> shp_usm{}; + + for (std::size_t i = 0; i < num; ++i) { + const auto &py_obj_i = py_objs[i]; + if (detail::ManagedMemory::is_usm_managed_by_shared_ptr(py_obj_i)) { + const auto &shp = + detail::ManagedMemory::extract_shared_ptr(py_obj_i); + shp_usm[n_usm_owners_held] = shp; + ++n_usm_owners_held; } - cgh.host_task([shp_arr = std::move(shp_arr)]() { - py::gil_scoped_acquire acquire; + else { + shp_arr[n_objects_held] = std::make_shared(py_obj_i); + shp_arr[n_objects_held]->inc_ref(); + ++n_objects_held; + } + } - for (std::size_t i = 0; i < num; ++i) { - shp_arr[i]->dec_ref(); + bool use_depends = true; + sycl::event host_task_ev; + + if (n_usm_owners_held > 0) { + host_task_ev = q.submit([&](sycl::handler &cgh) { + if (use_depends) { + cgh.depends_on(depends); + use_depends = false; } + else { + cgh.depends_on(host_task_ev); + } + cgh.host_task([shp_usm = std::move(shp_usm)]() { + // no body, but shared pointers are captured in + // the lambda, ensuring that USM allocation is + // kept alive + }); + }); + } + + if (n_objects_held > 0) { + host_task_ev = q.submit([&](sycl::handler &cgh) { + if (use_depends) { + cgh.depends_on(depends); + use_depends = false; + } + else { + cgh.depends_on(host_task_ev); + } + cgh.host_task([n_objects_held, shp_arr = std::move(shp_arr)]() { + py::gil_scoped_acquire acquire; + + for (std::size_t i = 0; i < n_objects_held; ++i) { + shp_arr[i]->dec_ref(); + } + }); }); - }); + } return host_task_ev; } diff --git a/dpctl/memory/CMakeLists.txt b/dpctl/memory/CMakeLists.txt index 68f57ce35c..cccc30b505 100644 --- a/dpctl/memory/CMakeLists.txt +++ b/dpctl/memory/CMakeLists.txt @@ -1,7 +1,7 @@ -file(GLOB _cython_sources *.pyx) -foreach(_cy_file ${_cython_sources}) - get_filename_component(_trgt ${_cy_file} NAME_WLE) - build_dpctl_ext(${_trgt} ${_cy_file} "dpctl/memory") - target_link_libraries(DpctlCAPI INTERFACE ${_trgt}_headers) -endforeach() +set(_cy_file ${CMAKE_CURRENT_SOURCE_DIR}/_memory.pyx) +get_filename_component(_trgt ${_cy_file} NAME_WLE) +build_dpctl_ext(${_trgt} ${_cy_file} "dpctl/memory" SYCL) +# _memory include _opaque_smart_ptr.hpp +target_include_directories(${_trgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(DpctlCAPI INTERFACE ${_trgt}_headers) diff --git a/dpctl/memory/_memory.pxd b/dpctl/memory/_memory.pxd index db8c18bd9c..4b3c6bb188 100644 --- a/dpctl/memory/_memory.pxd +++ b/dpctl/memory/_memory.pxd @@ -33,7 +33,8 @@ cdef DPCTLSyclQueueRef get_queue_ref_from_ptr_and_syclobj( cdef public api class _Memory [object Py_MemoryObject, type Py_MemoryType]: - cdef DPCTLSyclUSMRef memory_ptr + cdef DPCTLSyclUSMRef _memory_ptr + cdef void* _opaque_ptr cdef Py_ssize_t nbytes cdef SyclQueue queue cdef object refobj @@ -50,6 +51,8 @@ cdef public api class _Memory [object Py_MemoryObject, type Py_MemoryType]: cpdef memset(self, unsigned short val=*) cpdef bytes tobytes(self) + cdef DPCTLSyclUSMRef get_data_ptr(self) + cdef void * get_opaque_ptr(self) @staticmethod cdef SyclDevice get_pointer_device( diff --git a/dpctl/memory/_memory.pyx b/dpctl/memory/_memory.pyx index df5d360e2a..fce65984e2 100644 --- a/dpctl/memory/_memory.pyx +++ b/dpctl/memory/_memory.pyx @@ -78,6 +78,12 @@ __all__ = [ include "_sycl_usm_array_interface_utils.pxi" +cdef extern from "_opaque_smart_ptr.hpp": + void * OpaqueSmartPtr_Make(void *, DPCTLSyclQueueRef) nogil + void * OpaqueSmartPtr_Copy(void *) nogil + void OpaqueSmartPtr_Delete(void *) nogil + void * OpaqueSmartPtr_Get(void *) nogil + class USMAllocationError(Exception): """ An exception raised when Universal Shared Memory (USM) allocation @@ -152,7 +158,8 @@ cdef class _Memory: MemoryUSMShared, MemoryUSMDevice, MemoryUSMHost """ cdef _cinit_empty(self): - self.memory_ptr = NULL + self._memory_ptr = NULL + self._opaque_ptr = NULL self.nbytes = 0 self.queue = None self.refobj = None @@ -198,7 +205,8 @@ cdef class _Memory: ) if (p): - self.memory_ptr = p + self._memory_ptr = p + self._opaque_ptr = OpaqueSmartPtr_Make(p, QRef) self.nbytes = nbytes self.queue = queue else: @@ -214,18 +222,22 @@ cdef class _Memory: cdef _Memory other_mem if isinstance(other, _Memory): other_mem = <_Memory> other - self.memory_ptr = other_mem.memory_ptr self.nbytes = other_mem.nbytes self.queue = other_mem.queue - if other_mem.refobj is None: - self.refobj = other + if other_mem._opaque_ptr is NULL: + self._memory_ptr = other_mem._memory_ptr + self._opaque_ptr = NULL + self.refobj = other.reference_obj else: - self.refobj = other_mem.refobj + self._memory_ptr = other_mem._memory_ptr + self._opaque_ptr = OpaqueSmartPtr_Copy(other_mem._opaque_ptr) + self.refobj = None elif hasattr(other, '__sycl_usm_array_interface__'): other_iface = other.__sycl_usm_array_interface__ if isinstance(other_iface, dict): other_buf = _USMBufferData.from_sycl_usm_ary_iface(other_iface) - self.memory_ptr = other_buf.p + self._opaque_ptr = NULL + self._memory_ptr = other_buf.p self.nbytes = other_buf.nbytes self.queue = other_buf.queue self.refobj = other @@ -241,23 +253,25 @@ cdef class _Memory: ) def __dealloc__(self): - if (self.refobj is None): - if self.memory_ptr: - if (type(self.queue) is SyclQueue): - DPCTLfree_with_queue( - self.memory_ptr, self.queue.get_queue_ref() - ) + if not (self._opaque_ptr is NULL): + OpaqueSmartPtr_Delete(self._opaque_ptr) self._cinit_empty() + cdef DPCTLSyclUSMRef get_data_ptr(self): + return self._memory_ptr + + cdef void* get_opaque_ptr(self): + return self._opaque_ptr + cdef _getbuffer(self, Py_buffer *buffer, int flags): # memory_ptr is Ref which is pointer to SYCL type. For USM it is void*. cdef SyclContext ctx = self._context cdef _usm_type UsmTy = DPCTLUSM_GetPointerType( - self.memory_ptr, ctx.get_context_ref() + self._memory_ptr, ctx.get_context_ref() ) if UsmTy == _usm_type._USM_DEVICE: raise ValueError("USM Device memory is not host accessible") - buffer.buf = self.memory_ptr + buffer.buf = self._memory_ptr buffer.format = 'B' # byte buffer.internal = NULL # see References buffer.itemsize = 1 @@ -285,7 +299,7 @@ cdef class _Memory: represented as Python integer. """ def __get__(self): - return (self.memory_ptr) + return (self._memory_ptr) property _context: """:class:`dpctl.SyclContext` the USM pointer is bound to. """ @@ -333,7 +347,7 @@ cdef class _Memory: .format( self.get_usm_type(), self.nbytes, - hex((self.memory_ptr)) + hex((self._memory_ptr)) ) ) @@ -377,7 +391,7 @@ cdef class _Memory: """ def __get__(self): cdef dict iface = { - "data": ((self.memory_ptr), + "data": ((self._memory_ptr), True), # bool(self.writable)), "shape": (self.nbytes,), "strides": None, @@ -402,18 +416,18 @@ cdef class _Memory: if syclobj is None: ctx = self._context return _Memory.get_pointer_type( - self.memory_ptr, ctx + self._memory_ptr, ctx ).decode("UTF-8") elif isinstance(syclobj, SyclContext): ctx = (syclobj) return _Memory.get_pointer_type( - self.memory_ptr, ctx + self._memory_ptr, ctx ).decode("UTF-8") elif isinstance(syclobj, SyclQueue): q = (syclobj) ctx = q.get_sycl_context() return _Memory.get_pointer_type( - self.memory_ptr, ctx + self._memory_ptr, ctx ).decode("UTF-8") raise TypeError( "syclobj keyword can be either None, or an instance of " @@ -435,18 +449,18 @@ cdef class _Memory: if syclobj is None: ctx = self._context return _Memory.get_pointer_type_enum( - self.memory_ptr, ctx + self._memory_ptr, ctx ) elif isinstance(syclobj, SyclContext): ctx = (syclobj) return _Memory.get_pointer_type_enum( - self.memory_ptr, ctx + self._memory_ptr, ctx ) elif isinstance(syclobj, SyclQueue): q = (syclobj) ctx = q.get_sycl_context() return _Memory.get_pointer_type_enum( - self.memory_ptr, ctx + self._memory_ptr, ctx ) raise TypeError( "syclobj keyword can be either None, or an instance of " @@ -475,8 +489,8 @@ cdef class _Memory: # call kernel to copy from ERef = DPCTLQueue_Memcpy( self.queue.get_queue_ref(), - &host_buf[0], # destination - self.memory_ptr, # source + &host_buf[0], # destination + self._memory_ptr, # source self.nbytes ) with nogil: DPCTLEvent_Wait(ERef) @@ -500,8 +514,8 @@ cdef class _Memory: # call kernel to copy from ERef = DPCTLQueue_Memcpy( self.queue.get_queue_ref(), - self.memory_ptr, # destination - &host_buf[0], # source + self._memory_ptr, # destination + &host_buf[0], # source buf_len ) with nogil: DPCTLEvent_Wait(ERef) @@ -542,7 +556,7 @@ cdef class _Memory: if (same_contexts): ERef = DPCTLQueue_Memcpy( this_queue.get_queue_ref(), - self.memory_ptr, + self._memory_ptr, src_buf.p, src_buf.nbytes ) @@ -550,8 +564,8 @@ cdef class _Memory: DPCTLEvent_Delete(ERef) else: copy_via_host( - self.memory_ptr, this_queue, # dest - src_buf.p, src_queue, # src + self._memory_ptr, this_queue, # dest + src_buf.p, src_queue, # src src_buf.nbytes ) else: @@ -565,7 +579,7 @@ cdef class _Memory: ERef = DPCTLQueue_Memset( self.queue.get_queue_ref(), - self.memory_ptr, # destination + self._memory_ptr, # destination val, self.nbytes) @@ -703,20 +717,29 @@ cdef class _Memory: res = _Memory.__new__(_Memory) _mem = <_Memory> res _mem._cinit_empty() - _mem.memory_ptr = USMRef _mem.nbytes = nbytes QRef_copy = DPCTLQueue_Copy(QRef) if QRef_copy is NULL: raise ValueError("Referenced queue could not be copied.") try: - _mem.queue = SyclQueue._create(QRef_copy) # consumes the copy + # _create steals ownership of QRef_copy + _mem.queue = SyclQueue._create(QRef_copy) except dpctl.SyclQueueCreationError as sqce: raise ValueError( "SyclQueue object could not be created from " "copy of referenced queue" ) from sqce - _mem.refobj = memory_owner - return mem_ty(res) + if memory_owner is None: + _mem._memory_ptr = USMRef + # assume ownership of USM allocation via smart pointer + _mem._opaque_ptr = OpaqueSmartPtr_Make(USMRef, QRef) + _mem.refobj = None + else: + _mem._memory_ptr = USMRef + _mem._opaque_ptr = NULL + _mem.refobj = memory_owner + _out = mem_ty(_mem) + return _out cdef class MemoryUSMShared(_Memory): @@ -908,10 +931,13 @@ def as_usm_memory(obj): format(obj) ) +cdef api void * Memory_GetOpaquePointer(_Memory obj): + "Opaque pointer value" + return obj.get_opaque_ptr() cdef api DPCTLSyclUSMRef Memory_GetUsmPointer(_Memory obj): "Pointer of USM allocation" - return obj.memory_ptr + return obj.get_data_ptr() cdef api DPCTLSyclContextRef Memory_GetContextRef(_Memory obj): "Context reference to which USM allocation is bound" diff --git a/dpctl/memory/_opaque_smart_ptr.hpp b/dpctl/memory/_opaque_smart_ptr.hpp new file mode 100644 index 0000000000..a9eb1eafb7 --- /dev/null +++ b/dpctl/memory/_opaque_smart_ptr.hpp @@ -0,0 +1,115 @@ +//===--- _opaque_smart_ptr.hpp --------===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file implements working with shared_ptr with USM deleted +/// disguided as an opaque pointer. +/// +//===----------------------------------------------------------------------===// + +#pragma once + +#ifndef __cplusplus +#error "C++ is required to compile this file" +#endif + +#include "syclinterface/dpctl_sycl_type_casters.hpp" +#include "syclinterface/dpctl_sycl_types.h" +#include +#include + +#include +#include + +namespace +{ + +class USMDeleter +{ +public: + USMDeleter() = delete; + USMDeleter(const USMDeleter &) = default; + USMDeleter(USMDeleter &&) = default; + USMDeleter(const ::sycl::queue &queue) : _context(queue.get_context()) {} + USMDeleter(const ::sycl::context &context) : _context(context) {} + template void operator()(T *ptr) const + { + try { + ::sycl::free(ptr, _context); + } catch (const std::exception &e) { + std::cout << "Call to sycl::free caught an exception: " << e.what() + << std::endl; + // std::terminate(); + } + } + +private: + ::sycl::context _context; +}; + +} // end of anonymous namespace + +void *OpaqueSmartPtr_Make(void *usm_ptr, const sycl::queue &q) +{ + USMDeleter _deleter(q); + auto sptr = new std::shared_ptr(usm_ptr, std::move(_deleter)); + + return reinterpret_cast(sptr); +} + +void *OpaqueSmartPtr_Make(void *usm_ptr, DPCTLSyclQueueRef QRef) +{ + sycl::queue *q_ptr = dpctl::syclinterface::unwrap(QRef); + + // make a copy of queue + sycl::queue q{*q_ptr}; + + void *res = OpaqueSmartPtr_Make(usm_ptr, q); + + return res; +} + +void OpaqueSmartPtr_Delete(void *opaque_ptr) +{ + auto sptr = reinterpret_cast *>(opaque_ptr); + + delete sptr; +} + +void *OpaqueSmartPtr_Copy(void *opaque_ptr) +{ + auto sptr = reinterpret_cast *>(opaque_ptr); + auto copied_sptr = new std::shared_ptr(*sptr); + + return reinterpret_cast(copied_sptr); +} + +long OpaqueSmartPtr_UseCount(void *opaque_ptr) +{ + auto sptr = reinterpret_cast *>(opaque_ptr); + return sptr->use_count(); +} + +void *OpaqueSmartPtr_Get(void *opaque_ptr) +{ + auto sptr = reinterpret_cast *>(opaque_ptr); + + return sptr->get(); +} diff --git a/dpctl/sycl.pxd b/dpctl/sycl.pxd index f1ffe9cf48..45c6d652b2 100644 --- a/dpctl/sycl.pxd +++ b/dpctl/sycl.pxd @@ -22,6 +22,7 @@ from . cimport _backend as dpctl_backend cdef extern from "sycl/sycl.hpp" namespace "sycl": cdef cppclass queue "sycl::queue": + void wait() nogil pass cdef cppclass device "sycl::device": @@ -31,6 +32,7 @@ cdef extern from "sycl/sycl.hpp" namespace "sycl": pass cdef cppclass event "sycl::event": + void wait() nogil pass cdef cppclass kernel "sycl::kernel": diff --git a/dpctl/tensor/_accumulation.py b/dpctl/tensor/_accumulation.py index 64ab2ea8c8..4605b10b63 100644 --- a/dpctl/tensor/_accumulation.py +++ b/dpctl/tensor/_accumulation.py @@ -25,7 +25,7 @@ _default_accumulation_dtype_fp_types, _to_device_supported_dtype, ) -from dpctl.utils import ExecutionPlacementError +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager def _accumulate_common( @@ -125,7 +125,9 @@ def _accumulate_common( if a1 != nd: out = dpt.permute_dims(out, perm) - host_tasks_list = [] + final_ev = dpctl.SyclEvent() + _manager = SequentialOrderManager[q] + depends = _manager.submitted_events if implemented_types: if not include_initial: ht_e, acc_ev = _accumulate_fn( @@ -133,32 +135,32 @@ def _accumulate_common( trailing_dims_to_accumulate=1, dst=out, sycl_queue=q, + depends=depends, ) else: ht_e, acc_ev = _accumulate_include_initial_fn( - src=arr, - dst=out, - sycl_queue=q, + src=arr, dst=out, sycl_queue=q, depends=depends ) - host_tasks_list.append(ht_e) + _manager.add_event_pair(ht_e, acc_ev) if not (orig_out is None or out is orig_out): # Copy the out data from temporary buffer to original memory - ht_e_cpy, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_e_cpy, acc_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=q, depends=[acc_ev] ) - host_tasks_list.append(ht_e_cpy) + _manager.add_event_pair(ht_e_cpy, acc_ev) out = orig_out + final_ev = acc_ev else: if _dtype_supported(res_dt, res_dt): tmp = dpt.empty( arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=tmp, sycl_queue=q + src=arr, dst=tmp, sycl_queue=q, depends=depends ) - host_tasks_list.append(ht_e_cpy) + _manager.add_event_pair(ht_e_cpy, cpy_e) if not include_initial: - ht_e, acc_ev = _accumulate_fn( + ht_e, final_ev = _accumulate_fn( src=tmp, trailing_dims_to_accumulate=1, dst=out, @@ -166,26 +168,27 @@ def _accumulate_common( depends=[cpy_e], ) else: - ht_e, acc_ev = _accumulate_include_initial_fn( + ht_e, final_ev = _accumulate_include_initial_fn( src=tmp, dst=out, sycl_queue=q, depends=[cpy_e], ) + _manager.add_event_pair(ht_e, final_ev) else: buf_dt = _default_accumulation_type_fn(inp_dt, q) tmp = dpt.empty( arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q ) ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=tmp, sycl_queue=q + src=arr, dst=tmp, sycl_queue=q, depends=depends ) + _manager.add_event_pair(ht_e_cpy, cpy_e) tmp_res = dpt.empty( res_sh, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q ) if a1 != nd: tmp_res = dpt.permute_dims(tmp_res, perm) - host_tasks_list.append(ht_e_cpy) if not include_initial: ht_e, a_e = _accumulate_fn( src=tmp, @@ -201,18 +204,17 @@ def _accumulate_common( sycl_queue=q, depends=[cpy_e], ) - host_tasks_list.append(ht_e) - ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + _manager.add_event_pair(ht_e, a_e) + ht_e_cpy2, final_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=tmp_res, dst=out, sycl_queue=q, depends=[a_e] ) - host_tasks_list.append(ht_e_cpy2) + _manager.add_event_pair(ht_e_cpy2, final_ev) if appended_axis: out = dpt.squeeze(out) if a1 != nd: inv_perm = sorted(range(nd), key=lambda d: perm[d]) out = dpt.permute_dims(out, inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) return out diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py index dc54e46bb0..c6e86ce6d0 100644 --- a/dpctl/tensor/_clip.py +++ b/dpctl/tensor/_clip.py @@ -31,7 +31,7 @@ ) from dpctl.tensor._manipulation_functions import _broadcast_shape_impl from dpctl.tensor._type_utils import _can_cast, _to_device_supported_dtype -from dpctl.utils import ExecutionPlacementError +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager from ._type_utils import ( WeakComplexType, @@ -299,18 +299,21 @@ def _clip_none(x, val, out, order, _binary_fn): x = dpt.broadcast_to(x, res_shape) if val_ary.shape != res_shape: val_ary = dpt.broadcast_to(val_ary, res_shape) + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events ht_binary_ev, binary_ev = _binary_fn( - src1=x, src2=val_ary, dst=out, sycl_queue=exec_q + src1=x, src2=val_ary, dst=out, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, copy_ev) out = orig_out ht_binary_ev.wait() return out @@ -319,9 +322,12 @@ def _clip_none(x, val, out, order, _binary_fn): buf = _empty_like_orderK(val_ary, res_dt) else: buf = dpt.empty_like(val_ary, dtype=res_dt, order=order) + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=val_ary, dst=buf, sycl_queue=exec_q + src=val_ary, dst=buf, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -346,18 +352,17 @@ def _clip_none(x, val, out, order, _binary_fn): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_binary_ev.wait() return out @@ -444,20 +449,22 @@ def clip(x, /, min=None, max=None, out=None, order="K"): else: out = dpt.empty_like(x, order=order) + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x, dst=out, sycl_queue=exec_q + src=x, dst=out, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[copy_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() return out elif max is None: return _clip_none(x, min, out, order, tei._maximum) @@ -665,20 +672,27 @@ def clip(x, /, min=None, max=None, out=None, order="K"): a_min = dpt.broadcast_to(a_min, res_shape) if a_max.shape != res_shape: a_max = dpt.broadcast_to(a_max, res_shape) + _manager = SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events ht_binary_ev, binary_ev = ti._clip( - src=x, min=a_min, max=a_max, dst=out, sycl_queue=exec_q + src=x, + min=a_min, + max=a_max, + dst=out, + sycl_queue=exec_q, + depends=dep_ev, ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_binary_ev.wait() return out elif buf1_dt is None: @@ -686,9 +700,12 @@ def clip(x, /, min=None, max=None, out=None, order="K"): buf2 = _empty_like_orderK(a_max, buf2_dt) else: buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + _manager = SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_max, dst=buf2, sycl_queue=exec_q + src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_ev ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_triple_orderK( @@ -721,18 +738,17 @@ def clip(x, /, min=None, max=None, out=None, order="K"): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_binary_ev.wait() return out elif buf2_dt is None: @@ -740,9 +756,12 @@ def clip(x, /, min=None, max=None, out=None, order="K"): buf1 = _empty_like_orderK(a_min, buf1_dt) else: buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + _manager = SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_min, dst=buf1, sycl_queue=exec_q + src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_ev ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_triple_orderK( @@ -775,18 +794,17 @@ def clip(x, /, min=None, max=None, out=None, order="K"): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_binary_ev.wait() return out if order == "K": @@ -806,16 +824,21 @@ def clip(x, /, min=None, max=None, out=None, order="K"): buf1 = _empty_like_orderK(a_min, buf1_dt) else: buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_min, dst=buf1, sycl_queue=exec_q + src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) if order == "K": buf2 = _empty_like_orderK(a_max, buf2_dt) else: buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_max, dst=buf2, sycl_queue=exec_q + src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) if out is None: if order == "K": out = _empty_like_triple_orderK( @@ -833,7 +856,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): x = dpt.broadcast_to(x, res_shape) buf1 = dpt.broadcast_to(buf1, res_shape) buf2 = dpt.broadcast_to(buf2, res_shape) - ht_, _ = ti._clip( + ht_, clip_ev = ti._clip( src=x, min=buf1, max=buf2, @@ -841,5 +864,5 @@ def clip(x, /, min=None, max=None, out=None, order="K"): sycl_queue=exec_q, depends=[copy1_ev, copy2_ev], ) - dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_]) + _manager.add_event_pair(ht_, clip_ev) return out diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index bfd182e778..d8e15846eb 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -33,19 +33,22 @@ ":class:`dpctl.tensor.usm_ndarray`." ) -int32_t_max = 2147483648 +int32_t_max = 1 + np.iinfo(np.int32).max def _copy_to_numpy(ary): if not isinstance(ary, dpt.usm_ndarray): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}") nb = ary.usm_data.nbytes - hh = dpm.MemoryUSMHost(nb, queue=ary.sycl_queue) - hh.copy_from_device(ary.usm_data) + q = ary.sycl_queue + hh = dpm.MemoryUSMHost(nb, queue=q) h = np.ndarray(nb, dtype="u1", buffer=hh).view(ary.dtype) itsz = ary.itemsize strides_bytes = tuple(si * itsz for si in ary.strides) - offset = ary.__sycl_usm_array_interface__.get("offset", 0) * itsz + offset = ary._element_offset * itsz + # ensure that content of ary.usm_data is final + q.wait() + hh.copy_from_device(ary.usm_data) return np.ndarray( ary.shape, dtype=ary.dtype, @@ -103,8 +106,11 @@ def _copy_from_numpy_into(dst, np_ary): src_ary = src_ary.astype(np.float32) elif src_ary_dt_c == "D": src_ary = src_ary.astype(np.complex64) + _manager = dpctl.utils.SequentialOrderManager[copy_q] + dep_ev = _manager.submitted_events + # synchronizing call ti._copy_numpy_ndarray_into_usm_ndarray( - src=src_ary, dst=dst, sycl_queue=copy_q + src=src_ary, dst=dst, sycl_queue=copy_q, depends=dep_ev ) @@ -203,14 +209,16 @@ def _copy_overlapping(dst, src): order="C", buffer_ctor_kwargs={"queue": q}, ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events hcp1, cp1 = ti._copy_usm_ndarray_into_usm_ndarray( - src=src, dst=tmp, sycl_queue=q + src=src, dst=tmp, sycl_queue=q, depends=dep_evs ) - hcp2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + _manager.add_event_pair(hcp1, cp1) + hcp2, cp2 = ti._copy_usm_ndarray_into_usm_ndarray( src=tmp, dst=dst, sycl_queue=q, depends=[cp1] ) - hcp2.wait() - hcp1.wait() + _manager.add_event_pair(hcp2, cp2) def _copy_same_shape(dst, src): @@ -225,10 +233,13 @@ def _copy_same_shape(dst, src): _copy_overlapping(src=src, dst=dst) return - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=src, dst=dst, sycl_queue=dst.sycl_queue + copy_q = dst.sycl_queue + _manager = dpctl.utils.SequentialOrderManager[copy_q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src, dst=dst, sycl_queue=copy_q, depends=dep_evs ) - hev.wait() + _manager.add_event_pair(hev, cpy_ev) if hasattr(np, "broadcast_shapes"): @@ -634,7 +645,7 @@ def astype( target_dtype, d.has_aspect_fp16, d.has_aspect_fp64 ): raise ValueError( - f"Requested dtype `{target_dtype}` is not supported by the " + f"Requested dtype '{target_dtype}' is not supported by the " "target device" ) usm_ary = usm_ary.to_device(device) @@ -715,22 +726,27 @@ def _extract_impl(ary, ary_mask, axis=0): cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64 cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device) exec_q = cumsum.sycl_queue - mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q) + _manager = dpctl.utils.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + mask_count = ti.mask_positions( + ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs + ) dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] dst = dpt.empty( dst_shape, dtype=ary.dtype, usm_type=ary.usm_type, device=ary.device ) if dst.size == 0: return dst - hev, _ = ti._extract( + hev, ev = ti._extract( src=ary, cumsum=cumsum, axis_start=pp, axis_end=pp + mask_nd, dst=dst, sycl_queue=exec_q, + depends=dep_evs, ) - hev.wait() + _manager.add_event_pair(hev, ev) return dst @@ -746,7 +762,11 @@ def _nonzero_impl(ary): cumsum = dpt.empty( mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C" ) - mask_count = ti.mask_positions(ary, cumsum, sycl_queue=exec_q) + _manager = dpctl.utils.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + mask_count = ti.mask_positions( + ary, cumsum, sycl_queue=exec_q, depends=dep_evs + ) indexes_dt = ti.default_device_index_type(exec_q.sycl_device) indexes = dpt.empty( (ary.ndim, mask_count), @@ -755,9 +775,9 @@ def _nonzero_impl(ary): sycl_queue=exec_q, order="C", ) - hev, _ = ti._nonzero(cumsum, indexes, ary.shape, exec_q) + hev, nz_ev = ti._nonzero(cumsum, indexes, ary.shape, exec_q) res = tuple(indexes[i, :] for i in range(ary.ndim)) - hev.wait() + _manager.add_event_pair(hev, nz_ev) return res @@ -819,10 +839,18 @@ def _take_multi_index(ary, inds, p): res = dpt.empty( res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - hev, _ = ti._take( - src=ary, ind=inds, dst=res, axis_start=p, mode=0, sycl_queue=exec_q + _manager = dpctl.utils.SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + hev, take_ev = ti._take( + src=ary, + ind=inds, + dst=res, + axis_start=p, + mode=0, + sycl_queue=exec_q, + depends=dep_ev, ) - hev.wait() + _manager.add_event_pair(hev, take_ev) return res @@ -864,7 +892,11 @@ def _place_impl(ary, ary_mask, vals, axis=0): cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64 cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device) exec_q = cumsum.sycl_queue - mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q) + _manager = dpctl.utils.SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + mask_count = ti.mask_positions( + ary_mask, cumsum, sycl_queue=exec_q, depends=dep_ev + ) expected_vals_shape = ( ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] ) @@ -873,15 +905,17 @@ def _place_impl(ary, ary_mask, vals, axis=0): else: rhs = dpt.astype(vals, ary.dtype) rhs = dpt.broadcast_to(rhs, expected_vals_shape) - hev, _ = ti._place( + dep_ev = _manager.submitted_events + hev, pl_ev = ti._place( dst=ary, cumsum=cumsum, axis_start=pp, axis_end=pp + mask_nd, rhs=rhs, sycl_queue=exec_q, + depends=dep_ev, ) - hev.wait() + _manager.add_event_pair(hev, pl_ev) return @@ -958,8 +992,16 @@ def _put_multi_index(ary, inds, p, vals): else: rhs = dpt.astype(vals, ary.dtype) rhs = dpt.broadcast_to(rhs, expected_vals_shape) - hev, _ = ti._put( - dst=ary, ind=inds, val=rhs, axis_start=p, mode=0, sycl_queue=exec_q + _manager = dpctl.utils.SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + hev, put_ev = ti._put( + dst=ary, + ind=inds, + val=rhs, + axis_start=p, + mode=0, + sycl_queue=exec_q, + depends=dep_ev, ) - hev.wait() + _manager.add_event_pair(hev, put_ev) return diff --git a/dpctl/tensor/_ctors.py b/dpctl/tensor/_ctors.py index e61a73c7f3..fb400178f9 100644 --- a/dpctl/tensor/_ctors.py +++ b/dpctl/tensor/_ctors.py @@ -159,10 +159,12 @@ def _asarray_from_usm_ndarray( ) eq = dpctl.utils.get_execution_queue([usm_ndary.sycl_queue, copy_q]) if eq is not None: - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=usm_ndary, dst=res, sycl_queue=eq + _manager = dpctl.utils.SequentialOrderManager[eq] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=usm_ndary, dst=res, sycl_queue=eq, depends=dep_evs ) - hev.wait() + _manager.add_event_pair(hev, cpy_ev) else: tmp = dpt.asnumpy(usm_ndary) res[...] = tmp @@ -311,25 +313,27 @@ def _usm_types_walker(o, usm_types_list): raise TypeError -def _device_copy_walker(seq_o, res, events): +def _device_copy_walker(seq_o, res, _manager): if isinstance(seq_o, dpt.usm_ndarray): exec_q = res.sycl_queue - ht_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=seq_o, dst=res, sycl_queue=exec_q + deps = _manager.submitted_events + ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=seq_o, dst=res, sycl_queue=exec_q, depends=deps ) - events.append(ht_ev) + _manager.add_event_pair(ht_ev, cpy_ev) return if hasattr(seq_o, "__sycl_usm_array_interface__"): usm_ar = _usm_ndarray_from_suai(seq_o) exec_q = res.sycl_queue - ht_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=usm_ar, dst=res, sycl_queue=exec_q + deps = _manager.submitted_events + ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=usm_ar, dst=res, sycl_queue=exec_q, depends=deps ) - events.append(ht_ev) + _manager.add_event_pair(ht_ev, cpy_ev) return if isinstance(seq_o, (list, tuple)): for i, el in enumerate(seq_o): - _device_copy_walker(el, res[i], events) + _device_copy_walker(el, res[i], _manager) return raise TypeError @@ -411,9 +415,8 @@ def _asarray_from_seq( sycl_queue=alloc_q, order=order, ) - ht_events = [] - _device_copy_walker(seq_obj, res, ht_events) - dpctl.SyclEvent.wait_for(ht_events) + _manager = dpctl.utils.SequentialOrderManager[exec_q] + _device_copy_walker(seq_obj, res, _manager) return res else: res = dpt.empty( @@ -851,8 +854,10 @@ def arange( else: _step = sc_ty(1) _start = _first - hev, _ = ti._linspace_step(_start, _step, res, sycl_queue) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating newly allocated array, no task dependencies + hev, lin_ev = ti._linspace_step(_start, _step, res, sycl_queue) + _manager.add_event_pair(hev, lin_ev) if is_bool: res_out = dpt.usm_ndarray( (sh,), @@ -861,8 +866,11 @@ def arange( order="C", buffer_ctor_kwargs={"queue": sycl_queue}, ) - res_out[:] = res - res = res_out + hev_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=res, dst=res_out, sycl_queue=sycl_queue, depends=[lin_ev] + ) + _manager.add_event_pair(hev_cpy, cpy_ev) + return res_out return res @@ -927,6 +935,7 @@ def zeros( order=order, buffer_ctor_kwargs={"queue": sycl_queue}, ) + # FIXME: replace with asynchronous call to ti res.usm_data.memset() return res @@ -992,8 +1001,10 @@ def ones( order=order, buffer_ctor_kwargs={"queue": sycl_queue}, ) - hev, _ = ti._full_usm_ndarray(1, res, sycl_queue) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) return res @@ -1089,8 +1100,10 @@ def full( elif fill_value_type is int and np.issubdtype(dtype, np.integer): fill_value = _to_scalar(fill_value, dtype) - hev, _ = ti._full_usm_ndarray(fill_value, res, sycl_queue) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) return res @@ -1467,10 +1480,11 @@ def linspace( start = float(start) stop = float(stop) res = dpt.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue) - hev, _ = ti._linspace_affine( + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + hev, la_ev = ti._linspace_affine( start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue ) - hev.wait() + _manager.add_event_pair(hev, la_ev) return res if int_dt is None else dpt.astype(res, int_dt) @@ -1564,8 +1578,9 @@ def eye( buffer_ctor_kwargs={"queue": sycl_queue}, ) if n_rows != 0 and n_cols != 0: - hev, _ = ti._eye(k, dst=res, sycl_queue=sycl_queue) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + hev, eye_ev = ti._eye(k, dst=res, sycl_queue=sycl_queue) + _manager.add_event_pair(hev, eye_ev) return res @@ -1615,10 +1630,12 @@ def tril(x, /, *, k=0): usm_type=x.usm_type, sycl_queue=q, ) - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=x, dst=res, sycl_queue=q + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=res, sycl_queue=q, depends=dep_evs ) - hev.wait() + _manager.add_event_pair(hev, cpy_ev) elif k < -shape[nd - 2]: res = dpt.zeros( x.shape, @@ -1635,8 +1652,12 @@ def tril(x, /, *, k=0): usm_type=x.usm_type, sycl_queue=q, ) - hev, _ = ti._tril(src=x, dst=res, k=k, sycl_queue=q) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, tril_ev = ti._tril( + src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, tril_ev) return res @@ -1695,10 +1716,12 @@ def triu(x, /, *, k=0): usm_type=x.usm_type, sycl_queue=q, ) - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=x, dst=res, sycl_queue=q + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=res, sycl_queue=q, depends=dep_evs ) - hev.wait() + _manager.add_event_pair(hev, cpy_ev) else: res = dpt.empty( x.shape, @@ -1707,8 +1730,12 @@ def triu(x, /, *, k=0): usm_type=x.usm_type, sycl_queue=q, ) - hev, _ = ti._triu(src=x, dst=res, k=k, sycl_queue=q) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, triu_ev = ti._triu( + src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, triu_ev) return res diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py index 32da1fbb02..6b38444902 100644 --- a/dpctl/tensor/_elementwise_common.py +++ b/dpctl/tensor/_elementwise_common.py @@ -24,7 +24,7 @@ import dpctl.tensor._tensor_impl as ti from dpctl.tensor._manipulation_functions import _broadcast_shape_impl from dpctl.tensor._usmarray import _is_object_with_buffer_protocol as _is_buffer -from dpctl.utils import ExecutionPlacementError +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK from ._type_utils import ( @@ -236,6 +236,7 @@ def __call__(self, x, /, *, out=None, order="K"): ) exec_q = x.sycl_queue + _manager = SequentialOrderManager[exec_q] if buf_dt is None: if out is None: if order == "K": @@ -245,17 +246,20 @@ def __call__(self, x, /, *, out=None, order="K"): order = "F" if x.flags.f_contiguous else "C" out = dpt.empty_like(x, dtype=res_dt, order=order) - ht_unary_ev, unary_ev = self.unary_fn_(x, out, sycl_queue=exec_q) + dep_evs = _manager.submitted_events + ht_unary_ev, unary_ev = self.unary_fn_( + x, out, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_unary_ev, unary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[unary_ev] ) - ht_copy_ev.wait() + _manager.add_event_pair(ht_copy_ev, cpy_ev) out = orig_out - ht_unary_ev.wait() return out if order == "K": @@ -265,18 +269,21 @@ def __call__(self, x, /, *, out=None, order="K"): order = "F" if x.flags.f_contiguous else "C" buf = dpt.empty_like(x, dtype=buf_dt, order=order) + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x, dst=buf, sycl_queue=exec_q + src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_orderK(buf, res_dt) else: out = dpt.empty_like(buf, dtype=res_dt, order=order) - ht, _ = self.unary_fn_(buf, out, sycl_queue=exec_q, depends=[copy_ev]) - ht_copy_ev.wait() - ht.wait() + ht, uf_ev = self.unary_fn_( + buf, out, sycl_queue=exec_q, depends=[copy_ev] + ) + _manager.add_event_pair(ht, uf_ev) return out @@ -625,6 +632,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): ) orig_out = out + _manager = SequentialOrderManager[exec_q] if out is not None: if not isinstance(out, dpt.usm_ndarray): raise TypeError( @@ -676,28 +684,36 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): if buf2_dt is None: if src2.shape != res_shape: src2 = dpt.broadcast_to(src2, res_shape) - ht_, _ = self.binary_inplace_fn_( - lhs=o1, rhs=src2, sycl_queue=exec_q + dep_evs = _manager.submitted_events + ht_, comp_ev = self.binary_inplace_fn_( + lhs=o1, + rhs=src2, + sycl_queue=exec_q, + depends=dep_evs, ) - ht_.wait() + _manager.add_event_pair(ht_, comp_ev) else: buf2 = dpt.empty_like(src2, dtype=buf2_dt) + dep_evs = _manager.submitted_events ( ht_copy_ev, copy_ev, ) = ti._copy_usm_ndarray_into_usm_ndarray( - src=src2, dst=buf2, sycl_queue=exec_q + src=src2, + dst=buf2, + sycl_queue=exec_q, + depends=dep_evs, ) + _manager.add_event_pair(ht_copy_ev, copy_ev) buf2 = dpt.broadcast_to(buf2, res_shape) - ht_, _ = self.binary_inplace_fn_( + ht_, bf_ev = self.binary_inplace_fn_( lhs=o1, rhs=buf2, sycl_queue=exec_q, depends=[copy_ev], ) - ht_copy_ev.wait() - ht_.wait() + _manager.add_event_pair(ht_, bf_ev) return out @@ -751,29 +767,36 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): src1 = dpt.broadcast_to(src1, res_shape) if src2.shape != res_shape: src2 = dpt.broadcast_to(src2, res_shape) + deps_ev = _manager.submitted_events ht_binary_ev, binary_ev = self.binary_fn_( - src1=src1, src2=src2, dst=out, sycl_queue=exec_q + src1=src1, + src2=src2, + dst=out, + sycl_queue=exec_q, + depends=deps_ev, ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_binary_ev.wait() return out elif buf1_dt is None: if order == "K": buf2 = _empty_like_orderK(src2, buf2_dt) else: buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order) + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=src2, dst=buf2, sycl_queue=exec_q + src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -798,27 +821,28 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_binary_ev.wait() return out elif buf2_dt is None: if order == "K": buf1 = _empty_like_orderK(src1, buf1_dt) else: buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order) + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=src1, dst=buf1, sycl_queue=exec_q + src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -843,18 +867,17 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_binary_ev.wait() return out if order == "K": @@ -866,16 +889,19 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): buf1 = _empty_like_orderK(src1, buf1_dt) else: buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order) + dep_evs = _manager.submitted_events ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=src1, dst=buf1, sycl_queue=exec_q + src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) if order == "K": buf2 = _empty_like_orderK(src2, buf2_dt) else: buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=src2, dst=buf2, sycl_queue=exec_q + src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -892,12 +918,12 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): buf1 = dpt.broadcast_to(buf1, res_shape) buf2 = dpt.broadcast_to(buf2, res_shape) - ht_, _ = self.binary_fn_( + ht_, bf_ev = self.binary_fn_( src1=buf1, src2=buf2, dst=out, sycl_queue=exec_q, depends=[copy1_ev, copy2_ev], ) - dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_]) + _manager.add_event_pair(ht_, bf_ev) return out diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 04cbd1bc8d..b70d50c1df 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -21,6 +21,7 @@ import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti +import dpctl.utils from ._copy_utils import _extract_impl, _nonzero_impl @@ -120,8 +121,12 @@ def take(x, indices, /, *, axis=None, mode="wrap"): res_shape, dtype=x.dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - hev, _ = ti._take(x, (indices,), res, axis, mode, sycl_queue=exec_q) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager[exec_q] + deps_ev = _manager.submitted_events + hev, take_ev = ti._take( + x, (indices,), res, axis, mode, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(hev, take_ev) return res @@ -273,8 +278,12 @@ def put_vec_duplicates(vec, ind, vals): rhs = dpt.astype(vals, x.dtype) rhs = dpt.broadcast_to(rhs, val_shape) - hev, _ = ti._put(x, (indices,), rhs, axis, mode, sycl_queue=exec_q) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager[exec_q] + deps_ev = _manager.submitted_events + hev, put_ev = ti._put( + x, (indices,), rhs, axis, mode, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(hev, put_ev) def extract(condition, arr): @@ -366,7 +375,11 @@ def place(arr, mask, vals): if arr.shape != mask.shape or vals.ndim != 1: raise ValueError("Array sizes are not as required") cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q) - nz_count = ti.mask_positions(mask, cumsum, sycl_queue=exec_q) + _manager = dpctl.utils.SequentialOrderManager[exec_q] + deps_ev = _manager.submitted_events + nz_count = ti.mask_positions( + mask, cumsum, sycl_queue=exec_q, depends=deps_ev + ) if nz_count == 0: return if vals.size == 0: @@ -375,7 +388,7 @@ def place(arr, mask, vals): rhs = vals else: rhs = dpt.astype(vals, arr.dtype) - hev, _ = ti._place( + hev, pl_ev = ti._place( dst=arr, cumsum=cumsum, axis_start=0, @@ -383,7 +396,7 @@ def place(arr, mask, vals): rhs=rhs, sycl_queue=exec_q, ) - hev.wait() + _manager.add_event_pair(hev, pl_ev) def nonzero(arr): diff --git a/dpctl/tensor/_linear_algebra_functions.py b/dpctl/tensor/_linear_algebra_functions.py index e4155ab3e1..4d72aa28c2 100644 --- a/dpctl/tensor/_linear_algebra_functions.py +++ b/dpctl/tensor/_linear_algebra_functions.py @@ -30,7 +30,7 @@ _find_buf_dtype2, _to_device_supported_dtype, ) -from dpctl.utils import ExecutionPlacementError +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager def matrix_transpose(x): @@ -189,6 +189,7 @@ def tensordot(x1, x2, axes=2): "supported types according to the casting rule ''safe''." ) + _manager = SequentialOrderManager[exec_q] if buf1_dt is None and buf2_dt is None: out = dpt.empty( res_shape, @@ -197,7 +198,8 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + dep_evs = _manager.submitted_events + ht_dot_ev, dot_ev = tli._dot( x1=arr1, x2=arr2, batch_dims=0, @@ -206,16 +208,20 @@ def tensordot(x1, x2, axes=2): inner_dims=n_axes1, dst=out, sycl_queue=exec_q, + depends=dep_evs, ) - ht_dot_ev.wait() + _manager.add_event_pair(ht_dot_ev, dot_ev) return out elif buf1_dt is None: buf2 = _empty_like_orderK(arr2, buf2_dt) + + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr2, dst=buf2, sycl_queue=exec_q + src=arr2, dst=buf2, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) out = dpt.empty( res_shape, dtype=res_dt, @@ -223,7 +229,7 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + ht_dot_ev, dot_ev = tli._dot( x1=arr1, x2=buf2, batch_dims=0, @@ -234,16 +240,17 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, depends=[copy_ev], ) - ht_copy_ev.wait() - ht_dot_ev.wait() + _manager.add_event_pair(ht_dot_ev, dot_ev) return out elif buf2_dt is None: buf1 = _empty_like_orderK(arr1, buf1_dt) + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr1, dst=buf1, sycl_queue=exec_q + src=arr1, dst=buf1, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) out = dpt.empty( res_shape, dtype=res_dt, @@ -251,7 +258,7 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + ht_dot_ev, dot_ev = tli._dot( x1=buf1, x2=arr2, batch_dims=0, @@ -262,19 +269,21 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, depends=[copy_ev], ) - ht_copy_ev.wait() - ht_dot_ev.wait() + _manager.add_event_pair(ht_dot_ev, dot_ev) return out buf1 = _empty_like_orderK(arr1, buf1_dt) + deps_ev = _manager.submitted_events ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr1, dst=buf1, sycl_queue=exec_q + src=arr1, dst=buf1, sycl_queue=exec_q, depends=deps_ev ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) buf2 = _empty_like_orderK(arr2, buf2_dt) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr2, dst=buf2, sycl_queue=exec_q + src=arr2, dst=buf2, sycl_queue=exec_q, depends=deps_ev ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) out = dpt.empty( res_shape, dtype=res_dt, @@ -282,7 +291,7 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, order="C", ) - ht_, _ = tli._dot( + ht_, dot_ev = tli._dot( x1=buf1, x2=buf2, batch_dims=0, @@ -293,7 +302,7 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, depends=[copy1_ev, copy2_ev], ) - dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_]) + _manager.add_event_pair(ht_, dot_ev) return out @@ -399,18 +408,15 @@ def vecdot(x1, x2, axis=-1): "supported types according to the casting rule ''safe''." ) - ht_list = [] - deps = [] + _manager = SequentialOrderManager[exec_q] if buf1_dt is None and buf2_dt is None: if x1.dtype.kind == "c": x1_tmp = _empty_like_orderK(x1, x1.dtype) + dep_evs = _manager.submitted_events ht_conj_ev, conj_ev = tei._conj( - src=x1, - dst=x1_tmp, - sycl_queue=exec_q, + src=x1, dst=x1_tmp, sycl_queue=exec_q, depends=dep_evs ) - ht_list.append(ht_conj_ev) - deps.append(conj_ev) + _manager.add_event_pair(ht_conj_ev, conj_ev) x1 = x1_tmp if x1.shape != broadcast_sh: x1 = dpt.broadcast_to(x1, broadcast_sh) @@ -425,7 +431,8 @@ def vecdot(x1, x2, axis=-1): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + dep_evs = _manager.submitted_events + ht_dot_ev, dot_ev = tli._dot( x1=x1, x2=x2, batch_dims=len(res_sh), @@ -434,28 +441,26 @@ def vecdot(x1, x2, axis=-1): inner_dims=1, dst=out, sycl_queue=exec_q, - depends=deps, + depends=dep_evs, ) - ht_list.append(ht_dot_ev) - dpctl.SyclEvent.wait_for(ht_list) - + _manager.add_event_pair(ht_dot_ev, dot_ev) return dpt.reshape(out, res_sh) elif buf1_dt is None: if x1.dtype.kind == "c": x1_tmp = _empty_like_orderK(x1, x1.dtype) + deps_ev = _manager.submitted_events ht_conj_ev, conj_e = tei._conj( - src=x1, dst=x1_tmp, sycl_queue=exec_q + src=x1, dst=x1_tmp, sycl_queue=exec_q, depends=deps_ev ) - ht_list.append(ht_conj_ev) - deps.append(conj_e) + _manager.add_event_pair(ht_conj_ev, conj_e) x1 = x1_tmp buf2 = _empty_like_orderK(x2, buf2_dt) + deps_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x2, dst=buf2, sycl_queue=exec_q + src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev ) - ht_list.append(ht_copy_ev) - deps.append(copy_ev) + _manager.add_event_pair(ht_copy_ev, copy_ev) if x1.shape != broadcast_sh: x1 = dpt.broadcast_to(x1, broadcast_sh) if buf2.shape != broadcast_sh: @@ -469,7 +474,7 @@ def vecdot(x1, x2, axis=-1): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + ht_dot_ev, dot_ev = tli._dot( x1=x1, x2=buf2, batch_dims=len(res_sh), @@ -478,26 +483,23 @@ def vecdot(x1, x2, axis=-1): inner_dims=1, dst=out, sycl_queue=exec_q, - depends=deps, + depends=[copy_ev], ) - ht_list.append(ht_dot_ev) - dpctl.SyclEvent.wait_for(ht_list) - + _manager.add_event_pair(ht_dot_ev, dot_ev) return dpt.reshape(out, res_sh) elif buf2_dt is None: buf1 = _empty_like_orderK(x1, buf1_dt) + deps_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x1, dst=buf1, sycl_queue=exec_q + src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev ) - ht_list.append(ht_copy_ev) - deps.append(copy_ev) + _manager.add_event_pair(ht_copy_ev, copy_ev) if buf1.dtype.kind == "c": ht_conj_ev, conj_ev = tei._conj( src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy_ev] ) - ht_list.append(ht_conj_ev) - deps.append(conj_ev) + _manager.add_event_pair(ht_conj_ev, conj_ev) if buf1.shape != broadcast_sh: buf1 = dpt.broadcast_to(buf1, broadcast_sh) if x2.shape != broadcast_sh: @@ -511,7 +513,8 @@ def vecdot(x1, x2, axis=-1): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + deps_ev = _manager.submitted_events + ht_dot_ev, dot_ev = tli._dot( x1=buf1, x2=x2, batch_dims=len(res_sh), @@ -520,31 +523,27 @@ def vecdot(x1, x2, axis=-1): inner_dims=1, dst=out, sycl_queue=exec_q, - depends=deps, + depends=deps_ev, ) - ht_list.append(ht_dot_ev) - dpctl.SyclEvent.wait_for(ht_list) - + _manager.add_event_pair(ht_dot_ev, dot_ev) return dpt.reshape(out, res_sh) buf1 = _empty_like_orderK(x1, buf1_dt) + deps_ev = _manager.submitted_events ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x1, dst=buf1, sycl_queue=exec_q + src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev ) - ht_list.append(ht_copy1_ev) - deps.append(copy1_ev) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) if buf1.dtype.kind == "c": ht_conj_ev, conj_ev = tei._conj( src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy1_ev] ) - ht_list.append(ht_conj_ev) - deps.append(conj_ev) + _manager.add_event_pair(ht_conj_ev, conj_ev) buf2 = _empty_like_orderK(x2, buf2_dt) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x2, dst=buf2, sycl_queue=exec_q + src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev ) - ht_list.append(ht_copy2_ev) - deps.append(copy2_ev) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) if buf1.shape != broadcast_sh: buf1 = dpt.broadcast_to(buf1, broadcast_sh) if buf2.shape != broadcast_sh: @@ -558,7 +557,8 @@ def vecdot(x1, x2, axis=-1): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + deps_ev = _manager.submitted_events + ht_dot_ev, dot_ev = tli._dot( x1=buf1, x2=buf2, batch_dims=len(res_sh), @@ -567,11 +567,9 @@ def vecdot(x1, x2, axis=-1): inner_dims=1, dst=out, sycl_queue=exec_q, - depends=deps, + depends=deps_ev, ) - ht_list.append(ht_dot_ev) - dpctl.SyclEvent.wait_for(ht_list) - + _manager.add_event_pair(ht_dot_ev, dot_ev) return out @@ -793,6 +791,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): else "C" ) + _manager = SequentialOrderManager[exec_q] if buf1_dt is None and buf2_dt is None: if out is None: if order == "K": @@ -811,6 +810,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): x1 = dpt.broadcast_to(x1, x1_broadcast_shape) if x2.shape != x2_broadcast_shape: x2 = dpt.broadcast_to(x2, x2_broadcast_shape) + deps_evs = _manager.submitted_events ht_dot_ev, dot_ev = tli._dot( x1=x1, x2=x2, @@ -820,18 +820,19 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): inner_dims=1, dst=out, sycl_queue=exec_q, + depends=deps_evs, ) + _manager.add_event_pair(ht_dot_ev, dot_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[dot_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_dot_ev.wait() if appended_axes: out = dpt.squeeze(out, tuple(appended_axes)) return out @@ -840,9 +841,11 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): buf2 = _empty_like_orderK(x2, buf2_dt) else: buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order) + deps_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x2, dst=buf2, sycl_queue=exec_q + src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -872,18 +875,17 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_dot_ev, dot_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[dot_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_dot_ev.wait() if appended_axes: out = dpt.squeeze(out, tuple(appended_axes)) return out @@ -893,9 +895,11 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): buf1 = _empty_like_orderK(x1, buf1_dt) else: buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order) + deps_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x1, dst=buf1, sycl_queue=exec_q + src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -925,18 +929,17 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_dot_ev, dot_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[dot_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_dot_ev.wait() if appended_axes: out = dpt.squeeze(out, tuple(appended_axes)) return out @@ -950,16 +953,19 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): buf1 = _empty_like_orderK(x1, buf1_dt) else: buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order) + deps_ev = _manager.submitted_events ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x1, dst=buf1, sycl_queue=exec_q + src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) if order == "K": buf2 = _empty_like_orderK(x2, buf2_dt) else: buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x2, dst=buf2, sycl_queue=exec_q + src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -978,7 +984,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape) if buf2.shape != x2_broadcast_shape: buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape) - ht_, _ = tli._dot( + ht_, dot_ev = tli._dot( x1=buf1, x2=buf2, batch_dims=len(res_shape[:-2]), @@ -989,7 +995,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): sycl_queue=exec_q, depends=[copy1_ev, copy2_ev], ) - dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_]) + _manager.add_event_pair(ht_, dot_ev) if appended_axes: out = dpt.squeeze(out, tuple(appended_axes)) return out diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py index 50b7c25fed..af44104288 100644 --- a/dpctl/tensor/_manipulation_functions.py +++ b/dpctl/tensor/_manipulation_functions.py @@ -116,7 +116,7 @@ def permute_dims(X, /, axes): dtype=X.dtype, buffer=X, strides=newstrides, - offset=X.__sycl_usm_array_interface__.get("offset", 0), + offset=X._element_offset, ) @@ -244,7 +244,7 @@ def broadcast_to(X, /, shape): dtype=X.dtype, buffer=X, strides=new_sts, - offset=X.__sycl_usm_array_interface__.get("offset", 0), + offset=X._element_offset, ) @@ -343,15 +343,22 @@ def roll(X, /, shift, *, axis=None): """ if not isinstance(X, dpt.usm_ndarray): raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + exec_q = X.sycl_queue + _manager = dputils.SequentialOrderManager[exec_q] if axis is None: shift = operator.index(shift) + dep_evs = _manager.submitted_events res = dpt.empty( - X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=X.sycl_queue + X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=exec_q ) - hev, _ = ti._copy_usm_ndarray_for_roll_1d( - src=X, dst=res, shift=shift, sycl_queue=X.sycl_queue + hev, roll_ev = ti._copy_usm_ndarray_for_roll_1d( + src=X, + dst=res, + shift=shift, + sycl_queue=exec_q, + depends=dep_evs, ) - hev.wait() + _manager.add_event_pair(hev, roll_ev) return res axis = normalize_axis_tuple(axis, X.ndim, allow_duplicate=True) broadcasted = np.broadcast(shift, axis) @@ -363,14 +370,14 @@ def roll(X, /, shift, *, axis=None): for sh, ax in broadcasted: shifts[ax] += sh - exec_q = X.sycl_queue res = dpt.empty( X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=exec_q ) - ht_e, _ = ti._copy_usm_ndarray_for_roll_nd( - src=X, dst=res, shifts=shifts, sycl_queue=exec_q + dep_evs = _manager.submitted_events + ht_e, roll_ev = ti._copy_usm_ndarray_for_roll_nd( + src=X, dst=res, shifts=shifts, sycl_queue=exec_q, depends=dep_evs ) - ht_e.wait() + _manager.add_event_pair(ht_e, roll_ev) return res @@ -439,31 +446,46 @@ def _concat_axis_None(arrays): res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - hev_list = [] fill_start = 0 + _manager = dputils.SequentialOrderManager[exec_q] + deps = _manager.submitted_events for array in arrays: fill_end = fill_start + array.size if array.flags.c_contiguous: - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=dpt.reshape(array, -1), dst=res[fill_start:fill_end], sycl_queue=exec_q, + depends=deps, ) + _manager.add_event_pair(hev, cpy_ev) else: src_ = array # _copy_usm_ndarray_for_reshape requires src and dst to have # the same data type if not array.dtype == res_dtype: - src_ = dpt.astype(src_, res_dtype) - hev, _ = ti._copy_usm_ndarray_for_reshape( - src=src_, - dst=res[fill_start:fill_end], - sycl_queue=exec_q, - ) + src2_ = dpt.empty_like(src_, dtype=res_dtype) + ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src_, dst=src2_, sycl_queue=exec_q, depends=deps + ) + _manager.add_event_pair(ht_copy_ev, cpy_ev) + hev, reshape_copy_ev = ti._copy_usm_ndarray_for_reshape( + src=src2_, + dst=res[fill_start:fill_end], + sycl_queue=exec_q, + depends=[cpy_ev], + ) + _manager.add_event_pair(hev, reshape_copy_ev) + else: + hev, cpy_ev = ti._copy_usm_ndarray_for_reshape( + src=src_, + dst=res[fill_start:fill_end], + sycl_queue=exec_q, + depends=deps, + ) + _manager.add_event_pair(hev, cpy_ev) fill_start = fill_end - hev_list.append(hev) - dpctl.SyclEvent.wait_for(hev_list) return res @@ -516,7 +538,8 @@ def concat(arrays, /, *, axis=0): res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - hev_list = [] + _manager = dputils.SequentialOrderManager[exec_q] + deps = _manager.submitted_events fill_start = 0 for i in range(n): fill_end = fill_start + arrays[i].shape[axis] @@ -524,13 +547,14 @@ def concat(arrays, /, *, axis=0): np.s_[fill_start:fill_end] if j == axis else np.s_[:] for j in range(X0.ndim) ) - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=arrays[i], dst=res[c_shapes_copy], sycl_queue=exec_q + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arrays[i], + dst=res[c_shapes_copy], + sycl_queue=exec_q, + depends=deps, ) + _manager.add_event_pair(hev, cpy_ev) fill_start = fill_end - hev_list.append(hev) - - dpctl.SyclEvent.wait_for(hev_list) return res @@ -581,17 +605,17 @@ def stack(arrays, /, *, axis=0): res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - hev_list = [] + _manager = dputils.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events for i in range(n): c_shapes_copy = tuple( i if j == axis else np.s_[:] for j in range(res_ndim) ) - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=arrays[i], dst=res[c_shapes_copy], sycl_queue=exec_q + _dst = res[c_shapes_copy] + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arrays[i], dst=_dst, sycl_queue=exec_q, depends=dep_evs ) - hev_list.append(hev) - - dpctl.SyclEvent.wait_for(hev_list) + _manager.add_event_pair(hev, cpy_ev) return res @@ -793,8 +817,8 @@ def repeat(x, repeats, /, *, axis=None): dpctl.utils.validate_usm_type(usm_type, allow_none=False) if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"): raise TypeError( - f"`repeats` data type `{repeats.dtype}` cannot be cast to " - "`int64` according to the casting rule ''safe.''" + f"'repeats' data type {repeats.dtype} cannot be cast to " + "'int64' according to the casting rule ''safe.''" ) if repeats.size == 1: scalar = True @@ -805,11 +829,11 @@ def repeat(x, repeats, /, *, axis=None): else: if repeats.size != axis_size: raise ValueError( - "`repeats` array must be broadcastable to the size of " + "'repeats' array must be broadcastable to the size of " "the repeated axis" ) if not dpt.all(repeats >= 0): - raise ValueError("`repeats` elements must be positive") + raise ValueError("'repeats' elements must be positive") elif isinstance(repeats, (tuple, list, range)): usm_type = x.usm_type @@ -838,6 +862,8 @@ def repeat(x, repeats, /, *, axis=None): f"got {type(repeats)}" ) + _manager = dputils.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events if scalar: res_axis_size = repeats * axis_size if axis is not None: @@ -848,14 +874,15 @@ def repeat(x, repeats, /, *, axis=None): res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q ) if res_axis_size > 0: - ht_rep_ev, _ = ti._repeat_by_scalar( + ht_rep_ev, rep_ev = ti._repeat_by_scalar( src=x, dst=res, reps=repeats, axis=axis, sycl_queue=exec_q, + depends=dep_evs, ) - ht_rep_ev.wait() + _manager.add_event_pair(ht_rep_ev, rep_ev) else: if repeats.dtype != dpt.int64: rep_buf = dpt.empty( @@ -865,8 +892,9 @@ def repeat(x, repeats, /, *, axis=None): sycl_queue=exec_q, ) ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=repeats, dst=rep_buf, sycl_queue=exec_q + src=repeats, dst=rep_buf, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) cumsum = dpt.empty( (axis_size,), dtype=dpt.int64, @@ -890,7 +918,7 @@ def repeat(x, repeats, /, *, axis=None): sycl_queue=exec_q, ) if res_axis_size > 0: - ht_rep_ev, _ = ti._repeat_by_sequence( + ht_rep_ev, rep_ev = ti._repeat_by_sequence( src=x, dst=res, reps=rep_buf, @@ -898,8 +926,7 @@ def repeat(x, repeats, /, *, axis=None): axis=axis, sycl_queue=exec_q, ) - ht_rep_ev.wait() - ht_copy_ev.wait() + _manager.add_event_pair(ht_rep_ev, rep_ev) else: cumsum = dpt.empty( (axis_size,), @@ -907,7 +934,9 @@ def repeat(x, repeats, /, *, axis=None): usm_type=usm_type, sycl_queue=exec_q, ) - res_axis_size = ti._cumsum_1d(repeats, cumsum, sycl_queue=exec_q) + res_axis_size = ti._cumsum_1d( + repeats, cumsum, sycl_queue=exec_q, depends=dep_evs + ) if axis is not None: res_shape = ( x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] @@ -921,7 +950,7 @@ def repeat(x, repeats, /, *, axis=None): sycl_queue=exec_q, ) if res_axis_size > 0: - ht_rep_ev, _ = ti._repeat_by_sequence( + ht_rep_ev, rep_ev = ti._repeat_by_sequence( src=x, dst=res, reps=repeats, @@ -929,7 +958,7 @@ def repeat(x, repeats, /, *, axis=None): axis=axis, sycl_queue=exec_q, ) - ht_rep_ev.wait() + _manager.add_event_pair(ht_rep_ev, rep_ev) return res @@ -1021,8 +1050,10 @@ def tile(x, repetitions, /): broadcast_sh, ) # copy broadcast input into flat array - hev, _ = ti._copy_usm_ndarray_for_reshape( - src=x, dst=res, sycl_queue=exec_q + _manager = dputils.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + hev, cp_ev = ti._copy_usm_ndarray_for_reshape( + src=x, dst=res, sycl_queue=exec_q, depends=dep_evs ) - hev.wait() + _manager.add_event_pair(hev, cp_ev) return dpt.reshape(res, res_shape) diff --git a/dpctl/tensor/_print.py b/dpctl/tensor/_print.py index 450f46c0f1..77bd9d178e 100644 --- a/dpctl/tensor/_print.py +++ b/dpctl/tensor/_print.py @@ -23,6 +23,7 @@ import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti +import dpctl.utils __doc__ = "Print functions for :class:`dpctl.tensor.usm_ndarray`." @@ -244,11 +245,12 @@ def _nd_corners(arr_in, edge_items): for i in range(arr_in.ndim) ) + exec_q = arr_in.sycl_queue arr_out = dpt.empty( res_shape, dtype=arr_in.dtype, usm_type=arr_in.usm_type, - sycl_queue=arr_in.sycl_queue, + sycl_queue=exec_q, ) blocks = [] @@ -263,10 +265,15 @@ def _nd_corners(arr_in, edge_items): else: blocks.append((np.s_[:],)) + _manager = dpctl.utils.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events hev_list = [] for slc in itertools.product(*blocks): hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr_in[slc], dst=arr_out[slc], sycl_queue=arr_in.sycl_queue + src=arr_in[slc], + dst=arr_out[slc], + sycl_queue=exec_q, + depends=dep_evs, ) hev_list.append(hev) diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index b918152467..afd5f4cf9b 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -20,7 +20,7 @@ import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti import dpctl.tensor._tensor_reductions_impl as tri -from dpctl.utils import ExecutionPlacementError +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager from ._type_utils import ( _default_accumulation_dtype, @@ -108,31 +108,35 @@ def _reduction_over_axis( res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) - host_tasks_list = [] + _manager = SequentialOrderManager[q] + dep_evs = _manager.submitted_events if red_nd == 0: ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=out, sycl_queue=q + src=arr, dst=out, sycl_queue=q, depends=dep_evs ) - host_tasks_list.append(ht_e_cpy) + _manager.add_event_pair(ht_e_cpy, cpy_e) if not (orig_out is None or orig_out is out): - ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=q, depends=[cpy_e] ) - host_tasks_list.append(ht_e_cpy2) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) out = orig_out - dpctl.SyclEvent.wait_for(host_tasks_list) return out if implemented_types: ht_e, red_e = _reduction_fn( - src=arr, trailing_dims_to_reduce=red_nd, dst=out, sycl_queue=q + src=arr, + trailing_dims_to_reduce=red_nd, + dst=out, + sycl_queue=q, + depends=dep_evs, ) - host_tasks_list.append(ht_e) + _manager.add_event_pair(ht_e, red_e) if not (orig_out is None or orig_out is out): - ht_e_cpy, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=q, depends=[red_e] ) - host_tasks_list.append(ht_e_cpy) + _manager.add_event_pair(ht_e_cpy, cpy_e) out = orig_out else: if _dtype_supported(res_dt, res_dt, res_usm_type, q): @@ -140,29 +144,29 @@ def _reduction_over_axis( arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=tmp, sycl_queue=q + src=arr, dst=tmp, sycl_queue=q, depends=dep_evs ) - host_tasks_list.append(ht_e_cpy) - ht_e_red, _ = _reduction_fn( + _manager.add_event_pair(ht_e_cpy, cpy_e) + ht_e_red, red_ev = _reduction_fn( src=tmp, trailing_dims_to_reduce=red_nd, dst=out, sycl_queue=q, depends=[cpy_e], ) - host_tasks_list.append(ht_e_red) + _manager.add_event_pair(ht_e_red, red_ev) else: buf_dt = _default_reduction_type_fn(inp_dt, q) tmp = dpt.empty( arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q ) ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=tmp, sycl_queue=q + src=arr, dst=tmp, sycl_queue=q, depends=dep_evs ) + _manager.add_event_pair(ht_e_cpy, cpy_e) tmp_res = dpt.empty( res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q ) - host_tasks_list.append(ht_e_cpy) ht_e_red, r_e = _reduction_fn( src=tmp, trailing_dims_to_reduce=red_nd, @@ -170,18 +174,16 @@ def _reduction_over_axis( sycl_queue=q, depends=[cpy_e], ) - host_tasks_list.append(ht_e_red) - ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + _manager.add_event_pair(ht_e_red, r_e) + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( src=tmp_res, dst=out, sycl_queue=q, depends=[r_e] ) - host_tasks_list.append(ht_e_cpy2) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) if keepdims: res_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) - return out @@ -498,19 +500,19 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn): res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q ) - host_tasks_list = [] + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events if red_nd == 0: ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=x_tmp, dst=out, sycl_queue=exec_q + src=x_tmp, dst=out, sycl_queue=exec_q, depends=dep_evs ) - host_tasks_list.append(ht_e_cpy) + _manager.add_event_pair(ht_e_cpy, cpy_e) if not (orig_out is None or orig_out is out): - ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[cpy_e] ) - host_tasks_list.append(ht_e_cpy2) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) out = orig_out - dpctl.SyclEvent.wait_for(host_tasks_list) return out hev, red_ev = _reduction_fn( @@ -518,20 +520,20 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn): trailing_dims_to_reduce=red_nd, dst=out, sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks_list.append(hev) + _manager.add_event_pair(hev, red_ev) if not (orig_out is None or orig_out is out): - ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev] ) - host_tasks_list.append(ht_e_cpy2) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) out = orig_out if keepdims: res_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) return out @@ -667,33 +669,34 @@ def _search_over_axis(x, axis, keepdims, out, _reduction_fn): res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q ) + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events if red_nd == 0: - ht_e_fill, _ = ti._full_usm_ndarray( - fill_value=0, dst=out, sycl_queue=exec_q + ht_e_fill, fill_ev = ti._full_usm_ndarray( + fill_value=0, dst=out, sycl_queue=exec_q, depends=dep_evs ) - ht_e_fill.wait() + _manager.add_event_pair(ht_e_fill, fill_ev) return out - host_tasks_list = [] hev, red_ev = _reduction_fn( src=x_tmp, trailing_dims_to_reduce=red_nd, dst=out, sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks_list.append(hev) + _manager.add_event_pair(hev, red_ev) if not (orig_out is None or orig_out is out): - ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev] ) - host_tasks_list.append(ht_e_cpy2) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) out = orig_out if keepdims: res_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) return out diff --git a/dpctl/tensor/_reshape.py b/dpctl/tensor/_reshape.py index 575c79c115..eb9b7ffcfe 100644 --- a/dpctl/tensor/_reshape.py +++ b/dpctl/tensor/_reshape.py @@ -18,6 +18,7 @@ import numpy as np import dpctl.tensor as dpt +import dpctl.utils from dpctl.tensor._tensor_impl import ( _copy_usm_ndarray_for_reshape, _ravel_multi_index, @@ -157,22 +158,25 @@ def reshape(X, /, shape, *, order="C", copy=None): copy_q = X.sycl_queue if copy_required or (copy is True): # must perform a copy + copy_q = X.sycl_queue flat_res = dpt.usm_ndarray( (X.size,), dtype=X.dtype, buffer=X.usm_type, buffer_ctor_kwargs={"queue": copy_q}, ) + _manager = dpctl.utils.SequentialOrderManager[copy_q] + dep_evs = _manager.submitted_events if order == "C": - hev, _ = _copy_usm_ndarray_for_reshape( - src=X, dst=flat_res, sycl_queue=copy_q + hev, r_e = _copy_usm_ndarray_for_reshape( + src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs ) else: X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1)) - hev, _ = _copy_usm_ndarray_for_reshape( - src=X_t, dst=flat_res, sycl_queue=copy_q + hev, r_e = _copy_usm_ndarray_for_reshape( + src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs ) - hev.wait() + _manager.add_event_pair(hev, r_e) return dpt.usm_ndarray( tuple(shape), dtype=X.dtype, buffer=flat_res, order=order ) diff --git a/dpctl/tensor/_search_functions.py b/dpctl/tensor/_search_functions.py index 94982d0c82..c0fdfb7861 100644 --- a/dpctl/tensor/_search_functions.py +++ b/dpctl/tensor/_search_functions.py @@ -18,7 +18,7 @@ import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti from dpctl.tensor._manipulation_functions import _broadcast_shapes -from dpctl.utils import ExecutionPlacementError +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager from ._copy_utils import _empty_like_orderK, _empty_like_triple_orderK from ._type_utils import _all_data_types, _can_cast @@ -198,19 +198,18 @@ def where(condition, x1, x2, /, *, order="K", out=None): sycl_queue=exec_q, ) - deps = [] - wait_list = [] + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events if x1_dtype != out_dtype: if order == "K": _x1 = _empty_like_orderK(x1, out_dtype) else: _x1 = dpt.empty_like(x1, dtype=out_dtype, order=order) ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x1, dst=_x1, sycl_queue=exec_q + src=x1, dst=_x1, sycl_queue=exec_q, depends=dep_evs ) x1 = _x1 - deps.append(copy1_ev) - wait_list.append(ht_copy1_ev) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) if x2_dtype != out_dtype: if order == "K": @@ -218,11 +217,10 @@ def where(condition, x1, x2, /, *, order="K", out=None): else: _x2 = dpt.empty_like(x2, dtype=out_dtype, order=order) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x2, dst=_x2, sycl_queue=exec_q + src=x2, dst=_x2, sycl_queue=exec_q, depends=dep_evs ) x2 = _x2 - deps.append(copy2_ev) - wait_list.append(ht_copy2_ev) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) if out is None: if order == "K": @@ -242,25 +240,25 @@ def where(condition, x1, x2, /, *, order="K", out=None): x1 = dpt.broadcast_to(x1, res_shape) x2 = dpt.broadcast_to(x2, res_shape) + dep_evs = _manager.submitted_events hev, where_ev = ti._where( condition=condition, x1=x1, x2=x2, dst=out, sycl_queue=exec_q, - depends=deps, + depends=dep_evs, ) + _manager.add_event_pair(hev, where_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[where_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - dpctl.SyclEvent.wait_for(wait_list) - hev.wait() return out diff --git a/dpctl/tensor/_searchsorted.py b/dpctl/tensor/_searchsorted.py index d9408e072e..131759b5ce 100644 --- a/dpctl/tensor/_searchsorted.py +++ b/dpctl/tensor/_searchsorted.py @@ -87,7 +87,8 @@ def searchsorted( x1_dt = x1.dtype x2_dt = x2.dtype - host_evs = [] + _manager = du.SequentialOrderManager[q] + dep_evs = _manager.submitted_events ev = dpctl.SyclEvent() if sorter is not None: if not isdtype(sorter.dtype, "integral"): @@ -110,38 +111,28 @@ def searchsorted( axis, wrap_out_of_bound_indices_mode, sycl_queue=q, - depends=[ - ev, - ], + depends=dep_evs, ) x1 = res - host_evs.append(ht_ev) + _manager.add_event_pair(ht_ev, ev) if x1_dt != x2_dt: dt = result_type(x1, x2) if x1_dt != dt: x1_buf = _empty_like_orderK(x1, dt) + dep_evs = _manager.submitted_events ht_ev, ev = ti_copy( - src=x1, - dst=x1_buf, - sycl_queue=q, - depends=[ - ev, - ], + src=x1, dst=x1_buf, sycl_queue=q, depends=dep_evs ) - host_evs.append(ht_ev) + _manager.add_event_pair(ht_ev, ev) x1 = x1_buf if x2_dt != dt: x2_buf = _empty_like_orderK(x2, dt) + dep_evs = _manager.submitted_events ht_ev, ev = ti_copy( - src=x2, - dst=x2_buf, - sycl_queue=q, - depends=[ - ev, - ], + src=x2, dst=x2_buf, sycl_queue=q, depends=dep_evs ) - host_evs.append(ht_ev) + _manager.add_event_pair(ht_ev, ev) x2 = x2_buf dst_usm_type = du.get_coerced_usm_type([x1.usm_type, x2.usm_type]) @@ -149,28 +140,18 @@ def searchsorted( dst = _empty_like_orderK(x2, index_dt, usm_type=dst_usm_type) + dep_evs = _manager.submitted_events if side == "left": - ht_ev, _ = _searchsorted_left( + ht_ev, s_ev = _searchsorted_left( hay=x1, needles=x2, positions=dst, sycl_queue=q, - depends=[ - ev, - ], + depends=dep_evs, ) else: - ht_ev, _ = _searchsorted_right( - hay=x1, - needles=x2, - positions=dst, - sycl_queue=q, - depends=[ - ev, - ], + ht_ev, s_ev = _searchsorted_right( + hay=x1, needles=x2, positions=dst, sycl_queue=q, depends=dep_evs ) - - host_evs.append(ht_ev) - dpctl.SyclEvent.wait_for(host_evs) - + _manager.add_event_pair(ht_ev, s_ev) return dst diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py index 81023a0827..2e2df751a9 100644 --- a/dpctl/tensor/_set_functions.py +++ b/dpctl/tensor/_set_functions.py @@ -16,8 +16,8 @@ from typing import NamedTuple -import dpctl import dpctl.tensor as dpt +import dpctl.utils as du from ._tensor_elementwise_impl import _not_equal, _subtract from ._tensor_impl import ( @@ -87,18 +87,23 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: if fx.size == 0: return fx s = dpt.empty_like(fx, order="C") - host_tasks = [] + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events if fx.flags.c_contiguous: ht_ev, sort_ev = _sort_ascending( - src=fx, trailing_dims_to_sort=1, dst=s, sycl_queue=exec_q + src=fx, + trailing_dims_to_sort=1, + dst=s, + sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sort_ev) else: tmp = dpt.empty_like(fx, order="C") ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( - src=fx, dst=tmp, sycl_queue=exec_q + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, copy_ev) ht_ev, sort_ev = _sort_ascending( src=tmp, trailing_dims_to_sort=1, @@ -106,7 +111,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: sycl_queue=exec_q, depends=[copy_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sort_ev) unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) ht_ev, uneq_ev = _not_equal( src1=s[:-1], @@ -115,23 +120,23 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: sycl_queue=exec_q, depends=[sort_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, uneq_ev) + # writing into new allocation, no dependencies ht_ev, one_ev = _full_usm_ndarray( fill_value=True, dst=unique_mask[0], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, one_ev) cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions( unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] ) if n_uniques == fx.size: - dpctl.SyclEvent.wait_for(host_tasks) return s unique_vals = dpt.empty( n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q ) - ht_ev, _ = _extract( + ht_ev, ex_e = _extract( src=s, cumsum=cumsum, axis_start=0, @@ -139,8 +144,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: dst=unique_vals, sycl_queue=exec_q, ) - host_tasks.append(ht_ev) - dpctl.SyclEvent.wait_for(host_tasks) + _manager.add_event_pair(ht_ev, ex_e) return unique_vals @@ -178,18 +182,24 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: if fx.size == 0: return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt)) s = dpt.empty_like(fx, order="C") - host_tasks = [] + + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events if fx.flags.c_contiguous: ht_ev, sort_ev = _sort_ascending( - src=fx, trailing_dims_to_sort=1, dst=s, sycl_queue=exec_q + src=fx, + trailing_dims_to_sort=1, + dst=s, + sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sort_ev) else: tmp = dpt.empty_like(fx, order="C") ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( - src=fx, dst=tmp, sycl_queue=exec_q + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, copy_ev) ht_ev, sort_ev = _sort_ascending( src=tmp, dst=s, @@ -197,7 +207,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: sycl_queue=exec_q, depends=[copy_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sort_ev) unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q) ht_ev, uneq_ev = _not_equal( src1=s[:-1], @@ -206,18 +216,18 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: sycl_queue=exec_q, depends=[sort_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, uneq_ev) + # no dependency, since we write into new allocation ht_ev, one_ev = _full_usm_ndarray( fill_value=True, dst=unique_mask[0], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, one_ev) cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions( unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] ) if n_uniques == fx.size: - dpctl.SyclEvent.wait_for(host_tasks) return UniqueCountsResult( s, dpt.ones( @@ -228,7 +238,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q ) # populate unique values - ht_ev, _ = _extract( + ht_ev, ex_e = _extract( src=s, cumsum=cumsum, axis_start=0, @@ -236,13 +246,14 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: dst=unique_vals, sycl_queue=exec_q, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, ex_e) unique_counts = dpt.empty( n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q ) idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + # writing into new allocation, no dependency ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, id_ev) ht_ev, extr_ev = _extract( src=idx, cumsum=cumsum, @@ -252,21 +263,21 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: sycl_queue=exec_q, depends=[id_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, extr_ev) + # no dependency, writing into disjoint segmenent of new allocation ht_ev, set_ev = _full_usm_ndarray( x.size, dst=unique_counts[-1], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, set_ev) _counts = dpt.empty_like(unique_counts[1:]) - ht_ev, _ = _subtract( + ht_ev, sub_ev = _subtract( src1=unique_counts[1:], src2=unique_counts[:-1], dst=_counts, sycl_queue=exec_q, depends=[set_ev, extr_ev], ) - host_tasks.append(ht_ev) - dpctl.SyclEvent.wait_for(host_tasks) + _manager.add_event_pair(ht_ev, sub_ev) return UniqueCountsResult(unique_vals, _counts) @@ -305,18 +316,24 @@ def unique_inverse(x): unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") if fx.size == 0: return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape)) - host_tasks = [] + + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events if fx.flags.c_contiguous: ht_ev, sort_ev = _argsort_ascending( - src=fx, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q + src=fx, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sort_ev) else: tmp = dpt.empty_like(fx, order="C") ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( - src=fx, dst=tmp, sycl_queue=exec_q + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, copy_ev) ht_ev, sort_ev = _argsort_ascending( src=tmp, trailing_dims_to_sort=1, @@ -324,15 +341,15 @@ def unique_inverse(x): sycl_queue=exec_q, depends=[copy_ev], ) - host_tasks.append(ht_ev) - ht_ev, _ = _argsort_ascending( + _manager.add_event_pair(ht_ev, sort_ev) + ht_ev, argsort_ev = _argsort_ascending( src=sorting_ids, trailing_dims_to_sort=1, dst=unsorting_ids, sycl_queue=exec_q, depends=[sort_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, argsort_ev) s = dpt.empty_like(fx) # s = fx[sorting_ids] ht_ev, take_ev = _take( @@ -344,7 +361,7 @@ def unique_inverse(x): sycl_queue=exec_q, depends=[sort_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, take_ev) unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) ht_ev, uneq_ev = _not_equal( src1=s[:-1], @@ -353,18 +370,18 @@ def unique_inverse(x): sycl_queue=exec_q, depends=[take_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, uneq_ev) + # no dependency ht_ev, one_ev = _full_usm_ndarray( fill_value=True, dst=unique_mask[0], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, one_ev) cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions( unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] ) if n_uniques == fx.size: - dpctl.SyclEvent.wait_for(host_tasks) return UniqueInverseResult(s, dpt.reshape(unsorting_ids, x.shape)) unique_vals = dpt.empty( n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q @@ -377,13 +394,13 @@ def unique_inverse(x): dst=unique_vals, sycl_queue=exec_q, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, uv_ev) cum_unique_counts = dpt.empty( n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q ) idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, id_ev) ht_ev, extr_ev = _extract( src=idx, cumsum=cumsum, @@ -393,24 +410,24 @@ def unique_inverse(x): sycl_queue=exec_q, depends=[id_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, extr_ev) ht_ev, set_ev = _full_usm_ndarray( x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, set_ev) _counts = dpt.empty_like(cum_unique_counts[1:]) - ht_ev, _ = _subtract( + ht_ev, sub_ev = _subtract( src1=cum_unique_counts[1:], src2=cum_unique_counts[:-1], dst=_counts, sycl_queue=exec_q, depends=[set_ev, extr_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sub_ev) inv_dt = dpt.int64 if x.size > dpt.iinfo(dpt.int32).max else dpt.int32 inv = dpt.empty_like(x, dtype=inv_dt, order="C") - ht_ev, _ = _searchsorted_left( + ht_ev, ssl_ev = _searchsorted_left( hay=unique_vals, needles=x, positions=inv, @@ -419,9 +436,8 @@ def unique_inverse(x): uv_ev, ], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, ssl_ev) - dpctl.SyclEvent.wait_for(host_tasks) return UniqueInverseResult(unique_vals, inv) @@ -477,18 +493,23 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: dpt.reshape(unsorting_ids, x.shape), dpt.empty_like(fx, dtype=ind_dt), ) - host_tasks = [] + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events if fx.flags.c_contiguous: ht_ev, sort_ev = _argsort_ascending( - src=fx, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q + src=fx, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sort_ev) else: tmp = dpt.empty_like(fx, order="C") ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( - src=fx, dst=tmp, sycl_queue=exec_q + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, copy_ev) ht_ev, sort_ev = _argsort_ascending( src=tmp, trailing_dims_to_sort=1, @@ -496,15 +517,15 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: sycl_queue=exec_q, depends=[copy_ev], ) - host_tasks.append(ht_ev) - ht_ev, _ = _argsort_ascending( + _manager.add_event_pair(ht_ev, sort_ev) + ht_ev, args_ev = _argsort_ascending( src=sorting_ids, trailing_dims_to_sort=1, dst=unsorting_ids, sycl_queue=exec_q, depends=[sort_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, args_ev) s = dpt.empty_like(fx) # s = fx[sorting_ids] ht_ev, take_ev = _take( @@ -516,7 +537,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: sycl_queue=exec_q, depends=[sort_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, take_ev) unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) ht_ev, uneq_ev = _not_equal( src1=s[:-1], @@ -525,18 +546,17 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: sycl_queue=exec_q, depends=[take_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, uneq_ev) ht_ev, one_ev = _full_usm_ndarray( fill_value=True, dst=unique_mask[0], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, one_ev) cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions( unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] ) if n_uniques == fx.size: - dpctl.SyclEvent.wait_for(host_tasks) _counts = dpt.ones( n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q ) @@ -557,13 +577,13 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: dst=unique_vals, sycl_queue=exec_q, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, uv_ev) cum_unique_counts = dpt.empty( n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q ) idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, id_ev) ht_ev, extr_ev = _extract( src=idx, cumsum=cumsum, @@ -573,11 +593,11 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: sycl_queue=exec_q, depends=[id_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, extr_ev) ht_ev, set_ev = _full_usm_ndarray( x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, set_ev) _counts = dpt.empty_like(cum_unique_counts[1:]) ht_ev, sub_ev = _subtract( src1=cum_unique_counts[1:], @@ -586,11 +606,11 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: sycl_queue=exec_q, depends=[set_ev, extr_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sub_ev) inv_dt = dpt.int64 if x.size > dpt.iinfo(dpt.int32).max else dpt.int32 inv = dpt.empty_like(x, dtype=inv_dt, order="C") - ht_ev, _ = _searchsorted_left( + ht_ev, ssl_ev = _searchsorted_left( hay=unique_vals, needles=x, positions=inv, @@ -599,9 +619,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: uv_ev, ], ) - host_tasks.append(ht_ev) - - dpctl.SyclEvent.wait_for(host_tasks) + _manager.add_event_pair(ht_ev, ssl_ev) return UniqueAllResult( unique_vals, sorting_ids[cum_unique_counts[:-1]], diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py index 3d34ebf0a5..28ec42a085 100644 --- a/dpctl/tensor/_sorting.py +++ b/dpctl/tensor/_sorting.py @@ -16,9 +16,9 @@ from numpy.core.numeric import normalize_axis_index -import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti +import dpctl.utils as du from ._tensor_sorting_impl import ( _argsort_ascending, @@ -76,33 +76,37 @@ def sort(x, /, *, axis=-1, descending=False, stable=True): ] arr = dpt.permute_dims(x, perm) exec_q = x.sycl_queue - host_tasks_list = [] + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events impl_fn = _sort_descending if descending else _sort_ascending if arr.flags.c_contiguous: res = dpt.empty_like(arr, order="C") - ht_ev, _ = impl_fn( - src=arr, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q + ht_ev, impl_ev = impl_fn( + src=arr, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks_list.append(ht_ev) + _manager.add_event_pair(ht_ev, impl_ev) else: tmp = dpt.empty_like(arr, order="C") ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=tmp, sycl_queue=exec_q + src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs ) - host_tasks_list.append(ht_ev) + _manager.add_event_pair(ht_ev, copy_ev) res = dpt.empty_like(arr, order="C") - ht_ev, _ = impl_fn( + ht_ev, impl_ev = impl_fn( src=tmp, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q, depends=[copy_ev], ) - host_tasks_list.append(ht_ev) + _manager.add_event_pair(ht_ev, impl_ev) if a1 != nd: inv_perm = sorted(range(nd), key=lambda d: perm[d]) res = dpt.permute_dims(res, inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) return res @@ -155,32 +159,36 @@ def argsort(x, axis=-1, descending=False, stable=True): ] arr = dpt.permute_dims(x, perm) exec_q = x.sycl_queue - host_tasks_list = [] + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events impl_fn = _argsort_descending if descending else _argsort_ascending index_dt = ti.default_device_index_type(exec_q) if arr.flags.c_contiguous: res = dpt.empty_like(arr, dtype=index_dt, order="C") - ht_ev, _ = impl_fn( - src=arr, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q + ht_ev, impl_ev = impl_fn( + src=arr, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks_list.append(ht_ev) + _manager.add_event_pair(ht_ev, impl_ev) else: tmp = dpt.empty_like(arr, order="C") ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=tmp, sycl_queue=exec_q + src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs ) - host_tasks_list.append(ht_ev) + _manager.add_event_pair(ht_ev, copy_ev) res = dpt.empty_like(arr, dtype=index_dt, order="C") - ht_ev, _ = impl_fn( + ht_ev, impl_ev = impl_fn( src=tmp, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q, depends=[copy_ev], ) - host_tasks_list.append(ht_ev) + _manager.add_event_pair(ht_ev, impl_ev) if a1 != nd: inv_perm = sorted(range(nd), key=lambda d: perm[d]) res = dpt.permute_dims(res, inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) return res diff --git a/dpctl/tensor/_statistical_functions.py b/dpctl/tensor/_statistical_functions.py index 367f7960e0..6fc2066b06 100644 --- a/dpctl/tensor/_statistical_functions.py +++ b/dpctl/tensor/_statistical_functions.py @@ -16,11 +16,11 @@ from numpy.core.numeric import normalize_axis_tuple -import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_elementwise_impl as tei import dpctl.tensor._tensor_impl as ti import dpctl.tensor._tensor_reductions_impl as tri +import dpctl.utils as du def _var_impl(x, axis, correction, keepdims): @@ -48,15 +48,14 @@ def _var_impl(x, axis, correction, keepdims): ) res_usm_type = x.usm_type - deps = [] - host_tasks_list = [] + _manager = du.SequentialOrderManager[q] + dep_evs = _manager.submitted_events if inp_dt != res_dt: buf = dpt.empty_like(x, dtype=res_dt) ht_e_buf, c_e1 = ti._copy_usm_ndarray_into_usm_ndarray( - src=x, dst=buf, sycl_queue=q + src=x, dst=buf, sycl_queue=q, depends=dep_evs ) - deps.append(c_e1) - host_tasks_list.append(ht_e_buf) + _manager.add_event_pair(ht_e_buf, c_e1) else: buf = x # calculate mean @@ -65,11 +64,11 @@ def _var_impl(x, axis, correction, keepdims): # use keepdims=True path for later broadcasting if red_nd == 0: mean_ary = dpt.empty_like(buf) + dep_evs = _manager.submitted_events ht_e1, c_e2 = ti._copy_usm_ndarray_into_usm_ndarray( - src=buf, dst=mean_ary, sycl_queue=q + src=buf, dst=mean_ary, sycl_queue=q, depends=dep_evs ) - deps.append(c_e2) - host_tasks_list.append(ht_e1) + _manager.add_event_pair(ht_e1, c_e2) else: mean_ary = dpt.empty( res_shape, @@ -77,15 +76,15 @@ def _var_impl(x, axis, correction, keepdims): usm_type=res_usm_type, sycl_queue=q, ) + dep_evs = _manager.submitted_events ht_e1, r_e1 = tri._sum_over_axis( src=buf2, trailing_dims_to_reduce=red_nd, dst=mean_ary, sycl_queue=q, - depends=deps, + depends=dep_evs, ) - host_tasks_list.append(ht_e1) - deps.append(r_e1) + _manager.add_event_pair(ht_e1, r_e1) mean_ary_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) @@ -99,10 +98,11 @@ def _var_impl(x, axis, correction, keepdims): ) if nelems_ary.shape != mean_ary_shape: nelems_ary = dpt.broadcast_to(nelems_ary, mean_ary_shape) + dep_evs = _manager.submitted_events ht_e2, d_e1 = tei._divide_inplace( - lhs=mean_ary, rhs=nelems_ary, sycl_queue=q, depends=deps + lhs=mean_ary, rhs=nelems_ary, sycl_queue=q, depends=dep_evs ) - host_tasks_list.append(ht_e2) + _manager.add_event_pair(ht_e2, d_e1) # subtract mean from original array to get deviations dev_ary = dpt.empty_like(buf) if mean_ary_shape != buf.shape: @@ -110,18 +110,17 @@ def _var_impl(x, axis, correction, keepdims): ht_e4, su_e = tei._subtract( src1=buf, src2=mean_ary, dst=dev_ary, sycl_queue=q, depends=[d_e1] ) - host_tasks_list.append(ht_e4) + _manager.add_event_pair(ht_e4, su_e) # square deviations ht_e5, sq_e = tei._square( src=dev_ary, dst=dev_ary, sycl_queue=q, depends=[su_e] ) - host_tasks_list.append(ht_e5) - deps2 = [] + _manager.add_event_pair(ht_e5, sq_e) + # take sum of squared deviations dev_ary2 = dpt.permute_dims(dev_ary, perm) if red_nd == 0: res = dev_ary - deps2.append(sq_e) else: res = dpt.empty( res_shape, @@ -136,8 +135,7 @@ def _var_impl(x, axis, correction, keepdims): sycl_queue=q, depends=[sq_e], ) - host_tasks_list.append(ht_e6) - deps2.append(r_e2) + _manager.add_event_pair(ht_e6, r_e2) if keepdims: res_shape = res_shape + (1,) * red_nd @@ -154,11 +152,12 @@ def _var_impl(x, axis, correction, keepdims): # divide in-place again if div_ary.shape != res_shape: div_ary = dpt.broadcast_to(div_ary, res.shape) + dep_evs = _manager.submitted_events ht_e7, d_e2 = tei._divide_inplace( - lhs=res, rhs=div_ary, sycl_queue=q, depends=deps2 + lhs=res, rhs=div_ary, sycl_queue=q, depends=dep_evs ) - host_tasks_list.append(ht_e7) - return res, [d_e2], host_tasks_list + _manager.add_event_pair(ht_e7, d_e2) + return res, [d_e2] def mean(x, axis=None, keepdims=False): @@ -221,25 +220,28 @@ def mean(x, axis=None, keepdims=False): if sum_nd == 0: return dpt.astype(x, res_dt, copy=True) - s_e = [] - host_tasks_list = [] + _manager = du.SequentialOrderManager[q] + dep_evs = _manager.submitted_events if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q): res = dpt.empty( res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) ht_e1, r_e = tri._sum_over_axis( - src=arr2, trailing_dims_to_reduce=sum_nd, dst=res, sycl_queue=q + src=arr2, + trailing_dims_to_reduce=sum_nd, + dst=res, + sycl_queue=q, + depends=dep_evs, ) - host_tasks_list.append(ht_e1) - s_e.append(r_e) + _manager.add_event_pair(ht_e1, r_e) else: tmp = dpt.empty( arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr2, dst=tmp, sycl_queue=q + src=arr2, dst=tmp, sycl_queue=q, depends=dep_evs ) - host_tasks_list.append(ht_e_cpy) + _manager.add_event_pair(ht_e_cpy, cpy_e) res = dpt.empty( res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) @@ -250,8 +252,7 @@ def mean(x, axis=None, keepdims=False): sycl_queue=q, depends=[cpy_e], ) - host_tasks_list.append(ht_e_red) - s_e.append(r_e) + _manager.add_event_pair(ht_e_red, r_e) if keepdims: res_shape = res_shape + (1,) * sum_nd @@ -266,11 +267,11 @@ def mean(x, axis=None, keepdims=False): ) if nelems_arr.shape != res_shape: nelems_arr = dpt.broadcast_to(nelems_arr, res_shape) - ht_e2, _ = tei._divide_inplace( - lhs=res, rhs=nelems_arr, sycl_queue=q, depends=s_e + dep_evs = _manager.submitted_events + ht_e2, div_e = tei._divide_inplace( + lhs=res, rhs=nelems_arr, sycl_queue=q, depends=dep_evs ) - host_tasks_list.append(ht_e2) - dpctl.SyclEvent.wait_for(host_tasks_list) + _manager.add_event_pair(ht_e2, div_e) return res @@ -321,8 +322,7 @@ def var(x, axis=None, correction=0.0, keepdims=False): if x.dtype.kind == "c": raise ValueError("`var` does not support complex types") - res, _, host_tasks_list = _var_impl(x, axis, correction, keepdims) - dpctl.SyclEvent.wait_for(host_tasks_list) + res, _ = _var_impl(x, axis, correction, keepdims) return res @@ -374,10 +374,11 @@ def std(x, axis=None, correction=0.0, keepdims=False): if x.dtype.kind == "c": raise ValueError("`std` does not support complex types") - res, deps, host_tasks_list = _var_impl(x, axis, correction, keepdims) - ht_ev, _ = tei._sqrt( - src=res, dst=res, sycl_queue=res.sycl_queue, depends=deps + exec_q = x.sycl_queue + _manager = du.SequentialOrderManager[exec_q] + res, deps = _var_impl(x, axis, correction, keepdims) + ht_ev, sqrt_ev = tei._sqrt( + src=res, dst=res, sycl_queue=exec_q, depends=deps ) - host_tasks_list.append(ht_ev) - dpctl.SyclEvent.wait_for(host_tasks_list) + _manager.add_event_pair(ht_ev, sqrt_ev) return res diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 0c28380222..443d8184a2 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -25,6 +25,9 @@ import numpy as np import dpctl import dpctl.memory as dpmem +from .._backend cimport DPCTLSyclUSMRef +from .._sycl_device_factory cimport _cached_default_device + from ._data_types import bool as dpt_bool from ._device import Device from ._print import usm_ndarray_repr, usm_ndarray_str @@ -78,6 +81,7 @@ cdef class InternalUSMArrayError(Exception): cdef object _as_zero_dim_ndarray(object usm_ary): "Convert size-1 array to NumPy 0d array" mem_view = dpmem.as_usm_memory(usm_ary) + usm_ary.sycl_queue.wait() host_buf = mem_view.copy_to_host() view = host_buf.view(usm_ary.dtype) view.shape = tuple() @@ -514,6 +518,8 @@ cdef class usm_ndarray: "byte_offset is not a multiple of item_size.") elem_offset = byte_offset // item_size ary_iface['offset'] = elem_offset + # must wait for content of the memory to finalize + self.sycl_queue.wait() return ary_iface @property @@ -952,10 +958,10 @@ cdef class usm_ndarray: arr_buf = self.usm_data QRef = ( d.sycl_queue).get_queue_ref() view_buffer = c_dpmem._Memory.create_from_usm_pointer_size_qref( - arr_buf.memory_ptr, + arr_buf.get_data_ptr(), arr_buf.nbytes, QRef, - memory_owner = arr_buf + memory_owner=arr_buf ) res = usm_ndarray( self.shape, @@ -1609,6 +1615,11 @@ cdef api Py_ssize_t UsmNDArray_GetOffset(usm_ndarray arr): return arr.get_offset() +cdef api object UsmNDArray_GetUSMData(usm_ndarray arr): + """Get USM data object underlying the array""" + return arr.get_base() + + cdef api void UsmNDArray_SetWritableFlag(usm_ndarray arr, int flag): """Set/unset USM_ARRAY_WRITABLE in the given array `arr`.""" arr._set_writable_flag(flag) diff --git a/dpctl/tensor/_utility_functions.py b/dpctl/tensor/_utility_functions.py index 2a9c5923bf..709c1dc046 100644 --- a/dpctl/tensor/_utility_functions.py +++ b/dpctl/tensor/_utility_functions.py @@ -1,9 +1,9 @@ from numpy.core.numeric import normalize_axis_tuple -import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti import dpctl.tensor._tensor_reductions_impl as tri +import dpctl.utils as du def _boolean_reduction(x, axis, keepdims, func): @@ -35,7 +35,8 @@ def _boolean_reduction(x, axis, keepdims, func): exec_q = x.sycl_queue res_usm_type = x.usm_type - wait_list = [] + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events # always allocate the temporary as # int32 and usm-device to ensure that atomic updates # are supported @@ -50,8 +51,9 @@ def _boolean_reduction(x, axis, keepdims, func): trailing_dims_to_reduce=red_nd, dst=res_tmp, sycl_queue=exec_q, + depends=dep_evs, ) - wait_list.append(hev0) + _manager.add_event_pair(hev0, ev0) # copy to boolean result array res = dpt.empty( @@ -60,16 +62,15 @@ def _boolean_reduction(x, axis, keepdims, func): usm_type=res_usm_type, sycl_queue=exec_q, ) - hev1, _ = ti._copy_usm_ndarray_into_usm_ndarray( + hev1, ev1 = ti._copy_usm_ndarray_into_usm_ndarray( src=res_tmp, dst=res, sycl_queue=exec_q, depends=[ev0] ) - wait_list.append(hev1) + _manager.add_event_pair(hev1, ev1) if keepdims: res_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) - dpctl.SyclEvent.wait_for(wait_list) return res diff --git a/dpctl/tests/test_sycl_usm.py b/dpctl/tests/test_sycl_usm.py index a1cf98960b..eca71ac66c 100644 --- a/dpctl/tests/test_sycl_usm.py +++ b/dpctl/tests/test_sycl_usm.py @@ -152,7 +152,7 @@ def test_zero_copy(): mobj = _create_memory() mobj2 = type(mobj)(mobj) - assert mobj2.reference_obj is mobj + assert mobj2.reference_obj is None mobj_data = mobj.__sycl_usm_array_interface__["data"] mobj2_data = mobj2.__sycl_usm_array_interface__["data"] assert mobj_data == mobj2_data @@ -545,6 +545,7 @@ def test_cpython_api(memory_ctor): mod = sys.modules[mobj.__class__.__module__] # get capsules storing function pointers mem_ptr_fn_cap = mod.__pyx_capi__["Memory_GetUsmPointer"] + mem_opaque_ptr_fn_cap = mod.__pyx_capi__["Memory_GetOpaquePointer"] mem_q_ref_fn_cap = mod.__pyx_capi__["Memory_GetQueueRef"] mem_ctx_ref_fn_cap = mod.__pyx_capi__["Memory_GetContextRef"] mem_nby_fn_cap = mod.__pyx_capi__["Memory_GetNumBytes"] @@ -556,6 +557,9 @@ def test_cpython_api(memory_ctor): mem_ptr_fn_ptr = cap_ptr_fn( mem_ptr_fn_cap, b"DPCTLSyclUSMRef (struct Py_MemoryObject *)" ) + mem_opaque_ptr_fn_ptr = cap_ptr_fn( + mem_opaque_ptr_fn_cap, b"void *(struct Py_MemoryObject *)" + ) mem_ctx_ref_fn_ptr = cap_ptr_fn( mem_ctx_ref_fn_cap, b"DPCTLSyclContextRef (struct Py_MemoryObject *)" ) @@ -571,6 +575,7 @@ def test_cpython_api(memory_ctor): ) callable_maker = ctypes.PYFUNCTYPE(ctypes.c_void_p, ctypes.py_object) get_ptr_fn = callable_maker(mem_ptr_fn_ptr) + get_opaque_ptr_fn = callable_maker(mem_opaque_ptr_fn_ptr) get_ctx_ref_fn = callable_maker(mem_ctx_ref_fn_ptr) get_q_ref_fn = callable_maker(mem_q_ref_fn_ptr) get_nby_fn = callable_maker(mem_nby_fn_ptr) @@ -586,6 +591,8 @@ def test_cpython_api(memory_ctor): capi_ptr = get_ptr_fn(mobj) direct_ptr = mobj._pointer assert capi_ptr == direct_ptr + capi_opaque_ptr = get_opaque_ptr_fn(mobj) + assert capi_opaque_ptr != 0 capi_ctx_ref = get_ctx_ref_fn(mobj) direct_ctx_ref = mobj._context.addressof_ref() assert capi_ctx_ref == direct_ctx_ref diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py index 9a47c1f184..33a95af849 100644 --- a/dpctl/tests/test_usm_ndarray_ctor.py +++ b/dpctl/tests/test_usm_ndarray_ctor.py @@ -709,6 +709,25 @@ def test_pyx_capi_get_offset(): assert offset == X.__sycl_usm_array_interface__["offset"] +def test_pyx_capi_get_usmdata(): + try: + X = dpt.usm_ndarray(17, dtype="u2")[1::2] + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + get_usmdata_fn = _pyx_capi_fnptr_to_callable( + X, + "UsmNDArray_GetUSMData", + b"PyObject *(struct PyUSMArrayObject *)", + fn_restype=ctypes.py_object, + fn_argtypes=(ctypes.py_object,), + ) + capi_usm_data = get_usmdata_fn(X) + assert isinstance(capi_usm_data, dpm._memory._Memory) + assert capi_usm_data.nbytes == X.usm_data.nbytes + assert capi_usm_data._pointer == X.usm_data._pointer + assert capi_usm_data.sycl_queue == X.usm_data.sycl_queue + + def test_pyx_capi_get_queue_ref(): try: X = dpt.usm_ndarray(17, dtype="i2")[1::2] diff --git a/dpctl/tests/test_usm_ndarray_manipulation.py b/dpctl/tests/test_usm_ndarray_manipulation.py index 1e9c30d9cf..b64f68cbb8 100644 --- a/dpctl/tests/test_usm_ndarray_manipulation.py +++ b/dpctl/tests/test_usm_ndarray_manipulation.py @@ -698,6 +698,14 @@ def test_concat_different_dtype(): assert XY.shape == (5, 2) assert XY.sycl_queue == q + X1 = dpt.arange(10, dtype="i2", sycl_queue=q) + Y1 = dpt.arange(5, dtype="i4", sycl_queue=q) + + XY1 = dpt.concat([X1[::2], Y1[::-1]], axis=None) + assert XY1.shape == (10,) + assert XY1.sycl_queue == q + assert XY1.dtype == Y1.dtype + def test_concat_incorrect_ndim(): q = get_queue_or_skip() diff --git a/dpctl/tests/test_usm_ndarray_print.py b/dpctl/tests/test_usm_ndarray_print.py index 06b7ad6672..9e15fa3310 100644 --- a/dpctl/tests/test_usm_ndarray_print.py +++ b/dpctl/tests/test_usm_ndarray_print.py @@ -260,8 +260,8 @@ def test_usm_ndarray_str_separator(self): def test_print_repr(self): q = get_queue_or_skip() - x = dpt.asarray(0, dtype="int64", sycl_queue=q) - assert repr(x) == "usm_ndarray(0)" + x = dpt.asarray(3, dtype="int64", sycl_queue=q) + assert repr(x) == "usm_ndarray(3)" x = dpt.asarray([np.nan, np.inf], sycl_queue=q) if x.sycl_device.has_aspect_fp64: @@ -281,7 +281,10 @@ def test_print_repr(self): ) x = dpt.arange(4, dtype="i4", sycl_queue=q) - assert repr(x) == "usm_ndarray([0, 1, 2, 3], dtype=int32)" + x.sycl_queue.wait() + r = repr(x) + print(r) + assert r == "usm_ndarray([0, 1, 2, 3], dtype=int32)" dpt.set_print_options(linewidth=1) np.testing.assert_equal( diff --git a/dpctl/tests/test_usm_ndarray_searchsorted.py b/dpctl/tests/test_usm_ndarray_searchsorted.py index 0e65fcc235..47dd97bc87 100644 --- a/dpctl/tests/test_usm_ndarray_searchsorted.py +++ b/dpctl/tests/test_usm_ndarray_searchsorted.py @@ -267,11 +267,14 @@ def test_searchsorted_coerce(): x1_i4 = dpt.arange(5, dtype="i4") x1_i8 = dpt.arange(5, dtype="i8") + x2_i4 = dpt.arange(5, dtype="i4") x2_i8 = dpt.arange(5, dtype="i8") p1 = dpt.searchsorted(x1_i4, x2_i8) p2 = dpt.searchsorted(x1_i8, x2_i8) + p3 = dpt.searchsorted(x1_i8, x2_i4) assert dpt.all(p1 == p2) + assert dpt.all(p2 == p3) def test_searchsorted_validation(): diff --git a/dpctl/tests/test_utils.py b/dpctl/tests/test_utils.py index 4966f42b55..77735fdd47 100644 --- a/dpctl/tests/test_utils.py +++ b/dpctl/tests/test_utils.py @@ -149,3 +149,31 @@ def test_intel_device_info(): test = descriptor_name in allowed_names err_msg = f"Key '{descriptor_name}' is not recognized" assert test, err_msg + + +def test_order_manager(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not created for default-selected device") + _som = dpctl.utils.SequentialOrderManager + _mngr = _som[q] + assert isinstance(_mngr.num_host_task_events, int) + assert isinstance(_mngr.num_submitted_events, int) + assert isinstance(_mngr.submitted_events, list) + assert isinstance(_mngr.host_task_events, list) + _mngr.add_event_pair(dpctl.SyclEvent(), dpctl.SyclEvent()) + _mngr.add_event_pair([dpctl.SyclEvent()], dpctl.SyclEvent()) + _mngr.add_event_pair(dpctl.SyclEvent(), [dpctl.SyclEvent()]) + _mngr.wait() + cpy = _mngr.__copy__() + _som.clear() + del cpy + + try: + _passed = False + _som[None] + except TypeError: + _passed = True + finally: + assert _passed diff --git a/dpctl/utils/CMakeLists.txt b/dpctl/utils/CMakeLists.txt index 04a9eb2a4d..2a8494032a 100644 --- a/dpctl/utils/CMakeLists.txt +++ b/dpctl/utils/CMakeLists.txt @@ -36,3 +36,37 @@ if(_dpctl_sycl_targets) endif() target_link_libraries(${python_module_name} PRIVATE DpctlCAPI) install(TARGETS ${python_module_name} DESTINATION "dpctl/utils") + + +set(python_module_name _seq_order_keeper) +set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/src/order_keeper.cpp) +pybind11_add_module(${python_module_name} MODULE + ${_module_src} +) +target_include_directories(${python_module_name} PRIVATE ${CUMAKE_CURRENT_SOURCE_DIR}/src) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src}) +if(DPCTL_GENERATE_COVERAGE) + if(DPCTL_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS) + target_compile_options(${python_module_name} + PRIVATE -fprofile-instr-generate -fcoverage-mapping + ) + endif() + target_link_options(${python_module_name} + PRIVATE -fprofile-instr-generate -fcoverage-mapping + ) +endif() +if(_dpctl_sycl_targets) + # make fat binary + target_compile_options( + ${python_module_name} + PRIVATE + -fsycl-targets=${_dpctl_sycl_targets} + ) + target_link_options( + ${python_module_name} + PRIVATE + -fsycl-targets=${_dpctl_sycl_targets} + ) +endif() +target_link_libraries(${python_module_name} PRIVATE DpctlCAPI) +install(TARGETS ${python_module_name} DESTINATION "dpctl/utils") diff --git a/dpctl/utils/__init__.py b/dpctl/utils/__init__.py index e0c14a263d..04a4efa251 100644 --- a/dpctl/utils/__init__.py +++ b/dpctl/utils/__init__.py @@ -18,106 +18,15 @@ A collection of utility functions. """ -from .._sycl_device import SyclDevice from ._compute_follows_data import ( ExecutionPlacementError, get_coerced_usm_type, get_execution_queue, validate_usm_type, ) -from ._device_queries import ( - intel_device_info_device_id, - intel_device_info_free_memory, - intel_device_info_gpu_eu_count, - intel_device_info_gpu_eu_count_per_subslice, - intel_device_info_gpu_eu_simd_width, - intel_device_info_gpu_hw_threads_per_eu, - intel_device_info_gpu_slices, - intel_device_info_gpu_subslices_per_slice, - intel_device_info_max_mem_bandwidth, - intel_device_info_memory_bus_width, - intel_device_info_memory_clock_rate, -) +from ._intel_device_info import intel_device_info from ._onetrace_context import onetrace_enabled - - -def intel_device_info(dev, /): - """intel_device_info(sycl_device) - - For Intel(R) GPU devices returns a dictionary - with device architectural details, and an empty - dictionary otherwise. The dictionary contains - the following keys: - - device_id: - 32-bits device PCI identifier - gpu_eu_count: - Total number of execution units - gpu_hw_threads_per_eu: - Number of thread contexts in EU - gpu_eu_simd_width: - Physical SIMD width of EU - gpu_slices: - Total number of slices - gpu_subslices_per_slice: - Number of sub-slices per slice - gpu_eu_count_per_subslice: - Number of EUs in subslice - max_mem_bandwidth: - Maximum memory bandwidth in bytes/second - free_memory: - Global memory available on the device in units of bytes - - Unsupported descriptors are omitted from the dictionary. - - Descriptors other than the PCI identifier are supported only - for :class:`.SyclDevices` with Level-Zero backend. - - .. note:: - Environment variable ``ZES_ENABLE_SYSMAN`` may need to be set - to ``1`` for the ``"free_memory"`` key to be reported. - """ - if not isinstance(dev, SyclDevice): - raise TypeError(f"Expected dpctl.SyclDevice, got {type(dev)}") - dev_id = intel_device_info_device_id(dev) - if dev_id: - res = { - "device_id": dev_id, - } - if dev.has_aspect_gpu: - eu_count = intel_device_info_gpu_eu_count(dev) - if eu_count: - res["gpu_eu_count"] = eu_count - hw_threads = intel_device_info_gpu_hw_threads_per_eu(dev) - if hw_threads: - res["gpu_hw_threads_per_eu"] = hw_threads - simd_w = intel_device_info_gpu_eu_simd_width(dev) - if simd_w: - res["gpu_eu_simd_width"] = simd_w - n_slices = intel_device_info_gpu_slices(dev) - if n_slices: - res["gpu_slices"] = n_slices - n_subslices = intel_device_info_gpu_subslices_per_slice(dev) - if n_subslices: - res["gpu_subslices_per_slice"] = n_subslices - n_eu_per_subslice = intel_device_info_gpu_eu_count_per_subslice(dev) - if n_eu_per_subslice: - res["gpu_eu_count_per_subslice"] = n_eu_per_subslice - bw = intel_device_info_max_mem_bandwidth(dev) - if bw: - res["max_mem_bandwidth"] = bw - fm = intel_device_info_free_memory(dev) - if fm: - res["free_memory"] = fm - mcr = intel_device_info_memory_clock_rate(dev) - if mcr: - res["memory_clock_rate"] = mcr - mbw = intel_device_info_memory_bus_width(dev) - if mbw: - res["memory_bus_width"] = mbw - return res - return dict() - +from ._order_manager import SequentialOrderManager __all__ = [ "get_execution_queue", @@ -126,4 +35,5 @@ def intel_device_info(dev, /): "onetrace_enabled", "intel_device_info", "ExecutionPlacementError", + "SequentialOrderManager", ] diff --git a/dpctl/utils/_intel_device_info.py b/dpctl/utils/_intel_device_info.py new file mode 100644 index 0000000000..5c83a05261 --- /dev/null +++ b/dpctl/utils/_intel_device_info.py @@ -0,0 +1,113 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .._sycl_device import SyclDevice +from ._device_queries import ( + intel_device_info_device_id, + intel_device_info_free_memory, + intel_device_info_gpu_eu_count, + intel_device_info_gpu_eu_count_per_subslice, + intel_device_info_gpu_eu_simd_width, + intel_device_info_gpu_hw_threads_per_eu, + intel_device_info_gpu_slices, + intel_device_info_gpu_subslices_per_slice, + intel_device_info_max_mem_bandwidth, + intel_device_info_memory_bus_width, + intel_device_info_memory_clock_rate, +) + + +def intel_device_info(dev, /): + """intel_device_info(sycl_device) + + For Intel(R) GPU devices returns a dictionary + with device architectural details, and an empty + dictionary otherwise. The dictionary contains + the following keys: + + device_id: + 32-bits device PCI identifier + gpu_eu_count: + Total number of execution units + gpu_hw_threads_per_eu: + Number of thread contexts in EU + gpu_eu_simd_width: + Physical SIMD width of EU + gpu_slices: + Total number of slices + gpu_subslices_per_slice: + Number of sub-slices per slice + gpu_eu_count_per_subslice: + Number of EUs in subslice + max_mem_bandwidth: + Maximum memory bandwidth in bytes/second + free_memory: + Global memory available on the device in units of bytes + + Unsupported descriptors are omitted from the dictionary. + + Descriptors other than the PCI identifier are supported only + for :class:`.SyclDevices` with Level-Zero backend. + + .. note:: + Environment variable ``ZES_ENABLE_SYSMAN`` may need to be set + to ``1`` for the ``"free_memory"`` key to be reported. + """ + if not isinstance(dev, SyclDevice): + raise TypeError(f"Expected dpctl.SyclDevice, got {type(dev)}") + dev_id = intel_device_info_device_id(dev) + if dev_id: + res = { + "device_id": dev_id, + } + if dev.has_aspect_gpu: + eu_count = intel_device_info_gpu_eu_count(dev) + if eu_count: + res["gpu_eu_count"] = eu_count + hw_threads = intel_device_info_gpu_hw_threads_per_eu(dev) + if hw_threads: + res["gpu_hw_threads_per_eu"] = hw_threads + simd_w = intel_device_info_gpu_eu_simd_width(dev) + if simd_w: + res["gpu_eu_simd_width"] = simd_w + n_slices = intel_device_info_gpu_slices(dev) + if n_slices: + res["gpu_slices"] = n_slices + n_subslices = intel_device_info_gpu_subslices_per_slice(dev) + if n_subslices: + res["gpu_subslices_per_slice"] = n_subslices + n_eu_per_subslice = intel_device_info_gpu_eu_count_per_subslice(dev) + if n_eu_per_subslice: + res["gpu_eu_count_per_subslice"] = n_eu_per_subslice + bw = intel_device_info_max_mem_bandwidth(dev) + if bw: + res["max_mem_bandwidth"] = bw + fm = intel_device_info_free_memory(dev) + if fm: + res["free_memory"] = fm + mcr = intel_device_info_memory_clock_rate(dev) + if mcr: + res["memory_clock_rate"] = mcr + mbw = intel_device_info_memory_bus_width(dev) + if mbw: + res["memory_bus_width"] = mbw + return res + return dict() + + +__all__ = [ + "intel_device_info", +] diff --git a/dpctl/utils/_order_manager.py b/dpctl/utils/_order_manager.py new file mode 100644 index 0000000000..8938b17365 --- /dev/null +++ b/dpctl/utils/_order_manager.py @@ -0,0 +1,94 @@ +from collections import defaultdict +from contextvars import ContextVar + +from .._sycl_event import SyclEvent +from .._sycl_queue import SyclQueue +from ._seq_order_keeper import _OrderManager + + +class _SequentialOrderManager: + """ + Class to orchestrate default sequential order + of the tasks offloaded from Python. + """ + + def __init__(self): + self._state = _OrderManager(16) + + def __dealloc__(self): + _local = self._state + SyclEvent.wait_for(_local.get_submitted_events()) + SyclEvent.wait_for(_local.get_host_task_events()) + + def add_event_pair(self, host_task_ev, comp_ev): + _local = self._state + if isinstance(host_task_ev, SyclEvent) and isinstance( + comp_ev, SyclEvent + ): + _local.add_to_both_events(host_task_ev, comp_ev) + else: + if not isinstance(host_task_ev, (list, tuple)): + host_task_ev = (host_task_ev,) + if not isinstance(comp_ev, (list, tuple)): + comp_ev = (comp_ev,) + _local.add_vector_to_both_events(host_task_ev, comp_ev) + + @property + def num_host_task_events(self): + _local = self._state + return _local.get_num_host_task_events() + + @property + def num_submitted_events(self): + _local = self._state + return _local.get_num_submitted_events() + + @property + def host_task_events(self): + _local = self._state + return _local.get_host_task_events() + + @property + def submitted_events(self): + _local = self._state + return _local.get_submitted_events() + + def wait(self): + _local = self._state + return _local.wait() + + def __copy__(self): + res = _SequentialOrderManager.__new__(_SequentialOrderManager) + res._state = _OrderManager(self._state) + return res + + +class SyclQueueToOrderManagerMap: + """Utility class to ensure sequential ordering of offloaded + tasks issued by dpctl.tensor functions""" + + def __init__(self): + self._map = ContextVar( + "global_order_manager_map", + default=defaultdict(_SequentialOrderManager), + ) + + def __getitem__(self, q: SyclQueue) -> _SequentialOrderManager: + """Get order manager for given SyclQueue""" + _local = self._map.get() + if not isinstance(q, SyclQueue): + raise TypeError(f"Expected `dpctl.SyclQueue`, got {type(q)}") + if q in _local: + return _local[q] + else: + v = _local[q] + _local[q] = v + return v + + def clear(self): + """Clear content of internal dictionary""" + _local = self._map.get() + _local.clear() + + +SequentialOrderManager = SyclQueueToOrderManagerMap() diff --git a/dpctl/utils/src/order_keeper.cpp b/dpctl/utils/src/order_keeper.cpp new file mode 100644 index 0000000000..f54b21780c --- /dev/null +++ b/dpctl/utils/src/order_keeper.cpp @@ -0,0 +1,29 @@ +#include "dpctl4pybind11.hpp" +#include +#include + +#include "sequential_order_keeper.hpp" +#include + +PYBIND11_MODULE(_seq_order_keeper, m) +{ + py::class_(m, "_OrderManager") + .def(py::init()) + .def(py::init<>()) + .def(py::init()) + .def("get_num_submitted_events", + &SequentialOrder::get_num_submitted_events) + .def("get_num_host_task_events", + &SequentialOrder::get_num_host_task_events) + .def("get_submitted_events", &SequentialOrder::get_submitted_events) + .def("get_host_task_events", &SequentialOrder::get_host_task_events) + .def("add_to_both_events", &SequentialOrder::add_to_both_events) + .def("add_vector_to_both_events", + &SequentialOrder::add_vector_to_both_events) + .def("add_to_host_task_events", + &SequentialOrder::add_to_host_task_events) + .def("add_to_submitted_events", + &SequentialOrder::add_to_submitted_events) + .def("wait", &SequentialOrder::wait, + py::call_guard()); +} diff --git a/dpctl/utils/src/sequential_order_keeper.hpp b/dpctl/utils/src/sequential_order_keeper.hpp new file mode 100644 index 0000000000..0acc4a8b87 --- /dev/null +++ b/dpctl/utils/src/sequential_order_keeper.hpp @@ -0,0 +1,181 @@ +#pragma once +#include + +#include +#include + +namespace +{ +bool is_event_complete(const sycl::event &e) +{ + constexpr auto exec_complete = sycl::info::event_command_status::complete; + + const auto status = + e.get_info(); + return (status == exec_complete); +} +} // namespace + +class SequentialOrder +{ +private: + std::vector host_task_events; + std::vector submitted_events; + + void prune_complete() + { + const auto &ht_it = + std::remove_if(host_task_events.begin(), host_task_events.end(), + is_event_complete); + host_task_events.erase(ht_it, host_task_events.end()); + + const auto &sub_it = + std::remove_if(submitted_events.begin(), submitted_events.end(), + is_event_complete); + submitted_events.erase(sub_it, submitted_events.end()); + } + +public: + SequentialOrder() : host_task_events{}, submitted_events{} {} + SequentialOrder(size_t n) : host_task_events{}, submitted_events{} + { + host_task_events.reserve(n); + submitted_events.reserve(n); + } + + SequentialOrder(const SequentialOrder &other) + : host_task_events(other.host_task_events), + submitted_events(other.submitted_events) + { + prune_complete(); + } + SequentialOrder(SequentialOrder &&other) + : host_task_events{}, submitted_events{} + { + host_task_events = std::move(other.host_task_events); + submitted_events = std::move(other.submitted_events); + prune_complete(); + } + + SequentialOrder &operator=(const SequentialOrder &other) + { + host_task_events = other.host_task_events; + submitted_events = other.submitted_events; + + prune_complete(); + return *this; + } + + SequentialOrder &operator=(SequentialOrder &&other) + { + if (this != &other) { + host_task_events = std::move(other.host_task_events); + submitted_events = std::move(other.submitted_events); + prune_complete(); + } + return *this; + } + + size_t get_num_submitted_events() const + { + return submitted_events.size(); + } + + const std::vector &get_host_task_events() + { + prune_complete(); + return host_task_events; + } + + /* + const std::vector & get_host_task_events() const { + return host_task_events; + } + */ + + size_t get_num_host_task_events() const + { + return host_task_events.size(); + } + + const std::vector &get_submitted_events() + { + prune_complete(); + return submitted_events; + } + + /* + const std::vector & get_submitted_events() const { + return submitted_events; + } + */ + + void add_to_both_events(const sycl::event &ht_ev, + const sycl::event &comp_ev) + { + prune_complete(); + if (!is_event_complete(ht_ev)) + host_task_events.push_back(ht_ev); + if (!is_event_complete(comp_ev)) + submitted_events.push_back(comp_ev); + } + + void add_vector_to_both_events(const std::vector &ht_evs, + const std::vector &comp_evs) + { + prune_complete(); + for (const auto &e : ht_evs) { + if (!is_event_complete(e)) + host_task_events.push_back(e); + } + for (const auto &e : comp_evs) { + if (!is_event_complete(e)) + submitted_events.push_back(e); + } + } + + void add_to_host_task_events(const sycl::event &ht_ev) + { + prune_complete(); + if (!is_event_complete(ht_ev)) { + host_task_events.push_back(ht_ev); + } + } + + void add_to_submitted_events(const sycl::event &comp_ev) + { + prune_complete(); + if (!is_event_complete(comp_ev)) { + submitted_events.push_back(comp_ev); + } + } + + template + void add_list_to_host_task_events(const sycl::event (&ht_events)[num]) + { + prune_complete(); + for (size_t i = 0; i < num; ++i) { + const auto &e = ht_events[i]; + if (!is_event_complete(e)) + host_task_events.push_back(e); + } + } + + template + void add_list_to_submitted_events(const sycl::event (&comp_events)[num]) + { + prune_complete(); + for (size_t i = 0; i < num; ++i) { + const auto &e = comp_events[i]; + if (!is_event_complete(e)) + submitted_events.push_back(e); + } + } + + void wait() + { + sycl::event::wait(submitted_events); + sycl::event::wait(host_task_events); + prune_complete(); + } +}; diff --git a/examples/cython/usm_memory/blackscholes/blackscholes.pyx b/examples/cython/usm_memory/blackscholes/blackscholes.pyx index b7d37993f3..6d2d2bbbe5 100644 --- a/examples/cython/usm_memory/blackscholes/blackscholes.pyx +++ b/examples/cython/usm_memory/blackscholes/blackscholes.pyx @@ -106,11 +106,15 @@ def black_scholes_price(c_dpt.usm_ndarray option_params_arr): call_put_prices = dpt.empty((n_opts, 2), dtype='d', sycl_queue=q) dp1 = option_params_arr.get_data() dp2 = call_put_prices.get_data() + # ensure content of dp1 and dp2 is no longer worked on + exec_q_ptr[0].wait() cpp_blackscholes[double](exec_q_ptr[0], n_opts, dp1, dp2) elif (typenum_ == c_dpt.UAR_FLOAT): call_put_prices = dpt.empty((n_opts, 2), dtype='f', sycl_queue=q) fp1 = option_params_arr.get_data() fp2 = call_put_prices.get_data() + # ensure content of fp1 and fp2 is no longer worked on + exec_q_ptr[0].wait() cpp_blackscholes[float](exec_q_ptr[0], n_opts, fp1, fp2) else: raise ValueError("Unsupported data-type") @@ -196,11 +200,13 @@ def populate_params( if (typenum_ == c_dpt.UAR_DOUBLE): dp = option_params_arr.get_data() + exec_q_ptr[0].wait() cpp_populate_params[double]( exec_q_ptr[0], n_opts, dp, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed ) elif (typenum_ == c_dpt.UAR_FLOAT): fp = option_params_arr.get_data() + exec_q_ptr[0].wait() cpp_populate_params[float]( exec_q_ptr[0], n_opts, fp, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed ) diff --git a/examples/cython/usm_memory/src/sycl_blackscholes.hpp b/examples/cython/usm_memory/src/sycl_blackscholes.hpp index db7b1e3e31..376b0f54c5 100644 --- a/examples/cython/usm_memory/src/sycl_blackscholes.hpp +++ b/examples/cython/usm_memory/src/sycl_blackscholes.hpp @@ -47,7 +47,7 @@ constexpr int CALL = 0; constexpr int PUT = 1; template -void cpp_blackscholes(sycl::queue q, size_t n_opts, T *params, T *callput) +void cpp_blackscholes(sycl::queue &q, size_t n_opts, T *params, T *callput) { using data_t = T; @@ -57,8 +57,7 @@ void cpp_blackscholes(sycl::queue q, size_t n_opts, T *params, T *callput) data_t half = one / two; cgh.parallel_for>( - sycl::range<1>(n_opts), - [=](sycl::id<1> idx) + sycl::range<1>(n_opts), [=](sycl::id<1> idx) { const size_t i = n_params * idx[0]; const data_t opt_price = params[i + PRICE]; @@ -106,7 +105,7 @@ void cpp_blackscholes(sycl::queue q, size_t n_opts, T *params, T *callput) const size_t callput_i = n_prices * idx[0]; callput[callput_i + CALL] = call_price; callput[callput_i + PUT] = put_price; - }); + }); }); e.wait_and_throw(); diff --git a/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp b/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp index 11f2d2c531..2625cd4ba4 100644 --- a/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp +++ b/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp @@ -114,6 +114,9 @@ py::dict construct_sua_iface(DMatrix &m) iface["typestr"] = "|f8"; iface["syclobj"] = syclobj; + // ensure that content of array is flushed out + m.get_queue().wait(); + return iface; } diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py index 8823097613..6c350ab801 100644 --- a/examples/pybind11/onemkl_gemv/solve.py +++ b/examples/pybind11/onemkl_gemv/solve.py @@ -127,12 +127,14 @@ def cg_solve(A, b): converged is False if solver has not converged, or the iteration number """ exec_queue = A.sycl_queue + exec_queue.wait() + x = dpt.zeros_like(b) Ap = dpt.empty_like(x) all_host_tasks = [] - r = dpt.copy(b) # synchronous copy - p = dpt.copy(b) # synchronous copy + r = dpt.copy(b) + p = dpt.copy(b) rsold = sycl_gemm.norm_squared_blocking(exec_queue, r) if rsold < 1e-20: diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp index 10eb794aba..1ebf5e7bff 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp @@ -41,7 +41,7 @@ namespace py = pybind11; using dpctl::utils::keep_args_alive; std::pair -py_gemv(sycl::queue q, +py_gemv(sycl::queue &q, dpctl::tensor::usm_ndarray matrix, dpctl::tensor::usm_ndarray vector, dpctl::tensor::usm_ndarray result, diff --git a/examples/pybind11/use_dpctl_sycl_kernel/tests/test_user_kernel.py b/examples/pybind11/use_dpctl_sycl_kernel/tests/test_user_kernel.py index 085885810f..ad6bfb90f7 100644 --- a/examples/pybind11/use_dpctl_sycl_kernel/tests/test_user_kernel.py +++ b/examples/pybind11/use_dpctl_sycl_kernel/tests/test_user_kernel.py @@ -64,6 +64,7 @@ def test_kernel_submit_through_extension(): x = dpt.arange(0, stop=13, step=1, dtype="i4", sycl_queue=q) y = dpt.zeros_like(x) + q.wait() uk.submit_custom_kernel(q, krn, x, y, []) assert np.array_equal(dpt.asnumpy(y), np.arange(0, 26, step=2, dtype="i4")) diff --git a/examples/pybind11/use_dpctl_sycl_kernel/use_kernel/_example.cpp b/examples/pybind11/use_dpctl_sycl_kernel/use_kernel/_example.cpp index 98a5414e9f..92d0066db2 100644 --- a/examples/pybind11/use_dpctl_sycl_kernel/use_kernel/_example.cpp +++ b/examples/pybind11/use_dpctl_sycl_kernel/use_kernel/_example.cpp @@ -35,8 +35,8 @@ namespace py = pybind11; -void submit_custom_kernel(sycl::queue q, - sycl::kernel krn, +void submit_custom_kernel(sycl::queue &q, + sycl::kernel &krn, dpctl::tensor::usm_ndarray x, dpctl::tensor::usm_ndarray y, const std::vector &depends = {})