From a51973c7704470e5a53c2893ddc4d830b932d005 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 25 Apr 2024 20:39:13 -0500 Subject: [PATCH 01/25] Change _Memory object's memory ownership model The _Memory object acquires additional field void * _opaque_ptr. This pointer is the pointer to std::shared_ptr with a deleter class to delete USM allocations. The pointer is opaque to retain C-API of Py_MemoryObject. If _opaque_ptr is NULL, refobj field is assumed to be responsible for USM deallocation. api Memory_GetOpaquePointer is added to access the opaque pointer. A test in test_sycl_usm is modified to reflect factual changes in the behavior of _Memory type. --- dpctl/_sycl_queue.pyx | 8 +-- dpctl/memory/CMakeLists.txt | 12 ++-- dpctl/memory/_memory.pxd | 5 +- dpctl/memory/_memory.pyx | 100 +++++++++++++++++---------- dpctl/memory/_opaque_smart_ptr.hpp | 106 +++++++++++++++++++++++++++++ dpctl/tensor/_usmarray.pyx | 7 +- dpctl/tests/test_sycl_usm.py | 2 +- 7 files changed, 189 insertions(+), 51 deletions(-) create mode 100644 dpctl/memory/_opaque_smart_ptr.hpp diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index e85e35c29a..083ca3c6dc 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -342,7 +342,7 @@ cdef DPCTLSyclEventRef _memcpy_impl( cdef unsigned char[::1] dst_host_buf = None if isinstance(src, _Memory): - c_src_ptr = (<_Memory>src).memory_ptr + c_src_ptr = (<_Memory>src).get_data_ptr() elif _is_buffer(src): src_host_buf = src c_src_ptr = &src_host_buf[0] @@ -354,7 +354,7 @@ cdef DPCTLSyclEventRef _memcpy_impl( ) if isinstance(dst, _Memory): - c_dst_ptr = (<_Memory>dst).memory_ptr + c_dst_ptr = (<_Memory>dst).get_data_ptr() elif _is_buffer(dst): dst_host_buf = dst c_dst_ptr = &dst_host_buf[0] @@ -1265,7 +1265,7 @@ cdef class SyclQueue(_SyclQueue): cdef DPCTLSyclEventRef ERef = NULL if isinstance(mem, _Memory): - ptr = (<_Memory>mem).memory_ptr + ptr = (<_Memory>mem).get_data_ptr() else: raise TypeError("Parameter `mem` should have type _Memory") @@ -1285,7 +1285,7 @@ cdef class SyclQueue(_SyclQueue): cdef DPCTLSyclEventRef ERef = NULL if isinstance(mem, _Memory): - ptr = (<_Memory>mem).memory_ptr + ptr = (<_Memory>mem).get_data_ptr() else: raise TypeError("Parameter `mem` should have type _Memory") diff --git a/dpctl/memory/CMakeLists.txt b/dpctl/memory/CMakeLists.txt index 68f57ce35c..cccc30b505 100644 --- a/dpctl/memory/CMakeLists.txt +++ b/dpctl/memory/CMakeLists.txt @@ -1,7 +1,7 @@ -file(GLOB _cython_sources *.pyx) -foreach(_cy_file ${_cython_sources}) - get_filename_component(_trgt ${_cy_file} NAME_WLE) - build_dpctl_ext(${_trgt} ${_cy_file} "dpctl/memory") - target_link_libraries(DpctlCAPI INTERFACE ${_trgt}_headers) -endforeach() +set(_cy_file ${CMAKE_CURRENT_SOURCE_DIR}/_memory.pyx) +get_filename_component(_trgt ${_cy_file} NAME_WLE) +build_dpctl_ext(${_trgt} ${_cy_file} "dpctl/memory" SYCL) +# _memory include _opaque_smart_ptr.hpp +target_include_directories(${_trgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(DpctlCAPI INTERFACE ${_trgt}_headers) diff --git a/dpctl/memory/_memory.pxd b/dpctl/memory/_memory.pxd index db8c18bd9c..4b3c6bb188 100644 --- a/dpctl/memory/_memory.pxd +++ b/dpctl/memory/_memory.pxd @@ -33,7 +33,8 @@ cdef DPCTLSyclQueueRef get_queue_ref_from_ptr_and_syclobj( cdef public api class _Memory [object Py_MemoryObject, type Py_MemoryType]: - cdef DPCTLSyclUSMRef memory_ptr + cdef DPCTLSyclUSMRef _memory_ptr + cdef void* _opaque_ptr cdef Py_ssize_t nbytes cdef SyclQueue queue cdef object refobj @@ -50,6 +51,8 @@ cdef public api class _Memory [object Py_MemoryObject, type Py_MemoryType]: cpdef memset(self, unsigned short val=*) cpdef bytes tobytes(self) + cdef DPCTLSyclUSMRef get_data_ptr(self) + cdef void * get_opaque_ptr(self) @staticmethod cdef SyclDevice get_pointer_device( diff --git a/dpctl/memory/_memory.pyx b/dpctl/memory/_memory.pyx index df5d360e2a..fce65984e2 100644 --- a/dpctl/memory/_memory.pyx +++ b/dpctl/memory/_memory.pyx @@ -78,6 +78,12 @@ __all__ = [ include "_sycl_usm_array_interface_utils.pxi" +cdef extern from "_opaque_smart_ptr.hpp": + void * OpaqueSmartPtr_Make(void *, DPCTLSyclQueueRef) nogil + void * OpaqueSmartPtr_Copy(void *) nogil + void OpaqueSmartPtr_Delete(void *) nogil + void * OpaqueSmartPtr_Get(void *) nogil + class USMAllocationError(Exception): """ An exception raised when Universal Shared Memory (USM) allocation @@ -152,7 +158,8 @@ cdef class _Memory: MemoryUSMShared, MemoryUSMDevice, MemoryUSMHost """ cdef _cinit_empty(self): - self.memory_ptr = NULL + self._memory_ptr = NULL + self._opaque_ptr = NULL self.nbytes = 0 self.queue = None self.refobj = None @@ -198,7 +205,8 @@ cdef class _Memory: ) if (p): - self.memory_ptr = p + self._memory_ptr = p + self._opaque_ptr = OpaqueSmartPtr_Make(p, QRef) self.nbytes = nbytes self.queue = queue else: @@ -214,18 +222,22 @@ cdef class _Memory: cdef _Memory other_mem if isinstance(other, _Memory): other_mem = <_Memory> other - self.memory_ptr = other_mem.memory_ptr self.nbytes = other_mem.nbytes self.queue = other_mem.queue - if other_mem.refobj is None: - self.refobj = other + if other_mem._opaque_ptr is NULL: + self._memory_ptr = other_mem._memory_ptr + self._opaque_ptr = NULL + self.refobj = other.reference_obj else: - self.refobj = other_mem.refobj + self._memory_ptr = other_mem._memory_ptr + self._opaque_ptr = OpaqueSmartPtr_Copy(other_mem._opaque_ptr) + self.refobj = None elif hasattr(other, '__sycl_usm_array_interface__'): other_iface = other.__sycl_usm_array_interface__ if isinstance(other_iface, dict): other_buf = _USMBufferData.from_sycl_usm_ary_iface(other_iface) - self.memory_ptr = other_buf.p + self._opaque_ptr = NULL + self._memory_ptr = other_buf.p self.nbytes = other_buf.nbytes self.queue = other_buf.queue self.refobj = other @@ -241,23 +253,25 @@ cdef class _Memory: ) def __dealloc__(self): - if (self.refobj is None): - if self.memory_ptr: - if (type(self.queue) is SyclQueue): - DPCTLfree_with_queue( - self.memory_ptr, self.queue.get_queue_ref() - ) + if not (self._opaque_ptr is NULL): + OpaqueSmartPtr_Delete(self._opaque_ptr) self._cinit_empty() + cdef DPCTLSyclUSMRef get_data_ptr(self): + return self._memory_ptr + + cdef void* get_opaque_ptr(self): + return self._opaque_ptr + cdef _getbuffer(self, Py_buffer *buffer, int flags): # memory_ptr is Ref which is pointer to SYCL type. For USM it is void*. cdef SyclContext ctx = self._context cdef _usm_type UsmTy = DPCTLUSM_GetPointerType( - self.memory_ptr, ctx.get_context_ref() + self._memory_ptr, ctx.get_context_ref() ) if UsmTy == _usm_type._USM_DEVICE: raise ValueError("USM Device memory is not host accessible") - buffer.buf = self.memory_ptr + buffer.buf = self._memory_ptr buffer.format = 'B' # byte buffer.internal = NULL # see References buffer.itemsize = 1 @@ -285,7 +299,7 @@ cdef class _Memory: represented as Python integer. """ def __get__(self): - return (self.memory_ptr) + return (self._memory_ptr) property _context: """:class:`dpctl.SyclContext` the USM pointer is bound to. """ @@ -333,7 +347,7 @@ cdef class _Memory: .format( self.get_usm_type(), self.nbytes, - hex((self.memory_ptr)) + hex((self._memory_ptr)) ) ) @@ -377,7 +391,7 @@ cdef class _Memory: """ def __get__(self): cdef dict iface = { - "data": ((self.memory_ptr), + "data": ((self._memory_ptr), True), # bool(self.writable)), "shape": (self.nbytes,), "strides": None, @@ -402,18 +416,18 @@ cdef class _Memory: if syclobj is None: ctx = self._context return _Memory.get_pointer_type( - self.memory_ptr, ctx + self._memory_ptr, ctx ).decode("UTF-8") elif isinstance(syclobj, SyclContext): ctx = (syclobj) return _Memory.get_pointer_type( - self.memory_ptr, ctx + self._memory_ptr, ctx ).decode("UTF-8") elif isinstance(syclobj, SyclQueue): q = (syclobj) ctx = q.get_sycl_context() return _Memory.get_pointer_type( - self.memory_ptr, ctx + self._memory_ptr, ctx ).decode("UTF-8") raise TypeError( "syclobj keyword can be either None, or an instance of " @@ -435,18 +449,18 @@ cdef class _Memory: if syclobj is None: ctx = self._context return _Memory.get_pointer_type_enum( - self.memory_ptr, ctx + self._memory_ptr, ctx ) elif isinstance(syclobj, SyclContext): ctx = (syclobj) return _Memory.get_pointer_type_enum( - self.memory_ptr, ctx + self._memory_ptr, ctx ) elif isinstance(syclobj, SyclQueue): q = (syclobj) ctx = q.get_sycl_context() return _Memory.get_pointer_type_enum( - self.memory_ptr, ctx + self._memory_ptr, ctx ) raise TypeError( "syclobj keyword can be either None, or an instance of " @@ -475,8 +489,8 @@ cdef class _Memory: # call kernel to copy from ERef = DPCTLQueue_Memcpy( self.queue.get_queue_ref(), - &host_buf[0], # destination - self.memory_ptr, # source + &host_buf[0], # destination + self._memory_ptr, # source self.nbytes ) with nogil: DPCTLEvent_Wait(ERef) @@ -500,8 +514,8 @@ cdef class _Memory: # call kernel to copy from ERef = DPCTLQueue_Memcpy( self.queue.get_queue_ref(), - self.memory_ptr, # destination - &host_buf[0], # source + self._memory_ptr, # destination + &host_buf[0], # source buf_len ) with nogil: DPCTLEvent_Wait(ERef) @@ -542,7 +556,7 @@ cdef class _Memory: if (same_contexts): ERef = DPCTLQueue_Memcpy( this_queue.get_queue_ref(), - self.memory_ptr, + self._memory_ptr, src_buf.p, src_buf.nbytes ) @@ -550,8 +564,8 @@ cdef class _Memory: DPCTLEvent_Delete(ERef) else: copy_via_host( - self.memory_ptr, this_queue, # dest - src_buf.p, src_queue, # src + self._memory_ptr, this_queue, # dest + src_buf.p, src_queue, # src src_buf.nbytes ) else: @@ -565,7 +579,7 @@ cdef class _Memory: ERef = DPCTLQueue_Memset( self.queue.get_queue_ref(), - self.memory_ptr, # destination + self._memory_ptr, # destination val, self.nbytes) @@ -703,20 +717,29 @@ cdef class _Memory: res = _Memory.__new__(_Memory) _mem = <_Memory> res _mem._cinit_empty() - _mem.memory_ptr = USMRef _mem.nbytes = nbytes QRef_copy = DPCTLQueue_Copy(QRef) if QRef_copy is NULL: raise ValueError("Referenced queue could not be copied.") try: - _mem.queue = SyclQueue._create(QRef_copy) # consumes the copy + # _create steals ownership of QRef_copy + _mem.queue = SyclQueue._create(QRef_copy) except dpctl.SyclQueueCreationError as sqce: raise ValueError( "SyclQueue object could not be created from " "copy of referenced queue" ) from sqce - _mem.refobj = memory_owner - return mem_ty(res) + if memory_owner is None: + _mem._memory_ptr = USMRef + # assume ownership of USM allocation via smart pointer + _mem._opaque_ptr = OpaqueSmartPtr_Make(USMRef, QRef) + _mem.refobj = None + else: + _mem._memory_ptr = USMRef + _mem._opaque_ptr = NULL + _mem.refobj = memory_owner + _out = mem_ty(_mem) + return _out cdef class MemoryUSMShared(_Memory): @@ -908,10 +931,13 @@ def as_usm_memory(obj): format(obj) ) +cdef api void * Memory_GetOpaquePointer(_Memory obj): + "Opaque pointer value" + return obj.get_opaque_ptr() cdef api DPCTLSyclUSMRef Memory_GetUsmPointer(_Memory obj): "Pointer of USM allocation" - return obj.memory_ptr + return obj.get_data_ptr() cdef api DPCTLSyclContextRef Memory_GetContextRef(_Memory obj): "Context reference to which USM allocation is bound" diff --git a/dpctl/memory/_opaque_smart_ptr.hpp b/dpctl/memory/_opaque_smart_ptr.hpp new file mode 100644 index 0000000000..197115d098 --- /dev/null +++ b/dpctl/memory/_opaque_smart_ptr.hpp @@ -0,0 +1,106 @@ +//===--- _opaque_smart_ptr.hpp --------===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file implements working with shared_ptr with USM deleted +/// disguided as an opaque pointer. +/// +//===----------------------------------------------------------------------===// + +#pragma once + +#ifndef __cplusplus +#error "C++ is required to compile this file +#endif + +#include "syclinterface/dpctl_sycl_type_casters.hpp" +#include "syclinterface/dpctl_sycl_types.h" +#include +#include + +namespace +{ + +class USMDeleter +{ +public: + USMDeleter() = delete; + USMDeleter(const USMDeleter &) = default; + USMDeleter(USMDeleter &&) = default; + USMDeleter(const ::sycl::queue &queue) : _context(queue.get_context()) {} + USMDeleter(const ::sycl::context &context) : _context(context) {} + template void operator()(T *ptr) const + { + ::sycl::free(ptr, _context); + } + +private: + ::sycl::context _context; +}; + +} // end of anonymous namespace + +void *OpaqueSmartPtr_Make(void *usm_ptr, const sycl::queue &q) +{ + USMDeleter _deleter(q); + auto sptr = new std::shared_ptr(usm_ptr, std::move(_deleter)); + + return reinterpret_cast(sptr); +} + +void *OpaqueSmartPtr_Make(void *usm_ptr, DPCTLSyclQueueRef QRef) +{ + sycl::queue *q_ptr = dpctl::syclinterface::unwrap(QRef); + + // make a copy of queue + sycl::queue q{*q_ptr}; + + void *res = OpaqueSmartPtr_Make(usm_ptr, q); + + return res; +} + +void OpaqueSmartPtr_Delete(void *opaque_ptr) +{ + auto sptr = reinterpret_cast *>(opaque_ptr); + + delete sptr; +} + +void *OpaqueSmartPtr_Copy(void *opaque_ptr) +{ + auto sptr = reinterpret_cast *>(opaque_ptr); + auto copied_sptr = new std::shared_ptr(*sptr); + + return reinterpret_cast(copied_sptr); +} + +long OpaqueSmartPtr_UseCount(void *opaque_ptr) +{ + auto sptr = reinterpret_cast *>(opaque_ptr); + return sptr->use_count(); +} + +void *OpaqueSmartPtr_Get(void *opaque_ptr) +{ + auto sptr = reinterpret_cast *>(opaque_ptr); + + return sptr->get(); +} diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 0c28380222..6256e0c30f 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -25,6 +25,9 @@ import numpy as np import dpctl import dpctl.memory as dpmem +from .._backend cimport DPCTLSyclUSMRef +from .._sycl_device_factory cimport _cached_default_device + from ._data_types import bool as dpt_bool from ._device import Device from ._print import usm_ndarray_repr, usm_ndarray_str @@ -952,10 +955,10 @@ cdef class usm_ndarray: arr_buf = self.usm_data QRef = ( d.sycl_queue).get_queue_ref() view_buffer = c_dpmem._Memory.create_from_usm_pointer_size_qref( - arr_buf.memory_ptr, + arr_buf.get_data_ptr(), arr_buf.nbytes, QRef, - memory_owner = arr_buf + memory_owner=arr_buf ) res = usm_ndarray( self.shape, diff --git a/dpctl/tests/test_sycl_usm.py b/dpctl/tests/test_sycl_usm.py index a1cf98960b..0870c9d58c 100644 --- a/dpctl/tests/test_sycl_usm.py +++ b/dpctl/tests/test_sycl_usm.py @@ -152,7 +152,7 @@ def test_zero_copy(): mobj = _create_memory() mobj2 = type(mobj)(mobj) - assert mobj2.reference_obj is mobj + assert mobj2.reference_obj is None mobj_data = mobj.__sycl_usm_array_interface__["data"] mobj2_data = mobj2.__sycl_usm_array_interface__["data"] assert mobj_data == mobj2_data From dc77858967dd652a5c7cd3d91332762944804679 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sat, 27 Apr 2024 15:51:37 -0500 Subject: [PATCH 02/25] Deploy using shared pointers from keep_args_alive --- dpctl/apis/include/dpctl4pybind11.hpp | 210 +++++++++++++++++++++++--- dpctl/tensor/_usmarray.pyx | 5 + 2 files changed, 195 insertions(+), 20 deletions(-) diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp index 4e0cbe1986..749ec7fce5 100644 --- a/dpctl/apis/include/dpctl4pybind11.hpp +++ b/dpctl/apis/include/dpctl4pybind11.hpp @@ -89,6 +89,7 @@ class dpctl_capi // memory DPCTLSyclUSMRef (*Memory_GetUsmPointer_)(Py_MemoryObject *); + void *(*Memory_GetOpaquePointer_)(Py_MemoryObject *); DPCTLSyclContextRef (*Memory_GetContextRef_)(Py_MemoryObject *); DPCTLSyclQueueRef (*Memory_GetQueueRef_)(Py_MemoryObject *); size_t (*Memory_GetNumBytes_)(Py_MemoryObject *); @@ -115,6 +116,7 @@ class dpctl_capi int (*UsmNDArray_GetFlags_)(PyUSMArrayObject *); DPCTLSyclQueueRef (*UsmNDArray_GetQueueRef_)(PyUSMArrayObject *); py::ssize_t (*UsmNDArray_GetOffset_)(PyUSMArrayObject *); + PyObject *(*UsmNDArray_GetUSMData_)(PyUSMArrayObject *); void (*UsmNDArray_SetWritableFlag_)(PyUSMArrayObject *, int); PyObject *(*UsmNDArray_MakeSimpleFromMemory_)(int, const py::ssize_t *, @@ -233,15 +235,16 @@ class dpctl_capi SyclContext_Make_(nullptr), SyclEvent_GetEventRef_(nullptr), SyclEvent_Make_(nullptr), SyclQueue_GetQueueRef_(nullptr), SyclQueue_Make_(nullptr), Memory_GetUsmPointer_(nullptr), - Memory_GetContextRef_(nullptr), Memory_GetQueueRef_(nullptr), - Memory_GetNumBytes_(nullptr), Memory_Make_(nullptr), - SyclKernel_GetKernelRef_(nullptr), SyclKernel_Make_(nullptr), - SyclProgram_GetKernelBundleRef_(nullptr), SyclProgram_Make_(nullptr), - UsmNDArray_GetData_(nullptr), UsmNDArray_GetNDim_(nullptr), - UsmNDArray_GetShape_(nullptr), UsmNDArray_GetStrides_(nullptr), - UsmNDArray_GetTypenum_(nullptr), UsmNDArray_GetElementSize_(nullptr), - UsmNDArray_GetFlags_(nullptr), UsmNDArray_GetQueueRef_(nullptr), - UsmNDArray_GetOffset_(nullptr), UsmNDArray_SetWritableFlag_(nullptr), + Memory_GetOpaquePointer_(nullptr), Memory_GetContextRef_(nullptr), + Memory_GetQueueRef_(nullptr), Memory_GetNumBytes_(nullptr), + Memory_Make_(nullptr), SyclKernel_GetKernelRef_(nullptr), + SyclKernel_Make_(nullptr), SyclProgram_GetKernelBundleRef_(nullptr), + SyclProgram_Make_(nullptr), UsmNDArray_GetData_(nullptr), + UsmNDArray_GetNDim_(nullptr), UsmNDArray_GetShape_(nullptr), + UsmNDArray_GetStrides_(nullptr), UsmNDArray_GetTypenum_(nullptr), + UsmNDArray_GetElementSize_(nullptr), UsmNDArray_GetFlags_(nullptr), + UsmNDArray_GetQueueRef_(nullptr), UsmNDArray_GetOffset_(nullptr), + UsmNDArray_GetUSMData_(nullptr), UsmNDArray_SetWritableFlag_(nullptr), UsmNDArray_MakeSimpleFromMemory_(nullptr), UsmNDArray_MakeSimpleFromPtr_(nullptr), UsmNDArray_MakeFromPtr_(nullptr), USM_ARRAY_C_CONTIGUOUS_(0), @@ -299,6 +302,7 @@ class dpctl_capi // dpctl.memory API this->Memory_GetUsmPointer_ = Memory_GetUsmPointer; + this->Memory_GetOpaquePointer_ = Memory_GetOpaquePointer; this->Memory_GetContextRef_ = Memory_GetContextRef; this->Memory_GetQueueRef_ = Memory_GetQueueRef; this->Memory_GetNumBytes_ = Memory_GetNumBytes; @@ -320,6 +324,7 @@ class dpctl_capi this->UsmNDArray_GetFlags_ = UsmNDArray_GetFlags; this->UsmNDArray_GetQueueRef_ = UsmNDArray_GetQueueRef; this->UsmNDArray_GetOffset_ = UsmNDArray_GetOffset; + this->UsmNDArray_GetUSMData_ = UsmNDArray_GetUSMData; this->UsmNDArray_SetWritableFlag_ = UsmNDArray_SetWritableFlag; this->UsmNDArray_MakeSimpleFromMemory_ = UsmNDArray_MakeSimpleFromMemory; @@ -779,6 +784,33 @@ class usm_memory : public py::object return api.Memory_GetNumBytes_(mem_obj); } + bool is_managed_by_smart_ptr() const + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); + const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + + return bool(opaque_ptr); + } + + std::shared_ptr get_smart_ptr_owner() const + { + auto const &api = ::dpctl::detail::dpctl_capi::get(); + Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); + void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + + if (opaque_ptr) { + auto shptr_ptr = + reinterpret_cast *>(opaque_ptr); + return *shptr_ptr; + } + else { + throw std::runtime_error( + "Memory object does not have smart pointer " + "managing lifetime of USM allocation"); + } + } + protected: static PyObject *as_usm_memory(PyObject *o) { @@ -1065,6 +1097,63 @@ class usm_ndarray : public py::object return static_cast(flags & api.USM_ARRAY_WRITABLE_); } + py::object get_usm_data() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + + auto const &api = ::dpctl::detail::dpctl_capi::get(); + PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar); + + return py::reinterpret_steal(usm_data); + } + + bool is_managed_by_smart_ptr() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + + auto const &api = ::dpctl::detail::dpctl_capi::get(); + PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar); + + if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) + return false; + + Py_MemoryObject *mem_obj = + reinterpret_cast(usm_data); + const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + + return bool(opaque_ptr); + } + + std::shared_ptr get_smart_ptr_owner() const + { + PyUSMArrayObject *raw_ar = usm_array_ptr(); + + auto const &api = ::dpctl::detail::dpctl_capi::get(); + + PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar); + + if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) { + throw std::runtime_error( + "usm_ndarray object does not have Memory object " + "managing lifetime of USM allocation"); + } + + Py_MemoryObject *mem_obj = + reinterpret_cast(usm_data); + void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + + if (opaque_ptr) { + auto shptr_ptr = + reinterpret_cast *>(opaque_ptr); + return *shptr_ptr; + } + else { + throw std::runtime_error( + "Memory object underlying usm_ndarray does not have " + "smart pointer managing lifetime of USM allocation"); + } + } + private: PyUSMArrayObject *usm_array_ptr() const { @@ -1077,26 +1166,107 @@ class usm_ndarray : public py::object namespace utils { +namespace detail +{ + +struct ManagedMemory +{ + + static bool is_usm_managed_by_shared_ptr(const py::handle &h) + { + if (py::isinstance(h)) { + auto usm_memory_inst = py::cast(h); + return usm_memory_inst.is_managed_by_smart_ptr(); + } + else if (py::isinstance(h)) { + auto usm_array_inst = py::cast(h); + return usm_array_inst.is_managed_by_smart_ptr(); + } + + return false; + } + + static std::shared_ptr extract_shared_ptr(const py::handle &h) + { + if (py::isinstance(h)) { + auto usm_memory_inst = py::cast(h); + return usm_memory_inst.get_smart_ptr_owner(); + } + else if (py::isinstance(h)) { + auto usm_array_inst = py::cast(h); + return usm_array_inst.get_smart_ptr_owner(); + } + + throw std::runtime_error( + "Attempted extraction of shared_ptr on an unrecognized type"); + } +}; + +} // end of namespace detail + template sycl::event keep_args_alive(sycl::queue &q, const py::object (&py_objs)[num], const std::vector &depends = {}) { - sycl::event host_task_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - std::array, num> shp_arr; - for (std::size_t i = 0; i < num; ++i) { - shp_arr[i] = std::make_shared(py_objs[i]); - shp_arr[i]->inc_ref(); + std::size_t n_objects_held = 0; + std::array, num> shp_arr{}; + + std::size_t n_usm_owners_held = 0; + std::array, num> shp_usm{}; + + for (std::size_t i = 0; i < num; ++i) { + auto py_obj_i = py_objs[i]; + if (detail::ManagedMemory::is_usm_managed_by_shared_ptr(py_obj_i)) { + shp_usm[n_usm_owners_held] = + detail::ManagedMemory::extract_shared_ptr(py_obj_i); + ++n_usm_owners_held; } - cgh.host_task([shp_arr = std::move(shp_arr)]() { - py::gil_scoped_acquire acquire; + else { + shp_arr[n_objects_held] = std::make_shared(py_obj_i); + shp_arr[n_objects_held]->inc_ref(); + ++n_objects_held; + } + } + + bool use_depends = true; + sycl::event host_task_ev; + + if (n_usm_owners_held > 0) { + host_task_ev = q.submit([&](sycl::handler &cgh) { + if (use_depends) { + cgh.depends_on(depends); + use_depends = false; + } + else { + cgh.depends_on(host_task_ev); + } + cgh.host_task([shp_usm = std::move(shp_usm)]() { + // no body, but shared pointers are captured in + // the lamba, ensuring that USM allocation is + // kept alive + }); + }); + } - for (std::size_t i = 0; i < num; ++i) { - shp_arr[i]->dec_ref(); + if (n_objects_held > 0) { + host_task_ev = q.submit([&](sycl::handler &cgh) { + if (use_depends) { + cgh.depends_on(depends); + use_depends = false; } + else { + cgh.depends_on(host_task_ev); + } + cgh.host_task([n_objects_held, shp_arr = std::move(shp_arr)]() { + py::gil_scoped_acquire acquire; + + for (std::size_t i = 0; i < n_objects_held; ++i) { + shp_arr[i]->dec_ref(); + } + }); }); - }); + } return host_task_ev; } diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 6256e0c30f..800b7c823a 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -1612,6 +1612,11 @@ cdef api Py_ssize_t UsmNDArray_GetOffset(usm_ndarray arr): return arr.get_offset() +cdef api object UsmNDArray_GetUSMData(usm_ndarray arr): + """Get USM data object underlying the array""" + return arr.get_base() + + cdef api void UsmNDArray_SetWritableFlag(usm_ndarray arr, int flag): """Set/unset USM_ARRAY_WRITABLE in the given array `arr`.""" arr._set_writable_flag(flag) From 8a532126a5c5a6c718dfbf04e2b0fa4a1af34676 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sat, 27 Apr 2024 15:52:50 -0500 Subject: [PATCH 03/25] Adding test for Memory_GetOpaquePointer CAPI function --- dpctl/tests/test_sycl_usm.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dpctl/tests/test_sycl_usm.py b/dpctl/tests/test_sycl_usm.py index 0870c9d58c..eca71ac66c 100644 --- a/dpctl/tests/test_sycl_usm.py +++ b/dpctl/tests/test_sycl_usm.py @@ -545,6 +545,7 @@ def test_cpython_api(memory_ctor): mod = sys.modules[mobj.__class__.__module__] # get capsules storing function pointers mem_ptr_fn_cap = mod.__pyx_capi__["Memory_GetUsmPointer"] + mem_opaque_ptr_fn_cap = mod.__pyx_capi__["Memory_GetOpaquePointer"] mem_q_ref_fn_cap = mod.__pyx_capi__["Memory_GetQueueRef"] mem_ctx_ref_fn_cap = mod.__pyx_capi__["Memory_GetContextRef"] mem_nby_fn_cap = mod.__pyx_capi__["Memory_GetNumBytes"] @@ -556,6 +557,9 @@ def test_cpython_api(memory_ctor): mem_ptr_fn_ptr = cap_ptr_fn( mem_ptr_fn_cap, b"DPCTLSyclUSMRef (struct Py_MemoryObject *)" ) + mem_opaque_ptr_fn_ptr = cap_ptr_fn( + mem_opaque_ptr_fn_cap, b"void *(struct Py_MemoryObject *)" + ) mem_ctx_ref_fn_ptr = cap_ptr_fn( mem_ctx_ref_fn_cap, b"DPCTLSyclContextRef (struct Py_MemoryObject *)" ) @@ -571,6 +575,7 @@ def test_cpython_api(memory_ctor): ) callable_maker = ctypes.PYFUNCTYPE(ctypes.c_void_p, ctypes.py_object) get_ptr_fn = callable_maker(mem_ptr_fn_ptr) + get_opaque_ptr_fn = callable_maker(mem_opaque_ptr_fn_ptr) get_ctx_ref_fn = callable_maker(mem_ctx_ref_fn_ptr) get_q_ref_fn = callable_maker(mem_q_ref_fn_ptr) get_nby_fn = callable_maker(mem_nby_fn_ptr) @@ -586,6 +591,8 @@ def test_cpython_api(memory_ctor): capi_ptr = get_ptr_fn(mobj) direct_ptr = mobj._pointer assert capi_ptr == direct_ptr + capi_opaque_ptr = get_opaque_ptr_fn(mobj) + assert capi_opaque_ptr != 0 capi_ctx_ref = get_ctx_ref_fn(mobj) direct_ctx_ref = mobj._context.addressof_ref() assert capi_ctx_ref == direct_ctx_ref From cf1cad06ca1b07a77ae8a6918336418d2ebeb56b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 29 Apr 2024 05:37:09 -0500 Subject: [PATCH 04/25] Added tests for UsmNDArray_GetUSMData C-API function --- dpctl/tests/test_usm_ndarray_ctor.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py index 9a47c1f184..33a95af849 100644 --- a/dpctl/tests/test_usm_ndarray_ctor.py +++ b/dpctl/tests/test_usm_ndarray_ctor.py @@ -709,6 +709,25 @@ def test_pyx_capi_get_offset(): assert offset == X.__sycl_usm_array_interface__["offset"] +def test_pyx_capi_get_usmdata(): + try: + X = dpt.usm_ndarray(17, dtype="u2")[1::2] + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + get_usmdata_fn = _pyx_capi_fnptr_to_callable( + X, + "UsmNDArray_GetUSMData", + b"PyObject *(struct PyUSMArrayObject *)", + fn_restype=ctypes.py_object, + fn_argtypes=(ctypes.py_object,), + ) + capi_usm_data = get_usmdata_fn(X) + assert isinstance(capi_usm_data, dpm._memory._Memory) + assert capi_usm_data.nbytes == X.usm_data.nbytes + assert capi_usm_data._pointer == X.usm_data._pointer + assert capi_usm_data.sycl_queue == X.usm_data.sycl_queue + + def test_pyx_capi_get_queue_ref(): try: X = dpt.usm_ndarray(17, dtype="i2")[1::2] From 9df2b721af06fd0389d31a844bacf73aab60f88f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 29 Apr 2024 06:15:26 -0500 Subject: [PATCH 05/25] Fixed typo in the comment --- dpctl/apis/include/dpctl4pybind11.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp index 749ec7fce5..b476b79955 100644 --- a/dpctl/apis/include/dpctl4pybind11.hpp +++ b/dpctl/apis/include/dpctl4pybind11.hpp @@ -1243,7 +1243,7 @@ sycl::event keep_args_alive(sycl::queue &q, } cgh.host_task([shp_usm = std::move(shp_usm)]() { // no body, but shared pointers are captured in - // the lamba, ensuring that USM allocation is + // the lambda, ensuring that USM allocation is // kept alive }); }); From c85571b6063a913690dbcd2ea4abc3ef946bf6ac Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 1 May 2024 19:58:19 -0700 Subject: [PATCH 06/25] Fixed unclosed quote in error pragma --- dpctl/memory/_opaque_smart_ptr.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/memory/_opaque_smart_ptr.hpp b/dpctl/memory/_opaque_smart_ptr.hpp index 197115d098..c79be4f8d4 100644 --- a/dpctl/memory/_opaque_smart_ptr.hpp +++ b/dpctl/memory/_opaque_smart_ptr.hpp @@ -27,7 +27,7 @@ #pragma once #ifndef __cplusplus -#error "C++ is required to compile this file +#error "C++ is required to compile this file" #endif #include "syclinterface/dpctl_sycl_type_casters.hpp" From 08b6dd067871e46169e055398539830c467ec662 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 1 May 2024 20:03:08 -0700 Subject: [PATCH 07/25] Fixed memory leak introduced in new methods of usm_ndarray --- dpctl/apis/include/dpctl4pybind11.hpp | 32 ++++++++++++++++++--------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp index b476b79955..0c2b00af0f 100644 --- a/dpctl/apis/include/dpctl4pybind11.hpp +++ b/dpctl/apis/include/dpctl4pybind11.hpp @@ -793,7 +793,7 @@ class usm_memory : public py::object return bool(opaque_ptr); } - std::shared_ptr get_smart_ptr_owner() const + const std::shared_ptr &get_smart_ptr_owner() const { auto const &api = ::dpctl::detail::dpctl_capi::get(); Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); @@ -1114,17 +1114,20 @@ class usm_ndarray : public py::object auto const &api = ::dpctl::detail::dpctl_capi::get(); PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar); - if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) + if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) { + Py_DECREF(usm_data); return false; + } Py_MemoryObject *mem_obj = reinterpret_cast(usm_data); const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + Py_DECREF(usm_data); return bool(opaque_ptr); } - std::shared_ptr get_smart_ptr_owner() const + const std::shared_ptr &get_smart_ptr_owner() const { PyUSMArrayObject *raw_ar = usm_array_ptr(); @@ -1133,6 +1136,7 @@ class usm_ndarray : public py::object PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar); if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) { + Py_DECREF(usm_data); throw std::runtime_error( "usm_ndarray object does not have Memory object " "managing lifetime of USM allocation"); @@ -1141,6 +1145,7 @@ class usm_ndarray : public py::object Py_MemoryObject *mem_obj = reinterpret_cast(usm_data); void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj); + Py_DECREF(usm_data); if (opaque_ptr) { auto shptr_ptr = @@ -1172,28 +1177,32 @@ namespace detail struct ManagedMemory { - static bool is_usm_managed_by_shared_ptr(const py::handle &h) + static bool is_usm_managed_by_shared_ptr(const py::object &h) { if (py::isinstance(h)) { - auto usm_memory_inst = py::cast(h); + const auto &usm_memory_inst = + py::cast(h); return usm_memory_inst.is_managed_by_smart_ptr(); } else if (py::isinstance(h)) { - auto usm_array_inst = py::cast(h); + const auto &usm_array_inst = + py::cast(h); return usm_array_inst.is_managed_by_smart_ptr(); } return false; } - static std::shared_ptr extract_shared_ptr(const py::handle &h) + static const std::shared_ptr &extract_shared_ptr(const py::object &h) { if (py::isinstance(h)) { - auto usm_memory_inst = py::cast(h); + const auto &usm_memory_inst = + py::cast(h); return usm_memory_inst.get_smart_ptr_owner(); } else if (py::isinstance(h)) { - auto usm_array_inst = py::cast(h); + const auto &usm_array_inst = + py::cast(h); return usm_array_inst.get_smart_ptr_owner(); } @@ -1216,10 +1225,11 @@ sycl::event keep_args_alive(sycl::queue &q, std::array, num> shp_usm{}; for (std::size_t i = 0; i < num; ++i) { - auto py_obj_i = py_objs[i]; + const auto &py_obj_i = py_objs[i]; if (detail::ManagedMemory::is_usm_managed_by_shared_ptr(py_obj_i)) { - shp_usm[n_usm_owners_held] = + const auto &shp = detail::ManagedMemory::extract_shared_ptr(py_obj_i); + shp_usm[n_usm_owners_held] = shp; ++n_usm_owners_held; } else { From 96cd26e24be396be9e484fadf3e28f6979109efe Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 6 May 2024 20:52:00 -0500 Subject: [PATCH 08/25] Create sequential order manager The manager stores a list of events for tasks submitted so far, as well as list of events for host_tasks submitted so far. Every opperation of the manager prunes lists of completed events. `dpctl.utils.SequentialOrderManager` class instance is Python API to work with this class. Every offloading operation calls `.submitted_events` property to get dependent events, and adds computational event and host_task event to the manager using `.add_to_both_events(ht_ev, comp_ev)` method. The destructor of manager synchronizes on outstanding events. --- dpctl/utils/CMakeLists.txt | 34 ++++ dpctl/utils/__init__.py | 66 +++++++ dpctl/utils/src/order_keeper.cpp | 29 ++++ dpctl/utils/src/sequential_order_keeper.hpp | 181 ++++++++++++++++++++ 4 files changed, 310 insertions(+) create mode 100644 dpctl/utils/src/order_keeper.cpp create mode 100644 dpctl/utils/src/sequential_order_keeper.hpp diff --git a/dpctl/utils/CMakeLists.txt b/dpctl/utils/CMakeLists.txt index 04a9eb2a4d..2a8494032a 100644 --- a/dpctl/utils/CMakeLists.txt +++ b/dpctl/utils/CMakeLists.txt @@ -36,3 +36,37 @@ if(_dpctl_sycl_targets) endif() target_link_libraries(${python_module_name} PRIVATE DpctlCAPI) install(TARGETS ${python_module_name} DESTINATION "dpctl/utils") + + +set(python_module_name _seq_order_keeper) +set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/src/order_keeper.cpp) +pybind11_add_module(${python_module_name} MODULE + ${_module_src} +) +target_include_directories(${python_module_name} PRIVATE ${CUMAKE_CURRENT_SOURCE_DIR}/src) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src}) +if(DPCTL_GENERATE_COVERAGE) + if(DPCTL_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS) + target_compile_options(${python_module_name} + PRIVATE -fprofile-instr-generate -fcoverage-mapping + ) + endif() + target_link_options(${python_module_name} + PRIVATE -fprofile-instr-generate -fcoverage-mapping + ) +endif() +if(_dpctl_sycl_targets) + # make fat binary + target_compile_options( + ${python_module_name} + PRIVATE + -fsycl-targets=${_dpctl_sycl_targets} + ) + target_link_options( + ${python_module_name} + PRIVATE + -fsycl-targets=${_dpctl_sycl_targets} + ) +endif() +target_link_libraries(${python_module_name} PRIVATE DpctlCAPI) +install(TARGETS ${python_module_name} DESTINATION "dpctl/utils") diff --git a/dpctl/utils/__init__.py b/dpctl/utils/__init__.py index e0c14a263d..598ce1cce1 100644 --- a/dpctl/utils/__init__.py +++ b/dpctl/utils/__init__.py @@ -18,7 +18,10 @@ A collection of utility functions. """ +from contextvars import ContextVar + from .._sycl_device import SyclDevice +from .._sycl_event import SyclEvent from ._compute_follows_data import ( ExecutionPlacementError, get_coerced_usm_type, @@ -39,6 +42,7 @@ intel_device_info_memory_clock_rate, ) from ._onetrace_context import onetrace_enabled +from ._seq_order_keeper import _OrderManager def intel_device_info(dev, /): @@ -119,6 +123,67 @@ def intel_device_info(dev, /): return dict() +class _SequentialOrderManager: + """ + Class to orchestrate default sequential order + of the tasks offloaded from Python. + """ + + def __init__(self): + self._state = ContextVar("_seq_order_keeper", default=_OrderManager(16)) + + def __dealloc__(self): + _local = self._state.get() + SyclEvent.wait_for(_local.get_submitted_events()) + SyclEvent.wait_for(_local.get_host_task_events()) + + def __repr__(self): + return "" + + def __str__(self): + return "" + + def add_event_pair(self, host_task_ev, comp_ev): + _local = self._state.get() + if isinstance(host_task_ev, SyclEvent) and isinstance( + comp_ev, SyclEvent + ): + _local.add_to_both_events(host_task_ev, comp_ev) + else: + if not isinstance(host_task_ev, (list, tuple)): + host_task_ev = (host_task_ev,) + if not isinstance(comp_ev, (list, tuple)): + comp_ev = (comp_ev,) + _local.add_vector_to_both_events(host_task_ev, comp_ev) + + @property + def num_host_task_events(self): + _local = self._state.get() + return _local.get_num_host_task_events() + + @property + def num_submitted_events(self): + _local = self._state.get() + return _local.get_num_submitted_events() + + @property + def host_task_events(self): + _local = self._state.get() + return _local.get_host_task_events() + + @property + def submitted_events(self): + _local = self._state.get() + return _local.get_submitted_events() + + def wait(self): + _local = self._state.get() + return _local.wait() + + +SequentialOrderManager = _SequentialOrderManager() +SequentialOrderManager.__name__ = "SequentialOrderManager" + __all__ = [ "get_execution_queue", "get_coerced_usm_type", @@ -126,4 +191,5 @@ def intel_device_info(dev, /): "onetrace_enabled", "intel_device_info", "ExecutionPlacementError", + "SequentialOrderManager", ] diff --git a/dpctl/utils/src/order_keeper.cpp b/dpctl/utils/src/order_keeper.cpp new file mode 100644 index 0000000000..f54b21780c --- /dev/null +++ b/dpctl/utils/src/order_keeper.cpp @@ -0,0 +1,29 @@ +#include "dpctl4pybind11.hpp" +#include +#include + +#include "sequential_order_keeper.hpp" +#include + +PYBIND11_MODULE(_seq_order_keeper, m) +{ + py::class_(m, "_OrderManager") + .def(py::init()) + .def(py::init<>()) + .def(py::init()) + .def("get_num_submitted_events", + &SequentialOrder::get_num_submitted_events) + .def("get_num_host_task_events", + &SequentialOrder::get_num_host_task_events) + .def("get_submitted_events", &SequentialOrder::get_submitted_events) + .def("get_host_task_events", &SequentialOrder::get_host_task_events) + .def("add_to_both_events", &SequentialOrder::add_to_both_events) + .def("add_vector_to_both_events", + &SequentialOrder::add_vector_to_both_events) + .def("add_to_host_task_events", + &SequentialOrder::add_to_host_task_events) + .def("add_to_submitted_events", + &SequentialOrder::add_to_submitted_events) + .def("wait", &SequentialOrder::wait, + py::call_guard()); +} diff --git a/dpctl/utils/src/sequential_order_keeper.hpp b/dpctl/utils/src/sequential_order_keeper.hpp new file mode 100644 index 0000000000..0acc4a8b87 --- /dev/null +++ b/dpctl/utils/src/sequential_order_keeper.hpp @@ -0,0 +1,181 @@ +#pragma once +#include + +#include +#include + +namespace +{ +bool is_event_complete(const sycl::event &e) +{ + constexpr auto exec_complete = sycl::info::event_command_status::complete; + + const auto status = + e.get_info(); + return (status == exec_complete); +} +} // namespace + +class SequentialOrder +{ +private: + std::vector host_task_events; + std::vector submitted_events; + + void prune_complete() + { + const auto &ht_it = + std::remove_if(host_task_events.begin(), host_task_events.end(), + is_event_complete); + host_task_events.erase(ht_it, host_task_events.end()); + + const auto &sub_it = + std::remove_if(submitted_events.begin(), submitted_events.end(), + is_event_complete); + submitted_events.erase(sub_it, submitted_events.end()); + } + +public: + SequentialOrder() : host_task_events{}, submitted_events{} {} + SequentialOrder(size_t n) : host_task_events{}, submitted_events{} + { + host_task_events.reserve(n); + submitted_events.reserve(n); + } + + SequentialOrder(const SequentialOrder &other) + : host_task_events(other.host_task_events), + submitted_events(other.submitted_events) + { + prune_complete(); + } + SequentialOrder(SequentialOrder &&other) + : host_task_events{}, submitted_events{} + { + host_task_events = std::move(other.host_task_events); + submitted_events = std::move(other.submitted_events); + prune_complete(); + } + + SequentialOrder &operator=(const SequentialOrder &other) + { + host_task_events = other.host_task_events; + submitted_events = other.submitted_events; + + prune_complete(); + return *this; + } + + SequentialOrder &operator=(SequentialOrder &&other) + { + if (this != &other) { + host_task_events = std::move(other.host_task_events); + submitted_events = std::move(other.submitted_events); + prune_complete(); + } + return *this; + } + + size_t get_num_submitted_events() const + { + return submitted_events.size(); + } + + const std::vector &get_host_task_events() + { + prune_complete(); + return host_task_events; + } + + /* + const std::vector & get_host_task_events() const { + return host_task_events; + } + */ + + size_t get_num_host_task_events() const + { + return host_task_events.size(); + } + + const std::vector &get_submitted_events() + { + prune_complete(); + return submitted_events; + } + + /* + const std::vector & get_submitted_events() const { + return submitted_events; + } + */ + + void add_to_both_events(const sycl::event &ht_ev, + const sycl::event &comp_ev) + { + prune_complete(); + if (!is_event_complete(ht_ev)) + host_task_events.push_back(ht_ev); + if (!is_event_complete(comp_ev)) + submitted_events.push_back(comp_ev); + } + + void add_vector_to_both_events(const std::vector &ht_evs, + const std::vector &comp_evs) + { + prune_complete(); + for (const auto &e : ht_evs) { + if (!is_event_complete(e)) + host_task_events.push_back(e); + } + for (const auto &e : comp_evs) { + if (!is_event_complete(e)) + submitted_events.push_back(e); + } + } + + void add_to_host_task_events(const sycl::event &ht_ev) + { + prune_complete(); + if (!is_event_complete(ht_ev)) { + host_task_events.push_back(ht_ev); + } + } + + void add_to_submitted_events(const sycl::event &comp_ev) + { + prune_complete(); + if (!is_event_complete(comp_ev)) { + submitted_events.push_back(comp_ev); + } + } + + template + void add_list_to_host_task_events(const sycl::event (&ht_events)[num]) + { + prune_complete(); + for (size_t i = 0; i < num; ++i) { + const auto &e = ht_events[i]; + if (!is_event_complete(e)) + host_task_events.push_back(e); + } + } + + template + void add_list_to_submitted_events(const sycl::event (&comp_events)[num]) + { + prune_complete(); + for (size_t i = 0; i < num; ++i) { + const auto &e = comp_events[i]; + if (!is_event_complete(e)) + submitted_events.push_back(e); + } + } + + void wait() + { + sycl::event::wait(submitted_events); + sycl::event::wait(host_task_events); + prune_complete(); + } +}; From b78698ed51719f004fd2b9e1df990c276c98620b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 8 May 2024 10:39:01 -0500 Subject: [PATCH 09/25] Transition tensor to use SequentialOrderManager Remove pervasive use of SyclEvent.wait in favor of using SequentialOrderManager to maintain sequential order semantics via ordering of submitted tasks using events. --- dpctl/tensor/_accumulation.py | 38 ++--- dpctl/tensor/_clip.py | 85 +++++++----- dpctl/tensor/_copy_utils.py | 90 ++++++++---- dpctl/tensor/_ctors.py | 87 +++++++----- dpctl/tensor/_elementwise_common.py | 94 ++++++++----- dpctl/tensor/_indexing_functions.py | 27 +++- dpctl/tensor/_linear_algebra_functions.py | 162 +++++++++++----------- dpctl/tensor/_manipulation_functions.py | 111 +++++++++------ dpctl/tensor/_print.py | 9 +- dpctl/tensor/_reduction.py | 83 +++++------ dpctl/tensor/_reshape.py | 14 +- dpctl/tensor/_search_functions.py | 26 ++-- dpctl/tensor/_searchsorted.py | 51 +++---- dpctl/tensor/_set_functions.py | 162 ++++++++++++---------- dpctl/tensor/_sorting.py | 46 +++--- dpctl/tensor/_statistical_functions.py | 84 +++++------ dpctl/tensor/_usmarray.pyx | 2 + dpctl/tensor/_utility_functions.py | 13 +- 18 files changed, 683 insertions(+), 501 deletions(-) diff --git a/dpctl/tensor/_accumulation.py b/dpctl/tensor/_accumulation.py index 64ab2ea8c8..be81d5d333 100644 --- a/dpctl/tensor/_accumulation.py +++ b/dpctl/tensor/_accumulation.py @@ -25,7 +25,7 @@ _default_accumulation_dtype_fp_types, _to_device_supported_dtype, ) -from dpctl.utils import ExecutionPlacementError +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager def _accumulate_common( @@ -125,7 +125,9 @@ def _accumulate_common( if a1 != nd: out = dpt.permute_dims(out, perm) - host_tasks_list = [] + final_ev = dpctl.SyclEvent() + _manager = SequentialOrderManager + depends = _manager.submitted_events if implemented_types: if not include_initial: ht_e, acc_ev = _accumulate_fn( @@ -133,32 +135,32 @@ def _accumulate_common( trailing_dims_to_accumulate=1, dst=out, sycl_queue=q, + depends=depends, ) else: ht_e, acc_ev = _accumulate_include_initial_fn( - src=arr, - dst=out, - sycl_queue=q, + src=arr, dst=out, sycl_queue=q, depends=depends ) - host_tasks_list.append(ht_e) + _manager.add_event_pair(ht_e, acc_ev) if not (orig_out is None or out is orig_out): # Copy the out data from temporary buffer to original memory - ht_e_cpy, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_e_cpy, acc_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=q, depends=[acc_ev] ) - host_tasks_list.append(ht_e_cpy) + _manager.add_event_pair(ht_e_cpy, acc_ev) out = orig_out + final_ev = acc_ev else: if _dtype_supported(res_dt, res_dt): tmp = dpt.empty( arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=tmp, sycl_queue=q + src=arr, dst=tmp, sycl_queue=q, depends=depends ) - host_tasks_list.append(ht_e_cpy) + _manager.add_event_pair(ht_e_cpy, cpy_e) if not include_initial: - ht_e, acc_ev = _accumulate_fn( + ht_e, final_ev = _accumulate_fn( src=tmp, trailing_dims_to_accumulate=1, dst=out, @@ -166,26 +168,27 @@ def _accumulate_common( depends=[cpy_e], ) else: - ht_e, acc_ev = _accumulate_include_initial_fn( + ht_e, final_ev = _accumulate_include_initial_fn( src=tmp, dst=out, sycl_queue=q, depends=[cpy_e], ) + _manager.add_event_pair(ht_e, final_ev) else: buf_dt = _default_accumulation_type_fn(inp_dt, q) tmp = dpt.empty( arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q ) ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=tmp, sycl_queue=q + src=arr, dst=tmp, sycl_queue=q, depends=depends ) + _manager.add_event_pair(ht_e_cpy, cpy_e) tmp_res = dpt.empty( res_sh, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q ) if a1 != nd: tmp_res = dpt.permute_dims(tmp_res, perm) - host_tasks_list.append(ht_e_cpy) if not include_initial: ht_e, a_e = _accumulate_fn( src=tmp, @@ -201,18 +204,17 @@ def _accumulate_common( sycl_queue=q, depends=[cpy_e], ) - host_tasks_list.append(ht_e) - ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + _manager.add_event_pair(ht_e, a_e) + ht_e_cpy2, final_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=tmp_res, dst=out, sycl_queue=q, depends=[a_e] ) - host_tasks_list.append(ht_e_cpy2) + _manager.add_event_pair(ht_e_cpy2, final_ev) if appended_axis: out = dpt.squeeze(out) if a1 != nd: inv_perm = sorted(range(nd), key=lambda d: perm[d]) out = dpt.permute_dims(out, inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) return out diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py index dc54e46bb0..43bbb29aec 100644 --- a/dpctl/tensor/_clip.py +++ b/dpctl/tensor/_clip.py @@ -31,7 +31,7 @@ ) from dpctl.tensor._manipulation_functions import _broadcast_shape_impl from dpctl.tensor._type_utils import _can_cast, _to_device_supported_dtype -from dpctl.utils import ExecutionPlacementError +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager from ._type_utils import ( WeakComplexType, @@ -299,18 +299,21 @@ def _clip_none(x, val, out, order, _binary_fn): x = dpt.broadcast_to(x, res_shape) if val_ary.shape != res_shape: val_ary = dpt.broadcast_to(val_ary, res_shape) + _manager = SequentialOrderManager + dep_evs = _manager.submitted_events ht_binary_ev, binary_ev = _binary_fn( - src1=x, src2=val_ary, dst=out, sycl_queue=exec_q + src1=x, src2=val_ary, dst=out, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, copy_ev) out = orig_out ht_binary_ev.wait() return out @@ -319,9 +322,12 @@ def _clip_none(x, val, out, order, _binary_fn): buf = _empty_like_orderK(val_ary, res_dt) else: buf = dpt.empty_like(val_ary, dtype=res_dt, order=order) + _manager = SequentialOrderManager + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=val_ary, dst=buf, sycl_queue=exec_q + src=val_ary, dst=buf, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -346,18 +352,17 @@ def _clip_none(x, val, out, order, _binary_fn): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_binary_ev.wait() return out @@ -444,20 +449,22 @@ def clip(x, /, min=None, max=None, out=None, order="K"): else: out = dpt.empty_like(x, order=order) + _manager = SequentialOrderManager + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x, dst=out, sycl_queue=exec_q + src=x, dst=out, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[copy_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() return out elif max is None: return _clip_none(x, min, out, order, tei._maximum) @@ -665,20 +672,27 @@ def clip(x, /, min=None, max=None, out=None, order="K"): a_min = dpt.broadcast_to(a_min, res_shape) if a_max.shape != res_shape: a_max = dpt.broadcast_to(a_max, res_shape) + _manager = SequentialOrderManager + dep_ev = _manager.submitted_events ht_binary_ev, binary_ev = ti._clip( - src=x, min=a_min, max=a_max, dst=out, sycl_queue=exec_q + src=x, + min=a_min, + max=a_max, + dst=out, + sycl_queue=exec_q, + depends=dep_ev, ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_binary_ev.wait() return out elif buf1_dt is None: @@ -686,9 +700,12 @@ def clip(x, /, min=None, max=None, out=None, order="K"): buf2 = _empty_like_orderK(a_max, buf2_dt) else: buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + _manager = SequentialOrderManager + dep_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_max, dst=buf2, sycl_queue=exec_q + src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_ev ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_triple_orderK( @@ -721,18 +738,17 @@ def clip(x, /, min=None, max=None, out=None, order="K"): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_binary_ev.wait() return out elif buf2_dt is None: @@ -740,9 +756,12 @@ def clip(x, /, min=None, max=None, out=None, order="K"): buf1 = _empty_like_orderK(a_min, buf1_dt) else: buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + _manager = SequentialOrderManager + dep_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_min, dst=buf1, sycl_queue=exec_q + src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_ev ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_triple_orderK( @@ -775,18 +794,17 @@ def clip(x, /, min=None, max=None, out=None, order="K"): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_binary_ev.wait() return out if order == "K": @@ -806,16 +824,21 @@ def clip(x, /, min=None, max=None, out=None, order="K"): buf1 = _empty_like_orderK(a_min, buf1_dt) else: buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + + _manager = SequentialOrderManager + dep_evs = _manager.submitted_events ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_min, dst=buf1, sycl_queue=exec_q + src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) if order == "K": buf2 = _empty_like_orderK(a_max, buf2_dt) else: buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_max, dst=buf2, sycl_queue=exec_q + src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) if out is None: if order == "K": out = _empty_like_triple_orderK( @@ -833,7 +856,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): x = dpt.broadcast_to(x, res_shape) buf1 = dpt.broadcast_to(buf1, res_shape) buf2 = dpt.broadcast_to(buf2, res_shape) - ht_, _ = ti._clip( + ht_, clip_ev = ti._clip( src=x, min=buf1, max=buf2, @@ -841,5 +864,5 @@ def clip(x, /, min=None, max=None, out=None, order="K"): sycl_queue=exec_q, depends=[copy1_ev, copy2_ev], ) - dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_]) + _manager.add_event_pair(ht_, clip_ev) return out diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index bfd182e778..5d74bd0939 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -33,7 +33,7 @@ ":class:`dpctl.tensor.usm_ndarray`." ) -int32_t_max = 2147483648 +int32_t_max = 1 + np.iinfo(np.int32).max def _copy_to_numpy(ary): @@ -41,11 +41,13 @@ def _copy_to_numpy(ary): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}") nb = ary.usm_data.nbytes hh = dpm.MemoryUSMHost(nb, queue=ary.sycl_queue) - hh.copy_from_device(ary.usm_data) h = np.ndarray(nb, dtype="u1", buffer=hh).view(ary.dtype) itsz = ary.itemsize strides_bytes = tuple(si * itsz for si in ary.strides) offset = ary.__sycl_usm_array_interface__.get("offset", 0) * itsz + # ensure that content of ary.usm_data is final + dpctl.utils.SequentialOrderManager.wait() + hh.copy_from_device(ary.usm_data) return np.ndarray( ary.shape, dtype=ary.dtype, @@ -103,8 +105,11 @@ def _copy_from_numpy_into(dst, np_ary): src_ary = src_ary.astype(np.float32) elif src_ary_dt_c == "D": src_ary = src_ary.astype(np.complex64) + _manager = dpctl.utils.SequentialOrderManager + dep_ev = _manager.submitted_events + # synchronizing call ti._copy_numpy_ndarray_into_usm_ndarray( - src=src_ary, dst=dst, sycl_queue=copy_q + src=src_ary, dst=dst, sycl_queue=copy_q, depends=dep_ev ) @@ -203,14 +208,16 @@ def _copy_overlapping(dst, src): order="C", buffer_ctor_kwargs={"queue": q}, ) + _manager = dpctl.utils.SequentialOrderManager + dep_evs = _manager.submitted_events hcp1, cp1 = ti._copy_usm_ndarray_into_usm_ndarray( - src=src, dst=tmp, sycl_queue=q + src=src, dst=tmp, sycl_queue=q, depends=dep_evs ) - hcp2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + _manager.add_event_pair(hcp1, cp1) + hcp2, cp2 = ti._copy_usm_ndarray_into_usm_ndarray( src=tmp, dst=dst, sycl_queue=q, depends=[cp1] ) - hcp2.wait() - hcp1.wait() + _manager.add_event_pair(hcp2, cp2) def _copy_same_shape(dst, src): @@ -225,10 +232,12 @@ def _copy_same_shape(dst, src): _copy_overlapping(src=src, dst=dst) return - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=src, dst=dst, sycl_queue=dst.sycl_queue + _manager = dpctl.utils.SequentialOrderManager + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src, dst=dst, sycl_queue=dst.sycl_queue, depends=dep_evs ) - hev.wait() + _manager.add_event_pair(hev, cpy_ev) if hasattr(np, "broadcast_shapes"): @@ -715,22 +724,27 @@ def _extract_impl(ary, ary_mask, axis=0): cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64 cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device) exec_q = cumsum.sycl_queue - mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q) + _manager = dpctl.utils.SequentialOrderManager + dep_evs = _manager.submitted_events + mask_count = ti.mask_positions( + ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs + ) dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] dst = dpt.empty( dst_shape, dtype=ary.dtype, usm_type=ary.usm_type, device=ary.device ) if dst.size == 0: return dst - hev, _ = ti._extract( + hev, ev = ti._extract( src=ary, cumsum=cumsum, axis_start=pp, axis_end=pp + mask_nd, dst=dst, sycl_queue=exec_q, + depends=dep_evs, ) - hev.wait() + _manager.add_event_pair(hev, ev) return dst @@ -746,7 +760,11 @@ def _nonzero_impl(ary): cumsum = dpt.empty( mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C" ) - mask_count = ti.mask_positions(ary, cumsum, sycl_queue=exec_q) + _manager = dpctl.utils.SequentialOrderManager + dep_evs = _manager.submitted_events + mask_count = ti.mask_positions( + ary, cumsum, sycl_queue=exec_q, depends=dep_evs + ) indexes_dt = ti.default_device_index_type(exec_q.sycl_device) indexes = dpt.empty( (ary.ndim, mask_count), @@ -755,9 +773,9 @@ def _nonzero_impl(ary): sycl_queue=exec_q, order="C", ) - hev, _ = ti._nonzero(cumsum, indexes, ary.shape, exec_q) + hev, nz_ev = ti._nonzero(cumsum, indexes, ary.shape, exec_q) res = tuple(indexes[i, :] for i in range(ary.ndim)) - hev.wait() + _manager.add_event_pair(hev, nz_ev) return res @@ -819,10 +837,18 @@ def _take_multi_index(ary, inds, p): res = dpt.empty( res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - hev, _ = ti._take( - src=ary, ind=inds, dst=res, axis_start=p, mode=0, sycl_queue=exec_q + _manager = dpctl.utils.SequentialOrderManager + dep_ev = _manager.submitted_events + hev, take_ev = ti._take( + src=ary, + ind=inds, + dst=res, + axis_start=p, + mode=0, + sycl_queue=exec_q, + depends=dep_ev, ) - hev.wait() + _manager.add_event_pair(hev, take_ev) return res @@ -864,7 +890,11 @@ def _place_impl(ary, ary_mask, vals, axis=0): cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64 cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device) exec_q = cumsum.sycl_queue - mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q) + _manager = dpctl.utils.SequentialOrderManager + dep_ev = _manager.submitted_events + mask_count = ti.mask_positions( + ary_mask, cumsum, sycl_queue=exec_q, depends=dep_ev + ) expected_vals_shape = ( ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] ) @@ -873,15 +903,17 @@ def _place_impl(ary, ary_mask, vals, axis=0): else: rhs = dpt.astype(vals, ary.dtype) rhs = dpt.broadcast_to(rhs, expected_vals_shape) - hev, _ = ti._place( + dep_ev = _manager.submitted_events + hev, pl_ev = ti._place( dst=ary, cumsum=cumsum, axis_start=pp, axis_end=pp + mask_nd, rhs=rhs, sycl_queue=exec_q, + depends=dep_ev, ) - hev.wait() + _manager.add_event_pair(hev, pl_ev) return @@ -958,8 +990,16 @@ def _put_multi_index(ary, inds, p, vals): else: rhs = dpt.astype(vals, ary.dtype) rhs = dpt.broadcast_to(rhs, expected_vals_shape) - hev, _ = ti._put( - dst=ary, ind=inds, val=rhs, axis_start=p, mode=0, sycl_queue=exec_q + _manager = dpctl.utils.SequentialOrderManager + dep_ev = _manager.submitted_events + hev, put_ev = ti._put( + dst=ary, + ind=inds, + val=rhs, + axis_start=p, + mode=0, + sycl_queue=exec_q, + depends=dep_ev, ) - hev.wait() + _manager.add_event_pair(hev, put_ev) return diff --git a/dpctl/tensor/_ctors.py b/dpctl/tensor/_ctors.py index e61a73c7f3..740253b691 100644 --- a/dpctl/tensor/_ctors.py +++ b/dpctl/tensor/_ctors.py @@ -159,10 +159,12 @@ def _asarray_from_usm_ndarray( ) eq = dpctl.utils.get_execution_queue([usm_ndary.sycl_queue, copy_q]) if eq is not None: - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=usm_ndary, dst=res, sycl_queue=eq + _manager = dpctl.utils.SequentialOrderManager + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=usm_ndary, dst=res, sycl_queue=eq, depends=dep_evs ) - hev.wait() + _manager.add_event_pair(hev, cpy_ev) else: tmp = dpt.asnumpy(usm_ndary) res[...] = tmp @@ -311,25 +313,27 @@ def _usm_types_walker(o, usm_types_list): raise TypeError -def _device_copy_walker(seq_o, res, events): +def _device_copy_walker(seq_o, res, _manager): if isinstance(seq_o, dpt.usm_ndarray): exec_q = res.sycl_queue - ht_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=seq_o, dst=res, sycl_queue=exec_q + deps = _manager.submitted_events + ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=seq_o, dst=res, sycl_queue=exec_q, depends=deps ) - events.append(ht_ev) + _manager.add_event_pair(ht_ev, cpy_ev) return if hasattr(seq_o, "__sycl_usm_array_interface__"): usm_ar = _usm_ndarray_from_suai(seq_o) exec_q = res.sycl_queue - ht_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=usm_ar, dst=res, sycl_queue=exec_q + deps = _manager.submitted_events + ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=usm_ar, dst=res, sycl_queue=exec_q, depends=deps ) - events.append(ht_ev) + _manager.add_event_pair(ht_ev, cpy_ev) return if isinstance(seq_o, (list, tuple)): for i, el in enumerate(seq_o): - _device_copy_walker(el, res[i], events) + _device_copy_walker(el, res[i], _manager) return raise TypeError @@ -411,9 +415,8 @@ def _asarray_from_seq( sycl_queue=alloc_q, order=order, ) - ht_events = [] - _device_copy_walker(seq_obj, res, ht_events) - dpctl.SyclEvent.wait_for(ht_events) + _manager = dpctl.utils.SequentialOrderManager + _device_copy_walker(seq_obj, res, _manager) return res else: res = dpt.empty( @@ -851,8 +854,10 @@ def arange( else: _step = sc_ty(1) _start = _first - hev, _ = ti._linspace_step(_start, _step, res, sycl_queue) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager + # populating newly allocated array, no task dependencies + hev, lin_ev = ti._linspace_step(_start, _step, res, sycl_queue) + _manager.add_event_pair(hev, lin_ev) if is_bool: res_out = dpt.usm_ndarray( (sh,), @@ -861,8 +866,11 @@ def arange( order="C", buffer_ctor_kwargs={"queue": sycl_queue}, ) - res_out[:] = res - res = res_out + hev_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=res, dst=res_out, sycl_queue=sycl_queue, depends=[lin_ev] + ) + _manager.add_event_pair(hev_cpy, cpy_ev) + return res_out return res @@ -927,6 +935,7 @@ def zeros( order=order, buffer_ctor_kwargs={"queue": sycl_queue}, ) + # FIXME: replace with asynchronous call to ti res.usm_data.memset() return res @@ -992,8 +1001,10 @@ def ones( order=order, buffer_ctor_kwargs={"queue": sycl_queue}, ) - hev, _ = ti._full_usm_ndarray(1, res, sycl_queue) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) return res @@ -1089,8 +1100,10 @@ def full( elif fill_value_type is int and np.issubdtype(dtype, np.integer): fill_value = _to_scalar(fill_value, dtype) - hev, _ = ti._full_usm_ndarray(fill_value, res, sycl_queue) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) return res @@ -1467,10 +1480,11 @@ def linspace( start = float(start) stop = float(stop) res = dpt.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue) - hev, _ = ti._linspace_affine( + _manager = dpctl.utils.SequentialOrderManager + hev, la_ev = ti._linspace_affine( start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue ) - hev.wait() + _manager.add_event_pair(hev, la_ev) return res if int_dt is None else dpt.astype(res, int_dt) @@ -1564,8 +1578,9 @@ def eye( buffer_ctor_kwargs={"queue": sycl_queue}, ) if n_rows != 0 and n_cols != 0: - hev, _ = ti._eye(k, dst=res, sycl_queue=sycl_queue) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager + hev, eye_ev = ti._eye(k, dst=res, sycl_queue=sycl_queue) + _manager.add_event_pair(hev, eye_ev) return res @@ -1615,10 +1630,11 @@ def tril(x, /, *, k=0): usm_type=x.usm_type, sycl_queue=q, ) - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + _manager = dpctl.utils.SequentialOrderManager + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=x, dst=res, sycl_queue=q ) - hev.wait() + _manager.add_event_pair(hev, cpy_ev) elif k < -shape[nd - 2]: res = dpt.zeros( x.shape, @@ -1635,8 +1651,9 @@ def tril(x, /, *, k=0): usm_type=x.usm_type, sycl_queue=q, ) - hev, _ = ti._tril(src=x, dst=res, k=k, sycl_queue=q) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager + hev, tril_ev = ti._tril(src=x, dst=res, k=k, sycl_queue=q) + _manager.add_event_pair(hev, tril_ev) return res @@ -1695,10 +1712,11 @@ def triu(x, /, *, k=0): usm_type=x.usm_type, sycl_queue=q, ) - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + _manager = dpctl.utils.SequentialOrderManager + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=x, dst=res, sycl_queue=q ) - hev.wait() + _manager.add_event_pair(hev, cpy_ev) else: res = dpt.empty( x.shape, @@ -1707,8 +1725,9 @@ def triu(x, /, *, k=0): usm_type=x.usm_type, sycl_queue=q, ) - hev, _ = ti._triu(src=x, dst=res, k=k, sycl_queue=q) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager + hev, triu_ev = ti._triu(src=x, dst=res, k=k, sycl_queue=q) + _manager.add_event_pair(hev, triu_ev) return res diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py index 32da1fbb02..25e485ee17 100644 --- a/dpctl/tensor/_elementwise_common.py +++ b/dpctl/tensor/_elementwise_common.py @@ -24,7 +24,7 @@ import dpctl.tensor._tensor_impl as ti from dpctl.tensor._manipulation_functions import _broadcast_shape_impl from dpctl.tensor._usmarray import _is_object_with_buffer_protocol as _is_buffer -from dpctl.utils import ExecutionPlacementError +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK from ._type_utils import ( @@ -236,6 +236,7 @@ def __call__(self, x, /, *, out=None, order="K"): ) exec_q = x.sycl_queue + _manager = SequentialOrderManager if buf_dt is None: if out is None: if order == "K": @@ -245,17 +246,20 @@ def __call__(self, x, /, *, out=None, order="K"): order = "F" if x.flags.f_contiguous else "C" out = dpt.empty_like(x, dtype=res_dt, order=order) - ht_unary_ev, unary_ev = self.unary_fn_(x, out, sycl_queue=exec_q) + dep_evs = _manager.submitted_events + ht_unary_ev, unary_ev = self.unary_fn_( + x, out, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_unary_ev, unary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[unary_ev] ) - ht_copy_ev.wait() + _manager.add_event_pair(ht_copy_ev, cpy_ev) out = orig_out - ht_unary_ev.wait() return out if order == "K": @@ -265,18 +269,21 @@ def __call__(self, x, /, *, out=None, order="K"): order = "F" if x.flags.f_contiguous else "C" buf = dpt.empty_like(x, dtype=buf_dt, order=order) + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x, dst=buf, sycl_queue=exec_q + src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_orderK(buf, res_dt) else: out = dpt.empty_like(buf, dtype=res_dt, order=order) - ht, _ = self.unary_fn_(buf, out, sycl_queue=exec_q, depends=[copy_ev]) - ht_copy_ev.wait() - ht.wait() + ht, uf_ev = self.unary_fn_( + buf, out, sycl_queue=exec_q, depends=[copy_ev] + ) + _manager.add_event_pair(ht, uf_ev) return out @@ -625,6 +632,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): ) orig_out = out + _manager = SequentialOrderManager if out is not None: if not isinstance(out, dpt.usm_ndarray): raise TypeError( @@ -676,28 +684,36 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): if buf2_dt is None: if src2.shape != res_shape: src2 = dpt.broadcast_to(src2, res_shape) - ht_, _ = self.binary_inplace_fn_( - lhs=o1, rhs=src2, sycl_queue=exec_q + dep_evs = _manager.submitted_events + ht_, comp_ev = self.binary_inplace_fn_( + lhs=o1, + rhs=src2, + sycl_queue=exec_q, + depends=dep_evs, ) - ht_.wait() + _manager.add_event_pair(ht_, comp_ev) else: buf2 = dpt.empty_like(src2, dtype=buf2_dt) + dep_evs = _manager.submitted_events ( ht_copy_ev, copy_ev, ) = ti._copy_usm_ndarray_into_usm_ndarray( - src=src2, dst=buf2, sycl_queue=exec_q + src=src2, + dst=buf2, + sycl_queue=exec_q, + depends=dep_evs, ) + _manager.add_event_pair(ht_copy_ev, copy_ev) buf2 = dpt.broadcast_to(buf2, res_shape) - ht_, _ = self.binary_inplace_fn_( + ht_, bf_ev = self.binary_inplace_fn_( lhs=o1, rhs=buf2, sycl_queue=exec_q, depends=[copy_ev], ) - ht_copy_ev.wait() - ht_.wait() + _manager.add_event_pair(ht_, bf_ev) return out @@ -751,29 +767,36 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): src1 = dpt.broadcast_to(src1, res_shape) if src2.shape != res_shape: src2 = dpt.broadcast_to(src2, res_shape) + deps_ev = _manager.submitted_events ht_binary_ev, binary_ev = self.binary_fn_( - src1=src1, src2=src2, dst=out, sycl_queue=exec_q + src1=src1, + src2=src2, + dst=out, + sycl_queue=exec_q, + depends=deps_ev, ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_binary_ev.wait() return out elif buf1_dt is None: if order == "K": buf2 = _empty_like_orderK(src2, buf2_dt) else: buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order) + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=src2, dst=buf2, sycl_queue=exec_q + src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -798,27 +821,28 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_binary_ev.wait() return out elif buf2_dt is None: if order == "K": buf1 = _empty_like_orderK(src1, buf1_dt) else: buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order) + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=src1, dst=buf1, sycl_queue=exec_q + src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -843,18 +867,17 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_binary_ev, binary_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[binary_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_binary_ev.wait() return out if order == "K": @@ -866,16 +889,19 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): buf1 = _empty_like_orderK(src1, buf1_dt) else: buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order) + dep_evs = _manager.submitted_events ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=src1, dst=buf1, sycl_queue=exec_q + src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) if order == "K": buf2 = _empty_like_orderK(src2, buf2_dt) else: buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=src2, dst=buf2, sycl_queue=exec_q + src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -892,12 +918,12 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): buf1 = dpt.broadcast_to(buf1, res_shape) buf2 = dpt.broadcast_to(buf2, res_shape) - ht_, _ = self.binary_fn_( + ht_, bf_ev = self.binary_fn_( src1=buf1, src2=buf2, dst=out, sycl_queue=exec_q, depends=[copy1_ev, copy2_ev], ) - dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_]) + _manager.add_event_pair(ht_, bf_ev) return out diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 04cbd1bc8d..54b6deea3f 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -21,6 +21,7 @@ import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti +import dpctl.utils from ._copy_utils import _extract_impl, _nonzero_impl @@ -120,8 +121,12 @@ def take(x, indices, /, *, axis=None, mode="wrap"): res_shape, dtype=x.dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - hev, _ = ti._take(x, (indices,), res, axis, mode, sycl_queue=exec_q) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager + deps_ev = _manager.submitted_events + hev, take_ev = ti._take( + x, (indices,), res, axis, mode, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(hev, take_ev) return res @@ -273,8 +278,12 @@ def put_vec_duplicates(vec, ind, vals): rhs = dpt.astype(vals, x.dtype) rhs = dpt.broadcast_to(rhs, val_shape) - hev, _ = ti._put(x, (indices,), rhs, axis, mode, sycl_queue=exec_q) - hev.wait() + _manager = dpctl.utils.SequentialOrderManager + deps_ev = _manager.submitted_events + hev, put_ev = ti._put( + x, (indices,), rhs, axis, mode, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(hev, put_ev) def extract(condition, arr): @@ -366,7 +375,11 @@ def place(arr, mask, vals): if arr.shape != mask.shape or vals.ndim != 1: raise ValueError("Array sizes are not as required") cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q) - nz_count = ti.mask_positions(mask, cumsum, sycl_queue=exec_q) + _manager = dpctl.utils.SequentialOrderManager + deps_ev = _manager.submitted_events + nz_count = ti.mask_positions( + mask, cumsum, sycl_queue=exec_q, depends=deps_ev + ) if nz_count == 0: return if vals.size == 0: @@ -375,7 +388,7 @@ def place(arr, mask, vals): rhs = vals else: rhs = dpt.astype(vals, arr.dtype) - hev, _ = ti._place( + hev, pl_ev = ti._place( dst=arr, cumsum=cumsum, axis_start=0, @@ -383,7 +396,7 @@ def place(arr, mask, vals): rhs=rhs, sycl_queue=exec_q, ) - hev.wait() + _manager.add_event_pair(hev, pl_ev) def nonzero(arr): diff --git a/dpctl/tensor/_linear_algebra_functions.py b/dpctl/tensor/_linear_algebra_functions.py index e4155ab3e1..ac0f04fb99 100644 --- a/dpctl/tensor/_linear_algebra_functions.py +++ b/dpctl/tensor/_linear_algebra_functions.py @@ -30,7 +30,7 @@ _find_buf_dtype2, _to_device_supported_dtype, ) -from dpctl.utils import ExecutionPlacementError +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager def matrix_transpose(x): @@ -189,6 +189,7 @@ def tensordot(x1, x2, axes=2): "supported types according to the casting rule ''safe''." ) + _manager = SequentialOrderManager if buf1_dt is None and buf2_dt is None: out = dpt.empty( res_shape, @@ -197,7 +198,8 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + dep_evs = _manager.submitted_events + ht_dot_ev, dot_ev = tli._dot( x1=arr1, x2=arr2, batch_dims=0, @@ -206,16 +208,20 @@ def tensordot(x1, x2, axes=2): inner_dims=n_axes1, dst=out, sycl_queue=exec_q, + depends=dep_evs, ) - ht_dot_ev.wait() + _manager.add_event_pair(ht_dot_ev, dot_ev) return out elif buf1_dt is None: buf2 = _empty_like_orderK(arr2, buf2_dt) + + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr2, dst=buf2, sycl_queue=exec_q + src=arr2, dst=buf2, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) out = dpt.empty( res_shape, dtype=res_dt, @@ -223,7 +229,7 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + ht_dot_ev, dot_ev = tli._dot( x1=arr1, x2=buf2, batch_dims=0, @@ -234,16 +240,17 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, depends=[copy_ev], ) - ht_copy_ev.wait() - ht_dot_ev.wait() + _manager.add_event_pair(ht_dot_ev, dot_ev) return out elif buf2_dt is None: buf1 = _empty_like_orderK(arr1, buf1_dt) + dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr1, dst=buf1, sycl_queue=exec_q + src=arr1, dst=buf1, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) out = dpt.empty( res_shape, dtype=res_dt, @@ -251,7 +258,7 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + ht_dot_ev, dot_ev = tli._dot( x1=buf1, x2=arr2, batch_dims=0, @@ -262,19 +269,21 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, depends=[copy_ev], ) - ht_copy_ev.wait() - ht_dot_ev.wait() + _manager.add_event_pair(ht_dot_ev, dot_ev) return out buf1 = _empty_like_orderK(arr1, buf1_dt) + deps_ev = _manager.submitted_events ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr1, dst=buf1, sycl_queue=exec_q + src=arr1, dst=buf1, sycl_queue=exec_q, depends=deps_ev ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) buf2 = _empty_like_orderK(arr2, buf2_dt) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr2, dst=buf2, sycl_queue=exec_q + src=arr2, dst=buf2, sycl_queue=exec_q, depends=deps_ev ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) out = dpt.empty( res_shape, dtype=res_dt, @@ -282,7 +291,7 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, order="C", ) - ht_, _ = tli._dot( + ht_, dot_ev = tli._dot( x1=buf1, x2=buf2, batch_dims=0, @@ -293,7 +302,7 @@ def tensordot(x1, x2, axes=2): sycl_queue=exec_q, depends=[copy1_ev, copy2_ev], ) - dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_]) + _manager.add_event_pair(ht_, dot_ev) return out @@ -399,18 +408,15 @@ def vecdot(x1, x2, axis=-1): "supported types according to the casting rule ''safe''." ) - ht_list = [] - deps = [] + _manager = SequentialOrderManager if buf1_dt is None and buf2_dt is None: if x1.dtype.kind == "c": x1_tmp = _empty_like_orderK(x1, x1.dtype) + dep_evs = _manager.submitted_events ht_conj_ev, conj_ev = tei._conj( - src=x1, - dst=x1_tmp, - sycl_queue=exec_q, + src=x1, dst=x1_tmp, sycl_queue=exec_q, depends=dep_evs ) - ht_list.append(ht_conj_ev) - deps.append(conj_ev) + _manager.add_event_pair(ht_conj_ev, conj_ev) x1 = x1_tmp if x1.shape != broadcast_sh: x1 = dpt.broadcast_to(x1, broadcast_sh) @@ -425,7 +431,8 @@ def vecdot(x1, x2, axis=-1): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + dep_evs = _manager.submitted_events + ht_dot_ev, dot_ev = tli._dot( x1=x1, x2=x2, batch_dims=len(res_sh), @@ -434,28 +441,26 @@ def vecdot(x1, x2, axis=-1): inner_dims=1, dst=out, sycl_queue=exec_q, - depends=deps, + depends=dep_evs, ) - ht_list.append(ht_dot_ev) - dpctl.SyclEvent.wait_for(ht_list) - + _manager.add_event_pair(ht_dot_ev, dot_ev) return dpt.reshape(out, res_sh) elif buf1_dt is None: if x1.dtype.kind == "c": x1_tmp = _empty_like_orderK(x1, x1.dtype) + deps_ev = _manager.submitted_events ht_conj_ev, conj_e = tei._conj( - src=x1, dst=x1_tmp, sycl_queue=exec_q + src=x1, dst=x1_tmp, sycl_queue=exec_q, depends=deps_ev ) - ht_list.append(ht_conj_ev) - deps.append(conj_e) + _manager.add_event_pair(ht_conj_ev, conj_e) x1 = x1_tmp buf2 = _empty_like_orderK(x2, buf2_dt) + deps_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x2, dst=buf2, sycl_queue=exec_q + src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev ) - ht_list.append(ht_copy_ev) - deps.append(copy_ev) + _manager.add_event_pair(ht_copy_ev, copy_ev) if x1.shape != broadcast_sh: x1 = dpt.broadcast_to(x1, broadcast_sh) if buf2.shape != broadcast_sh: @@ -469,7 +474,7 @@ def vecdot(x1, x2, axis=-1): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + ht_dot_ev, dot_ev = tli._dot( x1=x1, x2=buf2, batch_dims=len(res_sh), @@ -478,26 +483,23 @@ def vecdot(x1, x2, axis=-1): inner_dims=1, dst=out, sycl_queue=exec_q, - depends=deps, + depends=[copy_ev], ) - ht_list.append(ht_dot_ev) - dpctl.SyclEvent.wait_for(ht_list) - + _manager.add_event_pair(ht_dot_ev, dot_ev) return dpt.reshape(out, res_sh) elif buf2_dt is None: buf1 = _empty_like_orderK(x1, buf1_dt) + deps_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x1, dst=buf1, sycl_queue=exec_q + src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev ) - ht_list.append(ht_copy_ev) - deps.append(copy_ev) + _manager.add_event_pair(ht_copy_ev, copy_ev) if buf1.dtype.kind == "c": ht_conj_ev, conj_ev = tei._conj( src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy_ev] ) - ht_list.append(ht_conj_ev) - deps.append(conj_ev) + _manager.add_event_pair(ht_conj_ev, conj_ev) if buf1.shape != broadcast_sh: buf1 = dpt.broadcast_to(buf1, broadcast_sh) if x2.shape != broadcast_sh: @@ -511,7 +513,8 @@ def vecdot(x1, x2, axis=-1): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + deps_ev = _manager.submitted_events + ht_dot_ev, dot_ev = tli._dot( x1=buf1, x2=x2, batch_dims=len(res_sh), @@ -520,31 +523,27 @@ def vecdot(x1, x2, axis=-1): inner_dims=1, dst=out, sycl_queue=exec_q, - depends=deps, + depends=deps_ev, ) - ht_list.append(ht_dot_ev) - dpctl.SyclEvent.wait_for(ht_list) - + _manager.add_event_pair(ht_dot_ev, dot_ev) return dpt.reshape(out, res_sh) buf1 = _empty_like_orderK(x1, buf1_dt) + deps_ev = _manager.submitted_events ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x1, dst=buf1, sycl_queue=exec_q + src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev ) - ht_list.append(ht_copy1_ev) - deps.append(copy1_ev) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) if buf1.dtype.kind == "c": ht_conj_ev, conj_ev = tei._conj( src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy1_ev] ) - ht_list.append(ht_conj_ev) - deps.append(conj_ev) + _manager.add_event_pair(ht_conj_ev, conj_ev) buf2 = _empty_like_orderK(x2, buf2_dt) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x2, dst=buf2, sycl_queue=exec_q + src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev ) - ht_list.append(ht_copy2_ev) - deps.append(copy2_ev) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) if buf1.shape != broadcast_sh: buf1 = dpt.broadcast_to(buf1, broadcast_sh) if buf2.shape != broadcast_sh: @@ -558,7 +557,8 @@ def vecdot(x1, x2, axis=-1): sycl_queue=exec_q, order="C", ) - ht_dot_ev, _ = tli._dot( + deps_ev = _manager.submitted_events + ht_dot_ev, dot_ev = tli._dot( x1=buf1, x2=buf2, batch_dims=len(res_sh), @@ -567,11 +567,9 @@ def vecdot(x1, x2, axis=-1): inner_dims=1, dst=out, sycl_queue=exec_q, - depends=deps, + depends=deps_ev, ) - ht_list.append(ht_dot_ev) - dpctl.SyclEvent.wait_for(ht_list) - + _manager.add_event_pair(ht_dot_ev, dot_ev) return out @@ -793,6 +791,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): else "C" ) + _manager = SequentialOrderManager if buf1_dt is None and buf2_dt is None: if out is None: if order == "K": @@ -811,6 +810,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): x1 = dpt.broadcast_to(x1, x1_broadcast_shape) if x2.shape != x2_broadcast_shape: x2 = dpt.broadcast_to(x2, x2_broadcast_shape) + deps_evs = _manager.submitted_events ht_dot_ev, dot_ev = tli._dot( x1=x1, x2=x2, @@ -820,18 +820,19 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): inner_dims=1, dst=out, sycl_queue=exec_q, + depends=deps_evs, ) + _manager.add_event_pair(ht_dot_ev, dot_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[dot_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_dot_ev.wait() if appended_axes: out = dpt.squeeze(out, tuple(appended_axes)) return out @@ -840,9 +841,11 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): buf2 = _empty_like_orderK(x2, buf2_dt) else: buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order) + deps_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x2, dst=buf2, sycl_queue=exec_q + src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -872,18 +875,17 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_dot_ev, dot_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[dot_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_dot_ev.wait() if appended_axes: out = dpt.squeeze(out, tuple(appended_axes)) return out @@ -893,9 +895,11 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): buf1 = _empty_like_orderK(x1, buf1_dt) else: buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order) + deps_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x1, dst=buf1, sycl_queue=exec_q + src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev ) + _manager.add_event_pair(ht_copy_ev, copy_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -925,18 +929,17 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): sycl_queue=exec_q, depends=[copy_ev], ) + _manager.add_event_pair(ht_dot_ev, dot_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[dot_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - ht_copy_ev.wait() - ht_dot_ev.wait() if appended_axes: out = dpt.squeeze(out, tuple(appended_axes)) return out @@ -950,16 +953,19 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): buf1 = _empty_like_orderK(x1, buf1_dt) else: buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order) + deps_ev = _manager.submitted_events ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x1, dst=buf1, sycl_queue=exec_q + src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) if order == "K": buf2 = _empty_like_orderK(x2, buf2_dt) else: buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x2, dst=buf2, sycl_queue=exec_q + src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) if out is None: if order == "K": out = _empty_like_pair_orderK( @@ -978,7 +984,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape) if buf2.shape != x2_broadcast_shape: buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape) - ht_, _ = tli._dot( + ht_, dot_ev = tli._dot( x1=buf1, x2=buf2, batch_dims=len(res_shape[:-2]), @@ -989,7 +995,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): sycl_queue=exec_q, depends=[copy1_ev, copy2_ev], ) - dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_]) + _manager.add_event_pair(ht_, dot_ev) if appended_axes: out = dpt.squeeze(out, tuple(appended_axes)) return out diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py index 50b7c25fed..9030780e45 100644 --- a/dpctl/tensor/_manipulation_functions.py +++ b/dpctl/tensor/_manipulation_functions.py @@ -343,15 +343,21 @@ def roll(X, /, shift, *, axis=None): """ if not isinstance(X, dpt.usm_ndarray): raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + _manager = dputils.SequentialOrderManager if axis is None: shift = operator.index(shift) + dep_evs = _manager.submitted_events res = dpt.empty( X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=X.sycl_queue ) - hev, _ = ti._copy_usm_ndarray_for_roll_1d( - src=X, dst=res, shift=shift, sycl_queue=X.sycl_queue + hev, roll_ev = ti._copy_usm_ndarray_for_roll_1d( + src=X, + dst=res, + shift=shift, + sycl_queue=X.sycl_queue, + depends=dep_evs, ) - hev.wait() + _manager.add_event_pair(hev, roll_ev) return res axis = normalize_axis_tuple(axis, X.ndim, allow_duplicate=True) broadcasted = np.broadcast(shift, axis) @@ -367,10 +373,11 @@ def roll(X, /, shift, *, axis=None): res = dpt.empty( X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=exec_q ) - ht_e, _ = ti._copy_usm_ndarray_for_roll_nd( - src=X, dst=res, shifts=shifts, sycl_queue=exec_q + dep_evs = _manager.submitted_events + ht_e, roll_ev = ti._copy_usm_ndarray_for_roll_nd( + src=X, dst=res, shifts=shifts, sycl_queue=exec_q, depends=dep_evs ) - ht_e.wait() + _manager.add_event_pair(ht_e, roll_ev) return res @@ -439,31 +446,46 @@ def _concat_axis_None(arrays): res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - hev_list = [] fill_start = 0 + _manager = dputils.SequentialOrderManager + deps = _manager.submitted_events for array in arrays: fill_end = fill_start + array.size if array.flags.c_contiguous: - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=dpt.reshape(array, -1), dst=res[fill_start:fill_end], sycl_queue=exec_q, + depends=deps, ) + _manager.add_event_pair(hev, cpy_ev) else: src_ = array # _copy_usm_ndarray_for_reshape requires src and dst to have # the same data type if not array.dtype == res_dtype: - src_ = dpt.astype(src_, res_dtype) - hev, _ = ti._copy_usm_ndarray_for_reshape( - src=src_, - dst=res[fill_start:fill_end], - sycl_queue=exec_q, - ) + src2_ = dpt.empty_like(src_, dtype=res_dtype) + ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src_, dst=src2_, sycl_queue=exec_q, depends=deps + ) + _manager.add_event_pair(ht_copy_ev, cpy_ev) + hev, reshape_copy_ev = ti._copy_usm_ndarray_for_reshape( + src=src_, + dst=res[fill_start:fill_end], + sycl_queue=exec_q, + depends=[cpy_ev], + ) + _manager.add_event_pair(hev, reshape_copy_ev) + else: + hev, cpy_ev = ti._copy_usm_ndarray_for_reshape( + src=src_, + dst=res[fill_start:fill_end], + sycl_queue=exec_q, + depends=deps, + ) + _manager.add_event_pair(hev, cpy_ev) fill_start = fill_end - hev_list.append(hev) - dpctl.SyclEvent.wait_for(hev_list) return res @@ -516,7 +538,8 @@ def concat(arrays, /, *, axis=0): res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - hev_list = [] + _manager = dputils.SequentialOrderManager + deps = _manager.submitted_events fill_start = 0 for i in range(n): fill_end = fill_start + arrays[i].shape[axis] @@ -524,13 +547,14 @@ def concat(arrays, /, *, axis=0): np.s_[fill_start:fill_end] if j == axis else np.s_[:] for j in range(X0.ndim) ) - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=arrays[i], dst=res[c_shapes_copy], sycl_queue=exec_q + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arrays[i], + dst=res[c_shapes_copy], + sycl_queue=exec_q, + depends=deps, ) + _manager.add_event_pair(hev, cpy_ev) fill_start = fill_end - hev_list.append(hev) - - dpctl.SyclEvent.wait_for(hev_list) return res @@ -581,17 +605,17 @@ def stack(arrays, /, *, axis=0): res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - hev_list = [] + _manager = dputils.SequentialOrderManager + dep_evs = _manager.submitted_events for i in range(n): c_shapes_copy = tuple( i if j == axis else np.s_[:] for j in range(res_ndim) ) - hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=arrays[i], dst=res[c_shapes_copy], sycl_queue=exec_q + _dst = res[c_shapes_copy] + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arrays[i], dst=_dst, sycl_queue=exec_q, depends=dep_evs ) - hev_list.append(hev) - - dpctl.SyclEvent.wait_for(hev_list) + _manager.add_event_pair(hev, cpy_ev) return res @@ -838,6 +862,8 @@ def repeat(x, repeats, /, *, axis=None): f"got {type(repeats)}" ) + _manager = dputils.SequentialOrderManager + dep_evs = _manager.submitted_events if scalar: res_axis_size = repeats * axis_size if axis is not None: @@ -848,14 +874,15 @@ def repeat(x, repeats, /, *, axis=None): res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q ) if res_axis_size > 0: - ht_rep_ev, _ = ti._repeat_by_scalar( + ht_rep_ev, rep_ev = ti._repeat_by_scalar( src=x, dst=res, reps=repeats, axis=axis, sycl_queue=exec_q, + depends=dep_evs, ) - ht_rep_ev.wait() + _manager.add_event_pair(ht_rep_ev, rep_ev) else: if repeats.dtype != dpt.int64: rep_buf = dpt.empty( @@ -865,8 +892,9 @@ def repeat(x, repeats, /, *, axis=None): sycl_queue=exec_q, ) ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=repeats, dst=rep_buf, sycl_queue=exec_q + src=repeats, dst=rep_buf, sycl_queue=exec_q, depends=dep_evs ) + _manager.add_event_pair(ht_copy_ev, copy_ev) cumsum = dpt.empty( (axis_size,), dtype=dpt.int64, @@ -890,7 +918,7 @@ def repeat(x, repeats, /, *, axis=None): sycl_queue=exec_q, ) if res_axis_size > 0: - ht_rep_ev, _ = ti._repeat_by_sequence( + ht_rep_ev, rep_ev = ti._repeat_by_sequence( src=x, dst=res, reps=rep_buf, @@ -898,8 +926,7 @@ def repeat(x, repeats, /, *, axis=None): axis=axis, sycl_queue=exec_q, ) - ht_rep_ev.wait() - ht_copy_ev.wait() + _manager.add_event_pair(ht_rep_ev, rep_ev) else: cumsum = dpt.empty( (axis_size,), @@ -907,7 +934,9 @@ def repeat(x, repeats, /, *, axis=None): usm_type=usm_type, sycl_queue=exec_q, ) - res_axis_size = ti._cumsum_1d(repeats, cumsum, sycl_queue=exec_q) + res_axis_size = ti._cumsum_1d( + repeats, cumsum, sycl_queue=exec_q, depends=dep_evs + ) if axis is not None: res_shape = ( x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] @@ -921,7 +950,7 @@ def repeat(x, repeats, /, *, axis=None): sycl_queue=exec_q, ) if res_axis_size > 0: - ht_rep_ev, _ = ti._repeat_by_sequence( + ht_rep_ev, rep_ev = ti._repeat_by_sequence( src=x, dst=res, reps=repeats, @@ -929,7 +958,7 @@ def repeat(x, repeats, /, *, axis=None): axis=axis, sycl_queue=exec_q, ) - ht_rep_ev.wait() + _manager.add_event_pair(ht_rep_ev, rep_ev) return res @@ -1021,8 +1050,10 @@ def tile(x, repetitions, /): broadcast_sh, ) # copy broadcast input into flat array - hev, _ = ti._copy_usm_ndarray_for_reshape( - src=x, dst=res, sycl_queue=exec_q + _manager = dputils.SequentialOrderManager + dep_evs = _manager.submitted_events + hev, cp_ev = ti._copy_usm_ndarray_for_reshape( + src=x, dst=res, sycl_queue=exec_q, depends=dep_evs ) - hev.wait() + _manager.add_event_pair(hev, cp_ev) return dpt.reshape(res, res_shape) diff --git a/dpctl/tensor/_print.py b/dpctl/tensor/_print.py index 450f46c0f1..b371eebb6f 100644 --- a/dpctl/tensor/_print.py +++ b/dpctl/tensor/_print.py @@ -23,6 +23,7 @@ import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti +import dpctl.utils __doc__ = "Print functions for :class:`dpctl.tensor.usm_ndarray`." @@ -263,10 +264,16 @@ def _nd_corners(arr_in, edge_items): else: blocks.append((np.s_[:],)) + _manager = dpctl.utils.SequentialOrderManager + dep_evs = _manager.submitted_events hev_list = [] + exec_q = arr_in.sycl_queue for slc in itertools.product(*blocks): hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr_in[slc], dst=arr_out[slc], sycl_queue=arr_in.sycl_queue + src=arr_in[slc], + dst=arr_out[slc], + sycl_queue=exec_q, + depends=dep_evs, ) hev_list.append(hev) diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index b918152467..b8bc1a20f2 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -20,7 +20,7 @@ import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti import dpctl.tensor._tensor_reductions_impl as tri -from dpctl.utils import ExecutionPlacementError +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager from ._type_utils import ( _default_accumulation_dtype, @@ -108,31 +108,35 @@ def _reduction_over_axis( res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) - host_tasks_list = [] + _manager = SequentialOrderManager + dep_evs = _manager.submitted_events if red_nd == 0: ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=out, sycl_queue=q + src=arr, dst=out, sycl_queue=q, depends=dep_evs ) - host_tasks_list.append(ht_e_cpy) + _manager.add_event_pair(ht_e_cpy, cpy_e) if not (orig_out is None or orig_out is out): - ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=q, depends=[cpy_e] ) - host_tasks_list.append(ht_e_cpy2) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) out = orig_out - dpctl.SyclEvent.wait_for(host_tasks_list) return out if implemented_types: ht_e, red_e = _reduction_fn( - src=arr, trailing_dims_to_reduce=red_nd, dst=out, sycl_queue=q + src=arr, + trailing_dims_to_reduce=red_nd, + dst=out, + sycl_queue=q, + depends=dep_evs, ) - host_tasks_list.append(ht_e) + _manager.add_event_pair(ht_e, red_e) if not (orig_out is None or orig_out is out): - ht_e_cpy, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=q, depends=[red_e] ) - host_tasks_list.append(ht_e_cpy) + _manager.add_event_pair(ht_e_cpy, cpy_e) out = orig_out else: if _dtype_supported(res_dt, res_dt, res_usm_type, q): @@ -140,29 +144,29 @@ def _reduction_over_axis( arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=tmp, sycl_queue=q + src=arr, dst=tmp, sycl_queue=q, depends=dep_evs ) - host_tasks_list.append(ht_e_cpy) - ht_e_red, _ = _reduction_fn( + _manager.add_event_pair(ht_e_cpy, cpy_e) + ht_e_red, red_ev = _reduction_fn( src=tmp, trailing_dims_to_reduce=red_nd, dst=out, sycl_queue=q, depends=[cpy_e], ) - host_tasks_list.append(ht_e_red) + _manager.add_event_pair(ht_e_red, red_ev) else: buf_dt = _default_reduction_type_fn(inp_dt, q) tmp = dpt.empty( arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q ) ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=tmp, sycl_queue=q + src=arr, dst=tmp, sycl_queue=q, depends=dep_evs ) + _manager.add_event_pair(ht_e_cpy, cpy_e) tmp_res = dpt.empty( res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q ) - host_tasks_list.append(ht_e_cpy) ht_e_red, r_e = _reduction_fn( src=tmp, trailing_dims_to_reduce=red_nd, @@ -170,18 +174,16 @@ def _reduction_over_axis( sycl_queue=q, depends=[cpy_e], ) - host_tasks_list.append(ht_e_red) - ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + _manager.add_event_pair(ht_e_red, r_e) + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( src=tmp_res, dst=out, sycl_queue=q, depends=[r_e] ) - host_tasks_list.append(ht_e_cpy2) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) if keepdims: res_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) - return out @@ -498,19 +500,19 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn): res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q ) - host_tasks_list = [] + _manager = SequentialOrderManager + dep_evs = _manager.submitted_events if red_nd == 0: ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=x_tmp, dst=out, sycl_queue=exec_q + src=x_tmp, dst=out, sycl_queue=exec_q, depends=dep_evs ) - host_tasks_list.append(ht_e_cpy) + _manager.add_event_pair(ht_e_cpy, cpy_e) if not (orig_out is None or orig_out is out): - ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[cpy_e] ) - host_tasks_list.append(ht_e_cpy2) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) out = orig_out - dpctl.SyclEvent.wait_for(host_tasks_list) return out hev, red_ev = _reduction_fn( @@ -518,20 +520,20 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn): trailing_dims_to_reduce=red_nd, dst=out, sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks_list.append(hev) + _manager.add_event_pair(hev, red_ev) if not (orig_out is None or orig_out is out): - ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev] ) - host_tasks_list.append(ht_e_cpy2) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) out = orig_out if keepdims: res_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) return out @@ -667,33 +669,34 @@ def _search_over_axis(x, axis, keepdims, out, _reduction_fn): res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q ) + _manager = SequentialOrderManager + dep_evs = _manager.submitted_events if red_nd == 0: - ht_e_fill, _ = ti._full_usm_ndarray( - fill_value=0, dst=out, sycl_queue=exec_q + ht_e_fill, fill_ev = ti._full_usm_ndarray( + fill_value=0, dst=out, sycl_queue=exec_q, depends=dep_evs ) - ht_e_fill.wait() + _manager.add_event_pair(ht_e_fill, fill_ev) return out - host_tasks_list = [] hev, red_ev = _reduction_fn( src=x_tmp, trailing_dims_to_reduce=red_nd, dst=out, sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks_list.append(hev) + _manager.add_event_pair(hev, red_ev) if not (orig_out is None or orig_out is out): - ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev] ) - host_tasks_list.append(ht_e_cpy2) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) out = orig_out if keepdims: res_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) return out diff --git a/dpctl/tensor/_reshape.py b/dpctl/tensor/_reshape.py index 575c79c115..6a669ae255 100644 --- a/dpctl/tensor/_reshape.py +++ b/dpctl/tensor/_reshape.py @@ -18,6 +18,7 @@ import numpy as np import dpctl.tensor as dpt +import dpctl.utils from dpctl.tensor._tensor_impl import ( _copy_usm_ndarray_for_reshape, _ravel_multi_index, @@ -157,22 +158,25 @@ def reshape(X, /, shape, *, order="C", copy=None): copy_q = X.sycl_queue if copy_required or (copy is True): # must perform a copy + copy_q = X.sycl_queue flat_res = dpt.usm_ndarray( (X.size,), dtype=X.dtype, buffer=X.usm_type, buffer_ctor_kwargs={"queue": copy_q}, ) + _manager = dpctl.utils.SequentialOrderManager + dep_evs = _manager.submitted_events if order == "C": - hev, _ = _copy_usm_ndarray_for_reshape( - src=X, dst=flat_res, sycl_queue=copy_q + hev, r_e = _copy_usm_ndarray_for_reshape( + src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs ) else: X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1)) - hev, _ = _copy_usm_ndarray_for_reshape( - src=X_t, dst=flat_res, sycl_queue=copy_q + hev, r_e = _copy_usm_ndarray_for_reshape( + src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs ) - hev.wait() + _manager.add_event_pair(hev, r_e) return dpt.usm_ndarray( tuple(shape), dtype=X.dtype, buffer=flat_res, order=order ) diff --git a/dpctl/tensor/_search_functions.py b/dpctl/tensor/_search_functions.py index 94982d0c82..693f25c118 100644 --- a/dpctl/tensor/_search_functions.py +++ b/dpctl/tensor/_search_functions.py @@ -18,7 +18,7 @@ import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti from dpctl.tensor._manipulation_functions import _broadcast_shapes -from dpctl.utils import ExecutionPlacementError +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager from ._copy_utils import _empty_like_orderK, _empty_like_triple_orderK from ._type_utils import _all_data_types, _can_cast @@ -198,19 +198,18 @@ def where(condition, x1, x2, /, *, order="K", out=None): sycl_queue=exec_q, ) - deps = [] - wait_list = [] + _manager = SequentialOrderManager + dep_evs = _manager.submitted_events if x1_dtype != out_dtype: if order == "K": _x1 = _empty_like_orderK(x1, out_dtype) else: _x1 = dpt.empty_like(x1, dtype=out_dtype, order=order) ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x1, dst=_x1, sycl_queue=exec_q + src=x1, dst=_x1, sycl_queue=exec_q, depends=dep_evs ) x1 = _x1 - deps.append(copy1_ev) - wait_list.append(ht_copy1_ev) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) if x2_dtype != out_dtype: if order == "K": @@ -218,11 +217,10 @@ def where(condition, x1, x2, /, *, order="K", out=None): else: _x2 = dpt.empty_like(x2, dtype=out_dtype, order=order) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x2, dst=_x2, sycl_queue=exec_q + src=x2, dst=_x2, sycl_queue=exec_q, depends=dep_evs ) x2 = _x2 - deps.append(copy2_ev) - wait_list.append(ht_copy2_ev) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) if out is None: if order == "K": @@ -242,25 +240,25 @@ def where(condition, x1, x2, /, *, order="K", out=None): x1 = dpt.broadcast_to(x1, res_shape) x2 = dpt.broadcast_to(x2, res_shape) + dep_evs = _manager.submitted_events hev, where_ev = ti._where( condition=condition, x1=x1, x2=x2, dst=out, sycl_queue=exec_q, - depends=deps, + depends=dep_evs, ) + _manager.add_event_pair(hev, where_ev) if not (orig_out is None or orig_out is out): # Copy the out data from temporary buffer to original memory - ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=out, dst=orig_out, sycl_queue=exec_q, depends=[where_ev], ) - ht_copy_out_ev.wait() + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) out = orig_out - dpctl.SyclEvent.wait_for(wait_list) - hev.wait() return out diff --git a/dpctl/tensor/_searchsorted.py b/dpctl/tensor/_searchsorted.py index d9408e072e..15a30bf127 100644 --- a/dpctl/tensor/_searchsorted.py +++ b/dpctl/tensor/_searchsorted.py @@ -87,7 +87,8 @@ def searchsorted( x1_dt = x1.dtype x2_dt = x2.dtype - host_evs = [] + _manager = du.SequentialOrderManager + dep_evs = _manager.submitted_events ev = dpctl.SyclEvent() if sorter is not None: if not isdtype(sorter.dtype, "integral"): @@ -110,38 +111,28 @@ def searchsorted( axis, wrap_out_of_bound_indices_mode, sycl_queue=q, - depends=[ - ev, - ], + depends=dep_evs, ) x1 = res - host_evs.append(ht_ev) + _manager.add_event_pair(ht_ev, ev) if x1_dt != x2_dt: dt = result_type(x1, x2) if x1_dt != dt: x1_buf = _empty_like_orderK(x1, dt) + dep_evs = _manager.submitted_events ht_ev, ev = ti_copy( - src=x1, - dst=x1_buf, - sycl_queue=q, - depends=[ - ev, - ], + src=x1, dst=x1_buf, sycl_queue=q, depends=dep_evs ) - host_evs.append(ht_ev) + _manager.add_event_pair(ht_ev, ev) x1 = x1_buf if x2_dt != dt: x2_buf = _empty_like_orderK(x2, dt) + dep_evs = _manager.submitted_events ht_ev, ev = ti_copy( - src=x2, - dst=x2_buf, - sycl_queue=q, - depends=[ - ev, - ], + src=x2, dst=x2_buf, sycl_queue=q, depends=dep_evs ) - host_evs.append(ht_ev) + _manager.add_event_pair(ht_ev, ev) x2 = x2_buf dst_usm_type = du.get_coerced_usm_type([x1.usm_type, x2.usm_type]) @@ -149,28 +140,18 @@ def searchsorted( dst = _empty_like_orderK(x2, index_dt, usm_type=dst_usm_type) + dep_evs = _manager.submitted_events if side == "left": - ht_ev, _ = _searchsorted_left( + ht_ev, s_ev = _searchsorted_left( hay=x1, needles=x2, positions=dst, sycl_queue=q, - depends=[ - ev, - ], + depends=dep_evs, ) else: - ht_ev, _ = _searchsorted_right( - hay=x1, - needles=x2, - positions=dst, - sycl_queue=q, - depends=[ - ev, - ], + ht_ev, s_ev = _searchsorted_right( + hay=x1, needles=x2, positions=dst, sycl_queue=q, depends=dep_evs ) - - host_evs.append(ht_ev) - dpctl.SyclEvent.wait_for(host_evs) - + _manager.add_event_pair(ht_ev, s_ev) return dst diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py index 81023a0827..5056f38aec 100644 --- a/dpctl/tensor/_set_functions.py +++ b/dpctl/tensor/_set_functions.py @@ -16,8 +16,8 @@ from typing import NamedTuple -import dpctl import dpctl.tensor as dpt +import dpctl.utils as du from ._tensor_elementwise_impl import _not_equal, _subtract from ._tensor_impl import ( @@ -87,18 +87,23 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: if fx.size == 0: return fx s = dpt.empty_like(fx, order="C") - host_tasks = [] + _manager = du.SequentialOrderManager + dep_evs = _manager.submitted_events if fx.flags.c_contiguous: ht_ev, sort_ev = _sort_ascending( - src=fx, trailing_dims_to_sort=1, dst=s, sycl_queue=exec_q + src=fx, + trailing_dims_to_sort=1, + dst=s, + sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sort_ev) else: tmp = dpt.empty_like(fx, order="C") ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( - src=fx, dst=tmp, sycl_queue=exec_q + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, copy_ev) ht_ev, sort_ev = _sort_ascending( src=tmp, trailing_dims_to_sort=1, @@ -106,7 +111,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: sycl_queue=exec_q, depends=[copy_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sort_ev) unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) ht_ev, uneq_ev = _not_equal( src1=s[:-1], @@ -115,23 +120,23 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: sycl_queue=exec_q, depends=[sort_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, uneq_ev) + # writing into new allocation, no dependencies ht_ev, one_ev = _full_usm_ndarray( fill_value=True, dst=unique_mask[0], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, one_ev) cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions( unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] ) if n_uniques == fx.size: - dpctl.SyclEvent.wait_for(host_tasks) return s unique_vals = dpt.empty( n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q ) - ht_ev, _ = _extract( + ht_ev, ex_e = _extract( src=s, cumsum=cumsum, axis_start=0, @@ -139,8 +144,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: dst=unique_vals, sycl_queue=exec_q, ) - host_tasks.append(ht_ev) - dpctl.SyclEvent.wait_for(host_tasks) + _manager.add_event_pair(ht_ev, ex_e) return unique_vals @@ -178,18 +182,24 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: if fx.size == 0: return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt)) s = dpt.empty_like(fx, order="C") - host_tasks = [] + + _manager = du.SequentialOrderManager + dep_evs = _manager.submitted_events if fx.flags.c_contiguous: ht_ev, sort_ev = _sort_ascending( - src=fx, trailing_dims_to_sort=1, dst=s, sycl_queue=exec_q + src=fx, + trailing_dims_to_sort=1, + dst=s, + sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sort_ev) else: tmp = dpt.empty_like(fx, order="C") ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( - src=fx, dst=tmp, sycl_queue=exec_q + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, copy_ev) ht_ev, sort_ev = _sort_ascending( src=tmp, dst=s, @@ -197,7 +207,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: sycl_queue=exec_q, depends=[copy_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sort_ev) unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q) ht_ev, uneq_ev = _not_equal( src1=s[:-1], @@ -206,18 +216,18 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: sycl_queue=exec_q, depends=[sort_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, uneq_ev) + # no dependency, since we write into new allocation ht_ev, one_ev = _full_usm_ndarray( fill_value=True, dst=unique_mask[0], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, one_ev) cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions( unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] ) if n_uniques == fx.size: - dpctl.SyclEvent.wait_for(host_tasks) return UniqueCountsResult( s, dpt.ones( @@ -228,7 +238,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q ) # populate unique values - ht_ev, _ = _extract( + ht_ev, ex_e = _extract( src=s, cumsum=cumsum, axis_start=0, @@ -236,13 +246,14 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: dst=unique_vals, sycl_queue=exec_q, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, ex_e) unique_counts = dpt.empty( n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q ) idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + # writing into new allocation, no dependency ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, id_ev) ht_ev, extr_ev = _extract( src=idx, cumsum=cumsum, @@ -252,21 +263,21 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: sycl_queue=exec_q, depends=[id_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, extr_ev) + # no dependency, writing into disjoint segmenent of new allocation ht_ev, set_ev = _full_usm_ndarray( x.size, dst=unique_counts[-1], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, set_ev) _counts = dpt.empty_like(unique_counts[1:]) - ht_ev, _ = _subtract( + ht_ev, sub_ev = _subtract( src1=unique_counts[1:], src2=unique_counts[:-1], dst=_counts, sycl_queue=exec_q, depends=[set_ev, extr_ev], ) - host_tasks.append(ht_ev) - dpctl.SyclEvent.wait_for(host_tasks) + _manager.add_event_pair(ht_ev, sub_ev) return UniqueCountsResult(unique_vals, _counts) @@ -305,18 +316,24 @@ def unique_inverse(x): unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C") if fx.size == 0: return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape)) - host_tasks = [] + + _manager = du.SequentialOrderManager + dep_evs = _manager.submitted_events if fx.flags.c_contiguous: ht_ev, sort_ev = _argsort_ascending( - src=fx, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q + src=fx, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sort_ev) else: tmp = dpt.empty_like(fx, order="C") ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( - src=fx, dst=tmp, sycl_queue=exec_q + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, copy_ev) ht_ev, sort_ev = _argsort_ascending( src=tmp, trailing_dims_to_sort=1, @@ -324,15 +341,15 @@ def unique_inverse(x): sycl_queue=exec_q, depends=[copy_ev], ) - host_tasks.append(ht_ev) - ht_ev, _ = _argsort_ascending( + _manager.add_event_pair(ht_ev, sort_ev) + ht_ev, argsort_ev = _argsort_ascending( src=sorting_ids, trailing_dims_to_sort=1, dst=unsorting_ids, sycl_queue=exec_q, depends=[sort_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, argsort_ev) s = dpt.empty_like(fx) # s = fx[sorting_ids] ht_ev, take_ev = _take( @@ -344,7 +361,7 @@ def unique_inverse(x): sycl_queue=exec_q, depends=[sort_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, take_ev) unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) ht_ev, uneq_ev = _not_equal( src1=s[:-1], @@ -353,18 +370,18 @@ def unique_inverse(x): sycl_queue=exec_q, depends=[take_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, uneq_ev) + # no dependency ht_ev, one_ev = _full_usm_ndarray( fill_value=True, dst=unique_mask[0], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, one_ev) cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions( unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] ) if n_uniques == fx.size: - dpctl.SyclEvent.wait_for(host_tasks) return UniqueInverseResult(s, dpt.reshape(unsorting_ids, x.shape)) unique_vals = dpt.empty( n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q @@ -377,13 +394,13 @@ def unique_inverse(x): dst=unique_vals, sycl_queue=exec_q, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, uv_ev) cum_unique_counts = dpt.empty( n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q ) idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, id_ev) ht_ev, extr_ev = _extract( src=idx, cumsum=cumsum, @@ -393,24 +410,24 @@ def unique_inverse(x): sycl_queue=exec_q, depends=[id_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, extr_ev) ht_ev, set_ev = _full_usm_ndarray( x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, set_ev) _counts = dpt.empty_like(cum_unique_counts[1:]) - ht_ev, _ = _subtract( + ht_ev, sub_ev = _subtract( src1=cum_unique_counts[1:], src2=cum_unique_counts[:-1], dst=_counts, sycl_queue=exec_q, depends=[set_ev, extr_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sub_ev) inv_dt = dpt.int64 if x.size > dpt.iinfo(dpt.int32).max else dpt.int32 inv = dpt.empty_like(x, dtype=inv_dt, order="C") - ht_ev, _ = _searchsorted_left( + ht_ev, ssl_ev = _searchsorted_left( hay=unique_vals, needles=x, positions=inv, @@ -419,9 +436,8 @@ def unique_inverse(x): uv_ev, ], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, ssl_ev) - dpctl.SyclEvent.wait_for(host_tasks) return UniqueInverseResult(unique_vals, inv) @@ -477,18 +493,23 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: dpt.reshape(unsorting_ids, x.shape), dpt.empty_like(fx, dtype=ind_dt), ) - host_tasks = [] + _manager = du.SequentialOrderManager + dep_evs = _manager.submitted_events if fx.flags.c_contiguous: ht_ev, sort_ev = _argsort_ascending( - src=fx, trailing_dims_to_sort=1, dst=sorting_ids, sycl_queue=exec_q + src=fx, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sort_ev) else: tmp = dpt.empty_like(fx, order="C") ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( - src=fx, dst=tmp, sycl_queue=exec_q + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, copy_ev) ht_ev, sort_ev = _argsort_ascending( src=tmp, trailing_dims_to_sort=1, @@ -496,15 +517,15 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: sycl_queue=exec_q, depends=[copy_ev], ) - host_tasks.append(ht_ev) - ht_ev, _ = _argsort_ascending( + _manager.add_event_pair(ht_ev, sort_ev) + ht_ev, args_ev = _argsort_ascending( src=sorting_ids, trailing_dims_to_sort=1, dst=unsorting_ids, sycl_queue=exec_q, depends=[sort_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, args_ev) s = dpt.empty_like(fx) # s = fx[sorting_ids] ht_ev, take_ev = _take( @@ -516,7 +537,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: sycl_queue=exec_q, depends=[sort_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, take_ev) unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q) ht_ev, uneq_ev = _not_equal( src1=s[:-1], @@ -525,18 +546,17 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: sycl_queue=exec_q, depends=[take_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, uneq_ev) ht_ev, one_ev = _full_usm_ndarray( fill_value=True, dst=unique_mask[0], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, one_ev) cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q) # synchronizing call n_uniques = mask_positions( unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] ) if n_uniques == fx.size: - dpctl.SyclEvent.wait_for(host_tasks) _counts = dpt.ones( n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q ) @@ -557,13 +577,13 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: dst=unique_vals, sycl_queue=exec_q, ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, uv_ev) cum_unique_counts = dpt.empty( n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q ) idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, id_ev) ht_ev, extr_ev = _extract( src=idx, cumsum=cumsum, @@ -573,11 +593,11 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: sycl_queue=exec_q, depends=[id_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, extr_ev) ht_ev, set_ev = _full_usm_ndarray( x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, set_ev) _counts = dpt.empty_like(cum_unique_counts[1:]) ht_ev, sub_ev = _subtract( src1=cum_unique_counts[1:], @@ -586,11 +606,11 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: sycl_queue=exec_q, depends=[set_ev, extr_ev], ) - host_tasks.append(ht_ev) + _manager.add_event_pair(ht_ev, sub_ev) inv_dt = dpt.int64 if x.size > dpt.iinfo(dpt.int32).max else dpt.int32 inv = dpt.empty_like(x, dtype=inv_dt, order="C") - ht_ev, _ = _searchsorted_left( + ht_ev, ssl_ev = _searchsorted_left( hay=unique_vals, needles=x, positions=inv, @@ -599,9 +619,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: uv_ev, ], ) - host_tasks.append(ht_ev) - - dpctl.SyclEvent.wait_for(host_tasks) + _manager.add_event_pair(ht_ev, ssl_ev) return UniqueAllResult( unique_vals, sorting_ids[cum_unique_counts[:-1]], diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py index 3d34ebf0a5..48e79c90c6 100644 --- a/dpctl/tensor/_sorting.py +++ b/dpctl/tensor/_sorting.py @@ -16,9 +16,9 @@ from numpy.core.numeric import normalize_axis_index -import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti +import dpctl.utils as du from ._tensor_sorting_impl import ( _argsort_ascending, @@ -76,33 +76,37 @@ def sort(x, /, *, axis=-1, descending=False, stable=True): ] arr = dpt.permute_dims(x, perm) exec_q = x.sycl_queue - host_tasks_list = [] + _manager = du.SequentialOrderManager + dep_evs = _manager.submitted_events impl_fn = _sort_descending if descending else _sort_ascending if arr.flags.c_contiguous: res = dpt.empty_like(arr, order="C") - ht_ev, _ = impl_fn( - src=arr, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q + ht_ev, impl_ev = impl_fn( + src=arr, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks_list.append(ht_ev) + _manager.add_event_pair(ht_ev, impl_ev) else: tmp = dpt.empty_like(arr, order="C") ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=tmp, sycl_queue=exec_q + src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs ) - host_tasks_list.append(ht_ev) + _manager.add_event_pair(ht_ev, copy_ev) res = dpt.empty_like(arr, order="C") - ht_ev, _ = impl_fn( + ht_ev, impl_ev = impl_fn( src=tmp, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q, depends=[copy_ev], ) - host_tasks_list.append(ht_ev) + _manager.add_event_pair(ht_ev, impl_ev) if a1 != nd: inv_perm = sorted(range(nd), key=lambda d: perm[d]) res = dpt.permute_dims(res, inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) return res @@ -155,32 +159,36 @@ def argsort(x, axis=-1, descending=False, stable=True): ] arr = dpt.permute_dims(x, perm) exec_q = x.sycl_queue - host_tasks_list = [] + _manager = du.SequentialOrderManager + dep_evs = _manager.submitted_events impl_fn = _argsort_descending if descending else _argsort_ascending index_dt = ti.default_device_index_type(exec_q) if arr.flags.c_contiguous: res = dpt.empty_like(arr, dtype=index_dt, order="C") - ht_ev, _ = impl_fn( - src=arr, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q + ht_ev, impl_ev = impl_fn( + src=arr, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=dep_evs, ) - host_tasks_list.append(ht_ev) + _manager.add_event_pair(ht_ev, impl_ev) else: tmp = dpt.empty_like(arr, order="C") ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=tmp, sycl_queue=exec_q + src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs ) - host_tasks_list.append(ht_ev) + _manager.add_event_pair(ht_ev, copy_ev) res = dpt.empty_like(arr, dtype=index_dt, order="C") - ht_ev, _ = impl_fn( + ht_ev, impl_ev = impl_fn( src=tmp, trailing_dims_to_sort=1, dst=res, sycl_queue=exec_q, depends=[copy_ev], ) - host_tasks_list.append(ht_ev) + _manager.add_event_pair(ht_ev, impl_ev) if a1 != nd: inv_perm = sorted(range(nd), key=lambda d: perm[d]) res = dpt.permute_dims(res, inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) return res diff --git a/dpctl/tensor/_statistical_functions.py b/dpctl/tensor/_statistical_functions.py index 367f7960e0..c37b27da85 100644 --- a/dpctl/tensor/_statistical_functions.py +++ b/dpctl/tensor/_statistical_functions.py @@ -16,11 +16,11 @@ from numpy.core.numeric import normalize_axis_tuple -import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_elementwise_impl as tei import dpctl.tensor._tensor_impl as ti import dpctl.tensor._tensor_reductions_impl as tri +import dpctl.utils as du def _var_impl(x, axis, correction, keepdims): @@ -48,15 +48,14 @@ def _var_impl(x, axis, correction, keepdims): ) res_usm_type = x.usm_type - deps = [] - host_tasks_list = [] + _manager = du.SequentialOrderManager + dep_evs = _manager.submitted_events if inp_dt != res_dt: buf = dpt.empty_like(x, dtype=res_dt) ht_e_buf, c_e1 = ti._copy_usm_ndarray_into_usm_ndarray( - src=x, dst=buf, sycl_queue=q + src=x, dst=buf, sycl_queue=q, depends=dep_evs ) - deps.append(c_e1) - host_tasks_list.append(ht_e_buf) + _manager.add_event_pair(ht_e_buf, c_e1) else: buf = x # calculate mean @@ -65,11 +64,11 @@ def _var_impl(x, axis, correction, keepdims): # use keepdims=True path for later broadcasting if red_nd == 0: mean_ary = dpt.empty_like(buf) + dep_evs = _manager.submitted_events ht_e1, c_e2 = ti._copy_usm_ndarray_into_usm_ndarray( - src=buf, dst=mean_ary, sycl_queue=q + src=buf, dst=mean_ary, sycl_queue=q, depends=dep_evs ) - deps.append(c_e2) - host_tasks_list.append(ht_e1) + _manager.add_event_pair(ht_e1, c_e2) else: mean_ary = dpt.empty( res_shape, @@ -77,15 +76,15 @@ def _var_impl(x, axis, correction, keepdims): usm_type=res_usm_type, sycl_queue=q, ) + dep_evs = _manager.submitted_events ht_e1, r_e1 = tri._sum_over_axis( src=buf2, trailing_dims_to_reduce=red_nd, dst=mean_ary, sycl_queue=q, - depends=deps, + depends=dep_evs, ) - host_tasks_list.append(ht_e1) - deps.append(r_e1) + _manager.add_event_pair(ht_e1, r_e1) mean_ary_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) @@ -99,10 +98,11 @@ def _var_impl(x, axis, correction, keepdims): ) if nelems_ary.shape != mean_ary_shape: nelems_ary = dpt.broadcast_to(nelems_ary, mean_ary_shape) + dep_evs = _manager.submitted_events ht_e2, d_e1 = tei._divide_inplace( - lhs=mean_ary, rhs=nelems_ary, sycl_queue=q, depends=deps + lhs=mean_ary, rhs=nelems_ary, sycl_queue=q, depends=dep_evs ) - host_tasks_list.append(ht_e2) + _manager.add_event_pair(ht_e2, d_e1) # subtract mean from original array to get deviations dev_ary = dpt.empty_like(buf) if mean_ary_shape != buf.shape: @@ -110,18 +110,17 @@ def _var_impl(x, axis, correction, keepdims): ht_e4, su_e = tei._subtract( src1=buf, src2=mean_ary, dst=dev_ary, sycl_queue=q, depends=[d_e1] ) - host_tasks_list.append(ht_e4) + _manager.add_event_pair(ht_e4, su_e) # square deviations ht_e5, sq_e = tei._square( src=dev_ary, dst=dev_ary, sycl_queue=q, depends=[su_e] ) - host_tasks_list.append(ht_e5) - deps2 = [] + _manager.add_event_pair(ht_e5, sq_e) + # take sum of squared deviations dev_ary2 = dpt.permute_dims(dev_ary, perm) if red_nd == 0: res = dev_ary - deps2.append(sq_e) else: res = dpt.empty( res_shape, @@ -136,8 +135,7 @@ def _var_impl(x, axis, correction, keepdims): sycl_queue=q, depends=[sq_e], ) - host_tasks_list.append(ht_e6) - deps2.append(r_e2) + _manager.add_event_pair(ht_e6, r_e2) if keepdims: res_shape = res_shape + (1,) * red_nd @@ -154,11 +152,12 @@ def _var_impl(x, axis, correction, keepdims): # divide in-place again if div_ary.shape != res_shape: div_ary = dpt.broadcast_to(div_ary, res.shape) + dep_evs = _manager.submitted_events ht_e7, d_e2 = tei._divide_inplace( - lhs=res, rhs=div_ary, sycl_queue=q, depends=deps2 + lhs=res, rhs=div_ary, sycl_queue=q, depends=dep_evs ) - host_tasks_list.append(ht_e7) - return res, [d_e2], host_tasks_list + _manager.add_event_pair(ht_e7, d_e2) + return res, [d_e2] def mean(x, axis=None, keepdims=False): @@ -221,25 +220,28 @@ def mean(x, axis=None, keepdims=False): if sum_nd == 0: return dpt.astype(x, res_dt, copy=True) - s_e = [] - host_tasks_list = [] + _manager = du.SequentialOrderManager + dep_evs = _manager.submitted_events if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q): res = dpt.empty( res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) ht_e1, r_e = tri._sum_over_axis( - src=arr2, trailing_dims_to_reduce=sum_nd, dst=res, sycl_queue=q + src=arr2, + trailing_dims_to_reduce=sum_nd, + dst=res, + sycl_queue=q, + depends=dep_evs, ) - host_tasks_list.append(ht_e1) - s_e.append(r_e) + _manager.add_event_pair(ht_e1, r_e) else: tmp = dpt.empty( arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr2, dst=tmp, sycl_queue=q + src=arr2, dst=tmp, sycl_queue=q, depends=dep_evs ) - host_tasks_list.append(ht_e_cpy) + _manager.add_event_pair(ht_e_cpy, cpy_e) res = dpt.empty( res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) @@ -250,8 +252,7 @@ def mean(x, axis=None, keepdims=False): sycl_queue=q, depends=[cpy_e], ) - host_tasks_list.append(ht_e_red) - s_e.append(r_e) + _manager.add_event_pair(ht_e_red, r_e) if keepdims: res_shape = res_shape + (1,) * sum_nd @@ -266,11 +267,11 @@ def mean(x, axis=None, keepdims=False): ) if nelems_arr.shape != res_shape: nelems_arr = dpt.broadcast_to(nelems_arr, res_shape) - ht_e2, _ = tei._divide_inplace( - lhs=res, rhs=nelems_arr, sycl_queue=q, depends=s_e + dep_evs = _manager.submitted_events + ht_e2, div_e = tei._divide_inplace( + lhs=res, rhs=nelems_arr, sycl_queue=q, depends=dep_evs ) - host_tasks_list.append(ht_e2) - dpctl.SyclEvent.wait_for(host_tasks_list) + _manager.add_event_pair(ht_e2, div_e) return res @@ -321,8 +322,7 @@ def var(x, axis=None, correction=0.0, keepdims=False): if x.dtype.kind == "c": raise ValueError("`var` does not support complex types") - res, _, host_tasks_list = _var_impl(x, axis, correction, keepdims) - dpctl.SyclEvent.wait_for(host_tasks_list) + res, _ = _var_impl(x, axis, correction, keepdims) return res @@ -374,10 +374,10 @@ def std(x, axis=None, correction=0.0, keepdims=False): if x.dtype.kind == "c": raise ValueError("`std` does not support complex types") - res, deps, host_tasks_list = _var_impl(x, axis, correction, keepdims) - ht_ev, _ = tei._sqrt( + _manager = du.SequentialOrderManager + res, deps = _var_impl(x, axis, correction, keepdims) + ht_ev, sqrt_ev = tei._sqrt( src=res, dst=res, sycl_queue=res.sycl_queue, depends=deps ) - host_tasks_list.append(ht_ev) - dpctl.SyclEvent.wait_for(host_tasks_list) + _manager.add_event_pair(ht_ev, sqrt_ev) return res diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 800b7c823a..f9d4f851c4 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -24,6 +24,7 @@ import numpy as np import dpctl import dpctl.memory as dpmem +import dpctl.utils as du from .._backend cimport DPCTLSyclUSMRef from .._sycl_device_factory cimport _cached_default_device @@ -81,6 +82,7 @@ cdef class InternalUSMArrayError(Exception): cdef object _as_zero_dim_ndarray(object usm_ary): "Convert size-1 array to NumPy 0d array" mem_view = dpmem.as_usm_memory(usm_ary) + du.SequentialOrderManager.wait() host_buf = mem_view.copy_to_host() view = host_buf.view(usm_ary.dtype) view.shape = tuple() diff --git a/dpctl/tensor/_utility_functions.py b/dpctl/tensor/_utility_functions.py index 2a9c5923bf..78c5e1a913 100644 --- a/dpctl/tensor/_utility_functions.py +++ b/dpctl/tensor/_utility_functions.py @@ -1,9 +1,9 @@ from numpy.core.numeric import normalize_axis_tuple -import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti import dpctl.tensor._tensor_reductions_impl as tri +import dpctl.utils as du def _boolean_reduction(x, axis, keepdims, func): @@ -35,7 +35,8 @@ def _boolean_reduction(x, axis, keepdims, func): exec_q = x.sycl_queue res_usm_type = x.usm_type - wait_list = [] + _manager = du.SequentialOrderManager + dep_evs = _manager.submitted_events # always allocate the temporary as # int32 and usm-device to ensure that atomic updates # are supported @@ -50,8 +51,9 @@ def _boolean_reduction(x, axis, keepdims, func): trailing_dims_to_reduce=red_nd, dst=res_tmp, sycl_queue=exec_q, + depends=dep_evs, ) - wait_list.append(hev0) + _manager.add_event_pair(hev0, ev0) # copy to boolean result array res = dpt.empty( @@ -60,16 +62,15 @@ def _boolean_reduction(x, axis, keepdims, func): usm_type=res_usm_type, sycl_queue=exec_q, ) - hev1, _ = ti._copy_usm_ndarray_into_usm_ndarray( + hev1, ev1 = ti._copy_usm_ndarray_into_usm_ndarray( src=res_tmp, dst=res, sycl_queue=exec_q, depends=[ev0] ) - wait_list.append(hev1) + _manager.add_event_pair(hev1, ev1) if keepdims: res_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) - dpctl.SyclEvent.wait_for(wait_list) return res From 9116e7386f7e728bb06cc539c34862e411e06abc Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 8 May 2024 15:06:13 -0500 Subject: [PATCH 10/25] Wrap call to sycl::free in try/catch Report caugh exception to std::cerr by ignore it otherwise. --- dpctl/memory/_opaque_smart_ptr.hpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/dpctl/memory/_opaque_smart_ptr.hpp b/dpctl/memory/_opaque_smart_ptr.hpp index c79be4f8d4..a9eb1eafb7 100644 --- a/dpctl/memory/_opaque_smart_ptr.hpp +++ b/dpctl/memory/_opaque_smart_ptr.hpp @@ -35,6 +35,9 @@ #include #include +#include +#include + namespace { @@ -48,7 +51,13 @@ class USMDeleter USMDeleter(const ::sycl::context &context) : _context(context) {} template void operator()(T *ptr) const { - ::sycl::free(ptr, _context); + try { + ::sycl::free(ptr, _context); + } catch (const std::exception &e) { + std::cout << "Call to sycl::free caught an exception: " << e.what() + << std::endl; + // std::terminate(); + } } private: From be401a65ce0b974e9e441a9fabb97299487acc63 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 8 May 2024 17:55:59 -0700 Subject: [PATCH 11/25] Fixed missing dependency events --- dpctl/tensor/_ctors.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/dpctl/tensor/_ctors.py b/dpctl/tensor/_ctors.py index 740253b691..45a447c623 100644 --- a/dpctl/tensor/_ctors.py +++ b/dpctl/tensor/_ctors.py @@ -1631,8 +1631,9 @@ def tril(x, /, *, k=0): sycl_queue=q, ) _manager = dpctl.utils.SequentialOrderManager + dep_evs = _manager.submitted_events hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x, dst=res, sycl_queue=q + src=x, dst=res, sycl_queue=q, depends=dep_evs ) _manager.add_event_pair(hev, cpy_ev) elif k < -shape[nd - 2]: @@ -1652,7 +1653,10 @@ def tril(x, /, *, k=0): sycl_queue=q, ) _manager = dpctl.utils.SequentialOrderManager - hev, tril_ev = ti._tril(src=x, dst=res, k=k, sycl_queue=q) + dep_evs = _manager.submitted_events + hev, tril_ev = ti._tril( + src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs + ) _manager.add_event_pair(hev, tril_ev) return res @@ -1713,8 +1717,9 @@ def triu(x, /, *, k=0): sycl_queue=q, ) _manager = dpctl.utils.SequentialOrderManager + dep_evs = _manager.submitted_events hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=x, dst=res, sycl_queue=q + src=x, dst=res, sycl_queue=q, depends=dep_evs ) _manager.add_event_pair(hev, cpy_ev) else: @@ -1726,7 +1731,10 @@ def triu(x, /, *, k=0): sycl_queue=q, ) _manager = dpctl.utils.SequentialOrderManager - hev, triu_ev = ti._triu(src=x, dst=res, k=k, sycl_queue=q) + dep_evs = _manager.submitted_events + hev, triu_ev = ti._triu( + src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs + ) _manager.add_event_pair(hev, triu_ev) return res From 106c8de6b96bad6c43bc08a5e877b539ee733ee9 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 9 May 2024 12:23:38 -0500 Subject: [PATCH 12/25] Add queue synchronization points in special methods for conversion to scalars Also synchronize in __sycl_usm_array_interface__ attribute, to ensure that `dpctl.memory.as_usm_memory(ary).copy_to_host()` produces expected results. --- dpctl/tensor/_usmarray.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index f9d4f851c4..443d8184a2 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -24,7 +24,6 @@ import numpy as np import dpctl import dpctl.memory as dpmem -import dpctl.utils as du from .._backend cimport DPCTLSyclUSMRef from .._sycl_device_factory cimport _cached_default_device @@ -82,7 +81,7 @@ cdef class InternalUSMArrayError(Exception): cdef object _as_zero_dim_ndarray(object usm_ary): "Convert size-1 array to NumPy 0d array" mem_view = dpmem.as_usm_memory(usm_ary) - du.SequentialOrderManager.wait() + usm_ary.sycl_queue.wait() host_buf = mem_view.copy_to_host() view = host_buf.view(usm_ary.dtype) view.shape = tuple() @@ -519,6 +518,8 @@ cdef class usm_ndarray: "byte_offset is not a multiple of item_size.") elem_offset = byte_offset // item_size ary_iface['offset'] = elem_offset + # must wait for content of the memory to finalize + self.sycl_queue.wait() return ary_iface @property From a5a481ecdb9a55340a88c5ab57ee7c0f3b8e6f46 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 14 May 2024 09:39:13 -0500 Subject: [PATCH 13/25] Use manager per queue in tensor implementation Move implementation from dpctl/utils/__init__.py into dedicated files. --- dpctl/tensor/_accumulation.py | 2 +- dpctl/tensor/_clip.py | 14 +- dpctl/tensor/_copy_utils.py | 24 ++-- dpctl/tensor/_ctors.py | 22 +-- dpctl/tensor/_elementwise_common.py | 4 +- dpctl/tensor/_indexing_functions.py | 6 +- dpctl/tensor/_linear_algebra_functions.py | 6 +- dpctl/tensor/_manipulation_functions.py | 19 +-- dpctl/tensor/_print.py | 6 +- dpctl/tensor/_reduction.py | 6 +- dpctl/tensor/_reshape.py | 2 +- dpctl/tensor/_search_functions.py | 2 +- dpctl/tensor/_searchsorted.py | 2 +- dpctl/tensor/_set_functions.py | 8 +- dpctl/tensor/_sorting.py | 4 +- dpctl/tensor/_statistical_functions.py | 9 +- dpctl/tensor/_utility_functions.py | 2 +- dpctl/tests/test_usm_ndarray_print.py | 9 +- dpctl/utils/__init__.py | 159 +--------------------- dpctl/utils/_intel_device_info.py | 109 +++++++++++++++ dpctl/utils/_order_manager.py | 87 ++++++++++++ 21 files changed, 275 insertions(+), 227 deletions(-) create mode 100644 dpctl/utils/_intel_device_info.py create mode 100644 dpctl/utils/_order_manager.py diff --git a/dpctl/tensor/_accumulation.py b/dpctl/tensor/_accumulation.py index be81d5d333..4605b10b63 100644 --- a/dpctl/tensor/_accumulation.py +++ b/dpctl/tensor/_accumulation.py @@ -126,7 +126,7 @@ def _accumulate_common( out = dpt.permute_dims(out, perm) final_ev = dpctl.SyclEvent() - _manager = SequentialOrderManager + _manager = SequentialOrderManager[q] depends = _manager.submitted_events if implemented_types: if not include_initial: diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py index 43bbb29aec..c6e86ce6d0 100644 --- a/dpctl/tensor/_clip.py +++ b/dpctl/tensor/_clip.py @@ -299,7 +299,7 @@ def _clip_none(x, val, out, order, _binary_fn): x = dpt.broadcast_to(x, res_shape) if val_ary.shape != res_shape: val_ary = dpt.broadcast_to(val_ary, res_shape) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events ht_binary_ev, binary_ev = _binary_fn( src1=x, src2=val_ary, dst=out, sycl_queue=exec_q, depends=dep_evs @@ -322,7 +322,7 @@ def _clip_none(x, val, out, order, _binary_fn): buf = _empty_like_orderK(val_ary, res_dt) else: buf = dpt.empty_like(val_ary, dtype=res_dt, order=order) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=val_ary, dst=buf, sycl_queue=exec_q, depends=dep_evs @@ -449,7 +449,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): else: out = dpt.empty_like(x, order=order) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=x, dst=out, sycl_queue=exec_q, depends=dep_evs @@ -672,7 +672,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): a_min = dpt.broadcast_to(a_min, res_shape) if a_max.shape != res_shape: a_max = dpt.broadcast_to(a_max, res_shape) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] dep_ev = _manager.submitted_events ht_binary_ev, binary_ev = ti._clip( src=x, @@ -700,7 +700,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): buf2 = _empty_like_orderK(a_max, buf2_dt) else: buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] dep_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_ev @@ -756,7 +756,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): buf1 = _empty_like_orderK(a_min, buf1_dt) else: buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] dep_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_ev @@ -825,7 +825,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): else: buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_evs diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index 5d74bd0939..d434819e04 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -40,13 +40,14 @@ def _copy_to_numpy(ary): if not isinstance(ary, dpt.usm_ndarray): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}") nb = ary.usm_data.nbytes - hh = dpm.MemoryUSMHost(nb, queue=ary.sycl_queue) + q = ary.sycl_queue + hh = dpm.MemoryUSMHost(nb, queue=q) h = np.ndarray(nb, dtype="u1", buffer=hh).view(ary.dtype) itsz = ary.itemsize strides_bytes = tuple(si * itsz for si in ary.strides) offset = ary.__sycl_usm_array_interface__.get("offset", 0) * itsz # ensure that content of ary.usm_data is final - dpctl.utils.SequentialOrderManager.wait() + q.wait() hh.copy_from_device(ary.usm_data) return np.ndarray( ary.shape, @@ -105,7 +106,7 @@ def _copy_from_numpy_into(dst, np_ary): src_ary = src_ary.astype(np.float32) elif src_ary_dt_c == "D": src_ary = src_ary.astype(np.complex64) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[copy_q] dep_ev = _manager.submitted_events # synchronizing call ti._copy_numpy_ndarray_into_usm_ndarray( @@ -208,7 +209,7 @@ def _copy_overlapping(dst, src): order="C", buffer_ctor_kwargs={"queue": q}, ) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[q] dep_evs = _manager.submitted_events hcp1, cp1 = ti._copy_usm_ndarray_into_usm_ndarray( src=src, dst=tmp, sycl_queue=q, depends=dep_evs @@ -232,10 +233,11 @@ def _copy_same_shape(dst, src): _copy_overlapping(src=src, dst=dst) return - _manager = dpctl.utils.SequentialOrderManager + copy_q = dst.sycl_queue + _manager = dpctl.utils.SequentialOrderManager[copy_q] dep_evs = _manager.submitted_events hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=src, dst=dst, sycl_queue=dst.sycl_queue, depends=dep_evs + src=src, dst=dst, sycl_queue=copy_q, depends=dep_evs ) _manager.add_event_pair(hev, cpy_ev) @@ -724,7 +726,7 @@ def _extract_impl(ary, ary_mask, axis=0): cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64 cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device) exec_q = cumsum.sycl_queue - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events mask_count = ti.mask_positions( ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs @@ -760,7 +762,7 @@ def _nonzero_impl(ary): cumsum = dpt.empty( mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C" ) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events mask_count = ti.mask_positions( ary, cumsum, sycl_queue=exec_q, depends=dep_evs @@ -837,7 +839,7 @@ def _take_multi_index(ary, inds, p): res = dpt.empty( res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_ev = _manager.submitted_events hev, take_ev = ti._take( src=ary, @@ -890,7 +892,7 @@ def _place_impl(ary, ary_mask, vals, axis=0): cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64 cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device) exec_q = cumsum.sycl_queue - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_ev = _manager.submitted_events mask_count = ti.mask_positions( ary_mask, cumsum, sycl_queue=exec_q, depends=dep_ev @@ -990,7 +992,7 @@ def _put_multi_index(ary, inds, p, vals): else: rhs = dpt.astype(vals, ary.dtype) rhs = dpt.broadcast_to(rhs, expected_vals_shape) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_ev = _manager.submitted_events hev, put_ev = ti._put( dst=ary, diff --git a/dpctl/tensor/_ctors.py b/dpctl/tensor/_ctors.py index 45a447c623..fb400178f9 100644 --- a/dpctl/tensor/_ctors.py +++ b/dpctl/tensor/_ctors.py @@ -159,7 +159,7 @@ def _asarray_from_usm_ndarray( ) eq = dpctl.utils.get_execution_queue([usm_ndary.sycl_queue, copy_q]) if eq is not None: - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[eq] dep_evs = _manager.submitted_events hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=usm_ndary, dst=res, sycl_queue=eq, depends=dep_evs @@ -415,7 +415,7 @@ def _asarray_from_seq( sycl_queue=alloc_q, order=order, ) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[exec_q] _device_copy_walker(seq_obj, res, _manager) return res else: @@ -854,7 +854,7 @@ def arange( else: _step = sc_ty(1) _start = _first - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] # populating newly allocated array, no task dependencies hev, lin_ev = ti._linspace_step(_start, _step, res, sycl_queue) _manager.add_event_pair(hev, lin_ev) @@ -1001,7 +1001,7 @@ def ones( order=order, buffer_ctor_kwargs={"queue": sycl_queue}, ) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] # populating new allocation, no dependent events hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue) _manager.add_event_pair(hev, full_ev) @@ -1100,7 +1100,7 @@ def full( elif fill_value_type is int and np.issubdtype(dtype, np.integer): fill_value = _to_scalar(fill_value, dtype) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] # populating new allocation, no dependent events hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue) _manager.add_event_pair(hev, full_ev) @@ -1480,7 +1480,7 @@ def linspace( start = float(start) stop = float(stop) res = dpt.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] hev, la_ev = ti._linspace_affine( start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue ) @@ -1578,7 +1578,7 @@ def eye( buffer_ctor_kwargs={"queue": sycl_queue}, ) if n_rows != 0 and n_cols != 0: - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] hev, eye_ev = ti._eye(k, dst=res, sycl_queue=sycl_queue) _manager.add_event_pair(hev, eye_ev) return res @@ -1630,7 +1630,7 @@ def tril(x, /, *, k=0): usm_type=x.usm_type, sycl_queue=q, ) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[q] dep_evs = _manager.submitted_events hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=x, dst=res, sycl_queue=q, depends=dep_evs @@ -1652,7 +1652,7 @@ def tril(x, /, *, k=0): usm_type=x.usm_type, sycl_queue=q, ) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[q] dep_evs = _manager.submitted_events hev, tril_ev = ti._tril( src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs @@ -1716,7 +1716,7 @@ def triu(x, /, *, k=0): usm_type=x.usm_type, sycl_queue=q, ) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[q] dep_evs = _manager.submitted_events hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=x, dst=res, sycl_queue=q, depends=dep_evs @@ -1730,7 +1730,7 @@ def triu(x, /, *, k=0): usm_type=x.usm_type, sycl_queue=q, ) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[q] dep_evs = _manager.submitted_events hev, triu_ev = ti._triu( src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py index 25e485ee17..6b38444902 100644 --- a/dpctl/tensor/_elementwise_common.py +++ b/dpctl/tensor/_elementwise_common.py @@ -236,7 +236,7 @@ def __call__(self, x, /, *, out=None, order="K"): ) exec_q = x.sycl_queue - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] if buf_dt is None: if out is None: if order == "K": @@ -632,7 +632,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"): ) orig_out = out - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] if out is not None: if not isinstance(out, dpt.usm_ndarray): raise TypeError( diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 54b6deea3f..b70d50c1df 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -121,7 +121,7 @@ def take(x, indices, /, *, axis=None, mode="wrap"): res_shape, dtype=x.dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[exec_q] deps_ev = _manager.submitted_events hev, take_ev = ti._take( x, (indices,), res, axis, mode, sycl_queue=exec_q, depends=deps_ev @@ -278,7 +278,7 @@ def put_vec_duplicates(vec, ind, vals): rhs = dpt.astype(vals, x.dtype) rhs = dpt.broadcast_to(rhs, val_shape) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[exec_q] deps_ev = _manager.submitted_events hev, put_ev = ti._put( x, (indices,), rhs, axis, mode, sycl_queue=exec_q, depends=deps_ev @@ -375,7 +375,7 @@ def place(arr, mask, vals): if arr.shape != mask.shape or vals.ndim != 1: raise ValueError("Array sizes are not as required") cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[exec_q] deps_ev = _manager.submitted_events nz_count = ti.mask_positions( mask, cumsum, sycl_queue=exec_q, depends=deps_ev diff --git a/dpctl/tensor/_linear_algebra_functions.py b/dpctl/tensor/_linear_algebra_functions.py index ac0f04fb99..4d72aa28c2 100644 --- a/dpctl/tensor/_linear_algebra_functions.py +++ b/dpctl/tensor/_linear_algebra_functions.py @@ -189,7 +189,7 @@ def tensordot(x1, x2, axes=2): "supported types according to the casting rule ''safe''." ) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] if buf1_dt is None and buf2_dt is None: out = dpt.empty( res_shape, @@ -408,7 +408,7 @@ def vecdot(x1, x2, axis=-1): "supported types according to the casting rule ''safe''." ) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] if buf1_dt is None and buf2_dt is None: if x1.dtype.kind == "c": x1_tmp = _empty_like_orderK(x1, x1.dtype) @@ -791,7 +791,7 @@ def matmul(x1, x2, out=None, dtype=None, order="K"): else "C" ) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] if buf1_dt is None and buf2_dt is None: if out is None: if order == "K": diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py index 9030780e45..8a9620605e 100644 --- a/dpctl/tensor/_manipulation_functions.py +++ b/dpctl/tensor/_manipulation_functions.py @@ -343,18 +343,19 @@ def roll(X, /, shift, *, axis=None): """ if not isinstance(X, dpt.usm_ndarray): raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") - _manager = dputils.SequentialOrderManager + exec_q = X.sycl_queue + _manager = dputils.SequentialOrderManager[exec_q] if axis is None: shift = operator.index(shift) dep_evs = _manager.submitted_events res = dpt.empty( - X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=X.sycl_queue + X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=exec_q ) hev, roll_ev = ti._copy_usm_ndarray_for_roll_1d( src=X, dst=res, shift=shift, - sycl_queue=X.sycl_queue, + sycl_queue=exec_q, depends=dep_evs, ) _manager.add_event_pair(hev, roll_ev) @@ -369,7 +370,6 @@ def roll(X, /, shift, *, axis=None): for sh, ax in broadcasted: shifts[ax] += sh - exec_q = X.sycl_queue res = dpt.empty( X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=exec_q ) @@ -447,7 +447,7 @@ def _concat_axis_None(arrays): ) fill_start = 0 - _manager = dputils.SequentialOrderManager + _manager = dputils.SequentialOrderManager[exec_q] deps = _manager.submitted_events for array in arrays: fill_end = fill_start + array.size @@ -538,7 +538,7 @@ def concat(arrays, /, *, axis=0): res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - _manager = dputils.SequentialOrderManager + _manager = dputils.SequentialOrderManager[exec_q] deps = _manager.submitted_events fill_start = 0 for i in range(n): @@ -605,7 +605,7 @@ def stack(arrays, /, *, axis=0): res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - _manager = dputils.SequentialOrderManager + _manager = dputils.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events for i in range(n): c_shapes_copy = tuple( @@ -862,7 +862,8 @@ def repeat(x, repeats, /, *, axis=None): f"got {type(repeats)}" ) - _manager = dputils.SequentialOrderManager + + _manager = dputils.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events if scalar: res_axis_size = repeats * axis_size @@ -1050,7 +1051,7 @@ def tile(x, repetitions, /): broadcast_sh, ) # copy broadcast input into flat array - _manager = dputils.SequentialOrderManager + _manager = dputils.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events hev, cp_ev = ti._copy_usm_ndarray_for_reshape( src=x, dst=res, sycl_queue=exec_q, depends=dep_evs diff --git a/dpctl/tensor/_print.py b/dpctl/tensor/_print.py index b371eebb6f..77bd9d178e 100644 --- a/dpctl/tensor/_print.py +++ b/dpctl/tensor/_print.py @@ -245,11 +245,12 @@ def _nd_corners(arr_in, edge_items): for i in range(arr_in.ndim) ) + exec_q = arr_in.sycl_queue arr_out = dpt.empty( res_shape, dtype=arr_in.dtype, usm_type=arr_in.usm_type, - sycl_queue=arr_in.sycl_queue, + sycl_queue=exec_q, ) blocks = [] @@ -264,10 +265,9 @@ def _nd_corners(arr_in, edge_items): else: blocks.append((np.s_[:],)) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events hev_list = [] - exec_q = arr_in.sycl_queue for slc in itertools.product(*blocks): hev, _ = ti._copy_usm_ndarray_into_usm_ndarray( src=arr_in[slc], diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index b8bc1a20f2..afd5f4cf9b 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -108,7 +108,7 @@ def _reduction_over_axis( res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[q] dep_evs = _manager.submitted_events if red_nd == 0: ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( @@ -500,7 +500,7 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn): res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q ) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events if red_nd == 0: ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( @@ -669,7 +669,7 @@ def _search_over_axis(x, axis, keepdims, out, _reduction_fn): res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q ) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events if red_nd == 0: ht_e_fill, fill_ev = ti._full_usm_ndarray( diff --git a/dpctl/tensor/_reshape.py b/dpctl/tensor/_reshape.py index 6a669ae255..eb9b7ffcfe 100644 --- a/dpctl/tensor/_reshape.py +++ b/dpctl/tensor/_reshape.py @@ -165,7 +165,7 @@ def reshape(X, /, shape, *, order="C", copy=None): buffer=X.usm_type, buffer_ctor_kwargs={"queue": copy_q}, ) - _manager = dpctl.utils.SequentialOrderManager + _manager = dpctl.utils.SequentialOrderManager[copy_q] dep_evs = _manager.submitted_events if order == "C": hev, r_e = _copy_usm_ndarray_for_reshape( diff --git a/dpctl/tensor/_search_functions.py b/dpctl/tensor/_search_functions.py index 693f25c118..c0fdfb7861 100644 --- a/dpctl/tensor/_search_functions.py +++ b/dpctl/tensor/_search_functions.py @@ -198,7 +198,7 @@ def where(condition, x1, x2, /, *, order="K", out=None): sycl_queue=exec_q, ) - _manager = SequentialOrderManager + _manager = SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events if x1_dtype != out_dtype: if order == "K": diff --git a/dpctl/tensor/_searchsorted.py b/dpctl/tensor/_searchsorted.py index 15a30bf127..131759b5ce 100644 --- a/dpctl/tensor/_searchsorted.py +++ b/dpctl/tensor/_searchsorted.py @@ -87,7 +87,7 @@ def searchsorted( x1_dt = x1.dtype x2_dt = x2.dtype - _manager = du.SequentialOrderManager + _manager = du.SequentialOrderManager[q] dep_evs = _manager.submitted_events ev = dpctl.SyclEvent() if sorter is not None: diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py index 5056f38aec..2e2df751a9 100644 --- a/dpctl/tensor/_set_functions.py +++ b/dpctl/tensor/_set_functions.py @@ -87,7 +87,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: if fx.size == 0: return fx s = dpt.empty_like(fx, order="C") - _manager = du.SequentialOrderManager + _manager = du.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events if fx.flags.c_contiguous: ht_ev, sort_ev = _sort_ascending( @@ -183,7 +183,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt)) s = dpt.empty_like(fx, order="C") - _manager = du.SequentialOrderManager + _manager = du.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events if fx.flags.c_contiguous: ht_ev, sort_ev = _sort_ascending( @@ -317,7 +317,7 @@ def unique_inverse(x): if fx.size == 0: return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape)) - _manager = du.SequentialOrderManager + _manager = du.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events if fx.flags.c_contiguous: ht_ev, sort_ev = _argsort_ascending( @@ -493,7 +493,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: dpt.reshape(unsorting_ids, x.shape), dpt.empty_like(fx, dtype=ind_dt), ) - _manager = du.SequentialOrderManager + _manager = du.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events if fx.flags.c_contiguous: ht_ev, sort_ev = _argsort_ascending( diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py index 48e79c90c6..28ec42a085 100644 --- a/dpctl/tensor/_sorting.py +++ b/dpctl/tensor/_sorting.py @@ -76,7 +76,7 @@ def sort(x, /, *, axis=-1, descending=False, stable=True): ] arr = dpt.permute_dims(x, perm) exec_q = x.sycl_queue - _manager = du.SequentialOrderManager + _manager = du.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events impl_fn = _sort_descending if descending else _sort_ascending if arr.flags.c_contiguous: @@ -159,7 +159,7 @@ def argsort(x, axis=-1, descending=False, stable=True): ] arr = dpt.permute_dims(x, perm) exec_q = x.sycl_queue - _manager = du.SequentialOrderManager + _manager = du.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events impl_fn = _argsort_descending if descending else _argsort_ascending index_dt = ti.default_device_index_type(exec_q) diff --git a/dpctl/tensor/_statistical_functions.py b/dpctl/tensor/_statistical_functions.py index c37b27da85..6fc2066b06 100644 --- a/dpctl/tensor/_statistical_functions.py +++ b/dpctl/tensor/_statistical_functions.py @@ -48,7 +48,7 @@ def _var_impl(x, axis, correction, keepdims): ) res_usm_type = x.usm_type - _manager = du.SequentialOrderManager + _manager = du.SequentialOrderManager[q] dep_evs = _manager.submitted_events if inp_dt != res_dt: buf = dpt.empty_like(x, dtype=res_dt) @@ -220,7 +220,7 @@ def mean(x, axis=None, keepdims=False): if sum_nd == 0: return dpt.astype(x, res_dt, copy=True) - _manager = du.SequentialOrderManager + _manager = du.SequentialOrderManager[q] dep_evs = _manager.submitted_events if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q): res = dpt.empty( @@ -374,10 +374,11 @@ def std(x, axis=None, correction=0.0, keepdims=False): if x.dtype.kind == "c": raise ValueError("`std` does not support complex types") - _manager = du.SequentialOrderManager + exec_q = x.sycl_queue + _manager = du.SequentialOrderManager[exec_q] res, deps = _var_impl(x, axis, correction, keepdims) ht_ev, sqrt_ev = tei._sqrt( - src=res, dst=res, sycl_queue=res.sycl_queue, depends=deps + src=res, dst=res, sycl_queue=exec_q, depends=deps ) _manager.add_event_pair(ht_ev, sqrt_ev) return res diff --git a/dpctl/tensor/_utility_functions.py b/dpctl/tensor/_utility_functions.py index 78c5e1a913..709c1dc046 100644 --- a/dpctl/tensor/_utility_functions.py +++ b/dpctl/tensor/_utility_functions.py @@ -35,7 +35,7 @@ def _boolean_reduction(x, axis, keepdims, func): exec_q = x.sycl_queue res_usm_type = x.usm_type - _manager = du.SequentialOrderManager + _manager = du.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events # always allocate the temporary as # int32 and usm-device to ensure that atomic updates diff --git a/dpctl/tests/test_usm_ndarray_print.py b/dpctl/tests/test_usm_ndarray_print.py index 06b7ad6672..9e15fa3310 100644 --- a/dpctl/tests/test_usm_ndarray_print.py +++ b/dpctl/tests/test_usm_ndarray_print.py @@ -260,8 +260,8 @@ def test_usm_ndarray_str_separator(self): def test_print_repr(self): q = get_queue_or_skip() - x = dpt.asarray(0, dtype="int64", sycl_queue=q) - assert repr(x) == "usm_ndarray(0)" + x = dpt.asarray(3, dtype="int64", sycl_queue=q) + assert repr(x) == "usm_ndarray(3)" x = dpt.asarray([np.nan, np.inf], sycl_queue=q) if x.sycl_device.has_aspect_fp64: @@ -281,7 +281,10 @@ def test_print_repr(self): ) x = dpt.arange(4, dtype="i4", sycl_queue=q) - assert repr(x) == "usm_ndarray([0, 1, 2, 3], dtype=int32)" + x.sycl_queue.wait() + r = repr(x) + print(r) + assert r == "usm_ndarray([0, 1, 2, 3], dtype=int32)" dpt.set_print_options(linewidth=1) np.testing.assert_equal( diff --git a/dpctl/utils/__init__.py b/dpctl/utils/__init__.py index 598ce1cce1..0802d07939 100644 --- a/dpctl/utils/__init__.py +++ b/dpctl/utils/__init__.py @@ -18,171 +18,16 @@ A collection of utility functions. """ -from contextvars import ContextVar - -from .._sycl_device import SyclDevice -from .._sycl_event import SyclEvent from ._compute_follows_data import ( ExecutionPlacementError, get_coerced_usm_type, get_execution_queue, validate_usm_type, ) -from ._device_queries import ( - intel_device_info_device_id, - intel_device_info_free_memory, - intel_device_info_gpu_eu_count, - intel_device_info_gpu_eu_count_per_subslice, - intel_device_info_gpu_eu_simd_width, - intel_device_info_gpu_hw_threads_per_eu, - intel_device_info_gpu_slices, - intel_device_info_gpu_subslices_per_slice, - intel_device_info_max_mem_bandwidth, - intel_device_info_memory_bus_width, - intel_device_info_memory_clock_rate, -) from ._onetrace_context import onetrace_enabled -from ._seq_order_keeper import _OrderManager - - -def intel_device_info(dev, /): - """intel_device_info(sycl_device) - - For Intel(R) GPU devices returns a dictionary - with device architectural details, and an empty - dictionary otherwise. The dictionary contains - the following keys: - - device_id: - 32-bits device PCI identifier - gpu_eu_count: - Total number of execution units - gpu_hw_threads_per_eu: - Number of thread contexts in EU - gpu_eu_simd_width: - Physical SIMD width of EU - gpu_slices: - Total number of slices - gpu_subslices_per_slice: - Number of sub-slices per slice - gpu_eu_count_per_subslice: - Number of EUs in subslice - max_mem_bandwidth: - Maximum memory bandwidth in bytes/second - free_memory: - Global memory available on the device in units of bytes - - Unsupported descriptors are omitted from the dictionary. - - Descriptors other than the PCI identifier are supported only - for :class:`.SyclDevices` with Level-Zero backend. - - .. note:: - Environment variable ``ZES_ENABLE_SYSMAN`` may need to be set - to ``1`` for the ``"free_memory"`` key to be reported. - """ - if not isinstance(dev, SyclDevice): - raise TypeError(f"Expected dpctl.SyclDevice, got {type(dev)}") - dev_id = intel_device_info_device_id(dev) - if dev_id: - res = { - "device_id": dev_id, - } - if dev.has_aspect_gpu: - eu_count = intel_device_info_gpu_eu_count(dev) - if eu_count: - res["gpu_eu_count"] = eu_count - hw_threads = intel_device_info_gpu_hw_threads_per_eu(dev) - if hw_threads: - res["gpu_hw_threads_per_eu"] = hw_threads - simd_w = intel_device_info_gpu_eu_simd_width(dev) - if simd_w: - res["gpu_eu_simd_width"] = simd_w - n_slices = intel_device_info_gpu_slices(dev) - if n_slices: - res["gpu_slices"] = n_slices - n_subslices = intel_device_info_gpu_subslices_per_slice(dev) - if n_subslices: - res["gpu_subslices_per_slice"] = n_subslices - n_eu_per_subslice = intel_device_info_gpu_eu_count_per_subslice(dev) - if n_eu_per_subslice: - res["gpu_eu_count_per_subslice"] = n_eu_per_subslice - bw = intel_device_info_max_mem_bandwidth(dev) - if bw: - res["max_mem_bandwidth"] = bw - fm = intel_device_info_free_memory(dev) - if fm: - res["free_memory"] = fm - mcr = intel_device_info_memory_clock_rate(dev) - if mcr: - res["memory_clock_rate"] = mcr - mbw = intel_device_info_memory_bus_width(dev) - if mbw: - res["memory_bus_width"] = mbw - return res - return dict() - - -class _SequentialOrderManager: - """ - Class to orchestrate default sequential order - of the tasks offloaded from Python. - """ - - def __init__(self): - self._state = ContextVar("_seq_order_keeper", default=_OrderManager(16)) - - def __dealloc__(self): - _local = self._state.get() - SyclEvent.wait_for(_local.get_submitted_events()) - SyclEvent.wait_for(_local.get_host_task_events()) - - def __repr__(self): - return "" - - def __str__(self): - return "" - - def add_event_pair(self, host_task_ev, comp_ev): - _local = self._state.get() - if isinstance(host_task_ev, SyclEvent) and isinstance( - comp_ev, SyclEvent - ): - _local.add_to_both_events(host_task_ev, comp_ev) - else: - if not isinstance(host_task_ev, (list, tuple)): - host_task_ev = (host_task_ev,) - if not isinstance(comp_ev, (list, tuple)): - comp_ev = (comp_ev,) - _local.add_vector_to_both_events(host_task_ev, comp_ev) - - @property - def num_host_task_events(self): - _local = self._state.get() - return _local.get_num_host_task_events() - - @property - def num_submitted_events(self): - _local = self._state.get() - return _local.get_num_submitted_events() - - @property - def host_task_events(self): - _local = self._state.get() - return _local.get_host_task_events() - - @property - def submitted_events(self): - _local = self._state.get() - return _local.get_submitted_events() - - def wait(self): - _local = self._state.get() - return _local.wait() - +from ._intel_device_info import intel_device_info +from ._order_manager import SequentialOrderManager -SequentialOrderManager = _SequentialOrderManager() -SequentialOrderManager.__name__ = "SequentialOrderManager" __all__ = [ "get_execution_queue", diff --git a/dpctl/utils/_intel_device_info.py b/dpctl/utils/_intel_device_info.py new file mode 100644 index 0000000000..e085b59d8f --- /dev/null +++ b/dpctl/utils/_intel_device_info.py @@ -0,0 +1,109 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .._sycl_device import SyclDevice +from ._device_queries import ( + intel_device_info_device_id, + intel_device_info_free_memory, + intel_device_info_gpu_eu_count, + intel_device_info_gpu_eu_count_per_subslice, + intel_device_info_gpu_eu_simd_width, + intel_device_info_gpu_hw_threads_per_eu, + intel_device_info_gpu_slices, + intel_device_info_gpu_subslices_per_slice, + intel_device_info_max_mem_bandwidth, + intel_device_info_memory_bus_width, + intel_device_info_memory_clock_rate, +) + +def intel_device_info(dev, /): + """intel_device_info(sycl_device) + + For Intel(R) GPU devices returns a dictionary + with device architectural details, and an empty + dictionary otherwise. The dictionary contains + the following keys: + + device_id: + 32-bits device PCI identifier + gpu_eu_count: + Total number of execution units + gpu_hw_threads_per_eu: + Number of thread contexts in EU + gpu_eu_simd_width: + Physical SIMD width of EU + gpu_slices: + Total number of slices + gpu_subslices_per_slice: + Number of sub-slices per slice + gpu_eu_count_per_subslice: + Number of EUs in subslice + max_mem_bandwidth: + Maximum memory bandwidth in bytes/second + free_memory: + Global memory available on the device in units of bytes + + Unsupported descriptors are omitted from the dictionary. + + Descriptors other than the PCI identifier are supported only + for :class:`.SyclDevices` with Level-Zero backend. + + .. note:: + Environment variable ``ZES_ENABLE_SYSMAN`` may need to be set + to ``1`` for the ``"free_memory"`` key to be reported. + """ + if not isinstance(dev, SyclDevice): + raise TypeError(f"Expected dpctl.SyclDevice, got {type(dev)}") + dev_id = intel_device_info_device_id(dev) + if dev_id: + res = { + "device_id": dev_id, + } + if dev.has_aspect_gpu: + eu_count = intel_device_info_gpu_eu_count(dev) + if eu_count: + res["gpu_eu_count"] = eu_count + hw_threads = intel_device_info_gpu_hw_threads_per_eu(dev) + if hw_threads: + res["gpu_hw_threads_per_eu"] = hw_threads + simd_w = intel_device_info_gpu_eu_simd_width(dev) + if simd_w: + res["gpu_eu_simd_width"] = simd_w + n_slices = intel_device_info_gpu_slices(dev) + if n_slices: + res["gpu_slices"] = n_slices + n_subslices = intel_device_info_gpu_subslices_per_slice(dev) + if n_subslices: + res["gpu_subslices_per_slice"] = n_subslices + n_eu_per_subslice = intel_device_info_gpu_eu_count_per_subslice(dev) + if n_eu_per_subslice: + res["gpu_eu_count_per_subslice"] = n_eu_per_subslice + bw = intel_device_info_max_mem_bandwidth(dev) + if bw: + res["max_mem_bandwidth"] = bw + fm = intel_device_info_free_memory(dev) + if fm: + res["free_memory"] = fm + mcr = intel_device_info_memory_clock_rate(dev) + if mcr: + res["memory_clock_rate"] = mcr + mbw = intel_device_info_memory_bus_width(dev) + if mbw: + res["memory_bus_width"] = mbw + return res + return dict() + +__all__ = ["intel_device_info",] diff --git a/dpctl/utils/_order_manager.py b/dpctl/utils/_order_manager.py new file mode 100644 index 0000000000..4e85cd26c5 --- /dev/null +++ b/dpctl/utils/_order_manager.py @@ -0,0 +1,87 @@ +from collections import defaultdict +from contextvars import ContextVar + +from .._sycl_event import SyclEvent +from .._sycl_queue import SyclQueue +from ._seq_order_keeper import _OrderManager + + +class _SequentialOrderManager: + """ + Class to orchestrate default sequential order + of the tasks offloaded from Python. + """ + + def __init__(self): + self._state = _OrderManager(16) + + def __dealloc__(self): + _local = self._state + SyclEvent.wait_for(_local.get_submitted_events()) + SyclEvent.wait_for(_local.get_host_task_events()) + + def add_event_pair(self, host_task_ev, comp_ev): + _local = self._state + if isinstance(host_task_ev, SyclEvent) and isinstance( + comp_ev, SyclEvent + ): + _local.add_to_both_events(host_task_ev, comp_ev) + else: + if not isinstance(host_task_ev, (list, tuple)): + host_task_ev = (host_task_ev,) + if not isinstance(comp_ev, (list, tuple)): + comp_ev = (comp_ev,) + _local.add_vector_to_both_events(host_task_ev, comp_ev) + + @property + def num_host_task_events(self): + _local = self._state + return _local.get_num_host_task_events() + + @property + def num_submitted_events(self): + _local = self._state + return _local.get_num_submitted_events() + + @property + def host_task_events(self): + _local = self._state + return _local.get_host_task_events() + + @property + def submitted_events(self): + _local = self._state + return _local.get_submitted_events() + + def wait(self): + _local = self._state + return _local.wait() + + def __copy__(self): + res = _SequentialOrderManager.__new__(_SequentialOrderManager) + res._state = _OrderManager(self._state) + return res + + +class OrderManagerMap: + """Utility class to ensure sequential ordering of offloaded + tasks issued by dpctl.tensor functions""" + + def __init__(self): + self._map = ContextVar( + "global_order_manager_map", + default=defaultdict(_SequentialOrderManager), + ) + + def __getitem__(self, q: SyclQueue) -> _SequentialOrderManager: + _local = self._map.get() + print(_local) + if q in _local: + return _local[q] + else: + v = _local[q] + _local[q] = v + return v + + +SequentialOrderManager = OrderManagerMap() From 3f0f93540681bf9ebdba5e73e9c533e68256884b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 15 May 2024 09:04:53 -0500 Subject: [PATCH 14/25] Adds clear method to SequentialOrderManager --- dpctl/utils/_order_manager.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/dpctl/utils/_order_manager.py b/dpctl/utils/_order_manager.py index 4e85cd26c5..8938b17365 100644 --- a/dpctl/utils/_order_manager.py +++ b/dpctl/utils/_order_manager.py @@ -63,7 +63,7 @@ def __copy__(self): return res -class OrderManagerMap: +class SyclQueueToOrderManagerMap: """Utility class to ensure sequential ordering of offloaded tasks issued by dpctl.tensor functions""" @@ -74,8 +74,10 @@ def __init__(self): ) def __getitem__(self, q: SyclQueue) -> _SequentialOrderManager: + """Get order manager for given SyclQueue""" _local = self._map.get() - print(_local) + if not isinstance(q, SyclQueue): + raise TypeError(f"Expected `dpctl.SyclQueue`, got {type(q)}") if q in _local: return _local[q] else: @@ -83,5 +85,10 @@ def __getitem__(self, q: SyclQueue) -> _SequentialOrderManager: _local[q] = v return v + def clear(self): + """Clear content of internal dictionary""" + _local = self._map.get() + _local.clear() + -SequentialOrderManager = OrderManagerMap() +SequentialOrderManager = SyclQueueToOrderManagerMap() From bdb2f750ed7179c6582fbd54278e9a6665f09aec Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 17 May 2024 17:04:14 -0500 Subject: [PATCH 15/25] Replaced use of synchronizing __sycl_usm_array_interface__ atribute Instead of relying on SUAI attribute which has to synchronize to get the offset, use `X._element_offset` attribute directly. --- dpctl/tensor/_copy_utils.py | 4 ++-- dpctl/tensor/_manipulation_functions.py | 13 ++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index d434819e04..d8e15846eb 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -45,7 +45,7 @@ def _copy_to_numpy(ary): h = np.ndarray(nb, dtype="u1", buffer=hh).view(ary.dtype) itsz = ary.itemsize strides_bytes = tuple(si * itsz for si in ary.strides) - offset = ary.__sycl_usm_array_interface__.get("offset", 0) * itsz + offset = ary._element_offset * itsz # ensure that content of ary.usm_data is final q.wait() hh.copy_from_device(ary.usm_data) @@ -645,7 +645,7 @@ def astype( target_dtype, d.has_aspect_fp16, d.has_aspect_fp64 ): raise ValueError( - f"Requested dtype `{target_dtype}` is not supported by the " + f"Requested dtype '{target_dtype}' is not supported by the " "target device" ) usm_ary = usm_ary.to_device(device) diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py index 8a9620605e..a7c50d4269 100644 --- a/dpctl/tensor/_manipulation_functions.py +++ b/dpctl/tensor/_manipulation_functions.py @@ -116,7 +116,7 @@ def permute_dims(X, /, axes): dtype=X.dtype, buffer=X, strides=newstrides, - offset=X.__sycl_usm_array_interface__.get("offset", 0), + offset=X._element_offset, ) @@ -244,7 +244,7 @@ def broadcast_to(X, /, shape): dtype=X.dtype, buffer=X, strides=new_sts, - offset=X.__sycl_usm_array_interface__.get("offset", 0), + offset=X._element_offset, ) @@ -817,8 +817,8 @@ def repeat(x, repeats, /, *, axis=None): dpctl.utils.validate_usm_type(usm_type, allow_none=False) if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"): raise TypeError( - f"`repeats` data type `{repeats.dtype}` cannot be cast to " - "`int64` according to the casting rule ''safe.''" + f"'repeats' data type {repeats.dtype} cannot be cast to " + "'int64' according to the casting rule ''safe.''" ) if repeats.size == 1: scalar = True @@ -829,11 +829,11 @@ def repeat(x, repeats, /, *, axis=None): else: if repeats.size != axis_size: raise ValueError( - "`repeats` array must be broadcastable to the size of " + "'repeats' array must be broadcastable to the size of " "the repeated axis" ) if not dpt.all(repeats >= 0): - raise ValueError("`repeats` elements must be positive") + raise ValueError("'repeats' elements must be positive") elif isinstance(repeats, (tuple, list, range)): usm_type = x.usm_type @@ -862,7 +862,6 @@ def repeat(x, repeats, /, *, axis=None): f"got {type(repeats)}" ) - _manager = dputils.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events if scalar: From 19302879593e51a2c7ed2cb6071ea7475973d5df Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 3 Jun 2024 19:46:06 -0500 Subject: [PATCH 16/25] Fix pre-commit --- dpctl/utils/__init__.py | 3 +-- dpctl/utils/_intel_device_info.py | 6 +++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dpctl/utils/__init__.py b/dpctl/utils/__init__.py index 0802d07939..04a4efa251 100644 --- a/dpctl/utils/__init__.py +++ b/dpctl/utils/__init__.py @@ -24,11 +24,10 @@ get_execution_queue, validate_usm_type, ) -from ._onetrace_context import onetrace_enabled from ._intel_device_info import intel_device_info +from ._onetrace_context import onetrace_enabled from ._order_manager import SequentialOrderManager - __all__ = [ "get_execution_queue", "get_coerced_usm_type", diff --git a/dpctl/utils/_intel_device_info.py b/dpctl/utils/_intel_device_info.py index e085b59d8f..5c83a05261 100644 --- a/dpctl/utils/_intel_device_info.py +++ b/dpctl/utils/_intel_device_info.py @@ -29,6 +29,7 @@ intel_device_info_memory_clock_rate, ) + def intel_device_info(dev, /): """intel_device_info(sycl_device) @@ -106,4 +107,7 @@ def intel_device_info(dev, /): return res return dict() -__all__ = ["intel_device_info",] + +__all__ = [ + "intel_device_info", +] From 7d3e228e367f048b04db53665d7fb1a8134296d7 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 3 Jun 2024 20:26:51 -0500 Subject: [PATCH 17/25] Extend test symmetrically to improve coverage --- dpctl/tests/test_usm_ndarray_searchsorted.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_searchsorted.py b/dpctl/tests/test_usm_ndarray_searchsorted.py index 0e65fcc235..47dd97bc87 100644 --- a/dpctl/tests/test_usm_ndarray_searchsorted.py +++ b/dpctl/tests/test_usm_ndarray_searchsorted.py @@ -267,11 +267,14 @@ def test_searchsorted_coerce(): x1_i4 = dpt.arange(5, dtype="i4") x1_i8 = dpt.arange(5, dtype="i8") + x2_i4 = dpt.arange(5, dtype="i4") x2_i8 = dpt.arange(5, dtype="i8") p1 = dpt.searchsorted(x1_i4, x2_i8) p2 = dpt.searchsorted(x1_i8, x2_i8) + p3 = dpt.searchsorted(x1_i8, x2_i4) assert dpt.all(p1 == p2) + assert dpt.all(p2 == p3) def test_searchsorted_validation(): From dd28026b0c45a333c0dc886e7efb68cf3800333e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 3 Jun 2024 20:41:25 -0500 Subject: [PATCH 18/25] Add tests for order manager --- dpctl/tests/test_utils.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/dpctl/tests/test_utils.py b/dpctl/tests/test_utils.py index 4966f42b55..77735fdd47 100644 --- a/dpctl/tests/test_utils.py +++ b/dpctl/tests/test_utils.py @@ -149,3 +149,31 @@ def test_intel_device_info(): test = descriptor_name in allowed_names err_msg = f"Key '{descriptor_name}' is not recognized" assert test, err_msg + + +def test_order_manager(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not created for default-selected device") + _som = dpctl.utils.SequentialOrderManager + _mngr = _som[q] + assert isinstance(_mngr.num_host_task_events, int) + assert isinstance(_mngr.num_submitted_events, int) + assert isinstance(_mngr.submitted_events, list) + assert isinstance(_mngr.host_task_events, list) + _mngr.add_event_pair(dpctl.SyclEvent(), dpctl.SyclEvent()) + _mngr.add_event_pair([dpctl.SyclEvent()], dpctl.SyclEvent()) + _mngr.add_event_pair(dpctl.SyclEvent(), [dpctl.SyclEvent()]) + _mngr.wait() + cpy = _mngr.__copy__() + _som.clear() + del cpy + + try: + _passed = False + _som[None] + except TypeError: + _passed = True + finally: + assert _passed From 71c5b8477f7c7c5ddea8290533fe042ccd4bada9 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 3 Jun 2024 20:49:17 -0500 Subject: [PATCH 19/25] Improve coverage of concat_axis_None Incidentally, the added test also found a bug --- dpctl/tests/test_usm_ndarray_manipulation.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_manipulation.py b/dpctl/tests/test_usm_ndarray_manipulation.py index 1e9c30d9cf..b64f68cbb8 100644 --- a/dpctl/tests/test_usm_ndarray_manipulation.py +++ b/dpctl/tests/test_usm_ndarray_manipulation.py @@ -698,6 +698,14 @@ def test_concat_different_dtype(): assert XY.shape == (5, 2) assert XY.sycl_queue == q + X1 = dpt.arange(10, dtype="i2", sycl_queue=q) + Y1 = dpt.arange(5, dtype="i4", sycl_queue=q) + + XY1 = dpt.concat([X1[::2], Y1[::-1]], axis=None) + assert XY1.shape == (10,) + assert XY1.sycl_queue == q + assert XY1.dtype == Y1.dtype + def test_concat_incorrect_ndim(): q = get_queue_or_skip() From a4b510cf4ad1759a358cdb81af4a92e35b6ec5e2 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 3 Jun 2024 20:50:18 -0500 Subject: [PATCH 20/25] Fix for the bug found by test added in previous commit --- dpctl/tensor/_manipulation_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py index a7c50d4269..af44104288 100644 --- a/dpctl/tensor/_manipulation_functions.py +++ b/dpctl/tensor/_manipulation_functions.py @@ -470,7 +470,7 @@ def _concat_axis_None(arrays): ) _manager.add_event_pair(ht_copy_ev, cpy_ev) hev, reshape_copy_ev = ti._copy_usm_ndarray_for_reshape( - src=src_, + src=src2_, dst=res[fill_start:fill_end], sycl_queue=exec_q, depends=[cpy_ev], From 61f8c5462974171c2e82f56cb56a75a72af0f587 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 5 Jun 2024 14:09:13 -0500 Subject: [PATCH 21/25] Add docs for two new C-API functions added in this branch for 0.18 --- docs/doc_sources/api_reference/dpctl_capi.rst | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/docs/doc_sources/api_reference/dpctl_capi.rst b/docs/doc_sources/api_reference/dpctl_capi.rst index 0d69cf808d..b12a636b24 100644 --- a/docs/doc_sources/api_reference/dpctl_capi.rst +++ b/docs/doc_sources/api_reference/dpctl_capi.rst @@ -152,7 +152,16 @@ API for :c:struct:`Py_MemoryObject` :param nbytes: The size of allocation in bytes :param QRef: instance of :c:struct:`PySyclQueueRef` corresponding to ``sycl::queue`` to be associated with this allocation - :param owner: Python object instance whose deleter triggers freeing of this USM allocation + :param owner: Python object instance whose deleter triggers freeing of this USM allocation. Specify `owner=None` + to pass ownership to created Python memory object, which will use ``sycl::free(ptr, sycl_queue)`` for + deallocation. + +.. c:function:: void * Memory_GetOpaquePointer(struct Py_MemoryObject *o) + + :param o: Input ojbect + :returns: Returns opaque pointer to `std::shared_ptr` which manages the USM allocation, + or a `nullptr` if this USM allocation represented by `o` is not managed by the smart + pointer. API for :c:struct:`PyUSMArrayObject` ------------------------------------ @@ -221,6 +230,11 @@ API for :c:struct:`PyUSMArrayObject` :returns: Offset of zero multi-index array element from the beginning of the USM allocation. +.. c:function:: PyObject * UsmNDArray_GetUSMData(struct PyUSMArrayObject *arr) + + :param arr: Input object + :returns: Python memory object underlying input array `arr`. + .. c:function:: void UsmNDArray_SetWritableFlag(struct PyUSMArrayObject *arr, int flag) :param arr: Input object From f6a1f0651282fb7c1476d37d27487200cbfdd679 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 5 Jun 2024 14:22:00 -0500 Subject: [PATCH 22/25] Add synchronization points in example --- examples/cython/usm_memory/blackscholes/blackscholes.pyx | 6 ++++++ examples/cython/usm_memory/src/sycl_blackscholes.hpp | 7 +++---- .../external_usm_allocation/_usm_alloc_example.cpp | 3 +++ examples/pybind11/onemkl_gemv/solve.py | 6 ++++-- examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp | 2 +- .../use_dpctl_sycl_kernel/tests/test_user_kernel.py | 1 + .../pybind11/use_dpctl_sycl_kernel/use_kernel/_example.cpp | 4 ++-- 7 files changed, 20 insertions(+), 9 deletions(-) diff --git a/examples/cython/usm_memory/blackscholes/blackscholes.pyx b/examples/cython/usm_memory/blackscholes/blackscholes.pyx index b7d37993f3..6d2d2bbbe5 100644 --- a/examples/cython/usm_memory/blackscholes/blackscholes.pyx +++ b/examples/cython/usm_memory/blackscholes/blackscholes.pyx @@ -106,11 +106,15 @@ def black_scholes_price(c_dpt.usm_ndarray option_params_arr): call_put_prices = dpt.empty((n_opts, 2), dtype='d', sycl_queue=q) dp1 = option_params_arr.get_data() dp2 = call_put_prices.get_data() + # ensure content of dp1 and dp2 is no longer worked on + exec_q_ptr[0].wait() cpp_blackscholes[double](exec_q_ptr[0], n_opts, dp1, dp2) elif (typenum_ == c_dpt.UAR_FLOAT): call_put_prices = dpt.empty((n_opts, 2), dtype='f', sycl_queue=q) fp1 = option_params_arr.get_data() fp2 = call_put_prices.get_data() + # ensure content of fp1 and fp2 is no longer worked on + exec_q_ptr[0].wait() cpp_blackscholes[float](exec_q_ptr[0], n_opts, fp1, fp2) else: raise ValueError("Unsupported data-type") @@ -196,11 +200,13 @@ def populate_params( if (typenum_ == c_dpt.UAR_DOUBLE): dp = option_params_arr.get_data() + exec_q_ptr[0].wait() cpp_populate_params[double]( exec_q_ptr[0], n_opts, dp, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed ) elif (typenum_ == c_dpt.UAR_FLOAT): fp = option_params_arr.get_data() + exec_q_ptr[0].wait() cpp_populate_params[float]( exec_q_ptr[0], n_opts, fp, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed ) diff --git a/examples/cython/usm_memory/src/sycl_blackscholes.hpp b/examples/cython/usm_memory/src/sycl_blackscholes.hpp index db7b1e3e31..376b0f54c5 100644 --- a/examples/cython/usm_memory/src/sycl_blackscholes.hpp +++ b/examples/cython/usm_memory/src/sycl_blackscholes.hpp @@ -47,7 +47,7 @@ constexpr int CALL = 0; constexpr int PUT = 1; template -void cpp_blackscholes(sycl::queue q, size_t n_opts, T *params, T *callput) +void cpp_blackscholes(sycl::queue &q, size_t n_opts, T *params, T *callput) { using data_t = T; @@ -57,8 +57,7 @@ void cpp_blackscholes(sycl::queue q, size_t n_opts, T *params, T *callput) data_t half = one / two; cgh.parallel_for>( - sycl::range<1>(n_opts), - [=](sycl::id<1> idx) + sycl::range<1>(n_opts), [=](sycl::id<1> idx) { const size_t i = n_params * idx[0]; const data_t opt_price = params[i + PRICE]; @@ -106,7 +105,7 @@ void cpp_blackscholes(sycl::queue q, size_t n_opts, T *params, T *callput) const size_t callput_i = n_prices * idx[0]; callput[callput_i + CALL] = call_price; callput[callput_i + PUT] = put_price; - }); + }); }); e.wait_and_throw(); diff --git a/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp b/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp index 11f2d2c531..2625cd4ba4 100644 --- a/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp +++ b/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp @@ -114,6 +114,9 @@ py::dict construct_sua_iface(DMatrix &m) iface["typestr"] = "|f8"; iface["syclobj"] = syclobj; + // ensure that content of array is flushed out + m.get_queue().wait(); + return iface; } diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py index 8823097613..6c350ab801 100644 --- a/examples/pybind11/onemkl_gemv/solve.py +++ b/examples/pybind11/onemkl_gemv/solve.py @@ -127,12 +127,14 @@ def cg_solve(A, b): converged is False if solver has not converged, or the iteration number """ exec_queue = A.sycl_queue + exec_queue.wait() + x = dpt.zeros_like(b) Ap = dpt.empty_like(x) all_host_tasks = [] - r = dpt.copy(b) # synchronous copy - p = dpt.copy(b) # synchronous copy + r = dpt.copy(b) + p = dpt.copy(b) rsold = sycl_gemm.norm_squared_blocking(exec_queue, r) if rsold < 1e-20: diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp index 10eb794aba..1ebf5e7bff 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp @@ -41,7 +41,7 @@ namespace py = pybind11; using dpctl::utils::keep_args_alive; std::pair -py_gemv(sycl::queue q, +py_gemv(sycl::queue &q, dpctl::tensor::usm_ndarray matrix, dpctl::tensor::usm_ndarray vector, dpctl::tensor::usm_ndarray result, diff --git a/examples/pybind11/use_dpctl_sycl_kernel/tests/test_user_kernel.py b/examples/pybind11/use_dpctl_sycl_kernel/tests/test_user_kernel.py index 085885810f..ad6bfb90f7 100644 --- a/examples/pybind11/use_dpctl_sycl_kernel/tests/test_user_kernel.py +++ b/examples/pybind11/use_dpctl_sycl_kernel/tests/test_user_kernel.py @@ -64,6 +64,7 @@ def test_kernel_submit_through_extension(): x = dpt.arange(0, stop=13, step=1, dtype="i4", sycl_queue=q) y = dpt.zeros_like(x) + q.wait() uk.submit_custom_kernel(q, krn, x, y, []) assert np.array_equal(dpt.asnumpy(y), np.arange(0, 26, step=2, dtype="i4")) diff --git a/examples/pybind11/use_dpctl_sycl_kernel/use_kernel/_example.cpp b/examples/pybind11/use_dpctl_sycl_kernel/use_kernel/_example.cpp index 98a5414e9f..92d0066db2 100644 --- a/examples/pybind11/use_dpctl_sycl_kernel/use_kernel/_example.cpp +++ b/examples/pybind11/use_dpctl_sycl_kernel/use_kernel/_example.cpp @@ -35,8 +35,8 @@ namespace py = pybind11; -void submit_custom_kernel(sycl::queue q, - sycl::kernel krn, +void submit_custom_kernel(sycl::queue &q, + sycl::kernel &krn, dpctl::tensor::usm_ndarray x, dpctl::tensor::usm_ndarray y, const std::vector &depends = {}) From 759c1d014ab17d06b875fdb60cf64be93668ec4b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 5 Jun 2024 15:13:20 -0500 Subject: [PATCH 23/25] Added wait call to cppclass queue and cppclass even in sycl.pxd --- dpctl/sycl.pxd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dpctl/sycl.pxd b/dpctl/sycl.pxd index f1ffe9cf48..45c6d652b2 100644 --- a/dpctl/sycl.pxd +++ b/dpctl/sycl.pxd @@ -22,6 +22,7 @@ from . cimport _backend as dpctl_backend cdef extern from "sycl/sycl.hpp" namespace "sycl": cdef cppclass queue "sycl::queue": + void wait() nogil pass cdef cppclass device "sycl::device": @@ -31,6 +32,7 @@ cdef extern from "sycl/sycl.hpp" namespace "sycl": pass cdef cppclass event "sycl::event": + void wait() nogil pass cdef cppclass kernel "sycl::kernel": From 7bc1ef8fd5d5d22c70c1e2c0795e73ef2cda4794 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 5 Jun 2024 15:21:01 -0500 Subject: [PATCH 24/25] Fixed issues found by @ndgrigorian --- docs/doc_sources/api_reference/dpctl_capi.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/doc_sources/api_reference/dpctl_capi.rst b/docs/doc_sources/api_reference/dpctl_capi.rst index b12a636b24..382859efd8 100644 --- a/docs/doc_sources/api_reference/dpctl_capi.rst +++ b/docs/doc_sources/api_reference/dpctl_capi.rst @@ -158,9 +158,9 @@ API for :c:struct:`Py_MemoryObject` .. c:function:: void * Memory_GetOpaquePointer(struct Py_MemoryObject *o) - :param o: Input ojbect + :param o: Input object :returns: Returns opaque pointer to `std::shared_ptr` which manages the USM allocation, - or a `nullptr` if this USM allocation represented by `o` is not managed by the smart + or a `nullptr` if the USM allocation represented by `o` is not managed by the smart pointer. API for :c:struct:`PyUSMArrayObject` From f4e3b6f1b635ec1f0fb83b3146cc57055b4f3aff Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 5 Jun 2024 15:51:15 -0500 Subject: [PATCH 25/25] Add comments for usm_ndarray::get_usm_data method --- dpctl/apis/include/dpctl4pybind11.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp index 0c2b00af0f..8fb435b355 100644 --- a/dpctl/apis/include/dpctl4pybind11.hpp +++ b/dpctl/apis/include/dpctl4pybind11.hpp @@ -1097,13 +1097,16 @@ class usm_ndarray : public py::object return static_cast(flags & api.USM_ARRAY_WRITABLE_); } + /*! @brief Get usm_data property of array */ py::object get_usm_data() const { PyUSMArrayObject *raw_ar = usm_array_ptr(); auto const &api = ::dpctl::detail::dpctl_capi::get(); + // UsmNDArray_GetUSMData_ gives a new reference PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar); + // pass reference ownership to py::object return py::reinterpret_steal(usm_data); }