diff --git a/dpctl/.gitignore b/dpctl/.gitignore index 3e23a8af25..3376d3081c 100644 --- a/dpctl/.gitignore +++ b/dpctl/.gitignore @@ -1,5 +1,5 @@ *.so -*.cpp +_*.cpp *.cxx *.c *.h diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp index f55ef3ec93..5b40f222ce 100644 --- a/dpctl/apis/include/dpctl4pybind11.hpp +++ b/dpctl/apis/include/dpctl4pybind11.hpp @@ -34,6 +34,295 @@ namespace py = pybind11; +namespace dpctl +{ +namespace detail +{ + +// Lookup a type according to its size, and return a value corresponding to the +// NumPy typenum. +template constexpr int platform_typeid_lookup() +{ + return -1; +} + +template +constexpr int platform_typeid_lookup(int I, Ints... Is) +{ + return sizeof(Concrete) == sizeof(T) + ? I + : platform_typeid_lookup(Is...); +} + +struct dpctl_capi +{ + + // dpctl type objects + PyTypeObject *Py_SyclDeviceType_; + PyTypeObject *PySyclDeviceType_; + PyTypeObject *Py_SyclContextType_; + PyTypeObject *PySyclContextType_; + PyTypeObject *Py_SyclEventType_; + PyTypeObject *PySyclEventType_; + PyTypeObject *Py_SyclQueueType_; + PyTypeObject *PySyclQueueType_; + PyTypeObject *Py_MemoryType_; + PyTypeObject *PyMemoryUSMDeviceType_; + PyTypeObject *PyMemoryUSMSharedType_; + PyTypeObject *PyMemoryUSMHostType_; + PyTypeObject *PyUSMArrayType_; + + DPCTLSyclDeviceRef (*SyclDevice_GetDeviceRef_)(PySyclDeviceObject *); + PySyclDeviceObject *(*SyclDevice_Make_)(DPCTLSyclDeviceRef); + + DPCTLSyclContextRef (*SyclContext_GetContextRef_)(PySyclContextObject *); + PySyclContextObject *(*SyclContext_Make_)(DPCTLSyclContextRef); + + DPCTLSyclEventRef (*SyclEvent_GetEventRef_)(PySyclEventObject *); + PySyclEventObject *(*SyclEvent_Make_)(DPCTLSyclEventRef); + + DPCTLSyclQueueRef (*SyclQueue_GetQueueRef_)(PySyclQueueObject *); + PySyclQueueObject *(*SyclQueue_Make_)(DPCTLSyclQueueRef); + + // memory + DPCTLSyclUSMRef (*Memory_GetUsmPointer_)(Py_MemoryObject *); + DPCTLSyclContextRef (*Memory_GetContextRef_)(Py_MemoryObject *); + DPCTLSyclQueueRef (*Memory_GetQueueRef_)(Py_MemoryObject *); + size_t (*Memory_GetNumBytes_)(Py_MemoryObject *); + PyObject *(*Memory_Make_)(DPCTLSyclUSMRef, + size_t, + DPCTLSyclQueueRef, + PyObject *); + + // tensor + char *(*UsmNDArray_GetData_)(PyUSMArrayObject *); + int (*UsmNDArray_GetNDim_)(PyUSMArrayObject *); + py::ssize_t *(*UsmNDArray_GetShape_)(PyUSMArrayObject *); + py::ssize_t *(*UsmNDArray_GetStrides_)(PyUSMArrayObject *); + int (*UsmNDArray_GetTypenum_)(PyUSMArrayObject *); + int (*UsmNDArray_GetElementSize_)(PyUSMArrayObject *); + int (*UsmNDArray_GetFlags_)(PyUSMArrayObject *); + DPCTLSyclQueueRef (*UsmNDArray_GetQueueRef_)(PyUSMArrayObject *); + py::ssize_t (*UsmNDArray_GetOffset_)(PyUSMArrayObject *); + + int USM_ARRAY_C_CONTIGUOUS_; + int USM_ARRAY_F_CONTIGUOUS_; + int USM_ARRAY_WRITABLE_; + int UAR_BOOL_, UAR_BYTE_, UAR_UBYTE_, UAR_SHORT_, UAR_USHORT_, UAR_INT_, + UAR_UINT_, UAR_LONG_, UAR_ULONG_, UAR_LONGLONG_, UAR_ULONGLONG_, + UAR_FLOAT_, UAR_DOUBLE_, UAR_CFLOAT_, UAR_CDOUBLE_, UAR_TYPE_SENTINEL_, + UAR_HALF_; + int UAR_INT8_, UAR_UINT8_, UAR_INT16_, UAR_UINT16_, UAR_INT32_, UAR_UINT32_, + UAR_INT64_, UAR_UINT64_; + + bool PySyclDevice_Check_(PyObject *obj) const + { + return PyObject_TypeCheck(obj, PySyclDeviceType_) != 0; + } + bool PySyclContext_Check_(PyObject *obj) const + { + return PyObject_TypeCheck(obj, PySyclContextType_) != 0; + } + bool PySyclEvent_Check_(PyObject *obj) const + { + return PyObject_TypeCheck(obj, PySyclEventType_) != 0; + } + bool PySyclQueue_Check_(PyObject *obj) const + { + return PyObject_TypeCheck(obj, PySyclQueueType_) != 0; + } + + ~dpctl_capi(){}; + + static auto &get() + { + static dpctl_capi api = lookup(); + return api; + } + + py::object default_sycl_queue_pyobj() + { + return *default_sycl_queue; + } + py::object default_usm_memory_pyobj() + { + return *default_usm_memory; + } + py::object default_usm_ndarray_pyobj() + { + return *default_usm_ndarray; + } + py::object as_usm_memory_pyobj() + { + return *as_usm_memory; + } + +private: + struct Deleter + { + void operator()(py::object *p) const + { + bool guard = (Py_IsInitialized() && !_Py_IsFinalizing()); + + if (guard) { + delete p; + } + } + }; + + std::shared_ptr default_sycl_queue; + std::shared_ptr default_usm_memory; + std::shared_ptr default_usm_ndarray; + std::shared_ptr as_usm_memory; + + dpctl_capi() + : default_sycl_queue{}, default_usm_memory{}, default_usm_ndarray{}, + as_usm_memory{} + { + // Import Cython-generated C-API for dpctl + // This imports python modules and initializes + // static variables such as function pointers for C-API, + // e.g. SyclDevice_GetDeviceRef, etc. + // pointers to Python types, i.e. PySyclDeviceType, etc. + // and exported constants, i.e. USM_ARRAY_C_CONTIGUOUS, etc. + import_dpctl(); + + // Python type objects for classes implemented by dpctl + this->Py_SyclDeviceType_ = &Py_SyclDeviceType; + this->PySyclDeviceType_ = &PySyclDeviceType; + this->Py_SyclContextType_ = &Py_SyclContextType; + this->PySyclContextType_ = &PySyclContextType; + this->Py_SyclEventType_ = &Py_SyclEventType; + this->PySyclEventType_ = &PySyclEventType; + this->Py_SyclQueueType_ = &Py_SyclQueueType; + this->PySyclQueueType_ = &PySyclQueueType; + this->Py_MemoryType_ = &Py_MemoryType; + this->PyMemoryUSMDeviceType_ = &PyMemoryUSMDeviceType; + this->PyMemoryUSMSharedType_ = &PyMemoryUSMSharedType; + this->PyMemoryUSMHostType_ = &PyMemoryUSMHostType; + this->PyUSMArrayType_ = &PyUSMArrayType; + + // SyclDevice API + this->SyclDevice_GetDeviceRef_ = SyclDevice_GetDeviceRef; + this->SyclDevice_Make_ = SyclDevice_Make; + + // SyclContext API + this->SyclContext_GetContextRef_ = SyclContext_GetContextRef; + this->SyclContext_Make_ = SyclContext_Make; + + // SyclEvent API + this->SyclEvent_GetEventRef_ = SyclEvent_GetEventRef; + this->SyclEvent_Make_ = SyclEvent_Make; + + // SyclQueue API + this->SyclQueue_GetQueueRef_ = SyclQueue_GetQueueRef; + this->SyclQueue_Make_ = SyclQueue_Make; + + // dpctl.memory API + this->Memory_GetUsmPointer_ = Memory_GetUsmPointer; + this->Memory_GetContextRef_ = Memory_GetContextRef; + this->Memory_GetQueueRef_ = Memory_GetQueueRef; + this->Memory_GetNumBytes_ = Memory_GetNumBytes; + this->Memory_Make_ = Memory_Make; + + // dpctl.tensor.usm_ndarray API + this->UsmNDArray_GetData_ = UsmNDArray_GetData; + this->UsmNDArray_GetNDim_ = UsmNDArray_GetNDim; + this->UsmNDArray_GetShape_ = UsmNDArray_GetShape; + this->UsmNDArray_GetStrides_ = UsmNDArray_GetStrides; + this->UsmNDArray_GetTypenum_ = UsmNDArray_GetTypenum; + this->UsmNDArray_GetElementSize_ = UsmNDArray_GetElementSize; + this->UsmNDArray_GetFlags_ = UsmNDArray_GetFlags; + this->UsmNDArray_GetQueueRef_ = UsmNDArray_GetQueueRef; + this->UsmNDArray_GetOffset_ = UsmNDArray_GetOffset; + + // constants + this->USM_ARRAY_C_CONTIGUOUS_ = USM_ARRAY_C_CONTIGUOUS; + this->USM_ARRAY_F_CONTIGUOUS_ = USM_ARRAY_F_CONTIGUOUS; + this->USM_ARRAY_WRITABLE_ = USM_ARRAY_WRITABLE; + this->UAR_BOOL_ = UAR_BOOL; + this->UAR_SHORT_ = UAR_SHORT; + this->UAR_USHORT_ = UAR_USHORT; + this->UAR_INT_ = UAR_INT; + this->UAR_UINT_ = UAR_UINT; + this->UAR_LONG_ = UAR_LONG; + this->UAR_ULONG_ = UAR_ULONG; + this->UAR_LONGLONG_ = UAR_LONGLONG; + this->UAR_ULONGLONG_ = UAR_ULONGLONG; + this->UAR_FLOAT_ = UAR_FLOAT; + this->UAR_DOUBLE_ = UAR_DOUBLE; + this->UAR_CFLOAT_ = UAR_CFLOAT; + this->UAR_CDOUBLE_ = UAR_CDOUBLE; + this->UAR_TYPE_SENTINEL_ = UAR_TYPE_SENTINEL; + this->UAR_HALF_ = UAR_HALF; + + // deduced disjoint types + this->UAR_INT8_ = UAR_BYTE; + this->UAR_UINT8_ = UAR_UBYTE; + this->UAR_INT16_ = UAR_SHORT; + this->UAR_UINT16_ = UAR_USHORT; + this->UAR_INT32_ = + platform_typeid_lookup( + UAR_LONG, UAR_INT, UAR_SHORT); + this->UAR_UINT32_ = + platform_typeid_lookup(UAR_ULONG, UAR_UINT, + UAR_USHORT); + this->UAR_INT64_ = + platform_typeid_lookup( + UAR_LONG, UAR_LONGLONG, UAR_INT); + this->UAR_UINT64_ = + platform_typeid_lookup( + UAR_ULONG, UAR_ULONGLONG, UAR_UINT); + + // create shared pointers to python objects used in type-casters + // for dpctl::memory::usm_memory and dpctl::tensor::usm_ndarray + sycl::queue q_{}; + PySyclQueueObject *py_q_tmp = + SyclQueue_Make(reinterpret_cast(&q_)); + py::object py_sycl_queue = py::reinterpret_steal( + reinterpret_cast(py_q_tmp)); + + default_sycl_queue = std::shared_ptr( + new py::object(py_sycl_queue), Deleter{}); + + py::module_ mod_memory = py::module_::import("dpctl.memory"); + py::object py_as_usm_memory = mod_memory.attr("as_usm_memory"); + as_usm_memory = std::shared_ptr( + new py::object{py_as_usm_memory}, Deleter{}); + + auto mem_kl = mod_memory.attr("MemoryUSMHost"); + py::object py_default_usm_memory = + mem_kl(1, py::arg("queue") = py_sycl_queue); + default_usm_memory = std::shared_ptr( + new py::object{py_default_usm_memory}, Deleter{}); + + py::module_ mod_usmarray = + py::module_::import("dpctl.tensor._usmarray"); + auto tensor_kl = mod_usmarray.attr("usm_ndarray"); + + py::object py_default_usm_ndarray = + tensor_kl(py::tuple(), py::arg("dtype") = py::str("u1"), + py::arg("buffer") = py_default_usm_memory); + + default_usm_ndarray = std::shared_ptr( + new py::object{py_default_usm_ndarray}, Deleter{}); + } + + dpctl_capi(dpctl_capi const &) = default; + dpctl_capi &operator=(dpctl_capi const &) = default; + + static dpctl_capi lookup() + { + static dpctl_capi api; + return api; + } + +}; // struct dpctl_capi +} // namespace detail +} // namespace dpctl + namespace pybind11 { namespace detail @@ -88,8 +377,9 @@ template <> struct type_caster bool load(handle src, bool) { PyObject *source = src.ptr(); - if (PyObject_TypeCheck(source, &PySyclQueueType)) { - DPCTLSyclQueueRef QRef = SyclQueue_GetQueueRef( + auto &api = ::dpctl::detail::dpctl_capi::get(); + if (api.PySyclQueue_Check_(source)) { + DPCTLSyclQueueRef QRef = api.SyclQueue_GetQueueRef_( reinterpret_cast(source)); value = std::make_unique( *(reinterpret_cast(QRef))); @@ -103,7 +393,9 @@ template <> struct type_caster static handle cast(sycl::queue src, return_value_policy, handle) { - auto tmp = SyclQueue_Make(reinterpret_cast(&src)); + auto &api = ::dpctl::detail::dpctl_capi::get(); + auto tmp = + api.SyclQueue_Make_(reinterpret_cast(&src)); return handle(reinterpret_cast(tmp)); } @@ -120,8 +412,9 @@ template <> struct type_caster bool load(handle src, bool) { PyObject *source = src.ptr(); - if (PyObject_TypeCheck(source, &PySyclDeviceType)) { - DPCTLSyclDeviceRef DRef = SyclDevice_GetDeviceRef( + auto &api = ::dpctl::detail::dpctl_capi::get(); + if (api.PySyclDevice_Check_(source)) { + DPCTLSyclDeviceRef DRef = api.SyclDevice_GetDeviceRef_( reinterpret_cast(source)); value = std::make_unique( *(reinterpret_cast(DRef))); @@ -135,7 +428,9 @@ template <> struct type_caster static handle cast(sycl::device src, return_value_policy, handle) { - auto tmp = SyclDevice_Make(reinterpret_cast(&src)); + auto &api = ::dpctl::detail::dpctl_capi::get(); + auto tmp = + api.SyclDevice_Make_(reinterpret_cast(&src)); return handle(reinterpret_cast(tmp)); } @@ -152,8 +447,9 @@ template <> struct type_caster bool load(handle src, bool) { PyObject *source = src.ptr(); - if (PyObject_TypeCheck(source, &PySyclContextType)) { - DPCTLSyclContextRef CRef = SyclContext_GetContextRef( + auto &api = ::dpctl::detail::dpctl_capi::get(); + if (api.PySyclContext_Check_(source)) { + DPCTLSyclContextRef CRef = api.SyclContext_GetContextRef_( reinterpret_cast(source)); value = std::make_unique( *(reinterpret_cast(CRef))); @@ -167,8 +463,9 @@ template <> struct type_caster static handle cast(sycl::context src, return_value_policy, handle) { + auto &api = ::dpctl::detail::dpctl_capi::get(); auto tmp = - SyclContext_Make(reinterpret_cast(&src)); + api.SyclContext_Make_(reinterpret_cast(&src)); return handle(reinterpret_cast(tmp)); } @@ -185,8 +482,9 @@ template <> struct type_caster bool load(handle src, bool) { PyObject *source = src.ptr(); - if (PyObject_TypeCheck(source, &PySyclEventType)) { - DPCTLSyclEventRef ERef = SyclEvent_GetEventRef( + auto &api = ::dpctl::detail::dpctl_capi::get(); + if (api.PySyclEvent_Check_(source)) { + DPCTLSyclEventRef ERef = api.SyclEvent_GetEventRef_( reinterpret_cast(source)); value = std::make_unique( *(reinterpret_cast(ERef))); @@ -200,7 +498,9 @@ template <> struct type_caster static handle cast(sycl::event src, return_value_policy, handle) { - auto tmp = SyclEvent_Make(reinterpret_cast(&src)); + auto &api = ::dpctl::detail::dpctl_capi::get(); + auto tmp = + api.SyclEvent_Make_(reinterpret_cast(&src)); return handle(reinterpret_cast(tmp)); } @@ -212,93 +512,6 @@ template <> struct type_caster namespace dpctl { -namespace detail -{ - -struct dpctl_api -{ -public: - static dpctl_api &get() - { - static dpctl_api api; - return api; - } - - py::object sycl_queue_() - { - return *sycl_queue; - } - py::object default_usm_memory_() - { - return *default_usm_memory; - } - py::object default_usm_ndarray_() - { - return *default_usm_ndarray; - } - py::object as_usm_memory_() - { - return *as_usm_memory; - } - -private: - struct Deleter - { - void operator()(py::object *p) const - { - bool guard = (Py_IsInitialized() && !_Py_IsFinalizing()); - - if (guard) { - delete p; - } - } - }; - - std::shared_ptr sycl_queue; - std::shared_ptr default_usm_memory; - std::shared_ptr default_usm_ndarray; - std::shared_ptr as_usm_memory; - - dpctl_api() : sycl_queue{}, default_usm_memory{}, default_usm_ndarray{} - { - import_dpctl(); - - sycl::queue q_; - py::object py_sycl_queue = py::cast(q_); - sycl_queue = std::shared_ptr(new py::object{py_sycl_queue}, - Deleter{}); - - py::module_ mod_memory = py::module_::import("dpctl.memory"); - py::object py_as_usm_memory = mod_memory.attr("as_usm_memory"); - as_usm_memory = std::shared_ptr( - new py::object{py_as_usm_memory}, Deleter{}); - - auto mem_kl = mod_memory.attr("MemoryUSMHost"); - py::object py_default_usm_memory = - mem_kl(1, py::arg("queue") = py_sycl_queue); - default_usm_memory = std::shared_ptr( - new py::object{py_default_usm_memory}, Deleter{}); - - py::module_ mod_usmarray = - py::module_::import("dpctl.tensor._usmarray"); - auto tensor_kl = mod_usmarray.attr("usm_ndarray"); - - py::object py_default_usm_ndarray = - tensor_kl(py::tuple(), py::arg("dtype") = py::str("u1"), - py::arg("buffer") = py_default_usm_memory); - - default_usm_ndarray = std::shared_ptr( - new py::object{py_default_usm_ndarray}, Deleter{}); - } - -public: - dpctl_api(dpctl_api const &) = delete; - void operator=(dpctl_api const &) = delete; - ~dpctl_api(){}; -}; - -} // namespace detail - namespace memory { @@ -313,13 +526,16 @@ class usm_memory : public py::object usm_memory, py::object, [](PyObject *o) -> bool { - return PyObject_TypeCheck(o, &Py_MemoryType) != 0; + return PyObject_TypeCheck( + o, ::dpctl::detail::dpctl_capi::get().Py_MemoryType_) != + 0; }, [](PyObject *o) -> PyObject * { return as_usm_memory(o); }) usm_memory() - : py::object(::dpctl::detail::dpctl_api::get().default_usm_memory_(), - borrowed_t{}) + : py::object( + ::dpctl::detail::dpctl_capi::get().default_usm_memory_pyobj(), + borrowed_t{}) { if (!m_ptr) throw py::error_already_set(); @@ -328,7 +544,8 @@ class usm_memory : public py::object sycl::queue get_queue() const { Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); - DPCTLSyclQueueRef QRef = Memory_GetQueueRef(mem_obj); + auto &api = ::dpctl::detail::dpctl_capi::get(); + DPCTLSyclQueueRef QRef = api.Memory_GetQueueRef_(mem_obj); sycl::queue *obj_q = reinterpret_cast(QRef); return *obj_q; } @@ -336,14 +553,16 @@ class usm_memory : public py::object char *get_pointer() const { Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); - DPCTLSyclUSMRef MRef = Memory_GetUsmPointer(mem_obj); + auto &api = ::dpctl::detail::dpctl_capi::get(); + DPCTLSyclUSMRef MRef = api.Memory_GetUsmPointer_(mem_obj); return reinterpret_cast(MRef); } size_t get_nbytes() const { + auto &api = ::dpctl::detail::dpctl_capi::get(); Py_MemoryObject *mem_obj = reinterpret_cast(m_ptr); - return Memory_GetNumBytes(mem_obj); + return api.Memory_GetNumBytes_(mem_obj); } protected: @@ -355,7 +574,8 @@ class usm_memory : public py::object return nullptr; } - auto convertor = ::dpctl::detail::dpctl_api::get().as_usm_memory_(); + auto convertor = + ::dpctl::detail::dpctl_capi::get().as_usm_memory_pyobj(); py::object res; try { @@ -371,16 +591,69 @@ class usm_memory : public py::object namespace tensor { + +inline std::vector +c_contiguous_strides(int nd, + const py::ssize_t *shape, + py::ssize_t element_size = 1) +{ + if (nd > 0) { + std::vector c_strides(nd, element_size); + for (int ic = nd - 1; ic > 0;) { + py::ssize_t next_v = c_strides[ic] * shape[ic]; + c_strides[--ic] = next_v; + } + return c_strides; + } + else { + return std::vector(); + } +} + +inline std::vector +f_contiguous_strides(int nd, + const py::ssize_t *shape, + py::ssize_t element_size = 1) +{ + if (nd > 0) { + std::vector f_strides(nd, element_size); + for (int i = 0; i < nd - 1;) { + py::ssize_t next_v = f_strides[i] * shape[i]; + f_strides[++i] = next_v; + } + return f_strides; + } + else { + return std::vector(); + } +} + +inline std::vector +c_contiguous_strides(const std::vector &shape, + py::ssize_t element_size = 1) +{ + return c_contiguous_strides(shape.size(), shape.data(), element_size); +} + +inline std::vector +f_contiguous_strides(const std::vector &shape, + py::ssize_t element_size = 1) +{ + return f_contiguous_strides(shape.size(), shape.data(), element_size); +} + class usm_ndarray : public py::object { public: PYBIND11_OBJECT(usm_ndarray, py::object, [](PyObject *o) -> bool { - return PyObject_TypeCheck(o, &PyUSMArrayType) != 0; + return PyObject_TypeCheck( + o, ::dpctl::detail::dpctl_capi::get().PyUSMArrayType_) != 0; }) usm_ndarray() - : py::object(::dpctl::detail::dpctl_api::get().default_usm_ndarray_(), - borrowed_t{}) + : py::object( + ::dpctl::detail::dpctl_capi::get().default_usm_ndarray_pyobj(), + borrowed_t{}) { if (!m_ptr) throw py::error_already_set(); @@ -388,10 +661,10 @@ class usm_ndarray : public py::object char *get_data() const { - PyObject *raw_o = this->ptr(); - PyUSMArrayObject *raw_ar = reinterpret_cast(raw_o); + PyUSMArrayObject *raw_ar = this->usm_array_ptr(); - return UsmNDArray_GetData(raw_ar); + auto &api = ::dpctl::detail::dpctl_capi::get(); + return api.UsmNDArray_GetData_(raw_ar); } template T *get_data() const @@ -401,18 +674,18 @@ class usm_ndarray : public py::object int get_ndim() const { - PyObject *raw_o = this->ptr(); - PyUSMArrayObject *raw_ar = reinterpret_cast(raw_o); + PyUSMArrayObject *raw_ar = this->usm_array_ptr(); - return UsmNDArray_GetNDim(raw_ar); + auto &api = ::dpctl::detail::dpctl_capi::get(); + return api.UsmNDArray_GetNDim_(raw_ar); } const py::ssize_t *get_shape_raw() const { - PyObject *raw_o = this->ptr(); - PyUSMArrayObject *raw_ar = reinterpret_cast(raw_o); + PyUSMArrayObject *raw_ar = this->usm_array_ptr(); - return UsmNDArray_GetShape(raw_ar); + auto &api = ::dpctl::detail::dpctl_capi::get(); + return api.UsmNDArray_GetShape_(raw_ar); } py::ssize_t get_shape(int i) const @@ -423,19 +696,19 @@ class usm_ndarray : public py::object const py::ssize_t *get_strides_raw() const { - PyObject *raw_o = this->ptr(); - PyUSMArrayObject *raw_ar = reinterpret_cast(raw_o); + PyUSMArrayObject *raw_ar = this->usm_array_ptr(); - return UsmNDArray_GetStrides(raw_ar); + auto &api = ::dpctl::detail::dpctl_capi::get(); + return api.UsmNDArray_GetStrides_(raw_ar); } py::ssize_t get_size() const { - PyObject *raw_o = this->ptr(); - PyUSMArrayObject *raw_ar = reinterpret_cast(raw_o); + PyUSMArrayObject *raw_ar = this->usm_array_ptr(); - int ndim = UsmNDArray_GetNDim(raw_ar); - const py::ssize_t *shape = UsmNDArray_GetShape(raw_ar); + auto &api = ::dpctl::detail::dpctl_capi::get(); + int ndim = api.UsmNDArray_GetNDim_(raw_ar); + const py::ssize_t *shape = api.UsmNDArray_GetShape_(raw_ar); py::ssize_t nelems = 1; for (int i = 0; i < ndim; ++i) { @@ -448,12 +721,12 @@ class usm_ndarray : public py::object std::pair get_minmax_offsets() const { - PyObject *raw_o = this->ptr(); - PyUSMArrayObject *raw_ar = reinterpret_cast(raw_o); + PyUSMArrayObject *raw_ar = this->usm_array_ptr(); - int nd = UsmNDArray_GetNDim(raw_ar); - const py::ssize_t *shape = UsmNDArray_GetShape(raw_ar); - const py::ssize_t *strides = UsmNDArray_GetStrides(raw_ar); + auto &api = ::dpctl::detail::dpctl_capi::get(); + int nd = api.UsmNDArray_GetNDim_(raw_ar); + const py::ssize_t *shape = api.UsmNDArray_GetShape_(raw_ar); + const py::ssize_t *strides = api.UsmNDArray_GetStrides_(raw_ar); py::ssize_t offset_min = 0; py::ssize_t offset_max = 0; @@ -465,7 +738,7 @@ class usm_ndarray : public py::object } } else { - offset_min = UsmNDArray_GetOffset(raw_ar); + offset_min = api.UsmNDArray_GetOffset_(raw_ar); offset_max = offset_min; for (int i = 0; i < nd; ++i) { py::ssize_t delta = strides[i] * (shape[i] - 1); @@ -482,35 +755,62 @@ class usm_ndarray : public py::object sycl::queue get_queue() const { - PyObject *raw_o = this->ptr(); - PyUSMArrayObject *raw_ar = reinterpret_cast(raw_o); + PyUSMArrayObject *raw_ar = this->usm_array_ptr(); - DPCTLSyclQueueRef QRef = UsmNDArray_GetQueueRef(raw_ar); + auto &api = ::dpctl::detail::dpctl_capi::get(); + DPCTLSyclQueueRef QRef = api.UsmNDArray_GetQueueRef_(raw_ar); return *(reinterpret_cast(QRef)); } int get_typenum() const { - PyObject *raw_o = this->ptr(); - PyUSMArrayObject *raw_ar = reinterpret_cast(raw_o); + PyUSMArrayObject *raw_ar = this->usm_array_ptr(); - return UsmNDArray_GetTypenum(raw_ar); + auto &api = ::dpctl::detail::dpctl_capi::get(); + return api.UsmNDArray_GetTypenum_(raw_ar); } int get_flags() const { - PyObject *raw_o = this->ptr(); - PyUSMArrayObject *raw_ar = reinterpret_cast(raw_o); + PyUSMArrayObject *raw_ar = this->usm_array_ptr(); - return UsmNDArray_GetFlags(raw_ar); + auto &api = ::dpctl::detail::dpctl_capi::get(); + return api.UsmNDArray_GetFlags_(raw_ar); } int get_elemsize() const { - PyObject *raw_o = this->ptr(); - PyUSMArrayObject *raw_ar = reinterpret_cast(raw_o); + PyUSMArrayObject *raw_ar = this->usm_array_ptr(); - return UsmNDArray_GetElementSize(raw_ar); + auto &api = ::dpctl::detail::dpctl_capi::get(); + return api.UsmNDArray_GetElementSize_(raw_ar); + } + + bool is_c_contiguous() const + { + int flags = this->get_flags(); + auto &api = ::dpctl::detail::dpctl_capi::get(); + return static_cast(flags & api.USM_ARRAY_C_CONTIGUOUS_); + } + + bool is_f_contiguous() const + { + int flags = this->get_flags(); + auto &api = ::dpctl::detail::dpctl_capi::get(); + return static_cast(flags & api.USM_ARRAY_F_CONTIGUOUS_); + } + + bool is_writable() const + { + int flags = this->get_flags(); + auto &api = ::dpctl::detail::dpctl_capi::get(); + return static_cast(flags & api.USM_ARRAY_WRITABLE_); + } + +private: + PyUSMArrayObject *usm_array_ptr() const + { + return reinterpret_cast(m_ptr); } }; diff --git a/dpctl/apis/include/dpctl_capi.h b/dpctl/apis/include/dpctl_capi.h index af4a5fd951..d6c104581a 100644 --- a/dpctl/apis/include/dpctl_capi.h +++ b/dpctl/apis/include/dpctl_capi.h @@ -47,8 +47,11 @@ * C functions can use dpctl's C-API functions without linking to * shared objects defining this symbols, if they call `import_dpctl()` * prior to using those symbols. + * + * It is declared inline to allow multiple definitions in + * different translation units */ -void import_dpctl(void) +static inline void import_dpctl(void) { import_dpctl___sycl_device(); import_dpctl___sycl_context(); diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 23a99304d8..a587d84609 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -18,12 +18,22 @@ add_custom_target(_dpctl4pybind11_deps set(python_module_name _tensor_impl) pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp ) target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel) target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/ ) add_dependencies(${python_module_name} _dpctl4pybind11_deps) install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor") diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp new file mode 100644 index 0000000000..4023d291af --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/constructors.hpp @@ -0,0 +1,657 @@ +//=== constructors.hpp - -----------------------------------*-C++-*--/===// +//=== Implementation of tensor constructors kernels ------===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor constructors. +//===----------------------------------------------------------------------===// + +#pragma once +#include "utils/strided_iters.hpp" +#include "utils/type_utils.hpp" +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace constructors +{ + +/*! + @defgroup CtorKernels + */ + +template class linear_sequence_step_kernel; +template class linear_sequence_affine_kernel; +template class eye_kernel; + +namespace py = pybind11; + +/* =========== Unboxing Python scalar =============== */ + +/*! + * @brief Cast pybind11 class managing Python object to specified type `T`. + * @defgroup CtorKernels + */ +template T unbox_py_scalar(py::object o) +{ + return py::cast(o); +} + +template <> inline sycl::half unbox_py_scalar(py::object o) +{ + float tmp = py::cast(o); + return static_cast(tmp); +} + +// Constructor to populate tensor with linear sequence defined by +// start and step data + +typedef sycl::event (*lin_space_step_fn_ptr_t)( + sycl::queue, + size_t, // num_elements + py::object start, + py::object step, + char *, // dst_data_ptr + const std::vector &); + +template class LinearSequenceStepFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty step_v; + +public: + LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv) + : p(reinterpret_cast(dst_p)), start_v(v0), step_v(dv) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + p[i] = Ty{start_v.real() + i * step_v.real(), + start_v.imag() + i * step_v.imag()}; + } + else { + p[i] = start_v + i * step_v; + } + } +}; + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting value and + * increment. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start_v Typed starting value of the sequence + * @param step_v Typed increment of the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_step_impl(sycl::queue exec_q, + size_t nelems, + Ty start_v, + Ty step_v, + char *array_data, + const std::vector &depends) +{ + sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>( + sycl::range<1>{nelems}, + LinearSequenceStepFunctor(array_data, start_v, step_v)); + }); + + return lin_space_step_event; +} + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting value and increment + * given as Python objects. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start Starting value of the sequence as Python object. Must be + * convertible to array element data type `Ty`. + * @param step Increment of the sequence as Python object. Must be convertible + * to array element data type `Ty`. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_step_impl(sycl::queue exec_q, + size_t nelems, + py::object start, + py::object step, + char *array_data, + const std::vector &depends) +{ + Ty start_v; + Ty step_v; + try { + start_v = unbox_py_scalar(start); + step_v = unbox_py_scalar(step); + } catch (const py::error_already_set &e) { + throw; + } + + auto lin_space_step_event = lin_space_step_impl( + exec_q, nelems, start_v, step_v, array_data, depends); + + return lin_space_step_event; +} + +/*! + * @brief Factor to get function pointer of type `fnT` for array with elements + * of type `Ty`. + * @defgroup CtorKernels + */ +template struct LinSpaceStepFactory +{ + fnT get() + { + fnT f = lin_space_step_impl; + return f; + } +}; + +// Constructor to populate tensor with linear sequence defined by +// start and and data + +typedef sycl::event (*lin_space_affine_fn_ptr_t)( + sycl::queue, + size_t, // num_elements + py::object start, + py::object end, + bool include_endpoint, + char *, // dst_data_ptr + const std::vector &); + +template class LinearSequenceAffineFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty end_v; + size_t n; + +public: + LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, size_t den) + : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), + n((den == 0) ? 1 : den) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + wTy wc = wTy(i) / n; + wTy w = wTy(n - i) / n; + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + auto _w = static_cast(w); + auto _wc = static_cast(wc); + auto re_comb = start_v.real() * _w + end_v.real() * _wc; + auto im_comb = start_v.imag() * _w + end_v.imag() * _wc; + Ty affine_comb = Ty{re_comb, im_comb}; + p[i] = affine_comb; + } + else { + using dpctl::tensor::type_utils::convert_impl; + auto affine_comb = start_v * w + end_v * wc; + p[i] = convert_impl(affine_comb); + } + } +}; + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting and end values. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence. + * @param start_v Stating value of the sequence. + * @param end_v End-value of the sequence. + * @param include_endpoint Whether the end-value is included in the sequence. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_affine_impl(sycl::queue exec_q, + size_t nelems, + Ty start_v, + Ty end_v, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + bool device_supports_doubles = exec_q.get_device().has(sycl::aspect::fp64); + sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + if (device_supports_doubles) { + cgh.parallel_for>( + sycl::range<1>{nelems}, + LinearSequenceAffineFunctor( + array_data, start_v, end_v, + (include_endpoint) ? nelems - 1 : nelems)); + } + else { + cgh.parallel_for>( + sycl::range<1>{nelems}, + LinearSequenceAffineFunctor( + array_data, start_v, end_v, + (include_endpoint) ? nelems - 1 : nelems)); + } + }); + + return lin_space_affine_event; +} + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting and end values given + * as Python objects. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param start Stating value of the sequence as Python object. Must be + * convertible to array data element type `Ty`. + * @param end End-value of the sequence as Python object. Must be convertible + * to array data element type `Ty`. + * @param include_endpoint Whether the end-value is included in the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_affine_impl(sycl::queue exec_q, + size_t nelems, + py::object start, + py::object end, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + Ty start_v, end_v; + try { + start_v = unbox_py_scalar(start); + end_v = unbox_py_scalar(end); + } catch (const py::error_already_set &e) { + throw; + } + + auto lin_space_affine_event = lin_space_affine_impl( + exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends); + + return lin_space_affine_event; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for array data type + * `Ty`. + */ +template struct LinSpaceAffineFactory +{ + fnT get() + { + fnT f = lin_space_affine_impl; + return f; + } +}; + +/* ================ Full ================== */ + +typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue, + size_t, + py::object, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param fill_v Value to fill the array with + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_contig_impl(sycl::queue q, + size_t nelems, + dstTy fill_v, + char *dst_p, + const std::vector &depends) +{ + sycl::event fill_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + dstTy *p = reinterpret_cast(dst_p); + cgh.fill(p, fill_v, nelems); + }); + + return fill_ev; +} + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param py_value Python object representing the value to fill the array with. + * Must be convertible to `dstTy`. + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_contig_impl(sycl::queue exec_q, + size_t nelems, + py::object py_value, + char *dst_p, + const std::vector &depends) +{ + dstTy fill_v; + try { + fill_v = unbox_py_scalar(py_value); + } catch (const py::error_already_set &e) { + throw; + } + + sycl::event fill_ev = + full_contig_impl(exec_q, nelems, fill_v, dst_p, depends); + + return fill_ev; +} + +template struct FullContigFactory +{ + fnT get() + { + fnT f = full_contig_impl; + return f; + } +}; + +/* ================ Eye ================== */ + +typedef sycl::event (*eye_fn_ptr_t)(sycl::queue, + size_t nelems, // num_elements + py::ssize_t start, + py::ssize_t end, + py::ssize_t step, + char *, // dst_data_ptr + const std::vector &); + +template class EyeFunctor +{ +private: + Ty *p = nullptr; + py::ssize_t start_v; + py::ssize_t end_v; + py::ssize_t step_v; + +public: + EyeFunctor(char *dst_p, + const py::ssize_t v0, + const py::ssize_t v1, + const py::ssize_t dv) + : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), step_v(dv) + { + } + + void operator()(sycl::id<1> wiid) const + { + Ty set_v = 0; + py::ssize_t i = static_cast(wiid.get(0)); + if (i >= start_v and i <= end_v) { + if ((i - start_v) % step_v == 0) { + set_v = 1; + } + } + p[i] = set_v; + } +}; + +/*! + * @brief Function to populate 2D array with eye matrix. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Number of elements to assign. + * @param start Position of the first non-zero value. + * @param end Position of the last non-zero value. + * @param step Number of array elements between non-zeros. + * @param array_data Kernel accessible USM pointer for the destination array. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event eye_impl(sycl::queue exec_q, + size_t nelems, + const py::ssize_t start, + const py::ssize_t end, + const py::ssize_t step, + char *array_data, + const std::vector &depends) +{ + sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>( + sycl::range<1>{nelems}, + EyeFunctor(array_data, start, end, step)); + }); + + return eye_event; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ +template struct EyeFactory +{ + fnT get() + { + fnT f = eye_impl; + return f; + } +}; + +/* =========================== Tril and triu ============================== */ + +// define function type +typedef sycl::event (*tri_fn_ptr_t)(sycl::queue, + py::ssize_t, // inner_range //py::ssize_t + py::ssize_t, // outer_range + char *, // src_data_ptr + char *, // dst_data_ptr + py::ssize_t, // nd + py::ssize_t *, // shape_and_strides + py::ssize_t, // k + const std::vector &, + const std::vector &); + +/*! + * @brief Function to copy triangular matrices from source stack to destination + * stack. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param inner_range Number of elements in each matrix. + * @param outer_range Number of matrices to copy. + * @param src_p Kernel accessible USM pointer for the source array. + * @param dst_p Kernel accessible USM pointer for the destination array. + * @param nd The array dimensionality of source and destination arrays. + * @param shape_and_strides Kernel accessible USM pointer to packed shape and + * strides of arrays. + * @param k Position of the diagonal above/below which to copy filling the rest + * with zero elements. + * @param depends List of events to wait for before starting computations, if + * any. + * @param additional_depends List of additional events to wait for before + * starting computations, if any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template class tri_kernel; +template +sycl::event tri_impl(sycl::queue exec_q, + py::ssize_t inner_range, + py::ssize_t outer_range, + char *src_p, + char *dst_p, + py::ssize_t nd, + py::ssize_t *shape_and_strides, + py::ssize_t k, + const std::vector &depends, + const std::vector &additional_depends) +{ + constexpr int d2 = 2; + py::ssize_t src_s = nd; + py::ssize_t dst_s = 2 * nd; + py::ssize_t nd_1 = nd - 1; + py::ssize_t nd_2 = nd - 2; + Ty *src = reinterpret_cast(src_p); + Ty *dst = reinterpret_cast(dst_p); + + sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + cgh.parallel_for>( + sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) { + py::ssize_t outer_gid = idx[0] / inner_range; + py::ssize_t inner_gid = idx[0] - inner_range * outer_gid; + + py::ssize_t src_inner_offset, dst_inner_offset; + bool to_copy; + + { + // py::ssize_t inner_gid = idx.get_id(0); + CIndexer_array indexer_i( + {shape_and_strides[nd_2], shape_and_strides[nd_1]}); + indexer_i.set(inner_gid); + const std::array &inner = indexer_i.get(); + src_inner_offset = + inner[0] * shape_and_strides[src_s + nd_2] + + inner[1] * shape_and_strides[src_s + nd_1]; + dst_inner_offset = + inner[0] * shape_and_strides[dst_s + nd_2] + + inner[1] * shape_and_strides[dst_s + nd_1]; + + if constexpr (upper) + to_copy = (inner[0] + k >= inner[1]); + else + to_copy = (inner[0] + k <= inner[1]); + } + + py::ssize_t src_offset = 0; + py::ssize_t dst_offset = 0; + { + // py::ssize_t outer_gid = idx.get_id(1); + CIndexer_vector outer(nd - d2); + outer.get_displacement( + outer_gid, shape_and_strides, shape_and_strides + src_s, + shape_and_strides + dst_s, src_offset, dst_offset); + } + + src_offset += src_inner_offset; + dst_offset += dst_inner_offset; + + dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0); + }); + }); + return tri_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ +template struct TrilGenericFactory +{ + fnT get() + { + fnT f = tri_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ +template struct TriuGenericFactory +{ + fnT get() + { + fnT f = tri_impl; + return f; + } +}; + +} // namespace constructors +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp new file mode 100644 index 0000000000..0d5a1d21ca --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -0,0 +1,669 @@ +//=== copy_and_cast.hpp - Implementation of copy-and-cast kernels *-C++-*/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor copying and value casting. +//===----------------------------------------------------------------------===// + +#pragma once +#include "utils/strided_iters.hpp" +#include "utils/type_utils.hpp" +#include +#include +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace copy_and_cast +{ + +namespace py = pybind11; + +template class copy_cast_generic_kernel; +template class copy_cast_from_host_kernel; +template class copy_cast_spec_kernel; +template class copy_for_reshape_generic_kernel; + +template class Caster +{ +public: + Caster() = default; + void operator()(char *src, + std::ptrdiff_t src_offset, + char *dst, + std::ptrdiff_t dst_offset) const + { + using dpctl::tensor::type_utils::convert_impl; + + srcT *src_ = reinterpret_cast(src) + src_offset; + dstT *dst_ = reinterpret_cast(dst) + dst_offset; + *dst_ = convert_impl(*src_); + } +}; + +template class GenericCopyFunctor +{ +private: + char *src_ = nullptr; + char *dst_ = nullptr; + py::ssize_t *shape_strides_ = nullptr; + int nd_ = 0; + py::ssize_t src_offset0 = 0; + py::ssize_t dst_offset0 = 0; + +public: + GenericCopyFunctor(char *src_cp, + char *dst_cp, + py::ssize_t *shape_strides, + int nd, + py::ssize_t src_offset, + py::ssize_t dst_offset) + : src_(src_cp), dst_(dst_cp), shape_strides_(shape_strides), nd_(nd), + src_offset0(src_offset), dst_offset0(dst_offset) + { + } + + void operator()(sycl::id<1> wiid) const + { + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + CIndexer_vector indxr(nd_); + indxr.get_displacement( + static_cast(wiid.get(0)), + const_cast(shape_strides_), // common shape + const_cast(shape_strides_ + + nd_), // src strides + const_cast(shape_strides_ + + 2 * nd_), // dst strides + src_offset, // modified by reference + dst_offset // modified by reference + ); + CastFnT fn{}; + fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset); + } +}; + +template class NDSpecializedCopyFunctor +{ +private: + char *src_ = nullptr; + char *dst_ = nullptr; + CIndexer_array indxr; + const std::array src_strides_; + const std::array dst_strides_; + static const int nd_ = nd; + py::ssize_t src_offset0 = 0; + py::ssize_t dst_offset0 = 0; + +public: + NDSpecializedCopyFunctor(char *src_cp, // USM pointer + char *dst_cp, // USM pointer + const std::array shape, + const std::array src_strides, + const std::array dst_strides, + py::ssize_t src_offset, + py::ssize_t dst_offset) + : src_(src_cp), dst_(dst_cp), indxr(shape), src_strides_(src_strides), + dst_strides_(dst_strides), src_offset0(src_offset), + dst_offset0(dst_offset) + { + } + + void operator()(sycl::id<1> wiid) const + { + py::ssize_t src_offset = 0; + py::ssize_t dst_offset = 0; + CIndexer_array local_indxr(std::move(indxr)); + + local_indxr.set(wiid.get(0)); + auto mi = local_indxr.get(); + for (int i = 0; i < nd; ++i) + src_offset += mi[i] * src_strides_[i]; + for (int i = 0; i < nd; ++i) + dst_offset += mi[i] * dst_strides_[i]; + + CastFnT fn{}; + fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset); + } +}; + +/*! + @defgroup CopyAndCastKernels + */ + +/*! + * @brief Function pointer type for generic array cast and copying function. + */ +typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( + sycl::queue, + size_t, + int, + py::ssize_t *, + char *, + py::ssize_t, + char *, + py::ssize_t, + const std::vector &, + const std::vector &); + +/*! + * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to + `dst` usm_ndarray while casting from `srcTy` to `dstTy`. + + Both arrays have array dimensionality specied via argument `nd`. The + `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the + first `nd` elements encode common shape, second `nd` elements contain strides + of `src` array, and the trailing `nd` elements contain strides of `dst` array. + `src_p` and `dst_p` represent pointers into respective arrays, but the start of + iteration begins at offset of `src_offset` elements for `src` array and at + offset `dst_offset` elements for `dst` array. Kernel is submitted to sycl queue + `q` with events `depends` and `additional_depends` as dependencies. + + @param q Sycl queue to which the kernel is submitted. + @param nelems Number of elements to cast and copy. + @param nd Array dimensionality, i.e. number of indices needed to + identify an element of each array. + @param shape_and_strides Kernel accessible USM pointer to packed shape and + strides. + @param src_p Kernel accessible USM pointer for the source array + @param src_offset Offset to the beginning of iteration in number of + elements of source array from `src_p`. + @param dst_p Kernel accessible USM pointer for the destination array + @param dst_offset Offset to the beginning of iteration in number of + elements of destination array from `dst_p`. + @param depends List of events to wait for before starting computations, if + any. + @param additional_depends Additional list of events to wait for before + starting computations, if any. + + @return Event to wait on to ensure that computation completes. + @ingroup CopyAndCastKernels + */ +template +sycl::event +copy_and_cast_generic_impl(sycl::queue q, + size_t nelems, + int nd, + py::ssize_t *shape_and_strides, + char *src_p, + py::ssize_t src_offset, + char *dst_p, + py::ssize_t dst_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyFunctor>( + src_p, dst_p, shape_and_strides, nd, src_offset, dst_offset)); + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get generic function pointer of type `fnT` for given source + * data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template struct CopyAndCastGenericFactory +{ + fnT get() + { + fnT f = copy_and_cast_generic_impl; + return f; + } +}; + +// Specialization of copy_and_cast for 1D arrays + +/*! + * @brief Factory to get function pointer for casting and copying 1D arrays. + * @ingroup CopyAndCastKernels + */ +typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( + sycl::queue, + size_t, + const std::array, + const std::array, + const std::array, + char *, + py::ssize_t, + char *, + py::ssize_t, + const std::vector &); + +/*! + * @brief Factory to get function pointer for casting and copying 2D arrays. + * @ingroup CopyAndCastKernels + */ +typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( + sycl::queue, + size_t, + const std::array, + const std::array, + const std::array, + char *, + py::ssize_t, + char *, + py::ssize_t, + const std::vector &); + +/*! + * @brief Specialized for given array dimension function to copy `nelems` + elements from `src` usm_ndarray to `dst` usm_ndarray while casting from `srcTy` + to `dstTy`. + + Both arrays have array dimensionality known at compile time and specified in + template parameters `nd`. Arrays' shape and strides are provided as + `std::array`. `src_p` and `dst_p` represent pointers into respective arrays, + but the start of iteration begins at offset of `src_offset` elements for `src` + array and at offset `dst_offset` elements for `dst` array. Kernel is submitted + to sycl queue `q` with events `depends` as dependencies. + + @param q The queue where the routine should be executed. + @param nelems Number of elements to cast and copy. + @param shape Common shape of the arrays. + @param src_strides Strides of the source array. + @param dst_strides Strides of the destination array. + @param src_p Kernel accessible USM pointer for the source array + @param src_offset Offset to the beginning of iteration in number of elements + of the source array from `src_p`. + @param dst_p Kernel accessible USM pointer for the destination array + @param dst_offset Offset to the beginning of iteration in number of elements + of the destination array from `src_p`. + @param depends List of events to wait for before starting computations, if + any. + + @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event +copy_and_cast_nd_specialized_impl(sycl::queue q, + size_t nelems, + const std::array shape, + const std::array src_strides, + const std::array dst_strides, + char *src_p, + py::ssize_t src_offset, + char *dst_p, + py::ssize_t dst_offset, + const std::vector &depends) +{ + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>( + sycl::range<1>(nelems), + NDSpecializedCopyFunctor>( + src_p, dst_p, shape, src_strides, dst_strides, src_offset, + dst_offset)); + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get 1D-specialized function pointer of type `fnT` for given + * source data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template struct CopyAndCast1DFactory +{ + fnT get() + { + fnT f = copy_and_cast_nd_specialized_impl; + return f; + } +}; + +/*! + * @brief Factory to get 2D-specialized function pointer of type `fnT` for given + * source data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template struct CopyAndCast2DFactory +{ + fnT get() + { + fnT f = copy_and_cast_nd_specialized_impl; + return f; + } +}; + +// ====================== Copying from host to USM + +template +class CasterForAccessor +{ +public: + CasterForAccessor() = default; + void operator()(AccessorT src, + std::ptrdiff_t src_offset, + char *dst, + std::ptrdiff_t dst_offset) const + { + using dpctl::tensor::type_utils::convert_impl; + + dstT *dst_ = reinterpret_cast(dst) + dst_offset; + *dst_ = convert_impl(src[src_offset]); + } +}; + +template class GenericCopyFromHostFunctor +{ +private: + AccessorT src_acc_; + char *dst_ = nullptr; + py::ssize_t *shape_strides_ = nullptr; + int nd_ = 0; + py::ssize_t src_offset0 = 0; + py::ssize_t dst_offset0 = 0; + +public: + GenericCopyFromHostFunctor(AccessorT src_acc, + char *dst_cp, + py::ssize_t *shape_strides, + int nd, + py::ssize_t src_offset, + py::ssize_t dst_offset) + : src_acc_(src_acc), dst_(dst_cp), shape_strides_(shape_strides), + nd_(nd), src_offset0(src_offset), dst_offset0(dst_offset) + { + } + + void operator()(sycl::id<1> wiid) const + { + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + CIndexer_vector indxr(nd_); + indxr.get_displacement( + static_cast(wiid.get(0)), + const_cast(shape_strides_), // common shape + const_cast(shape_strides_ + + nd_), // src strides + const_cast(shape_strides_ + + 2 * nd_), // dst strides + src_offset, // modified by reference + dst_offset // modified by reference + ); + CastFnT fn{}; + fn(src_acc_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset); + } +}; + +typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)( + sycl::queue, + size_t, + int, + py::ssize_t *, + const char *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + char *, + py::ssize_t, + const std::vector &, + const std::vector &); + +/*! + * @brief Function to copy from NumPy's ndarray with elements of type `srcTy` + * into usm_ndarray with elements of type `srcTy`. + * + * Function to cast and copy elements from numpy.ndarray specified by typeless + * `host_src_p` and the `src_offset` given in the number of array elements. + * Arrays' metadata are given in packed USM vector of length `3*nd` whose first + * `nd` elements contain arrays' shape, next `nd` elements specify source + * strides in elements (not bytes), and trailing `nd` elements specify + * destination array strides. Kernel dependencies are given by two vectors of + * events: `depends` and `additional_depends`. The function execution is + * complete at the return. + * + * @param q The queue where the routine should be executed. + * @param nelems Number of elements to cast and copy. + * @param nd The dimensionality of arrays + * @param shape_and_strides Kernel accessible USM pointer to packed shape and + * strides. + * @param host_src_p Host (not USM allocated) pointer associated with the + * source array. + * @param src_offset Offset to the beginning of iteration in number of elements + * of the source array from `host_src_p`. + * @param src_min_nelem_offset Smallest value of offset relative to + * `host_src_p` in number of elements attained while iterating over elements of + * the source array. + * @param src_max_nelem_offset Largest value of offset relative to `host_src_p` + * in number of elements attained while iterating over elements of the source + * array. + * @param dst_p USM pointer associated with the destination array. + * @param dst_offset Offset to the beginning of iteration in number of elements + * of the destination array from `dst_p`. + * @param depends List of events to wait for before starting computations, if + * any. + * @param additional_depends List of additional events to wait for before + * starting computations, if any. + * + * @ingroup CopyAndCastKernels + */ +template +void copy_and_cast_from_host_impl( + sycl::queue q, + size_t nelems, + int nd, + py::ssize_t *shape_and_strides, + const char *host_src_p, + py::ssize_t src_offset, + py::ssize_t src_min_nelem_offset, + py::ssize_t src_max_nelem_offset, + char *dst_p, + py::ssize_t dst_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + py::ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1; + sycl::buffer npy_buf( + reinterpret_cast(host_src_p) + src_min_nelem_offset, + sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}}); + + sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only); + + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyFromHostFunctor< + CasterForAccessor, + decltype(npy_acc)>(npy_acc, dst_p, shape_and_strides, nd, + src_offset - src_min_nelem_offset, + dst_offset)); + }); + + // perform explicit synchronization. Implicit synchronization would be + // performed by sycl::buffer destructor. + copy_and_cast_from_host_ev.wait_and_throw(); + + return; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given NumPy array + * source data type `S` and destination data type `D`. + * @defgroup CopyAndCastKernels + */ +template +struct CopyAndCastFromHostFactory +{ + fnT get() + { + fnT f = copy_and_cast_from_host_impl; + return f; + } +}; + +// =============== Copying for reshape ================== // + +template class GenericCopyForReshapeFunctor +{ +private: + py::ssize_t offset = 0; + py::ssize_t size = 1; + int src_nd = -1; + int dst_nd = -1; + // USM array of size 2*(src_nd + dst_nd) + // [ src_shape; src_strides; dst_shape; dst_strides ] + const py::ssize_t *src_dst_shapes_and_strides = nullptr; + Ty *src_p = nullptr; + Ty *dst_p = nullptr; + +public: + GenericCopyForReshapeFunctor(py::ssize_t shift, + py::ssize_t nelems, + int src_ndim, + int dst_ndim, + const py::ssize_t *packed_shapes_and_strides, + char *src_ptr, + char *dst_ptr) + : offset(shift), size(nelems), src_nd(src_ndim), dst_nd(dst_ndim), + src_dst_shapes_and_strides(packed_shapes_and_strides), + src_p(reinterpret_cast(src_ptr)), + dst_p(reinterpret_cast(dst_ptr)) + { + } + + void operator()(sycl::id<1> wiid) const + { + py::ssize_t this_src_offset(0); + CIndexer_vector src_indxr(src_nd); + + src_indxr.get_displacement( + static_cast(wiid.get(0)), + const_cast( + src_dst_shapes_and_strides), // src shape + const_cast(src_dst_shapes_and_strides + + src_nd), // src strides + this_src_offset // modified by reference + ); + const Ty *in = src_p + this_src_offset; + + py::ssize_t this_dst_offset(0); + CIndexer_vector dst_indxr(dst_nd); + py::ssize_t shifted_wiid = + (static_cast(wiid.get(0)) + offset) % size; + shifted_wiid = (shifted_wiid >= 0) ? shifted_wiid : shifted_wiid + size; + dst_indxr.get_displacement( + shifted_wiid, + const_cast(src_dst_shapes_and_strides + + 2 * src_nd), // dst shape + const_cast(src_dst_shapes_and_strides + + 2 * src_nd + dst_nd), // dst strides + this_dst_offset // modified by reference + ); + + Ty *out = dst_p + this_dst_offset; + *out = *in; + } +}; + +// define function type +typedef sycl::event (*copy_for_reshape_fn_ptr_t)( + sycl::queue, + py::ssize_t, // shift + size_t, // num_elements + int, + int, // src_nd, dst_nd + py::ssize_t *, // packed shapes and strides + char *, // src_data_ptr + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to copy content of array while reshaping. + * + * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems , + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param shift The shift in flat indexing. + * @param nelems The number of elements to copy + * @param src_nd Array dimension of the source array + * @param dst_nd Array dimension of the destination array + * @param packed_shapes_and_strides Kernel accessible USM array of size + * `2*src_nd + 2*dst_nd` with contant `[src_shape, src_strides, dst_shape, + * dst_strides]`. + * @param src_p Typeless USM pointer to the buffer of the source array + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event +copy_for_reshape_generic_impl(sycl::queue q, + py::ssize_t shift, + size_t nelems, + int src_nd, + int dst_nd, + py::ssize_t *packed_shapes_and_strides, + char *src_p, + char *dst_p, + const std::vector &depends) +{ + sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyForReshapeFunctor(shift, nelems, src_nd, dst_nd, + packed_shapes_and_strides, src_p, + dst_p)); + }); + + return copy_for_reshape_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template struct CopyForReshapeGenericFactory +{ + fnT get() + { + fnT f = copy_for_reshape_generic_impl; + return f; + } +}; + +} // namespace copy_and_cast +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp index 9725e5fb6b..c24ed54941 100644 --- a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp +++ b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp @@ -499,16 +499,16 @@ int simplify_iteration_two_strides(const int nd, return nd_; } -using vecT = std::vector; -std::tuple contract_iter(vecT shape, vecT strides) +template > +std::tuple contract_iter(vecT shape, vecT strides) { const size_t dim = shape.size(); if (dim != strides.size()) { - throw py::value_error("Shape and strides must be of equal size."); + throw Error("Shape and strides must be of equal size."); } vecT out_shape = shape; vecT out_strides = strides; - py::ssize_t disp(0); + T disp(0); int nd = simplify_iteration_stride(dim, out_shape.data(), out_strides.data(), disp); @@ -517,18 +517,19 @@ std::tuple contract_iter(vecT shape, vecT strides) return std::make_tuple(out_shape, out_strides, disp); } -std::tuple +template > +std::tuple contract_iter2(vecT shape, vecT strides1, vecT strides2) { const size_t dim = shape.size(); if (dim != strides1.size() || dim != strides2.size()) { - throw py::value_error("Shape and strides must be of equal size."); + throw Error("Shape and strides must be of equal size."); } vecT out_shape = shape; vecT out_strides1 = strides1; vecT out_strides2 = strides2; - py::ssize_t disp1(0); - py::ssize_t disp2(0); + T disp1(0); + T disp2(0); int nd = simplify_iteration_two_strides(dim, out_shape.data(), out_strides1.data(), diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp index a617daf975..feea1308cb 100644 --- a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp +++ b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp @@ -159,83 +159,64 @@ class DispatchVectorBuilder } }; -// Lookup a type according to its size, and return a value corresponding to the -// NumPy typenum. -template constexpr int platform_typeid_lookup() -{ - return -1; -} - -template -constexpr int platform_typeid_lookup(int I, Ints... Is) -{ - return sizeof(Concrete) == sizeof(T) - ? I - : platform_typeid_lookup(Is...); -} - struct usm_ndarray_types { - static usm_ndarray_types &get() - { - static usm_ndarray_types singleton = populate_fields(); - return singleton; - } int typenum_to_lookup_id(int typenum) { using typenum_t = dpctl::tensor::detail::typenum_t; + auto &api = ::dpctl::detail::dpctl_capi::get(); - if (typenum == UAR_DOUBLE_) { + if (typenum == api.UAR_DOUBLE_) { return static_cast(typenum_t::DOUBLE); } - else if (typenum == UAR_INT64_) { + else if (typenum == api.UAR_INT64_) { return static_cast(typenum_t::INT64); } - else if (typenum == UAR_INT32_) { + else if (typenum == api.UAR_INT32_) { return static_cast(typenum_t::INT32); } - else if (typenum == UAR_BOOL_) { + else if (typenum == api.UAR_BOOL_) { return static_cast(typenum_t::BOOL); } - else if (typenum == UAR_CDOUBLE_) { + else if (typenum == api.UAR_CDOUBLE_) { return static_cast(typenum_t::CDOUBLE); } - else if (typenum == UAR_FLOAT_) { + else if (typenum == api.UAR_FLOAT_) { return static_cast(typenum_t::FLOAT); } - else if (typenum == UAR_INT16_) { + else if (typenum == api.UAR_INT16_) { return static_cast(typenum_t::INT16); } - else if (typenum == UAR_INT8_) { + else if (typenum == api.UAR_INT8_) { return static_cast(typenum_t::INT8); } - else if (typenum == UAR_UINT64_) { + else if (typenum == api.UAR_UINT64_) { return static_cast(typenum_t::UINT64); } - else if (typenum == UAR_UINT32_) { + else if (typenum == api.UAR_UINT32_) { return static_cast(typenum_t::UINT32); } - else if (typenum == UAR_UINT16_) { + else if (typenum == api.UAR_UINT16_) { return static_cast(typenum_t::UINT16); } - else if (typenum == UAR_UINT8_) { + else if (typenum == api.UAR_UINT8_) { return static_cast(typenum_t::UINT8); } - else if (typenum == UAR_CFLOAT_) { + else if (typenum == api.UAR_CFLOAT_) { return static_cast(typenum_t::CFLOAT); } - else if (typenum == UAR_HALF_) { + else if (typenum == api.UAR_HALF_) { return static_cast(typenum_t::HALF); } - else if (typenum == UAR_INT || typenum == UAR_UINT) { + else if (typenum == api.UAR_INT_ || typenum == api.UAR_UINT_) { switch (sizeof(int)) { case sizeof(std::int32_t): - return ((typenum == UAR_INT) + return ((typenum == api.UAR_INT_) ? static_cast(typenum_t::INT32) : static_cast(typenum_t::UINT32)); case sizeof(std::int64_t): - return ((typenum == UAR_INT) + return ((typenum == api.UAR_INT_) ? static_cast(typenum_t::INT64) : static_cast(typenum_t::UINT64)); default: @@ -251,58 +232,6 @@ struct usm_ndarray_types } private: - int UAR_BOOL_ = -1; - // Platform-dependent normalization - int UAR_INT8_ = -1; - int UAR_UINT8_ = -1; - int UAR_INT16_ = -1; - int UAR_UINT16_ = -1; - int UAR_INT32_ = -1; - int UAR_UINT32_ = -1; - int UAR_INT64_ = -1; - int UAR_UINT64_ = -1; - int UAR_HALF_ = -1; - int UAR_FLOAT_ = -1; - int UAR_DOUBLE_ = -1; - int UAR_CFLOAT_ = -1; - int UAR_CDOUBLE_ = -1; - int UAR_TYPE_SENTINEL_ = -1; - - void init_constants() - { - UAR_BOOL_ = UAR_BOOL; - UAR_INT8_ = UAR_BYTE; - UAR_UINT8_ = UAR_UBYTE; - UAR_INT16_ = UAR_SHORT; - UAR_UINT16_ = UAR_USHORT; - UAR_INT32_ = platform_typeid_lookup( - UAR_LONG, UAR_INT, UAR_SHORT); - UAR_UINT32_ = platform_typeid_lookup( - UAR_ULONG, UAR_UINT, UAR_USHORT); - UAR_INT64_ = platform_typeid_lookup( - UAR_LONG, UAR_LONGLONG, UAR_INT); - UAR_UINT64_ = platform_typeid_lookup( - UAR_ULONG, UAR_ULONGLONG, UAR_UINT); - UAR_HALF_ = UAR_HALF; - UAR_FLOAT_ = UAR_FLOAT; - UAR_DOUBLE_ = UAR_DOUBLE; - UAR_CFLOAT_ = UAR_CFLOAT; - UAR_CDOUBLE_ = UAR_CDOUBLE; - UAR_TYPE_SENTINEL_ = UAR_TYPE_SENTINEL; - } - - static usm_ndarray_types populate_fields() - { - import_dpctl(); - - usm_ndarray_types types; - types.init_constants(); - - return types; - } - void throw_unrecognized_typenum_error(int typenum) { throw std::runtime_error("Unrecogized typenum " + diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp new file mode 100644 index 0000000000..181ff89adc --- /dev/null +++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp @@ -0,0 +1,73 @@ +//===------ type_utils.hpp - Implementation of types utils ----*-C++-*/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions for value casting. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace dpctl +{ +namespace tensor +{ +namespace type_utils +{ + +template struct is_complex : std::false_type +{ +}; +template struct is_complex> : std::true_type +{ +}; + +template dstTy convert_impl(const srcTy &v) +{ + if constexpr (std::is_same::value) { + return v; + } + else if constexpr (std::is_same_v && is_complex::value) + { + // bool(complex_v) == (complex_v.real() != 0) && (complex_v.imag() !=0) + return (convert_impl(v.real()) || + convert_impl(v.imag())); + } + else if constexpr (is_complex::value && !is_complex::value) { + // real_t(complex_v) == real_t(complex_v.real()) + return convert_impl(v.real()); + } + else if constexpr (!std::is_integral::value && + !std::is_same::value && + std::is_integral::value && + std::is_unsigned::value) + { + // first cast to signed variant, the cast to unsigned one + using signedT = typename std::make_signed::type; + return static_cast(convert_impl(v)); + } + else { + return static_cast(v); + } +} + +} // namespace type_utils +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp new file mode 100644 index 0000000000..c81430d54b --- /dev/null +++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -0,0 +1,359 @@ +//===-- tensor_py.cpp - Implementation of _tensor_impl module --*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include "kernels/copy_and_cast.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "simplify_iteration_space.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace _ns = dpctl::tensor::detail; + +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_2d_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t; + +static copy_and_cast_generic_fn_ptr_t + copy_and_cast_generic_dispatch_table[_ns::num_types][_ns::num_types]; +static copy_and_cast_1d_fn_ptr_t + copy_and_cast_1d_dispatch_table[_ns::num_types][_ns::num_types]; +static copy_and_cast_2d_fn_ptr_t + copy_and_cast_2d_dispatch_table[_ns::num_types][_ns::num_types]; + +namespace py = pybind11; + +using dpctl::tensor::c_contiguous_strides; +using dpctl::tensor::f_contiguous_strides; + +using dpctl::utils::keep_args_alive; + +sycl::event _populate_packed_shape_strides_for_copycast_kernel( + sycl::queue exec_q, + py::ssize_t *device_shape_strides, // to be populated + const std::vector &common_shape, + const std::vector &src_strides, + const std::vector &dst_strides) +{ + // memory transfer optimization, use USM-host for temporary speeds up + // tranfer to device, especially on dGPUs + using usm_host_allocatorT = + sycl::usm_allocator; + using shT = std::vector; + size_t nd = common_shape.size(); + + usm_host_allocatorT allocator(exec_q); + + // create host temporary for packed shape and strides managed by shared + // pointer. Packed vector is concatenation of common_shape, src_stride and + // std_strides + std::shared_ptr shp_host_shape_strides = + std::make_shared(3 * nd, allocator); + std::copy(common_shape.begin(), common_shape.end(), + shp_host_shape_strides->begin()); + + std::copy(src_strides.begin(), src_strides.end(), + shp_host_shape_strides->begin() + nd); + + std::copy(dst_strides.begin(), dst_strides.end(), + shp_host_shape_strides->begin() + 2 * nd); + + sycl::event copy_shape_ev = exec_q.copy( + shp_host_shape_strides->data(), device_shape_strides, + shp_host_shape_strides->size()); + + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(copy_shape_ev); + cgh.host_task([shp_host_shape_strides]() { + // increment shared pointer ref-count to keep it alive + // till copy operation completes; + }); + }); + + return copy_shape_ev; +} + +std::pair +copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = {}) +{ + // array dimensions must be the same + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Array dimensions are not the same."); + } + + // shapes must be the same + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + size_t src_nelems(1); + + for (int i = 0; i < src_nd; ++i) { + src_nelems *= static_cast(src_shape[i]); + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + if (src_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accomodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < src_nelems) { + throw py::value_error( + "Destination array can not accomodate all the " + "elements of source array."); + } + } + + // check compatibility of execution queue and allocation queue + sycl::queue src_q = src.get_queue(); + sycl::queue dst_q = dst.get_queue(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = dpctl::tensor::detail::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + // check that arrays do not overlap, and concurrent copying is safe. + auto src_offsets = src.get_minmax_offsets(); + int src_elem_size = src.get_elemsize(); + int dst_elem_size = dst.get_elemsize(); + + bool memory_overlap = + ((dst_data - src_data > src_offsets.second * src_elem_size - + dst_offsets.first * dst_elem_size) && + (src_data - dst_data > dst_offsets.second * dst_elem_size - + src_offsets.first * src_elem_size)); + if (memory_overlap) { + // TODO: could use a temporary, but this is done by the caller + throw py::value_error("Arrays index overlapping segments of memory"); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + // check for applicability of special cases: + // (same type && (both C-contiguous || both F-contiguous) + bool both_c_contig = (is_src_c_contig && is_dst_c_contig); + bool both_f_contig = (is_src_f_contig && is_dst_f_contig); + if (both_c_contig || both_f_contig) { + if (src_type_id == dst_type_id) { + + sycl::event copy_ev = + exec_q.memcpy(static_cast(dst_data), + static_cast(src_data), + src_nelems * src_elem_size, depends); + + // make sure src and dst are not GC-ed before copy_ev is complete + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_ev}), copy_ev); + } + // With contract_iter2 in place, there is no need to write + // dedicated kernels for casting between contiguous arrays + } + + const py::ssize_t *src_strides = src.get_strides_raw(); + const py::ssize_t *dst_strides = dst.get_strides_raw(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + const py::ssize_t *shape = src_shape; + + constexpr py::ssize_t src_itemsize = 1; // in elements + constexpr py::ssize_t dst_itemsize = 1; // in elements + + // all args except itemsizes and is_?_contig bools can be modified by + // reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, shape, src_strides, src_itemsize, is_src_c_contig, is_src_f_contig, + dst_strides, dst_itemsize, is_dst_c_contig, is_dst_f_contig, + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (nd < 3) { + if (nd == 1) { + std::array shape_arr = {shape[0]}; + // strides may be null + std::array src_strides_arr = { + (src_strides ? src_strides[0] : 1)}; + std::array dst_strides_arr = { + (dst_strides ? dst_strides[0] : 1)}; + + auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; + sycl::event copy_and_cast_1d_event = fn( + exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr, + src_data, src_offset, dst_data, dst_offset, depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}), + copy_and_cast_1d_event); + } + else if (nd == 2) { + std::array shape_arr = {shape[0], shape[1]}; + std::array src_strides_arr = {src_strides[0], + src_strides[1]}; + std::array dst_strides_arr = {dst_strides[0], + dst_strides[1]}; + + auto fn = copy_and_cast_2d_dispatch_table[dst_type_id][src_type_id]; + + sycl::event copy_and_cast_2d_event = fn( + exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr, + src_data, src_offset, dst_data, dst_offset, depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_and_cast_2d_event}), + copy_and_cast_2d_event); + } + else if (nd == 0) { // case of a scalar + assert(src_nelems == 1); + std::array shape_arr = {1}; + std::array src_strides_arr = {1}; + std::array dst_strides_arr = {1}; + + auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; + + sycl::event copy_and_cast_0d_event = fn( + exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr, + src_data, src_offset, dst_data, dst_offset, depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}), + copy_and_cast_0d_event); + } + } + + // Generic implementation + auto copy_and_cast_fn = + copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id]; + + // If shape/strides are accessed with accessors, buffer destructor + // will force syncronization. + py::ssize_t *shape_strides = + sycl::malloc_device(3 * nd, exec_q); + + if (shape_strides == nullptr) { + throw std::runtime_error("Unabled to allocate device memory"); + } + + sycl::event copy_shape_ev = + _populate_packed_shape_strides_for_copycast_kernel( + exec_q, shape_strides, simplified_shape, simplified_src_strides, + simplified_dst_strides); + + sycl::event copy_and_cast_generic_ev = copy_and_cast_fn( + exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data, + dst_offset, depends, {copy_shape_ev}); + + // async free of shape_strides temporary + auto ctx = exec_q.get_context(); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(copy_and_cast_generic_ev); + cgh.host_task( + [ctx, shape_strides]() { sycl::free(shape_strides, ctx); }); + }); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_and_cast_generic_ev}), + copy_and_cast_generic_ev); +} + +void init_copy_and_cast_usm_to_usm_dispatch_tables(void) +{ + using namespace dpctl::tensor::detail; + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory; + DispatchTableBuilder + dtb_generic; + dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCast1DFactory; + DispatchTableBuilder + dtb_1d; + dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCast2DFactory; + DispatchTableBuilder + dtb_2d; + dtb_2d.populate_dispatch_table(copy_and_cast_2d_dispatch_table); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp new file mode 100644 index 0000000000..192d70c0f2 --- /dev/null +++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp @@ -0,0 +1,50 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair +copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = {}); + +extern void init_copy_and_cast_usm_to_usm_dispatch_tables(); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp new file mode 100644 index 0000000000..7f7e866bb1 --- /dev/null +++ b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp @@ -0,0 +1,258 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "copy_for_reshape.hpp" +#include "dpctl4pybind11.hpp" +#include "kernels/copy_and_cast.hpp" +#include "utils/type_dispatch.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace _ns = dpctl::tensor::detail; + +using dpctl::tensor::kernels::copy_and_cast::copy_for_reshape_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +// define static vector +static copy_for_reshape_fn_ptr_t + copy_for_reshape_generic_dispatch_vector[_ns::num_types]; + +/* + * Copies src into dst (same data type) of different shapes by using flat + * iterations. + * + * Equivalent to the following loop: + * + * for i for range(src.size): + * dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)] + */ +std::pair +copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + py::ssize_t shift, + sycl::queue exec_q, + const std::vector &depends) +{ + py::ssize_t src_nelems = src.get_size(); + py::ssize_t dst_nelems = dst.get_size(); + + // Must have the same number of elements + if (src_nelems != dst_nelems) { + throw py::value_error( + "copy_usm_ndarray_for_reshape requires src and dst to " + "have the same number of elements."); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_reshape requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + // destination must be ample enough to accomodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + py::ssize_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < src_nelems) { + throw py::value_error( + "Destination array can not accomodate all the " + "elements of source array."); + } + } + + // check same contexts + sycl::queue src_q = src.get_queue(); + sycl::queue dst_q = dst.get_queue(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + // dimensions may be different + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + auto array_types = dpctl::tensor::detail::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + auto fn = copy_for_reshape_generic_dispatch_vector[type_id]; + + // packed_shape_strides = [src_shape, src_strides, dst_shape, dst_strides] + py::ssize_t *packed_shapes_strides = + sycl::malloc_device(2 * (src_nd + dst_nd), exec_q); + + if (packed_shapes_strides == nullptr) { + throw std::runtime_error("Unabled to allocate device memory"); + } + + using usm_host_allocatorT = + sycl::usm_allocator; + using shT = std::vector; + usm_host_allocatorT allocator(exec_q); + std::shared_ptr packed_host_shapes_strides_shp = + std::make_shared(2 * (src_nd + dst_nd), allocator); + + std::copy(src_shape, src_shape + src_nd, + packed_host_shapes_strides_shp->begin()); + std::copy(dst_shape, dst_shape + dst_nd, + packed_host_shapes_strides_shp->begin() + 2 * src_nd); + + const py::ssize_t *src_strides = src.get_strides_raw(); + if (src_strides == nullptr) { + if (src.is_c_contiguous()) { + const auto &src_contig_strides = + c_contiguous_strides(src_nd, src_shape); + std::copy(src_contig_strides.begin(), src_contig_strides.end(), + packed_host_shapes_strides_shp->begin() + src_nd); + } + else if (src.is_f_contiguous()) { + const auto &src_contig_strides = + f_contiguous_strides(src_nd, src_shape); + std::copy(src_contig_strides.begin(), src_contig_strides.end(), + packed_host_shapes_strides_shp->begin() + src_nd); + } + else { + sycl::free(packed_shapes_strides, exec_q); + throw std::runtime_error( + "Invalid src array encountered: in copy_for_reshape function"); + } + } + else { + std::copy(src_strides, src_strides + src_nd, + packed_host_shapes_strides_shp->begin() + src_nd); + } + + const py::ssize_t *dst_strides = dst.get_strides_raw(); + if (dst_strides == nullptr) { + if (dst.is_c_contiguous()) { + const auto &dst_contig_strides = + c_contiguous_strides(dst_nd, dst_shape); + std::copy(dst_contig_strides.begin(), dst_contig_strides.end(), + packed_host_shapes_strides_shp->begin() + 2 * src_nd + + dst_nd); + } + else if (dst.is_f_contiguous()) { + const auto &dst_contig_strides = + f_contiguous_strides(dst_nd, dst_shape); + std::copy(dst_contig_strides.begin(), dst_contig_strides.end(), + packed_host_shapes_strides_shp->begin() + 2 * src_nd + + dst_nd); + } + else { + sycl::free(packed_shapes_strides, exec_q); + throw std::runtime_error( + "Invalid dst array encountered: in copy_for_reshape function"); + } + } + else { + std::copy(dst_strides, dst_strides + dst_nd, + packed_host_shapes_strides_shp->begin() + 2 * src_nd + + dst_nd); + } + + // copy packed shapes and strides from host to devices + sycl::event packed_shape_strides_copy_ev = exec_q.copy( + packed_host_shapes_strides_shp->data(), packed_shapes_strides, + packed_host_shapes_strides_shp->size()); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(packed_shape_strides_copy_ev); + cgh.host_task([packed_host_shapes_strides_shp] { + // Capturing shared pointer ensures that the underlying vector is + // not destroyed until after its data are copied into packed USM + // vector + }); + }); + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(packed_shape_strides_copy_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_reshape_event = + fn(exec_q, shift, src_nelems, src_nd, dst_nd, packed_shapes_strides, + src_data, dst_data, all_deps); + + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(copy_for_reshape_event); + auto ctx = exec_q.get_context(); + cgh.host_task([packed_shapes_strides, ctx]() { + sycl::free(packed_shapes_strides, ctx); + }); + }); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_for_reshape_event}), + copy_for_reshape_event); +} + +void init_copy_for_reshape_dispatch_vectors(void) +{ + using namespace dpctl::tensor::detail; + using dpctl::tensor::kernels::copy_and_cast::CopyForReshapeGenericFactory; + + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(copy_for_reshape_generic_dispatch_vector); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp new file mode 100644 index 0000000000..51c3719b97 --- /dev/null +++ b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp @@ -0,0 +1,51 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair +copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + py::ssize_t shift, + sycl::queue exec_q, + const std::vector &depends = {}); + +extern void init_copy_for_reshape_dispatch_vectors(); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp new file mode 100644 index 0000000000..c6b42e48ff --- /dev/null +++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -0,0 +1,276 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include + +#include "dpctl4pybind11.hpp" +#include +#include + +#include "kernels/copy_and_cast.hpp" +#include "utils/strided_iters.hpp" +#include "utils/type_dispatch.hpp" + +#include "copy_numpy_ndarray_into_usm_ndarray.hpp" +#include "simplify_iteration_space.hpp" + +namespace py = pybind11; +namespace _ns = dpctl::tensor::detail; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::tensor::kernels::copy_and_cast:: + copy_and_cast_from_host_blocking_fn_ptr_t; + +static copy_and_cast_from_host_blocking_fn_ptr_t + copy_and_cast_from_host_blocking_dispatch_table[_ns::num_types] + [_ns::num_types]; + +void copy_numpy_ndarray_into_usm_ndarray( + py::array npy_src, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends) +{ + int src_ndim = npy_src.ndim(); + int dst_ndim = dst.get_ndim(); + + if (src_ndim != dst_ndim) { + throw py::value_error("Source ndarray and destination usm_ndarray have " + "different array ranks, " + "i.e. different number of indices needed to " + "address array elements."); + } + + const py::ssize_t *src_shape = npy_src.shape(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool shapes_equal(true); + size_t src_nelems(1); + for (int i = 0; i < src_ndim; ++i) { + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + src_nelems *= static_cast(src_shape[i]); + } + + if (!shapes_equal) { + throw py::value_error("Source ndarray and destination usm_ndarray have " + "difference shapes."); + } + + if (src_nelems == 0) { + // nothing to do + return; + } + + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accomodate all elements of source + // array + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < src_nelems) { + throw py::value_error( + "Destination array can not accomodate all the " + "elements of source array."); + } + } + + sycl::queue dst_q = dst.get_queue(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) { + throw py::value_error("Execution queue is not compatible with the " + "allocation queue"); + } + + // here we assume that NumPy's type numbers agree with ours for types + // supported in both + int src_typenum = + py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num; + int dst_typenum = dst.get_typenum(); + + auto array_types = dpctl::tensor::detail::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + py::buffer_info src_pybuf = npy_src.request(); + const char *const src_data = static_cast(src_pybuf.ptr); + char *dst_data = dst.get_data(); + + int src_flags = npy_src.flags(); + + // check for applicability of special cases: + // (same type && (both C-contiguous || both F-contiguous) + bool both_c_contig = + ((src_flags & py::array::c_style) && dst.is_c_contiguous()); + bool both_f_contig = + ((src_flags & py::array::f_style) && dst.is_f_contiguous()); + if (both_c_contig || both_f_contig) { + if (src_type_id == dst_type_id) { + int src_elem_size = npy_src.itemsize(); + + sycl::event copy_ev = + exec_q.memcpy(static_cast(dst_data), + static_cast(src_data), + src_nelems * src_elem_size, depends); + + // wait for copy_ev to complete + copy_ev.wait_and_throw(); + + return; + } + // With contract_iter2 in place, there is no need to write + // dedicated kernels for casting between contiguous arrays + } + + const py::ssize_t *src_strides = + npy_src.strides(); // N.B.: strides in bytes + const py::ssize_t *dst_strides = + dst.get_strides_raw(); // N.B.: strides in elements + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes + constexpr py::ssize_t dst_itemsize = 1; // item size in elements + + int nd = src_ndim; + const py::ssize_t *shape = src_shape; + + bool is_src_c_contig = ((src_flags & py::array::c_style) != 0); + bool is_src_f_contig = ((src_flags & py::array::f_style) != 0); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + // all args except itemsizes and is_?_contig bools can be modified by + // reference + simplify_iteration_space(nd, shape, src_strides, src_itemsize, + is_src_c_contig, is_src_f_contig, dst_strides, + dst_itemsize, is_dst_c_contig, is_dst_f_contig, + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + // handle nd == 0 + if (nd == 0) { + nd = 1; + simplified_shape.reserve(nd); + simplified_shape.push_back(1); + + simplified_src_strides.reserve(nd); + simplified_src_strides.push_back(src_itemsize); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.push_back(dst_itemsize); + } + + // Minumum and maximum element offsets for source np.ndarray + py::ssize_t npy_src_min_nelem_offset(0); + py::ssize_t npy_src_max_nelem_offset(0); + for (int i = 0; i < nd; ++i) { + // convert source strides from bytes to elements + simplified_src_strides[i] = simplified_src_strides[i] / src_itemsize; + if (simplified_src_strides[i] < 0) { + npy_src_min_nelem_offset += + simplified_src_strides[i] * (simplified_shape[i] - 1); + } + else { + npy_src_max_nelem_offset += + simplified_src_strides[i] * (simplified_shape[i] - 1); + } + } + + // Create shared pointers with shape and src/dst strides, copy into device + // memory + using shT = std::vector; + + // Get implementation function pointer + auto copy_and_cast_from_host_blocking_fn = + copy_and_cast_from_host_blocking_dispatch_table[dst_type_id] + [src_type_id]; + + // If shape/strides are accessed with accessors, buffer destructor + // will force syncronization. + py::ssize_t *shape_strides = + sycl::malloc_device(3 * nd, exec_q); + + if (shape_strides == nullptr) { + throw std::runtime_error("Unabled to allocate device memory"); + } + + using usm_host_allocatorT = + sycl::usm_allocator; + using usmshT = std::vector; + usm_host_allocatorT alloc(exec_q); + + auto host_shape_strides_shp = std::make_shared(3 * nd, alloc); + std::copy(simplified_shape.begin(), simplified_shape.end(), + host_shape_strides_shp->begin()); + std::copy(simplified_src_strides.begin(), simplified_src_strides.end(), + host_shape_strides_shp->begin() + nd); + std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(), + host_shape_strides_shp->begin() + 2 * nd); + + sycl::event copy_packed_ev = + exec_q.copy(host_shape_strides_shp->data(), shape_strides, + host_shape_strides_shp->size()); + + copy_and_cast_from_host_blocking_fn( + exec_q, src_nelems, nd, shape_strides, src_data, src_offset, + npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data, + dst_offset, depends, {copy_packed_ev}); + + sycl::free(shape_strides, exec_q); + + return; +} + +void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void) +{ + using namespace dpctl::tensor::detail; + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastFromHostFactory; + + DispatchTableBuilder + dtb_copy_from_numpy; + + dtb_copy_from_numpy.populate_dispatch_table( + copy_and_cast_from_host_blocking_dispatch_table); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp new file mode 100644 index 0000000000..16adb921ee --- /dev/null +++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp @@ -0,0 +1,50 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include "dpctl4pybind11.hpp" +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void copy_numpy_ndarray_into_usm_ndarray( + py::array npy_src, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = {}); + +extern void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/device_support_queries.cpp b/dpctl/tensor/libtensor/source/device_support_queries.cpp new file mode 100644 index 0000000000..0b15754f56 --- /dev/null +++ b/dpctl/tensor/libtensor/source/device_support_queries.cpp @@ -0,0 +1,118 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include + +#include "dpctl4pybind11.hpp" +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace +{ + +std::string _default_device_fp_type(sycl::device d) +{ + if (d.has(sycl::aspect::fp64)) { + return "f8"; + } + else { + return "f4"; + } +} + +std::string _default_device_int_type(sycl::device) +{ + return "i8"; +} + +std::string _default_device_complex_type(sycl::device d) +{ + if (d.has(sycl::aspect::fp64)) { + return "c16"; + } + else { + return "c8"; + } +} + +std::string _default_device_bool_type(sycl::device) +{ + return "b1"; +} + +sycl::device _extract_device(py::object arg) +{ + auto &api = dpctl::detail::dpctl_capi::get(); + + PyObject *source = arg.ptr(); + if (api.PySyclQueue_Check_(source)) { + sycl::queue q = py::cast(arg); + return q.get_device(); + } + else if (api.PySyclDevice_Check_(source)) { + return py::cast(arg); + } + else { + throw py::type_error( + "Expected type `dpctl.SyclQueue` or `dpctl.SyclDevice`."); + } +} + +} // namespace + +std::string default_device_fp_type(py::object arg) +{ + sycl::device d = _extract_device(arg); + return _default_device_fp_type(d); +} + +std::string default_device_int_type(py::object arg) +{ + sycl::device d = _extract_device(arg); + return _default_device_int_type(d); +} + +std::string default_device_bool_type(py::object arg) +{ + sycl::device d = _extract_device(arg); + return _default_device_bool_type(d); +} + +std::string default_device_complex_type(py::object arg) +{ + sycl::device d = _extract_device(arg); + return _default_device_complex_type(d); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/device_support_queries.hpp b/dpctl/tensor/libtensor/source/device_support_queries.hpp new file mode 100644 index 0000000000..905ba4b535 --- /dev/null +++ b/dpctl/tensor/libtensor/source/device_support_queries.hpp @@ -0,0 +1,47 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +#include "dpctl4pybind11.hpp" +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::string default_device_fp_type(py::object); +extern std::string default_device_int_type(py::object); +extern std::string default_device_bool_type(py::object); +extern std::string default_device_complex_type(py::object); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/eye_ctor.cpp b/dpctl/tensor/libtensor/source/eye_ctor.cpp new file mode 100644 index 0000000000..d36447749a --- /dev/null +++ b/dpctl/tensor/libtensor/source/eye_ctor.cpp @@ -0,0 +1,133 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +#include "eye_ctor.hpp" +#include "kernels/constructors.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace _ns = dpctl::tensor::detail; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::utils::keep_args_alive; + +using dpctl::tensor::kernels::constructors::eye_fn_ptr_t; +static eye_fn_ptr_t eye_dispatch_vector[_ns::num_types]; + +std::pair +usm_ndarray_eye(py::ssize_t k, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends) +{ + // dst must be 2D + + if (dst.get_ndim() != 2) { + throw py::value_error( + "usm_ndarray_eye: Expecting 2D array to populate"); + } + + sycl::queue dst_q = dst.get_queue(); + if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) { + throw py::value_error("Execution queue is not compatible with the " + "allocation queue"); + } + + auto array_types = dpctl::tensor::detail::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + const py::ssize_t nelem = dst.get_size(); + const py::ssize_t rows = dst.get_shape(0); + const py::ssize_t cols = dst.get_shape(1); + if (rows == 0 || cols == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + if (!is_dst_c_contig && !is_dst_f_contig) { + throw py::value_error("USM array is not contiguous"); + } + + py::ssize_t start; + if (is_dst_c_contig) { + start = (k < 0) ? -k * cols : k; + } + else { + start = (k < 0) ? -k : k * rows; + } + + const py::ssize_t *strides = dst.get_strides_raw(); + py::ssize_t step; + if (strides == nullptr) { + step = (is_dst_c_contig) ? cols + 1 : rows + 1; + } + else { + step = strides[0] + strides[1]; + } + + const py::ssize_t length = std::min({rows, cols, rows + k, cols - k}); + const py::ssize_t end = start + step * (length - 1); + + char *dst_data = dst.get_data(); + sycl::event eye_event; + + auto fn = eye_dispatch_vector[dst_typeid]; + + eye_event = fn(exec_q, static_cast(nelem), start, end, step, + dst_data, depends); + + return std::make_pair(keep_args_alive(exec_q, {dst}, {eye_event}), + eye_event); +} + +void init_eye_ctor_dispatch_vectors(void) +{ + using namespace dpctl::tensor::detail; + using dpctl::tensor::kernels::constructors::EyeFactory; + + DispatchVectorBuilder dvb; + dvb.populate_dispatch_vector(eye_dispatch_vector); + + return; +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/eye_ctor.hpp b/dpctl/tensor/libtensor/source/eye_ctor.hpp new file mode 100644 index 0000000000..1067ed8d8b --- /dev/null +++ b/dpctl/tensor/libtensor/source/eye_ctor.hpp @@ -0,0 +1,50 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair +usm_ndarray_eye(py::ssize_t k, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = {}); + +extern void init_eye_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp new file mode 100644 index 0000000000..e5b1da362b --- /dev/null +++ b/dpctl/tensor/libtensor/source/full_ctor.cpp @@ -0,0 +1,116 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/strided_iters.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "full_ctor.hpp" + +namespace py = pybind11; +namespace _ns = dpctl::tensor::detail; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::tensor::kernels::constructors::lin_space_step_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +using dpctl::tensor::kernels::constructors::full_contig_fn_ptr_t; + +static full_contig_fn_ptr_t full_contig_dispatch_vector[_ns::num_types]; + +std::pair +usm_ndarray_full(py::object py_value, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends) +{ + // start, end should be coercible into data type of dst + + py::ssize_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + sycl::queue dst_q = dst.get_queue(); + if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + auto array_types = dpctl::tensor::detail::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + char *dst_data = dst.get_data(); + sycl::event full_event; + + if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) { + auto fn = full_contig_dispatch_vector[dst_typeid]; + + sycl::event full_contig_event = + fn(exec_q, static_cast(dst_nelems), py_value, dst_data, + depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {full_contig_event}), + full_contig_event); + } + else { + throw std::runtime_error( + "Only population of contiguous usm_ndarray objects is supported."); + } +} + +void init_full_ctor_dispatch_vectors(void) +{ + using namespace dpctl::tensor::detail; + using dpctl::tensor::kernels::constructors::FullContigFactory; + + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(full_contig_dispatch_vector); + + return; +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/full_ctor.hpp b/dpctl/tensor/libtensor/source/full_ctor.hpp new file mode 100644 index 0000000000..4a620a03db --- /dev/null +++ b/dpctl/tensor/libtensor/source/full_ctor.hpp @@ -0,0 +1,50 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair +usm_ndarray_full(py::object py_value, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = {}); + +extern void init_full_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/linear_sequences.cpp b/dpctl/tensor/libtensor/source/linear_sequences.cpp new file mode 100644 index 0000000000..8b72923679 --- /dev/null +++ b/dpctl/tensor/libtensor/source/linear_sequences.cpp @@ -0,0 +1,178 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/strided_iters.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "linear_sequences.hpp" + +namespace py = pybind11; +namespace _ns = dpctl::tensor::detail; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::utils::keep_args_alive; + +using dpctl::tensor::kernels::constructors::lin_space_step_fn_ptr_t; + +static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[_ns::num_types]; + +using dpctl::tensor::kernels::constructors::lin_space_affine_fn_ptr_t; + +static lin_space_affine_fn_ptr_t + lin_space_affine_dispatch_vector[_ns::num_types]; + +std::pair +usm_ndarray_linear_sequence_step(py::object start, + py::object dt, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends) +{ + // dst must be 1D and C-contiguous + // start, end should be coercible into data type of dst + + if (dst.get_ndim() != 1) { + throw py::value_error( + "usm_ndarray_linspace: Expecting 1D array to populate"); + } + + if (!dst.is_c_contiguous()) { + throw py::value_error( + "usm_ndarray_linspace: Non-contiguous arrays are not supported"); + } + + sycl::queue dst_q = dst.get_queue(); + if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + auto array_types = dpctl::tensor::detail::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + py::ssize_t len = dst.get_shape(0); + if (len == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *dst_data = dst.get_data(); + sycl::event linspace_step_event; + + auto fn = lin_space_step_dispatch_vector[dst_typeid]; + + linspace_step_event = + fn(exec_q, static_cast(len), start, dt, dst_data, depends); + + return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}), + linspace_step_event); +} + +std::pair +usm_ndarray_linear_sequence_affine(py::object start, + py::object end, + dpctl::tensor::usm_ndarray dst, + bool include_endpoint, + sycl::queue exec_q, + const std::vector &depends) +{ + // dst must be 1D and C-contiguous + // start, end should be coercible into data type of dst + + if (dst.get_ndim() != 1) { + throw py::value_error( + "usm_ndarray_linspace: Expecting 1D array to populate"); + } + + if (!dst.is_c_contiguous()) { + throw py::value_error( + "usm_ndarray_linspace: Non-contiguous arrays are not supported"); + } + + sycl::queue dst_q = dst.get_queue(); + if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) { + throw py::value_error( + "Execution queue context is not the same as allocation context"); + } + + auto array_types = dpctl::tensor::detail::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + py::ssize_t len = dst.get_shape(0); + if (len == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *dst_data = dst.get_data(); + sycl::event linspace_affine_event; + + auto fn = lin_space_affine_dispatch_vector[dst_typeid]; + + linspace_affine_event = fn(exec_q, static_cast(len), start, end, + include_endpoint, dst_data, depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {linspace_affine_event}), + linspace_affine_event); +} + +void init_linear_sequences_dispatch_vectors(void) +{ + using namespace dpctl::tensor::detail; + using dpctl::tensor::kernels::constructors::LinSpaceAffineFactory; + using dpctl::tensor::kernels::constructors::LinSpaceStepFactory; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector); + + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/linear_sequences.hpp b/dpctl/tensor/libtensor/source/linear_sequences.hpp new file mode 100644 index 0000000000..b463fdf533 --- /dev/null +++ b/dpctl/tensor/libtensor/source/linear_sequences.hpp @@ -0,0 +1,59 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair +usm_ndarray_linear_sequence_step(py::object start, + py::object dt, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = {}); + +extern std::pair usm_ndarray_linear_sequence_affine( + py::object start, + py::object end, + dpctl::tensor::usm_ndarray dst, + bool include_endpoint, + sycl::queue exec_q, + const std::vector &depends = {}); + +extern void init_linear_sequences_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp new file mode 100644 index 0000000000..8937300047 --- /dev/null +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp @@ -0,0 +1,178 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "simplify_iteration_space.hpp" +#include "dpctl4pybind11.hpp" +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace py = pybind11; + +using dpctl::tensor::c_contiguous_strides; +using dpctl::tensor::f_contiguous_strides; + +void simplify_iteration_space(int &nd, + const py::ssize_t *&shape, + const py::ssize_t *&src_strides, + py::ssize_t src_itemsize, + bool is_src_c_contig, + bool is_src_f_contig, + const py::ssize_t *&dst_strides, + py::ssize_t dst_itemsize, + bool is_dst_c_contig, + bool is_dst_f_contig, + std::vector &simplified_shape, + std::vector &simplified_src_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src_offset, + py::ssize_t &dst_offset) +{ + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + for (int i = 0; i < nd; ++i) { + simplified_shape.push_back(shape[i]); + } + + simplified_src_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + if (src_strides == nullptr) { + if (is_src_c_contig) { + simplified_src_strides = + c_contiguous_strides(nd, shape, src_itemsize); + } + else if (is_src_f_contig) { + simplified_src_strides = + f_contiguous_strides(nd, shape, src_itemsize); + } + else { + throw std::runtime_error( + "Source array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + for (int i = 0; i < nd; ++i) { + simplified_src_strides.push_back(src_strides[i]); + } + } + if (dst_strides == nullptr) { + if (is_dst_c_contig) { + simplified_dst_strides = + c_contiguous_strides(nd, shape, dst_itemsize); + } + else if (is_dst_f_contig) { + simplified_dst_strides = + f_contiguous_strides(nd, shape, dst_itemsize); + } + else { + throw std::runtime_error( + "Destination array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + for (int i = 0; i < nd; ++i) { + simplified_dst_strides.push_back(dst_strides[i]); + } + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + int contracted_nd = simplify_iteration_two_strides( + nd, simplified_shape.data(), simplified_src_strides.data(), + simplified_dst_strides.data(), + src_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + shape = const_cast(simplified_shape.data()); + src_strides = + const_cast(simplified_src_strides.data()); + dst_strides = + const_cast(simplified_dst_strides.data()); + } + else if (nd == 1) { + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + + simplified_src_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if (src_strides == nullptr) { + if (is_src_c_contig) { + simplified_src_strides.push_back(src_itemsize); + } + else if (is_src_f_contig) { + simplified_src_strides.push_back(src_itemsize); + } + else { + throw std::runtime_error( + "Source array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + simplified_src_strides.push_back(src_strides[0]); + } + if (dst_strides == nullptr) { + if (is_dst_c_contig) { + simplified_dst_strides.push_back(dst_itemsize); + } + else if (is_dst_f_contig) { + simplified_dst_strides.push_back(dst_itemsize); + } + else { + throw std::runtime_error( + "Destination array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp new file mode 100644 index 0000000000..515e795d20 --- /dev/null +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp @@ -0,0 +1,57 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include "utils/strided_iters.hpp" +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace py = pybind11; + +void simplify_iteration_space(int &, + const py::ssize_t *&, + const py::ssize_t *&, + py::ssize_t, + bool, + bool, + const py::ssize_t *&, + py::ssize_t, + bool, + bool, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 7a6c1fcca6..460855e76e 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -26,2303 +26,91 @@ #include #include #include -#include -#include #include #include #include #include #include "dpctl4pybind11.hpp" + +#include "copy_and_cast_usm_to_usm.hpp" +#include "copy_for_reshape.hpp" +#include "copy_numpy_ndarray_into_usm_ndarray.hpp" +#include "device_support_queries.hpp" +#include "eye_ctor.hpp" +#include "full_ctor.hpp" +#include "linear_sequences.hpp" +#include "triul_ctor.hpp" #include "utils/strided_iters.hpp" -#include "utils/type_dispatch.hpp" namespace py = pybind11; -template class copy_cast_generic_kernel; -template class copy_cast_from_host_kernel; -template class copy_cast_spec_kernel; -template class copy_for_reshape_generic_kernel; -template class linear_sequence_step_kernel; -template class linear_sequence_affine_kernel; -template class eye_kernel; - -static dpctl::tensor::detail::usm_ndarray_types array_types; - namespace { -template struct is_complex : std::false_type -{ -}; -template struct is_complex> : std::true_type -{ -}; -template dstTy convert_impl(const srcTy &v) -{ - if constexpr (std::is_same::value) { - return v; - } - else if constexpr (std::is_same_v && is_complex::value) - { - // bool(complex_v) == (complex_v.real() != 0) && (complex_v.imag() !=0) - return (convert_impl(v.real()) || - convert_impl(v.imag())); - } - else if constexpr (is_complex::value && !is_complex::value) { - // real_t(complex_v) == real_t(complex_v.real()) - return convert_impl(v.real()); - } - else if constexpr (!std::is_integral::value && - !std::is_same::value && - std::is_integral::value && - std::is_unsigned::value) - { - // first cast to signed variant, the cast to unsigned one - using signedT = typename std::make_signed::type; - return static_cast(convert_impl(v)); - } - else { - return static_cast(v); - } -} - -template class Caster -{ -public: - Caster() = default; - void operator()(char *src, - std::ptrdiff_t src_offset, - char *dst, - std::ptrdiff_t dst_offset) const - { - srcT *src_ = reinterpret_cast(src) + src_offset; - dstT *dst_ = reinterpret_cast(dst) + dst_offset; - *dst_ = convert_impl(*src_); - } -}; - -template class GenericCopyFunctor -{ -private: - char *src_ = nullptr; - char *dst_ = nullptr; - py::ssize_t *shape_strides_ = nullptr; - int nd_ = 0; - py::ssize_t src_offset0 = 0; - py::ssize_t dst_offset0 = 0; - -public: - GenericCopyFunctor(char *src_cp, - char *dst_cp, - py::ssize_t *shape_strides, - int nd, - py::ssize_t src_offset, - py::ssize_t dst_offset) - : src_(src_cp), dst_(dst_cp), shape_strides_(shape_strides), nd_(nd), - src_offset0(src_offset), dst_offset0(dst_offset) - { - } - - void operator()(sycl::id<1> wiid) const - { - py::ssize_t src_offset(0); - py::ssize_t dst_offset(0); - CIndexer_vector indxr(nd_); - indxr.get_displacement( - static_cast(wiid.get(0)), - const_cast(shape_strides_), // common shape - const_cast(shape_strides_ + - nd_), // src strides - const_cast(shape_strides_ + - 2 * nd_), // dst strides - src_offset, // modified by reference - dst_offset // modified by reference - ); - CastFnT fn{}; - fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset); - } -}; - -template class NDSpecializedCopyFunctor -{ -private: - char *src_ = nullptr; - char *dst_ = nullptr; - CIndexer_array indxr; - const std::array src_strides_; - const std::array dst_strides_; - static const int nd_ = nd; - py::ssize_t src_offset0 = 0; - py::ssize_t dst_offset0 = 0; - -public: - NDSpecializedCopyFunctor(char *src_cp, // USM pointer - char *dst_cp, // USM pointer - const std::array shape, - const std::array src_strides, - const std::array dst_strides, - py::ssize_t src_offset, - py::ssize_t dst_offset) - : src_(src_cp), dst_(dst_cp), indxr(shape), src_strides_(src_strides), - dst_strides_(dst_strides), src_offset0(src_offset), - dst_offset0(dst_offset) - { - } - - void operator()(sycl::id<1> wiid) const - { - py::ssize_t src_offset = 0; - py::ssize_t dst_offset = 0; - CIndexer_array local_indxr(std::move(indxr)); - - local_indxr.set(wiid.get(0)); - auto mi = local_indxr.get(); - for (int i = 0; i < nd; ++i) - src_offset += mi[i] * src_strides_[i]; - for (int i = 0; i < nd; ++i) - dst_offset += mi[i] * dst_strides_[i]; - - CastFnT fn{}; - fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset); - } -}; - -typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( - sycl::queue, - size_t, - int, - py::ssize_t *, - char *, - py::ssize_t, - char *, - py::ssize_t, - const std::vector &, - const std::vector &); - -template -sycl::event -copy_and_cast_generic_impl(sycl::queue q, - size_t nelems, - int nd, - py::ssize_t *shape_and_strides, - char *src_p, - py::ssize_t src_offset, - char *dst_p, - py::ssize_t dst_offset, - const std::vector &depends, - const std::vector &additional_depends) -{ - sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - cgh.parallel_for>( - sycl::range<1>(nelems), - GenericCopyFunctor>( - src_p, dst_p, shape_and_strides, nd, src_offset, dst_offset)); - }); - - return copy_and_cast_ev; -} - -typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( - sycl::queue, - size_t, - const std::array, - const std::array, - const std::array, - char *, - py::ssize_t, - char *, - py::ssize_t, - const std::vector &); - -typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( - sycl::queue, - size_t, - const std::array, - const std::array, - const std::array, - char *, - py::ssize_t, - char *, - py::ssize_t, - const std::vector &); - -template -sycl::event -copy_and_cast_nd_specialized_impl(sycl::queue q, - size_t nelems, - const std::array shape, - const std::array src_strides, - const std::array dst_strides, - char *src_p, - py::ssize_t src_offset, - char *dst_p, - py::ssize_t dst_offset, - const std::vector &depends) -{ - sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>(nelems), - NDSpecializedCopyFunctor>( - src_p, dst_p, shape, src_strides, dst_strides, src_offset, - dst_offset)); - }); - - return copy_and_cast_ev; -} - -namespace _ns = dpctl::tensor::detail; - -static copy_and_cast_generic_fn_ptr_t - copy_and_cast_generic_dispatch_table[_ns::num_types][_ns::num_types]; -static copy_and_cast_1d_fn_ptr_t - copy_and_cast_1d_dispatch_table[_ns::num_types][_ns::num_types]; -static copy_and_cast_2d_fn_ptr_t - copy_and_cast_2d_dispatch_table[_ns::num_types][_ns::num_types]; - -template struct CopyAndCastGenericFactory -{ - fnT get() - { - fnT f = copy_and_cast_generic_impl; - return f; - } -}; - -template struct CopyAndCast1DFactory -{ - fnT get() - { - fnT f = copy_and_cast_nd_specialized_impl; - return f; - } -}; - -template struct CopyAndCast2DFactory -{ - fnT get() - { - fnT f = copy_and_cast_nd_specialized_impl; - return f; - } -}; - -std::vector c_contiguous_strides(int nd, - const py::ssize_t *shape, - py::ssize_t element_size = 1) -{ - if (nd > 0) { - std::vector c_strides(nd, element_size); - for (int ic = nd - 1; ic > 0;) { - py::ssize_t next_v = c_strides[ic] * shape[ic]; - c_strides[--ic] = next_v; - } - return c_strides; - } - else { - return std::vector(); - } -} - -std::vector f_contiguous_strides(int nd, - const py::ssize_t *shape, - py::ssize_t element_size = 1) -{ - if (nd > 0) { - std::vector f_strides(nd, element_size); - for (int i = 0; i < nd - 1;) { - py::ssize_t next_v = f_strides[i] * shape[i]; - f_strides[++i] = next_v; - } - return f_strides; - } - else { - return std::vector(); - } -} - -using dpctl::utils::keep_args_alive; - -void simplify_iteration_space(int &nd, - const py::ssize_t *&shape, - const py::ssize_t *&src_strides, - py::ssize_t src_itemsize, - bool is_src_c_contig, - bool is_src_f_contig, - const py::ssize_t *&dst_strides, - py::ssize_t dst_itemsize, - bool is_dst_c_contig, - bool is_dst_f_contig, - std::vector &simplified_shape, - std::vector &simplified_src_strides, - std::vector &simplified_dst_strides, - py::ssize_t &src_offset, - py::ssize_t &dst_offset) -{ - if (nd > 1) { - // Simplify iteration space to reduce dimensionality - // and improve access pattern - simplified_shape.reserve(nd); - for (int i = 0; i < nd; ++i) { - simplified_shape.push_back(shape[i]); - } - - simplified_src_strides.reserve(nd); - simplified_dst_strides.reserve(nd); - if (src_strides == nullptr) { - if (is_src_c_contig) { - simplified_src_strides = - c_contiguous_strides(nd, shape, src_itemsize); - } - else if (is_src_f_contig) { - simplified_src_strides = - f_contiguous_strides(nd, shape, src_itemsize); - } - else { - throw std::runtime_error( - "Source array has null strides " - "but has neither C- nor F- contiguous flag set"); - } - } - else { - for (int i = 0; i < nd; ++i) { - simplified_src_strides.push_back(src_strides[i]); - } - } - if (dst_strides == nullptr) { - if (is_dst_c_contig) { - simplified_dst_strides = - c_contiguous_strides(nd, shape, dst_itemsize); - } - else if (is_dst_f_contig) { - simplified_dst_strides = - f_contiguous_strides(nd, shape, dst_itemsize); - } - else { - throw std::runtime_error( - "Destination array has null strides " - "but has neither C- nor F- contiguous flag set"); - } - } - else { - for (int i = 0; i < nd; ++i) { - simplified_dst_strides.push_back(dst_strides[i]); - } - } - - assert(simplified_shape.size() == static_cast(nd)); - assert(simplified_src_strides.size() == static_cast(nd)); - assert(simplified_dst_strides.size() == static_cast(nd)); - int contracted_nd = simplify_iteration_two_strides( - nd, simplified_shape.data(), simplified_src_strides.data(), - simplified_dst_strides.data(), - src_offset, // modified by reference - dst_offset // modified by reference - ); - simplified_shape.resize(contracted_nd); - simplified_src_strides.resize(contracted_nd); - simplified_dst_strides.resize(contracted_nd); - - nd = contracted_nd; - shape = const_cast(simplified_shape.data()); - src_strides = - const_cast(simplified_src_strides.data()); - dst_strides = - const_cast(simplified_dst_strides.data()); - } - else if (nd == 1) { - // Populate vectors - simplified_shape.reserve(nd); - simplified_shape.push_back(shape[0]); - - simplified_src_strides.reserve(nd); - simplified_dst_strides.reserve(nd); - - if (src_strides == nullptr) { - if (is_src_c_contig) { - simplified_src_strides.push_back(src_itemsize); - } - else if (is_src_f_contig) { - simplified_src_strides.push_back(src_itemsize); - } - else { - throw std::runtime_error( - "Source array has null strides " - "but has neither C- nor F- contiguous flag set"); - } - } - else { - simplified_src_strides.push_back(src_strides[0]); - } - if (dst_strides == nullptr) { - if (is_dst_c_contig) { - simplified_dst_strides.push_back(dst_itemsize); - } - else if (is_dst_f_contig) { - simplified_dst_strides.push_back(dst_itemsize); - } - else { - throw std::runtime_error( - "Destination array has null strides " - "but has neither C- nor F- contiguous flag set"); - } - } - else { - simplified_dst_strides.push_back(dst_strides[0]); - } - - assert(simplified_shape.size() == static_cast(nd)); - assert(simplified_src_strides.size() == static_cast(nd)); - assert(simplified_dst_strides.size() == static_cast(nd)); - } -} - -sycl::event _populate_packed_shape_strides_for_copycast_kernel( - sycl::queue exec_q, - py::ssize_t *device_shape_strides, // to be populated - const std::vector &common_shape, - const std::vector &src_strides, - const std::vector &dst_strides) -{ - // memory transfer optimization, use USM-host for temporary speeds up - // tranfer to device, especially on dGPUs - using usm_host_allocatorT = - sycl::usm_allocator; - using shT = std::vector; - size_t nd = common_shape.size(); - - usm_host_allocatorT allocator(exec_q); - - // create host temporary for packed shape and strides managed by shared - // pointer. Packed vector is concatenation of common_shape, src_stride and - // std_strides - std::shared_ptr shp_host_shape_strides = - std::make_shared(3 * nd, allocator); - std::copy(common_shape.begin(), common_shape.end(), - shp_host_shape_strides->begin()); - - std::copy(src_strides.begin(), src_strides.end(), - shp_host_shape_strides->begin() + nd); - - std::copy(dst_strides.begin(), dst_strides.end(), - shp_host_shape_strides->begin() + 2 * nd); - - sycl::event copy_shape_ev = exec_q.copy( - shp_host_shape_strides->data(), device_shape_strides, - shp_host_shape_strides->size()); - - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_shape_ev); - cgh.host_task([shp_host_shape_strides]() { - // increment shared pointer ref-count to keep it alive - // till copy operation completes; - }); - }); - - return copy_shape_ev; -} - -std::pair -copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src, - dpctl::tensor::usm_ndarray dst, - sycl::queue exec_q, - const std::vector &depends = {}) -{ - - // array dimensions must be the same - int src_nd = src.get_ndim(); - int dst_nd = dst.get_ndim(); - if (src_nd != dst_nd) { - throw py::value_error("Array dimensions are not the same."); - } - - // shapes must be the same - const py::ssize_t *src_shape = src.get_shape_raw(); - const py::ssize_t *dst_shape = dst.get_shape_raw(); - - bool shapes_equal(true); - size_t src_nelems(1); - - for (int i = 0; i < src_nd; ++i) { - src_nelems *= static_cast(src_shape[i]); - shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); - } - if (!shapes_equal) { - throw py::value_error("Array shapes are not the same."); - } - - if (src_nelems == 0) { - // nothing to do - return std::make_pair(sycl::event(), sycl::event()); - } - - auto dst_offsets = dst.get_minmax_offsets(); - // destination must be ample enough to accomodate all elements - { - size_t range = - static_cast(dst_offsets.second - dst_offsets.first); - if (range + 1 < src_nelems) { - throw py::value_error( - "Destination array can not accomodate all the " - "elements of source array."); - } - } - - // check compatibility of execution queue and allocation queue - sycl::queue src_q = src.get_queue(); - sycl::queue dst_q = dst.get_queue(); - - if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) { - throw py::value_error( - "Execution queue is not compatible with allocation queues"); - } - - int src_typenum = src.get_typenum(); - int dst_typenum = dst.get_typenum(); - - int src_type_id = array_types.typenum_to_lookup_id(src_typenum); - int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); - - char *src_data = src.get_data(); - char *dst_data = dst.get_data(); - - // check that arrays do not overlap, and concurrent copying is safe. - auto src_offsets = src.get_minmax_offsets(); - int src_elem_size = src.get_elemsize(); - int dst_elem_size = dst.get_elemsize(); - - bool memory_overlap = - ((dst_data - src_data > src_offsets.second * src_elem_size - - dst_offsets.first * dst_elem_size) && - (src_data - dst_data > dst_offsets.second * dst_elem_size - - src_offsets.first * src_elem_size)); - if (memory_overlap) { - // TODO: could use a temporary, but this is done by the caller - throw py::value_error("Arrays index overlapping segments of memory"); - } - - int src_flags = src.get_flags(); - int dst_flags = dst.get_flags(); - - // check for applicability of special cases: - // (same type && (both C-contiguous || both F-contiguous) - bool both_c_contig = ((src_flags & USM_ARRAY_C_CONTIGUOUS) && - (dst_flags & USM_ARRAY_C_CONTIGUOUS)); - bool both_f_contig = ((src_flags & USM_ARRAY_F_CONTIGUOUS) && - (dst_flags & USM_ARRAY_F_CONTIGUOUS)); - if (both_c_contig || both_f_contig) { - if (src_type_id == dst_type_id) { - - sycl::event copy_ev = - exec_q.memcpy(static_cast(dst_data), - static_cast(src_data), - src_nelems * src_elem_size, depends); - - // make sure src and dst are not GC-ed before copy_ev is complete - return std::make_pair( - keep_args_alive(exec_q, {src, dst}, {copy_ev}), copy_ev); - } - // With contract_iter2 in place, there is no need to write - // dedicated kernels for casting between contiguous arrays - } - - const py::ssize_t *src_strides = src.get_strides_raw(); - const py::ssize_t *dst_strides = dst.get_strides_raw(); - - using shT = std::vector; - shT simplified_shape; - shT simplified_src_strides; - shT simplified_dst_strides; - py::ssize_t src_offset(0); - py::ssize_t dst_offset(0); - - int nd = src_nd; - const py::ssize_t *shape = src_shape; - - bool is_src_c_contig = ((src_flags & USM_ARRAY_C_CONTIGUOUS) != 0); - bool is_src_f_contig = ((src_flags & USM_ARRAY_F_CONTIGUOUS) != 0); - - bool is_dst_c_contig = ((dst_flags & USM_ARRAY_C_CONTIGUOUS) != 0); - bool is_dst_f_contig = ((dst_flags & USM_ARRAY_F_CONTIGUOUS) != 0); - - constexpr py::ssize_t src_itemsize = 1; // in elements - constexpr py::ssize_t dst_itemsize = 1; // in elements - - // all args except itemsizes and is_?_contig bools can be modified by - // reference - simplify_iteration_space(nd, shape, src_strides, src_itemsize, - is_src_c_contig, is_src_f_contig, dst_strides, - dst_itemsize, is_dst_c_contig, is_dst_f_contig, - simplified_shape, simplified_src_strides, - simplified_dst_strides, src_offset, dst_offset); - - if (nd < 3) { - if (nd == 1) { - std::array shape_arr = {shape[0]}; - // strides may be null - std::array src_strides_arr = { - (src_strides ? src_strides[0] : 1)}; - std::array dst_strides_arr = { - (dst_strides ? dst_strides[0] : 1)}; - - auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; - sycl::event copy_and_cast_1d_event = fn( - exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr, - src_data, src_offset, dst_data, dst_offset, depends); - - return std::make_pair( - keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}), - copy_and_cast_1d_event); - } - else if (nd == 2) { - std::array shape_arr = {shape[0], shape[1]}; - std::array src_strides_arr = {src_strides[0], - src_strides[1]}; - std::array dst_strides_arr = {dst_strides[0], - dst_strides[1]}; - - auto fn = copy_and_cast_2d_dispatch_table[dst_type_id][src_type_id]; - sycl::event copy_and_cast_2d_event = fn( - exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr, - src_data, src_offset, dst_data, dst_offset, depends); - - return std::make_pair( - keep_args_alive(exec_q, {src, dst}, {copy_and_cast_2d_event}), - copy_and_cast_2d_event); - } - else if (nd == 0) { // case of a scalar - assert(src_nelems == 1); - std::array shape_arr = {1}; - std::array src_strides_arr = {1}; - std::array dst_strides_arr = {1}; - - auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; - sycl::event copy_and_cast_0d_event = fn( - exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr, - src_data, src_offset, dst_data, dst_offset, depends); - - return std::make_pair( - keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}), - copy_and_cast_0d_event); - } - } - - // Generic implementation - auto copy_and_cast_fn = - copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id]; +using dpctl::tensor::c_contiguous_strides; +using dpctl::tensor::f_contiguous_strides; - // If shape/strides are accessed with accessors, buffer destructor - // will force syncronization. - py::ssize_t *shape_strides = - sycl::malloc_device(3 * nd, exec_q); +using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray; - if (shape_strides == nullptr) { - throw std::runtime_error("Unabled to allocate device memory"); - } +/* =========================== Copy for reshape ============================= */ - sycl::event copy_shape_ev = - _populate_packed_shape_strides_for_copycast_kernel( - exec_q, shape_strides, simplified_shape, simplified_src_strides, - simplified_dst_strides); - - sycl::event copy_and_cast_generic_ev = copy_and_cast_fn( - exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data, - dst_offset, depends, {copy_shape_ev}); - - // async free of shape_strides temporary - auto ctx = exec_q.get_context(); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_and_cast_generic_ev); - cgh.host_task( - [ctx, shape_strides]() { sycl::free(shape_strides, ctx); }); - }); - - return std::make_pair( - keep_args_alive(exec_q, {src, dst}, {copy_and_cast_generic_ev}), - copy_and_cast_generic_ev); -} - -/* =========================== Copy for reshape ============================== - */ - -template class GenericCopyForReshapeFunctor -{ -private: - py::ssize_t offset = 0; - py::ssize_t size = 1; - int src_nd = -1; - int dst_nd = -1; - // USM array of size 2*(src_nd + dst_nd) - // [ src_shape; src_strides; dst_shape; dst_strides ] - const py::ssize_t *src_dst_shapes_and_strides = nullptr; - Ty *src_p = nullptr; - Ty *dst_p = nullptr; - -public: - GenericCopyForReshapeFunctor(py::ssize_t shift, - py::ssize_t nelems, - int src_ndim, - int dst_ndim, - const py::ssize_t *packed_shapes_and_strides, - char *src_ptr, - char *dst_ptr) - : offset(shift), size(nelems), src_nd(src_ndim), dst_nd(dst_ndim), - src_dst_shapes_and_strides(packed_shapes_and_strides), - src_p(reinterpret_cast(src_ptr)), - dst_p(reinterpret_cast(dst_ptr)) - { - } - - void operator()(sycl::id<1> wiid) const - { - py::ssize_t this_src_offset(0); - CIndexer_vector src_indxr(src_nd); - - src_indxr.get_displacement( - static_cast(wiid.get(0)), - const_cast( - src_dst_shapes_and_strides), // src shape - const_cast(src_dst_shapes_and_strides + - src_nd), // src strides - this_src_offset // modified by reference - ); - const Ty *in = src_p + this_src_offset; - - py::ssize_t this_dst_offset(0); - CIndexer_vector dst_indxr(dst_nd); - py::ssize_t shifted_wiid = - (static_cast(wiid.get(0)) + offset) % size; - shifted_wiid = (shifted_wiid >= 0) ? shifted_wiid : shifted_wiid + size; - dst_indxr.get_displacement( - shifted_wiid, - const_cast(src_dst_shapes_and_strides + - 2 * src_nd), // dst shape - const_cast(src_dst_shapes_and_strides + - 2 * src_nd + dst_nd), // dst strides - this_dst_offset // modified by reference - ); - - Ty *out = dst_p + this_dst_offset; - *out = *in; - } -}; - -// define function type -typedef sycl::event (*copy_for_reshape_fn_ptr_t)( - sycl::queue, - py::ssize_t, // shift - size_t, // num_elements - int, - int, // src_nd, dst_nd - py::ssize_t *, // packed shapes and strides - char *, // src_data_ptr - char *, // dst_data_ptr - const std::vector &); - -template -sycl::event -copy_for_reshape_generic_impl(sycl::queue q, - py::ssize_t shift, - size_t nelems, - int src_nd, - int dst_nd, - py::ssize_t *packed_shapes_and_strides, - char *src_p, - char *dst_p, - const std::vector &depends) -{ - sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>(nelems), - GenericCopyForReshapeFunctor(shift, nelems, src_nd, dst_nd, - packed_shapes_and_strides, src_p, - dst_p)); - }); - - return copy_for_reshape_ev; -} - -// define static vector -static copy_for_reshape_fn_ptr_t - copy_for_reshape_generic_dispatch_vector[_ns::num_types]; - -template struct CopyForReshapeGenericFactory -{ - fnT get() - { - fnT f = copy_for_reshape_generic_impl; - return f; - } -}; - -/* - * Copies src into dst (same data type) of different shapes by using flat - * iterations. - * - * Equivalent to the following loop: - * - * for i for range(src.size): - * dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)] - */ -std::pair -copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src, - dpctl::tensor::usm_ndarray dst, - py::ssize_t shift, - sycl::queue exec_q, - const std::vector &depends = {}) -{ - py::ssize_t src_nelems = src.get_size(); - py::ssize_t dst_nelems = dst.get_size(); - - // Must have the same number of elements - if (src_nelems != dst_nelems) { - throw py::value_error( - "copy_usm_ndarray_for_reshape requires src and dst to " - "have the same number of elements."); - } - - int src_typenum = src.get_typenum(); - int dst_typenum = dst.get_typenum(); - - // typenames must be the same - if (src_typenum != dst_typenum) { - throw py::value_error( - "copy_usm_ndarray_for_reshape requires src and dst to " - "have the same type."); - } - - if (src_nelems == 0) { - return std::make_pair(sycl::event(), sycl::event()); - } - - // destination must be ample enough to accomodate all elements - { - auto dst_offsets = dst.get_minmax_offsets(); - py::ssize_t range = - static_cast(dst_offsets.second - dst_offsets.first); - if (range + 1 < src_nelems) { - throw py::value_error( - "Destination array can not accomodate all the " - "elements of source array."); - } - } - - // check same contexts - sycl::queue src_q = src.get_queue(); - sycl::queue dst_q = dst.get_queue(); - - if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) { - throw py::value_error( - "Execution queue is not compatible with allocation queues"); - } - - if (src_nelems == 1) { - // handle special case of 1-element array - int src_elemsize = src.get_elemsize(); - char *src_data = src.get_data(); - char *dst_data = dst.get_data(); - sycl::event copy_ev = - exec_q.copy(src_data, dst_data, src_elemsize); - return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), - copy_ev); - } - - // dimensions may be different - int src_nd = src.get_ndim(); - int dst_nd = dst.get_ndim(); - - const py::ssize_t *src_shape = src.get_shape_raw(); - const py::ssize_t *dst_shape = dst.get_shape_raw(); - - int type_id = array_types.typenum_to_lookup_id(src_typenum); - - auto fn = copy_for_reshape_generic_dispatch_vector[type_id]; - - // packed_shape_strides = [src_shape, src_strides, dst_shape, dst_strides] - py::ssize_t *packed_shapes_strides = - sycl::malloc_device(2 * (src_nd + dst_nd), exec_q); - - if (packed_shapes_strides == nullptr) { - throw std::runtime_error("Unabled to allocate device memory"); - } - - using usm_host_allocatorT = - sycl::usm_allocator; - using shT = std::vector; - usm_host_allocatorT allocator(exec_q); - std::shared_ptr packed_host_shapes_strides_shp = - std::make_shared(2 * (src_nd + dst_nd), allocator); - - std::copy(src_shape, src_shape + src_nd, - packed_host_shapes_strides_shp->begin()); - std::copy(dst_shape, dst_shape + dst_nd, - packed_host_shapes_strides_shp->begin() + 2 * src_nd); - - const py::ssize_t *src_strides = src.get_strides_raw(); - if (src_strides == nullptr) { - int src_flags = src.get_flags(); - if (src_flags & USM_ARRAY_C_CONTIGUOUS) { - const auto &src_contig_strides = - c_contiguous_strides(src_nd, src_shape); - std::copy(src_contig_strides.begin(), src_contig_strides.end(), - packed_host_shapes_strides_shp->begin() + src_nd); - } - else if (src_flags & USM_ARRAY_F_CONTIGUOUS) { - const auto &src_contig_strides = - c_contiguous_strides(src_nd, src_shape); - std::copy(src_contig_strides.begin(), src_contig_strides.end(), - packed_host_shapes_strides_shp->begin() + src_nd); - } - else { - sycl::free(packed_shapes_strides, exec_q); - throw std::runtime_error( - "Invalid src array encountered: in copy_for_reshape function"); - } - } - else { - std::copy(src_strides, src_strides + src_nd, - packed_host_shapes_strides_shp->begin() + src_nd); - } - - const py::ssize_t *dst_strides = dst.get_strides_raw(); - if (dst_strides == nullptr) { - int dst_flags = dst.get_flags(); - if (dst_flags & USM_ARRAY_C_CONTIGUOUS) { - const auto &dst_contig_strides = - c_contiguous_strides(dst_nd, dst_shape); - std::copy(dst_contig_strides.begin(), dst_contig_strides.end(), - packed_host_shapes_strides_shp->begin() + 2 * src_nd + - dst_nd); - } - else if (dst_flags & USM_ARRAY_F_CONTIGUOUS) { - const auto &dst_contig_strides = - f_contiguous_strides(dst_nd, dst_shape); - std::copy(dst_contig_strides.begin(), dst_contig_strides.end(), - packed_host_shapes_strides_shp->begin() + 2 * src_nd + - dst_nd); - } - else { - sycl::free(packed_shapes_strides, exec_q); - throw std::runtime_error( - "Invalid dst array encountered: in copy_for_reshape function"); - } - } - else { - std::copy(dst_strides, dst_strides + dst_nd, - packed_host_shapes_strides_shp->begin() + 2 * src_nd + - dst_nd); - } - - // copy packed shapes and strides from host to devices - sycl::event packed_shape_strides_copy_ev = exec_q.copy( - packed_host_shapes_strides_shp->data(), packed_shapes_strides, - packed_host_shapes_strides_shp->size()); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(packed_shape_strides_copy_ev); - cgh.host_task([packed_host_shapes_strides_shp] { - // Capturing shared pointer ensures that the underlying vector is - // not destroyed until after its data are copied into packed USM - // vector - }); - }); - - char *src_data = src.get_data(); - char *dst_data = dst.get_data(); - - std::vector all_deps(depends.size() + 1); - all_deps.push_back(packed_shape_strides_copy_ev); - all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); - - sycl::event copy_for_reshape_event = - fn(exec_q, shift, src_nelems, src_nd, dst_nd, packed_shapes_strides, - src_data, dst_data, all_deps); - - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_for_reshape_event); - auto ctx = exec_q.get_context(); - cgh.host_task([packed_shapes_strides, ctx]() { - sycl::free(packed_shapes_strides, ctx); - }); - }); - - return std::make_pair( - keep_args_alive(exec_q, {src, dst}, {copy_for_reshape_event}), - copy_for_reshape_event); -} +using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */ -template -class CasterForAccessor -{ -public: - CasterForAccessor() = default; - void operator()(AccessorT src, - std::ptrdiff_t src_offset, - char *dst, - std::ptrdiff_t dst_offset) const - { - dstT *dst_ = reinterpret_cast(dst) + dst_offset; - *dst_ = convert_impl(src[src_offset]); - } -}; - -template class GenericCopyFromHostFunctor -{ -private: - AccessorT src_acc_; - char *dst_ = nullptr; - py::ssize_t *shape_strides_ = nullptr; - int nd_ = 0; - py::ssize_t src_offset0 = 0; - py::ssize_t dst_offset0 = 0; - -public: - GenericCopyFromHostFunctor(AccessorT src_acc, - char *dst_cp, - py::ssize_t *shape_strides, - int nd, - py::ssize_t src_offset, - py::ssize_t dst_offset) - : src_acc_(src_acc), dst_(dst_cp), shape_strides_(shape_strides), - nd_(nd), src_offset0(src_offset), dst_offset0(dst_offset) - { - } - - void operator()(sycl::id<1> wiid) const - { - py::ssize_t src_offset(0); - py::ssize_t dst_offset(0); - CIndexer_vector indxr(nd_); - indxr.get_displacement( - static_cast(wiid.get(0)), - const_cast(shape_strides_), // common shape - const_cast(shape_strides_ + - nd_), // src strides - const_cast(shape_strides_ + - 2 * nd_), // dst strides - src_offset, // modified by reference - dst_offset // modified by reference - ); - CastFnT fn{}; - fn(src_acc_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset); - } -}; - -typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)( - sycl::queue, - size_t, - int, - py::ssize_t *, - const char *, - py::ssize_t, - py::ssize_t, - py::ssize_t, - char *, - py::ssize_t, - const std::vector &, - const std::vector &); - -template -void copy_and_cast_from_host_impl( - sycl::queue q, - size_t nelems, - int nd, - py::ssize_t *shape_and_strides, - const char *host_src_p, - py::ssize_t src_offset, - py::ssize_t src_min_nelem_offset, - py::ssize_t src_max_nelem_offset, - char *dst_p, - py::ssize_t dst_offset, - const std::vector &depends, - const std::vector &additional_depends) -{ - py::ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1; - sycl::buffer npy_buf( - reinterpret_cast(host_src_p) + src_min_nelem_offset, - sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}}); - - sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - - sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only); - - cgh.parallel_for>( - sycl::range<1>(nelems), - GenericCopyFromHostFunctor< - CasterForAccessor, - decltype(npy_acc)>(npy_acc, dst_p, shape_and_strides, nd, - src_offset - src_min_nelem_offset, - dst_offset)); - }); - - copy_and_cast_from_host_ev.wait_and_throw(); - - return; -} - -static copy_and_cast_from_host_blocking_fn_ptr_t - copy_and_cast_from_host_blocking_dispatch_table[_ns::num_types] - [_ns::num_types]; - -template -struct CopyAndCastFromHostFactory -{ - fnT get() - { - fnT f = copy_and_cast_from_host_impl; - return f; - } -}; - -void copy_numpy_ndarray_into_usm_ndarray( - py::array npy_src, - dpctl::tensor::usm_ndarray dst, - sycl::queue exec_q, - const std::vector &depends = {}) -{ - int src_ndim = npy_src.ndim(); - int dst_ndim = dst.get_ndim(); - - if (src_ndim != dst_ndim) { - throw py::value_error("Source ndarray and destination usm_ndarray have " - "different array ranks, " - "i.e. different number of indices needed to " - "address array elements."); - } - - const py::ssize_t *src_shape = npy_src.shape(); - const py::ssize_t *dst_shape = dst.get_shape_raw(); - bool shapes_equal(true); - size_t src_nelems(1); - for (int i = 0; i < src_ndim; ++i) { - shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); - src_nelems *= static_cast(src_shape[i]); - } - - if (!shapes_equal) { - throw py::value_error("Source ndarray and destination usm_ndarray have " - "difference shapes."); - } - - if (src_nelems == 0) { - // nothing to do - return; - } - - auto dst_offsets = dst.get_minmax_offsets(); - // destination must be ample enough to accomodate all elements of source - // array - { - size_t range = - static_cast(dst_offsets.second - dst_offsets.first); - if (range + 1 < src_nelems) { - throw py::value_error( - "Destination array can not accomodate all the " - "elements of source array."); - } - } - - sycl::queue dst_q = dst.get_queue(); - - if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) { - throw py::value_error("Execution queue is not compatible with the " - "allocation queue"); - } - - // here we assume that NumPy's type numbers agree with ours for types - // supported in both - int src_typenum = - py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num; - int dst_typenum = dst.get_typenum(); - - int src_type_id = array_types.typenum_to_lookup_id(src_typenum); - int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); - - py::buffer_info src_pybuf = npy_src.request(); - const char *const src_data = static_cast(src_pybuf.ptr); - char *dst_data = dst.get_data(); - - int src_flags = npy_src.flags(); - int dst_flags = dst.get_flags(); - - // check for applicability of special cases: - // (same type && (both C-contiguous || both F-contiguous) - bool both_c_contig = ((src_flags & py::array::c_style) && - (dst_flags & USM_ARRAY_C_CONTIGUOUS)); - bool both_f_contig = ((src_flags & py::array::f_style) && - (dst_flags & USM_ARRAY_F_CONTIGUOUS)); - if (both_c_contig || both_f_contig) { - if (src_type_id == dst_type_id) { - int src_elem_size = npy_src.itemsize(); - - sycl::event copy_ev = - exec_q.memcpy(static_cast(dst_data), - static_cast(src_data), - src_nelems * src_elem_size, depends); - - // wait for copy_ev to complete - copy_ev.wait_and_throw(); - - return; - } - // With contract_iter2 in place, there is no need to write - // dedicated kernels for casting between contiguous arrays - } - - const py::ssize_t *src_strides = - npy_src.strides(); // N.B.: strides in bytes - const py::ssize_t *dst_strides = - dst.get_strides_raw(); // N.B.: strides in elements - - using shT = std::vector; - shT simplified_shape; - shT simplified_src_strides; - shT simplified_dst_strides; - py::ssize_t src_offset(0); - py::ssize_t dst_offset(0); - - py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes - constexpr py::ssize_t dst_itemsize = 1; // item size in elements - - int nd = src_ndim; - const py::ssize_t *shape = src_shape; - - bool is_src_c_contig = ((src_flags & py::array::c_style) != 0); - bool is_src_f_contig = ((src_flags & py::array::f_style) != 0); - - bool is_dst_c_contig = ((dst_flags & USM_ARRAY_C_CONTIGUOUS) != 0); - bool is_dst_f_contig = ((dst_flags & USM_ARRAY_F_CONTIGUOUS) != 0); - - // all args except itemsizes and is_?_contig bools can be modified by - // reference - simplify_iteration_space(nd, shape, src_strides, src_itemsize, - is_src_c_contig, is_src_f_contig, dst_strides, - dst_itemsize, is_dst_c_contig, is_dst_f_contig, - simplified_shape, simplified_src_strides, - simplified_dst_strides, src_offset, dst_offset); - - assert(simplified_shape.size() == static_cast(nd)); - assert(simplified_src_strides.size() == static_cast(nd)); - assert(simplified_dst_strides.size() == static_cast(nd)); - - // handle nd == 0 - if (nd == 0) { - nd = 1; - simplified_shape.reserve(nd); - simplified_shape.push_back(1); - - simplified_src_strides.reserve(nd); - simplified_src_strides.push_back(src_itemsize); - - simplified_dst_strides.reserve(nd); - simplified_dst_strides.push_back(dst_itemsize); - } - - // Minumum and maximum element offsets for source np.ndarray - py::ssize_t npy_src_min_nelem_offset(0); - py::ssize_t npy_src_max_nelem_offset(0); - for (int i = 0; i < nd; ++i) { - // convert source strides from bytes to elements - simplified_src_strides[i] = simplified_src_strides[i] / src_itemsize; - if (simplified_src_strides[i] < 0) { - npy_src_min_nelem_offset += - simplified_src_strides[i] * (simplified_shape[i] - 1); - } - else { - npy_src_max_nelem_offset += - simplified_src_strides[i] * (simplified_shape[i] - 1); - } - } - - // Create shared pointers with shape and src/dst strides, copy into device - // memory - using shT = std::vector; - - // Get implementation function pointer - auto copy_and_cast_from_host_blocking_fn = - copy_and_cast_from_host_blocking_dispatch_table[dst_type_id] - [src_type_id]; - - // If shape/strides are accessed with accessors, buffer destructor - // will force syncronization. - py::ssize_t *shape_strides = - sycl::malloc_device(3 * nd, exec_q); - - if (shape_strides == nullptr) { - throw std::runtime_error("Unabled to allocate device memory"); - } - - using usm_host_allocatorT = - sycl::usm_allocator; - using usmshT = std::vector; - usm_host_allocatorT alloc(exec_q); - - auto host_shape_strides_shp = std::make_shared(3 * nd, alloc); - std::copy(simplified_shape.begin(), simplified_shape.end(), - host_shape_strides_shp->begin()); - std::copy(simplified_src_strides.begin(), simplified_src_strides.end(), - host_shape_strides_shp->begin() + nd); - std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(), - host_shape_strides_shp->begin() + 2 * nd); - - sycl::event copy_packed_ev = - exec_q.copy(host_shape_strides_shp->data(), shape_strides, - host_shape_strides_shp->size()); - - copy_and_cast_from_host_blocking_fn( - exec_q, src_nelems, nd, shape_strides, src_data, src_offset, - npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data, - dst_offset, depends, {copy_packed_ev}); - - sycl::free(shape_strides, exec_q); - - return; -} - -/* =========== Unboxing Python scalar =============== */ - -template T unbox_py_scalar(py::object o) -{ - return py::cast(o); -} - -template <> sycl::half unbox_py_scalar(py::object o) -{ - float tmp = py::cast(o); - return static_cast(tmp); -} +using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray; /* ============= linear-sequence ==================== */ -typedef sycl::event (*lin_space_step_fn_ptr_t)( - sycl::queue, - size_t, // num_elements - py::object start, - py::object step, - char *, // dst_data_ptr - const std::vector &); - -static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[_ns::num_types]; - -template class LinearSequenceStepFunctor -{ -private: - Ty *p = nullptr; - Ty start_v; - Ty step_v; - -public: - LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv) - : p(reinterpret_cast(dst_p)), start_v(v0), step_v(dv) - { - } - - void operator()(sycl::id<1> wiid) const - { - auto i = wiid.get(0); - if constexpr (is_complex::value) { - p[i] = Ty{start_v.real() + i * step_v.real(), - start_v.imag() + i * step_v.imag()}; - } - else { - p[i] = start_v + i * step_v; - } - } -}; - -template -sycl::event lin_space_step_impl(sycl::queue exec_q, - size_t nelems, - py::object start, - py::object step, - char *array_data, - const std::vector &depends) -{ - Ty start_v; - Ty step_v; - try { - start_v = unbox_py_scalar(start); - step_v = unbox_py_scalar(step); - } catch (const py::error_already_set &e) { - throw; - } - - sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>{nelems}, - LinearSequenceStepFunctor(array_data, start_v, step_v)); - }); - - return lin_space_step_event; -} - -template struct LinSpaceStepFactory -{ - fnT get() - { - fnT f = lin_space_step_impl; - return f; - } -}; - -typedef sycl::event (*lin_space_affine_fn_ptr_t)( - sycl::queue, - size_t, // num_elements - py::object start, - py::object end, - bool include_endpoint, - char *, // dst_data_ptr - const std::vector &); - -static lin_space_affine_fn_ptr_t - lin_space_affine_dispatch_vector[_ns::num_types]; - -template class LinearSequenceAffineFunctor -{ -private: - Ty *p = nullptr; - Ty start_v; - Ty end_v; - size_t n; - -public: - LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, size_t den) - : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), - n((den == 0) ? 1 : den) - { - } - - void operator()(sycl::id<1> wiid) const - { - auto i = wiid.get(0); - wTy wc = wTy(i) / n; - wTy w = wTy(n - i) / n; - if constexpr (is_complex::value) { - auto _w = static_cast(w); - auto _wc = static_cast(wc); - auto re_comb = start_v.real() * _w + end_v.real() * _wc; - auto im_comb = start_v.imag() * _w + end_v.imag() * _wc; - Ty affine_comb = Ty{re_comb, im_comb}; - p[i] = affine_comb; - } - else { - auto affine_comb = start_v * w + end_v * wc; - p[i] = convert_impl(affine_comb); - } - } -}; - -template -sycl::event lin_space_affine_impl(sycl::queue exec_q, - size_t nelems, - py::object start, - py::object end, - bool include_endpoint, - char *array_data, - const std::vector &depends) -{ - Ty start_v, end_v; - try { - start_v = unbox_py_scalar(start); - end_v = unbox_py_scalar(end); - } catch (const py::error_already_set &e) { - throw; - } - - bool device_supports_doubles = exec_q.get_device().has(sycl::aspect::fp64); - sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - if (device_supports_doubles) { - cgh.parallel_for>( - sycl::range<1>{nelems}, - LinearSequenceAffineFunctor( - array_data, start_v, end_v, - (include_endpoint) ? nelems - 1 : nelems)); - } - else { - cgh.parallel_for>( - sycl::range<1>{nelems}, - LinearSequenceAffineFunctor( - array_data, start_v, end_v, - (include_endpoint) ? nelems - 1 : nelems)); - } - }); - - return lin_space_affine_event; -} - -template struct LinSpaceAffineFactory -{ - fnT get() - { - fnT f = lin_space_affine_impl; - return f; - } -}; - -std::pair -usm_ndarray_linear_sequence_step(py::object start, - py::object dt, - dpctl::tensor::usm_ndarray dst, - sycl::queue exec_q, - const std::vector &depends = {}) -{ - // dst must be 1D and C-contiguous - // start, end should be coercible into data type of dst - - if (dst.get_ndim() != 1) { - throw py::value_error( - "usm_ndarray_linspace: Expecting 1D array to populate"); - } - - int flags = dst.get_flags(); - if (!(flags & USM_ARRAY_C_CONTIGUOUS)) { - throw py::value_error( - "usm_ndarray_linspace: Non-contiguous arrays are not supported"); - } - - sycl::queue dst_q = dst.get_queue(); - if (dst_q != exec_q && dst_q.get_context() != exec_q.get_context()) { - throw py::value_error( - "Execution queue context is not the same as allocation context"); - } - - int dst_typenum = dst.get_typenum(); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - py::ssize_t len = dst.get_shape(0); - if (len == 0) { - // nothing to do - return std::make_pair(sycl::event{}, sycl::event{}); - } - - char *dst_data = dst.get_data(); - sycl::event linspace_step_event; - - auto fn = lin_space_step_dispatch_vector[dst_typeid]; - - linspace_step_event = - fn(exec_q, static_cast(len), start, dt, dst_data, depends); - - return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}), - linspace_step_event); -} - -std::pair -usm_ndarray_linear_sequence_affine(py::object start, - py::object end, - dpctl::tensor::usm_ndarray dst, - bool include_endpoint, - sycl::queue exec_q, - const std::vector &depends = {}) -{ - // dst must be 1D and C-contiguous - // start, end should be coercible into data type of dst - - if (dst.get_ndim() != 1) { - throw py::value_error( - "usm_ndarray_linspace: Expecting 1D array to populate"); - } - - int flags = dst.get_flags(); - if (!(flags & USM_ARRAY_C_CONTIGUOUS)) { - throw py::value_error( - "usm_ndarray_linspace: Non-contiguous arrays are not supported"); - } - - sycl::queue dst_q = dst.get_queue(); - if (dst_q != exec_q && dst_q.get_context() != exec_q.get_context()) { - throw py::value_error( - "Execution queue context is not the same as allocation context"); - } - - int dst_typenum = dst.get_typenum(); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - py::ssize_t len = dst.get_shape(0); - if (len == 0) { - // nothing to do - return std::make_pair(sycl::event{}, sycl::event{}); - } - - char *dst_data = dst.get_data(); - sycl::event linspace_affine_event; - - auto fn = lin_space_affine_dispatch_vector[dst_typeid]; - - linspace_affine_event = fn(exec_q, static_cast(len), start, end, - include_endpoint, dst_data, depends); - - return std::make_pair( - keep_args_alive(exec_q, {dst}, {linspace_affine_event}), - linspace_affine_event); -} +using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; +using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; /* ================ Full ================== */ -typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue, - size_t, - py::object, - char *, - const std::vector &); - -static full_contig_fn_ptr_t full_contig_dispatch_vector[_ns::num_types]; - -template -sycl::event full_contig_impl(sycl::queue q, - size_t nelems, - py::object py_value, - char *dst_p, - const std::vector &depends) -{ - dstTy fill_v; - try { - fill_v = unbox_py_scalar(py_value); - } catch (const py::error_already_set &e) { - throw; - } - - sycl::event fill_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - dstTy *p = reinterpret_cast(dst_p); - cgh.fill(p, fill_v, nelems); - }); - - return fill_ev; -} - -template struct FullContigFactory -{ - fnT get() - { - fnT f = full_contig_impl; - return f; - } -}; - -std::pair -usm_ndarray_full(py::object py_value, - dpctl::tensor::usm_ndarray dst, - sycl::queue exec_q, - const std::vector &depends = {}) -{ - // start, end should be coercible into data type of dst - - py::ssize_t dst_nelems = dst.get_size(); - - if (dst_nelems == 0) { - // nothing to do - return std::make_pair(sycl::event(), sycl::event()); - } - - int dst_flags = dst.get_flags(); - - sycl::queue dst_q = dst.get_queue(); - if (dst_q != exec_q && dst_q.get_context() != exec_q.get_context()) { - throw py::value_error( - "Execution queue context is not the same as allocation context"); - } - - int dst_typenum = dst.get_typenum(); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - char *dst_data = dst.get_data(); - sycl::event full_event; - - if (dst_nelems == 1 || (dst_flags & USM_ARRAY_C_CONTIGUOUS) || - (dst_flags & USM_ARRAY_F_CONTIGUOUS)) - { - auto fn = full_contig_dispatch_vector[dst_typeid]; - - sycl::event full_contig_event = - fn(exec_q, static_cast(dst_nelems), py_value, dst_data, - depends); - - return std::make_pair( - keep_args_alive(exec_q, {dst}, {full_contig_event}), - full_contig_event); - } - else { - throw std::runtime_error( - "Only population of contiguous usm_ndarray objects is supported."); - } -} +using dpctl::tensor::py_internal::usm_ndarray_full; /* ================ Eye ================== */ -typedef sycl::event (*eye_fn_ptr_t)(sycl::queue, - size_t nelems, // num_elements - py::ssize_t start, - py::ssize_t end, - py::ssize_t step, - char *, // dst_data_ptr - const std::vector &); - -static eye_fn_ptr_t eye_dispatch_vector[_ns::num_types]; - -template class EyeFunctor -{ -private: - Ty *p = nullptr; - py::ssize_t start_v; - py::ssize_t end_v; - py::ssize_t step_v; - -public: - EyeFunctor(char *dst_p, - const py::ssize_t v0, - const py::ssize_t v1, - const py::ssize_t dv) - : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), step_v(dv) - { - } - - void operator()(sycl::id<1> wiid) const - { - Ty set_v = 0; - py::ssize_t i = static_cast(wiid.get(0)); - if (i >= start_v and i <= end_v) { - if ((i - start_v) % step_v == 0) { - set_v = 1; - } - } - p[i] = set_v; - } -}; - -template -sycl::event eye_impl(sycl::queue exec_q, - size_t nelems, - const py::ssize_t start, - const py::ssize_t end, - const py::ssize_t step, - char *array_data, - const std::vector &depends) -{ - sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>{nelems}, - EyeFunctor(array_data, start, end, step)); - }); - - return eye_event; -} - -template struct EyeFactory -{ - fnT get() - { - fnT f = eye_impl; - return f; - } -}; - -std::pair -eye(py::ssize_t k, - dpctl::tensor::usm_ndarray dst, - sycl::queue exec_q, - const std::vector &depends = {}) -{ - // dst must be 2D - - if (dst.get_ndim() != 2) { - throw py::value_error( - "usm_ndarray_eye: Expecting 2D array to populate"); - } - - sycl::queue dst_q = dst.get_queue(); - if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) { - throw py::value_error("Execution queue is not compatible with the " - "allocation queue"); - } - - int dst_typenum = dst.get_typenum(); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - const py::ssize_t nelem = dst.get_size(); - const py::ssize_t rows = dst.get_shape(0); - const py::ssize_t cols = dst.get_shape(1); - if (rows == 0 || cols == 0) { - // nothing to do - return std::make_pair(sycl::event{}, sycl::event{}); - } - - bool is_dst_c_contig = ((dst.get_flags() & USM_ARRAY_C_CONTIGUOUS) != 0); - bool is_dst_f_contig = ((dst.get_flags() & USM_ARRAY_F_CONTIGUOUS) != 0); - if (!is_dst_c_contig && !is_dst_f_contig) { - throw py::value_error("USM array is not contiguous"); - } - - py::ssize_t start; - if (is_dst_c_contig) { - start = (k < 0) ? -k * cols : k; - } - else { - start = (k < 0) ? -k : k * rows; - } - - const py::ssize_t *strides = dst.get_strides_raw(); - py::ssize_t step; - if (strides == nullptr) { - step = (is_dst_c_contig) ? cols + 1 : rows + 1; - } - else { - step = strides[0] + strides[1]; - } - - const py::ssize_t length = std::min({rows, cols, rows + k, cols - k}); - const py::ssize_t end = start + step * (length - 1); - - char *dst_data = dst.get_data(); - sycl::event eye_event; - - auto fn = eye_dispatch_vector[dst_typeid]; - - eye_event = fn(exec_q, static_cast(nelem), start, end, step, - dst_data, depends); - - return std::make_pair(keep_args_alive(exec_q, {dst}, {eye_event}), - eye_event); -} +using dpctl::tensor::py_internal::usm_ndarray_eye; /* =========================== Tril and triu ============================== */ -// define function type -typedef sycl::event (*tri_fn_ptr_t)(sycl::queue, - py::ssize_t, // inner_range //py::ssize_t - py::ssize_t, // outer_range - char *, // src_data_ptr - char *, // dst_data_ptr - py::ssize_t, // nd - py::ssize_t *, // shape_and_strides - py::ssize_t, // k - const std::vector &, - const std::vector &); - -template class tri_kernel; -template -sycl::event tri_impl(sycl::queue exec_q, - py::ssize_t inner_range, - py::ssize_t outer_range, - char *src_p, - char *dst_p, - py::ssize_t nd, - py::ssize_t *shape_and_strides, - py::ssize_t k, - const std::vector &depends, - const std::vector &additional_depends) -{ - constexpr int d2 = 2; - py::ssize_t src_s = nd; - py::ssize_t dst_s = 2 * nd; - py::ssize_t nd_1 = nd - 1; - py::ssize_t nd_2 = nd - 2; - Ty *src = reinterpret_cast(src_p); - Ty *dst = reinterpret_cast(dst_p); - - sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - cgh.parallel_for>( - sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) { - py::ssize_t outer_gid = idx[0] / inner_range; - py::ssize_t inner_gid = idx[0] - inner_range * outer_gid; - - py::ssize_t src_inner_offset, dst_inner_offset; - bool to_copy; - - { - // py::ssize_t inner_gid = idx.get_id(0); - CIndexer_array indexer_i( - {shape_and_strides[nd_2], shape_and_strides[nd_1]}); - indexer_i.set(inner_gid); - const std::array &inner = indexer_i.get(); - src_inner_offset = - inner[0] * shape_and_strides[src_s + nd_2] + - inner[1] * shape_and_strides[src_s + nd_1]; - dst_inner_offset = - inner[0] * shape_and_strides[dst_s + nd_2] + - inner[1] * shape_and_strides[dst_s + nd_1]; - - if (l) - to_copy = (inner[0] + k >= inner[1]); - else - to_copy = (inner[0] + k <= inner[1]); - } - - py::ssize_t src_offset = 0; - py::ssize_t dst_offset = 0; - { - // py::ssize_t outer_gid = idx.get_id(1); - CIndexer_vector outer(nd - d2); - outer.get_displacement( - outer_gid, shape_and_strides, shape_and_strides + src_s, - shape_and_strides + dst_s, src_offset, dst_offset); - } - - src_offset += src_inner_offset; - dst_offset += dst_inner_offset; - - dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0); - }); - }); - return tri_ev; -} - -static tri_fn_ptr_t tril_generic_dispatch_vector[_ns::num_types]; - -template struct TrilGenericFactory -{ - fnT get() - { - fnT f = tri_impl; - return f; - } -}; - -static tri_fn_ptr_t triu_generic_dispatch_vector[_ns::num_types]; - -template struct TriuGenericFactory -{ - fnT get() - { - fnT f = tri_impl; - return f; - } -}; - -std::pair -tri(sycl::queue &exec_q, - dpctl::tensor::usm_ndarray src, - dpctl::tensor::usm_ndarray dst, - char part, - py::ssize_t k = 0, - const std::vector &depends = {}) -{ - // array dimensions must be the same - int src_nd = src.get_ndim(); - int dst_nd = dst.get_ndim(); - if (src_nd != dst_nd) { - throw py::value_error("Array dimensions are not the same."); - } - - if (src_nd < 2) { - throw py::value_error("Array dimensions less than 2."); - } - - // shapes must be the same - const py::ssize_t *src_shape = src.get_shape_raw(); - const py::ssize_t *dst_shape = dst.get_shape_raw(); - - bool shapes_equal(true); - size_t src_nelems(1); - - for (int i = 0; shapes_equal && i < src_nd; ++i) { - src_nelems *= static_cast(src_shape[i]); - shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); - } - if (!shapes_equal) { - throw py::value_error("Array shapes are not the same."); - } - - if (src_nelems == 0) { - // nothing to do - return std::make_pair(sycl::event(), sycl::event()); - } - - char *src_data = src.get_data(); - char *dst_data = dst.get_data(); - - // check that arrays do not overlap, and concurrent copying is safe. - auto src_offsets = src.get_minmax_offsets(); - auto dst_offsets = dst.get_minmax_offsets(); - int src_elem_size = src.get_elemsize(); - int dst_elem_size = dst.get_elemsize(); - - bool memory_overlap = - ((dst_data - src_data > src_offsets.second * src_elem_size - - dst_offsets.first * dst_elem_size) && - (src_data - dst_data > dst_offsets.second * dst_elem_size - - src_offsets.first * src_elem_size)); - if (memory_overlap) { - // TODO: could use a temporary, but this is done by the caller - throw py::value_error("Arrays index overlapping segments of memory"); - } - - int src_typenum = src.get_typenum(); - int dst_typenum = dst.get_typenum(); - int src_typeid = array_types.typenum_to_lookup_id(src_typenum); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - if (dst_typeid != src_typeid) { - throw py::value_error("Array dtype are not the same."); - } - - // check same contexts - sycl::queue src_q = src.get_queue(); - sycl::queue dst_q = dst.get_queue(); - - if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) { - throw py::value_error( - "Execution queue context is not the same as allocation contexts"); - } - - using shT = std::vector; - shT src_strides(src_nd); - - int src_flags = src.get_flags(); - bool is_src_c_contig = ((src_flags & USM_ARRAY_C_CONTIGUOUS) != 0); - bool is_src_f_contig = ((src_flags & USM_ARRAY_F_CONTIGUOUS) != 0); - - const py::ssize_t *src_strides_raw = src.get_strides_raw(); - if (src_strides_raw == nullptr) { - if (is_src_c_contig) { - src_strides = c_contiguous_strides(src_nd, src_shape); - } - else if (is_src_f_contig) { - src_strides = f_contiguous_strides(src_nd, src_shape); - } - else { - throw std::runtime_error("Source array has null strides but has " - "neither C- nor F- contiguous flag set"); - } - } - else { - std::copy(src_strides_raw, src_strides_raw + src_nd, - src_strides.begin()); - } - - shT dst_strides(src_nd); - - int dst_flags = dst.get_flags(); - bool is_dst_c_contig = ((dst_flags & USM_ARRAY_C_CONTIGUOUS) != 0); - bool is_dst_f_contig = ((dst_flags & USM_ARRAY_F_CONTIGUOUS) != 0); - - const py::ssize_t *dst_strides_raw = dst.get_strides_raw(); - if (dst_strides_raw == nullptr) { - if (is_dst_c_contig) { - dst_strides = c_contiguous_strides(src_nd, src_shape); - } - else if (is_dst_f_contig) { - dst_strides = f_contiguous_strides(src_nd, src_shape); - } - else { - throw std::runtime_error("Source array has null strides but has " - "neither C- nor F- contiguous flag set"); - } - } - else { - std::copy(dst_strides_raw, dst_strides_raw + dst_nd, - dst_strides.begin()); - } - shT simplified_shape; - shT simplified_src_strides; - shT simplified_dst_strides; - py::ssize_t src_offset(0); - py::ssize_t dst_offset(0); - - constexpr py::ssize_t src_itemsize = 1; // item size in elements - constexpr py::ssize_t dst_itemsize = 1; // item size in elements - - int nd = src_nd - 2; - const py::ssize_t *shape = src_shape; - const py::ssize_t *p_src_strides = src_strides.data(); - const py::ssize_t *p_dst_strides = dst_strides.data(); - - simplify_iteration_space(nd, shape, p_src_strides, src_itemsize, - is_src_c_contig, is_src_f_contig, p_dst_strides, - dst_itemsize, is_dst_c_contig, is_dst_f_contig, - simplified_shape, simplified_src_strides, - simplified_dst_strides, src_offset, dst_offset); - - if (src_offset != 0 || dst_offset != 0) { - throw py::value_error("Reversed slice for dst is not supported"); - } - - nd += 2; - - using usm_host_allocatorT = - sycl::usm_allocator; - using usmshT = std::vector; - - usm_host_allocatorT allocator(exec_q); - auto shp_host_shape_and_strides = - std::make_shared(3 * nd, allocator); - - std::copy(simplified_shape.begin(), simplified_shape.end(), - shp_host_shape_and_strides->begin()); - (*shp_host_shape_and_strides)[nd - 2] = src_shape[src_nd - 2]; - (*shp_host_shape_and_strides)[nd - 1] = src_shape[src_nd - 1]; - - std::copy(simplified_src_strides.begin(), simplified_src_strides.end(), - shp_host_shape_and_strides->begin() + nd); - (*shp_host_shape_and_strides)[2 * nd - 2] = src_strides[src_nd - 2]; - (*shp_host_shape_and_strides)[2 * nd - 1] = src_strides[src_nd - 1]; - - std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(), - shp_host_shape_and_strides->begin() + 2 * nd); - (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2]; - (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1]; - - py::ssize_t *dev_shape_and_strides = - sycl::malloc_device(3 * nd, exec_q); - if (dev_shape_and_strides == nullptr) { - throw std::runtime_error("Unabled to allocate device memory"); - } - sycl::event copy_shape_and_strides = exec_q.copy( - shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd); - - py::ssize_t inner_range = src_shape[src_nd - 1] * src_shape[src_nd - 2]; - py::ssize_t outer_range = src_nelems / inner_range; - - sycl::event tri_ev; - if (part == 'l') { - auto fn = tril_generic_dispatch_vector[src_typeid]; - tri_ev = - fn(exec_q, inner_range, outer_range, src_data, dst_data, nd, - dev_shape_and_strides, k, depends, {copy_shape_and_strides}); - } - else { - auto fn = triu_generic_dispatch_vector[src_typeid]; - tri_ev = - fn(exec_q, inner_range, outer_range, src_data, dst_data, nd, - dev_shape_and_strides, k, depends, {copy_shape_and_strides}); - } - - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on({tri_ev}); - auto ctx = exec_q.get_context(); - cgh.host_task( - [shp_host_shape_and_strides, dev_shape_and_strides, ctx]() { - // capture of shp_host_shape_and_strides ensure the underlying - // vector exists for the entire execution of copying kernel - sycl::free(dev_shape_and_strides, ctx); - }); - }); - - return std::make_pair(keep_args_alive(exec_q, {src, dst}, {tri_ev}), - tri_ev); -} +using dpctl::tensor::py_internal::usm_ndarray_triul; // populate dispatch tables -void init_copy_and_cast_dispatch_tables(void) +void init_dispatch_tables(void) { - using namespace dpctl::tensor::detail; - - DispatchTableBuilder - dtb_generic; - dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table); - - DispatchTableBuilder - dtb_1d; - dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table); - - DispatchTableBuilder - dtb_2d; - dtb_2d.populate_dispatch_table(copy_and_cast_2d_dispatch_table); - - DispatchTableBuilder - dtb_copy_from_numpy; - - dtb_copy_from_numpy.populate_dispatch_table( - copy_and_cast_from_host_blocking_dispatch_table); + using namespace dpctl::tensor::py_internal; + init_copy_and_cast_usm_to_usm_dispatch_tables(); + init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); return; } // populate dispatch vectors -void init_copy_for_reshape_dispatch_vector(void) +void init_dispatch_vectors(void) { - using namespace dpctl::tensor::detail; - - DispatchVectorBuilder - dvb; - dvb.populate_dispatch_vector(copy_for_reshape_generic_dispatch_vector); - - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector); - - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector); + using namespace dpctl::tensor::py_internal; - DispatchVectorBuilder - dvb3; - dvb3.populate_dispatch_vector(full_contig_dispatch_vector); - - DispatchVectorBuilder dvb4; - dvb4.populate_dispatch_vector(eye_dispatch_vector); - - DispatchVectorBuilder dvb5; - dvb5.populate_dispatch_vector(tril_generic_dispatch_vector); - - DispatchVectorBuilder dvb6; - dvb6.populate_dispatch_vector(triu_generic_dispatch_vector); + init_copy_for_reshape_dispatch_vectors(); + init_linear_sequences_dispatch_vectors(); + init_full_ctor_dispatch_vectors(); + init_eye_ctor_dispatch_vectors(); + init_triul_ctor_dispatch_vectors(); return; } -std::string get_default_device_fp_type(sycl::device d) -{ - if (d.has(sycl::aspect::fp64)) { - return "f8"; - } - else { - return "f4"; - } -} - -std::string get_default_device_int_type(sycl::device) -{ - return "i8"; -} - -std::string get_default_device_complex_type(sycl::device d) -{ - if (d.has(sycl::aspect::fp64)) { - return "c16"; - } - else { - return "c8"; - } -} - -std::string get_default_device_bool_type(sycl::device) -{ - return "b1"; -} - } // namespace PYBIND11_MODULE(_tensor_impl, m) { - - init_copy_and_cast_dispatch_tables(); - init_copy_for_reshape_dispatch_vector(); - import_dpctl(); - - // populate types constants for type dispatching functions - array_types = dpctl::tensor::detail::usm_ndarray_types::get(); + init_dispatch_tables(); + init_dispatch_vectors(); m.def( - "_contract_iter", &contract_iter, + "_contract_iter", &contract_iter, "Simplifies iteration of array of given shape & stride. Returns " "a triple: shape, stride and offset for the new iterator of possible " "smaller dimension, which traverses the same elements as the original " @@ -2337,7 +125,7 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("depends") = py::list()); m.def( - "_contract_iter2", &contract_iter2, + "_contract_iter2", &contract_iter2, "Simplifies iteration over elements of pair of arrays of given shape " "with strides stride1 and stride2. Returns " "a 5-tuple: shape, stride and offset for the new iterator of possible " @@ -2381,7 +169,7 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_eye", &eye, + m.def("_eye", &usm_ndarray_eye, "Fills input 2D contiguous usm_ndarray `dst` with " "zeros outside of the diagonal " "specified by " @@ -2391,57 +179,43 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("k"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("default_device_fp_type", [](sycl::queue q) -> std::string { - return get_default_device_fp_type(q.get_device()); - }); - m.def("default_device_fp_type_device", [](sycl::device dev) -> std::string { - return get_default_device_fp_type(dev); - }); - - m.def("default_device_int_type", [](sycl::queue q) -> std::string { - return get_default_device_int_type(q.get_device()); - }); - m.def("default_device_int_type_device", - [](sycl::device dev) -> std::string { - return get_default_device_int_type(dev); - }); - - m.def("default_device_bool_type", [](sycl::queue q) -> std::string { - return get_default_device_bool_type(q.get_device()); - }); - m.def("default_device_bool_type_device", - [](sycl::device dev) -> std::string { - return get_default_device_bool_type(dev); - }); - - m.def("default_device_complex_type", [](sycl::queue q) -> std::string { - return get_default_device_complex_type(q.get_device()); - }); - m.def("default_device_complex_type_device", - [](sycl::device dev) -> std::string { - return get_default_device_complex_type(dev); - }); - m.def( - "_tril", - [](dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst, - py::ssize_t k, sycl::queue exec_q, - const std::vector depends) - -> std::pair { - return tri(exec_q, src, dst, 'l', k, depends); - }, - "Tril helper function.", py::arg("src"), py::arg("dst"), - py::arg("k") = 0, py::arg("sycl_queue"), - py::arg("depends") = py::list()); + m.def("default_device_fp_type", + dpctl::tensor::py_internal::default_device_fp_type, + "Gives default floating point type supported by device.", + py::arg("dev")); + + m.def("default_device_int_type", + dpctl::tensor::py_internal::default_device_int_type, + "Gives default integer type supported by device.", py::arg("dev")); + + m.def("default_device_bool_type", + dpctl::tensor::py_internal::default_device_bool_type, + "Gives default boolean type supported by device.", py::arg("dev")); + + m.def("default_device_complex_type", + dpctl::tensor::py_internal::default_device_complex_type, + "Gives default complex floating point type support by device.", + py::arg("dev")); + + auto tril_fn = [](dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, py::ssize_t k, + sycl::queue exec_q, + const std::vector depends) + -> std::pair { + return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends); + }; + m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"), + py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + py::arg("depends") = py::list()); - m.def( - "_triu", - [](dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst, - py::ssize_t k, sycl::queue exec_q, - const std::vector depends) - -> std::pair { - return tri(exec_q, src, dst, 'u', k, depends); - }, - "Triu helper function.", py::arg("src"), py::arg("dst"), - py::arg("k") = 0, py::arg("sycl_queue"), - py::arg("depends") = py::list()); + auto triu_fn = [](dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, py::ssize_t k, + sycl::queue exec_q, + const std::vector depends) + -> std::pair { + return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends); + }; + m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"), + py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + py::arg("depends") = py::list()); } diff --git a/dpctl/tensor/libtensor/source/triul_ctor.cpp b/dpctl/tensor/libtensor/source/triul_ctor.cpp new file mode 100644 index 0000000000..60e5572d6d --- /dev/null +++ b/dpctl/tensor/libtensor/source/triul_ctor.cpp @@ -0,0 +1,287 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +#include "kernels/constructors.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace _ns = dpctl::tensor::detail; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::tensor::c_contiguous_strides; +using dpctl::tensor::f_contiguous_strides; +using dpctl::utils::keep_args_alive; + +using dpctl::tensor::kernels::constructors::tri_fn_ptr_t; + +static tri_fn_ptr_t tril_generic_dispatch_vector[_ns::num_types]; +static tri_fn_ptr_t triu_generic_dispatch_vector[_ns::num_types]; + +std::pair +usm_ndarray_triul(sycl::queue exec_q, + dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + char part, + py::ssize_t k = 0, + const std::vector &depends = {}) +{ + // array dimensions must be the same + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("Array dimensions are not the same."); + } + + if (src_nd < 2) { + throw py::value_error("Array dimensions less than 2."); + } + + // shapes must be the same + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + size_t src_nelems(1); + + for (int i = 0; shapes_equal && i < src_nd; ++i) { + src_nelems *= static_cast(src_shape[i]); + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + if (src_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + // check that arrays do not overlap, and concurrent copying is safe. + auto src_offsets = src.get_minmax_offsets(); + auto dst_offsets = dst.get_minmax_offsets(); + int src_elem_size = src.get_elemsize(); + int dst_elem_size = dst.get_elemsize(); + + bool memory_overlap = + ((dst_data - src_data > src_offsets.second * src_elem_size - + dst_offsets.first * dst_elem_size) && + (src_data - dst_data > dst_offsets.second * dst_elem_size - + src_offsets.first * src_elem_size)); + if (memory_overlap) { + // TODO: could use a temporary, but this is done by the caller + throw py::value_error("Arrays index overlapping segments of memory"); + } + + auto array_types = dpctl::tensor::detail::usm_ndarray_types(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (dst_typeid != src_typeid) { + throw py::value_error("Array dtype are not the same."); + } + + // check same contexts + sycl::queue src_q = src.get_queue(); + sycl::queue dst_q = dst.get_queue(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) { + throw py::value_error( + "Execution queue context is not the same as allocation contexts"); + } + + using shT = std::vector; + shT src_strides(src_nd); + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + const py::ssize_t *src_strides_raw = src.get_strides_raw(); + if (src_strides_raw == nullptr) { + if (is_src_c_contig) { + src_strides = c_contiguous_strides(src_nd, src_shape); + } + else if (is_src_f_contig) { + src_strides = f_contiguous_strides(src_nd, src_shape); + } + else { + throw std::runtime_error("Source array has null strides but has " + "neither C- nor F- contiguous flag set"); + } + } + else { + std::copy(src_strides_raw, src_strides_raw + src_nd, + src_strides.begin()); + } + + shT dst_strides(src_nd); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + const py::ssize_t *dst_strides_raw = dst.get_strides_raw(); + if (dst_strides_raw == nullptr) { + if (is_dst_c_contig) { + dst_strides = + dpctl::tensor::c_contiguous_strides(src_nd, src_shape); + } + else if (is_dst_f_contig) { + dst_strides = + dpctl::tensor::f_contiguous_strides(src_nd, src_shape); + } + else { + throw std::runtime_error("Source array has null strides but has " + "neither C- nor F- contiguous flag set"); + } + } + else { + std::copy(dst_strides_raw, dst_strides_raw + dst_nd, + dst_strides.begin()); + } + + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + constexpr py::ssize_t src_itemsize = 1; // item size in elements + constexpr py::ssize_t dst_itemsize = 1; // item size in elements + + int nd = src_nd - 2; + const py::ssize_t *shape = src_shape; + const py::ssize_t *p_src_strides = src_strides.data(); + const py::ssize_t *p_dst_strides = dst_strides.data(); + + simplify_iteration_space(nd, shape, p_src_strides, src_itemsize, + is_src_c_contig, is_src_f_contig, p_dst_strides, + dst_itemsize, is_dst_c_contig, is_dst_f_contig, + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + if (src_offset != 0 || dst_offset != 0) { + throw py::value_error("Reversed slice for dst is not supported"); + } + + nd += 2; + + using usm_host_allocatorT = + sycl::usm_allocator; + using usmshT = std::vector; + + usm_host_allocatorT allocator(exec_q); + auto shp_host_shape_and_strides = + std::make_shared(3 * nd, allocator); + + std::copy(simplified_shape.begin(), simplified_shape.end(), + shp_host_shape_and_strides->begin()); + (*shp_host_shape_and_strides)[nd - 2] = src_shape[src_nd - 2]; + (*shp_host_shape_and_strides)[nd - 1] = src_shape[src_nd - 1]; + + std::copy(simplified_src_strides.begin(), simplified_src_strides.end(), + shp_host_shape_and_strides->begin() + nd); + (*shp_host_shape_and_strides)[2 * nd - 2] = src_strides[src_nd - 2]; + (*shp_host_shape_and_strides)[2 * nd - 1] = src_strides[src_nd - 1]; + + std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(), + shp_host_shape_and_strides->begin() + 2 * nd); + (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2]; + (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1]; + + py::ssize_t *dev_shape_and_strides = + sycl::malloc_device(3 * nd, exec_q); + if (dev_shape_and_strides == nullptr) { + throw std::runtime_error("Unabled to allocate device memory"); + } + sycl::event copy_shape_and_strides = exec_q.copy( + shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd); + + py::ssize_t inner_range = src_shape[src_nd - 1] * src_shape[src_nd - 2]; + py::ssize_t outer_range = src_nelems / inner_range; + + sycl::event tri_ev; + if (part == 'l') { + auto fn = tril_generic_dispatch_vector[src_typeid]; + tri_ev = + fn(exec_q, inner_range, outer_range, src_data, dst_data, nd, + dev_shape_and_strides, k, depends, {copy_shape_and_strides}); + } + else { + auto fn = triu_generic_dispatch_vector[src_typeid]; + tri_ev = + fn(exec_q, inner_range, outer_range, src_data, dst_data, nd, + dev_shape_and_strides, k, depends, {copy_shape_and_strides}); + } + + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on({tri_ev}); + auto ctx = exec_q.get_context(); + cgh.host_task( + [shp_host_shape_and_strides, dev_shape_and_strides, ctx]() { + // capture of shp_host_shape_and_strides ensure the underlying + // vector exists for the entire execution of copying kernel + sycl::free(dev_shape_and_strides, ctx); + }); + }); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {tri_ev}), + tri_ev); +} + +void init_triul_ctor_dispatch_vectors(void) +{ + + using namespace dpctl::tensor::detail; + using dpctl::tensor::kernels::constructors::TrilGenericFactory; + using dpctl::tensor::kernels::constructors::TriuGenericFactory; + + DispatchVectorBuilder dvb1; + dvb1.populate_dispatch_vector(tril_generic_dispatch_vector); + + DispatchVectorBuilder dvb2; + dvb2.populate_dispatch_vector(triu_generic_dispatch_vector); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/triul_ctor.hpp b/dpctl/tensor/libtensor/source/triul_ctor.hpp new file mode 100644 index 0000000000..3789df80c5 --- /dev/null +++ b/dpctl/tensor/libtensor/source/triul_ctor.hpp @@ -0,0 +1,52 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair +usm_ndarray_triul(sycl::queue exec_q, + dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + char part, + py::ssize_t k = 0, + const std::vector &depends = {}); + +extern void init_triul_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl