From 4e999f7a6e1e60a110ea16320b26357f8f693a7f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 21 Sep 2022 20:28:00 -0500 Subject: [PATCH 1/4] Moved implementation of kernels out to dedicated header files. --- dpctl/apis/include/dpctl4pybind11.hpp | 49 + .../include/kernels/constructors.hpp | 487 +++++++++ .../include/kernels/copy_and_cast.hpp | 505 ++++++++++ .../libtensor/include/utils/type_utils.hpp | 73 ++ dpctl/tensor/libtensor/source/tensor_py.cpp | 931 +----------------- 5 files changed, 1149 insertions(+), 896 deletions(-) create mode 100644 dpctl/tensor/libtensor/include/kernels/constructors.hpp create mode 100644 dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp create mode 100644 dpctl/tensor/libtensor/include/utils/type_utils.hpp diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp index f55ef3ec93..56a4e56f60 100644 --- a/dpctl/apis/include/dpctl4pybind11.hpp +++ b/dpctl/apis/include/dpctl4pybind11.hpp @@ -371,6 +371,55 @@ class usm_memory : public py::object namespace tensor { + +std::vector c_contiguous_strides(int nd, + const py::ssize_t *shape, + py::ssize_t element_size = 1) +{ + if (nd > 0) { + std::vector c_strides(nd, element_size); + for (int ic = nd - 1; ic > 0;) { + py::ssize_t next_v = c_strides[ic] * shape[ic]; + c_strides[--ic] = next_v; + } + return c_strides; + } + else { + return std::vector(); + } +} + +std::vector f_contiguous_strides(int nd, + const py::ssize_t *shape, + py::ssize_t element_size = 1) +{ + if (nd > 0) { + std::vector f_strides(nd, element_size); + for (int i = 0; i < nd - 1;) { + py::ssize_t next_v = f_strides[i] * shape[i]; + f_strides[++i] = next_v; + } + return f_strides; + } + else { + return std::vector(); + } +} + +std::vector +c_contiguous_strides(const std::vector &shape, + py::ssize_t element_size = 1) +{ + return c_contiguous_strides(shape.size(), shape.data(), element_size); +} + +std::vector +f_contiguous_strides(const std::vector &shape, + py::ssize_t element_size = 1) +{ + return f_contiguous_strides(shape.size(), shape.data(), element_size); +} + class usm_ndarray : public py::object { public: diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp new file mode 100644 index 0000000000..b08eebbab3 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/constructors.hpp @@ -0,0 +1,487 @@ +//=== constructors.hpp - -----------------------------------*-C++-*--/===// +//=== Implementation of tensor constructors kernels ------===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor constructors. +//===----------------------------------------------------------------------===// + +#pragma once +#include "utils/strided_iters.hpp" +#include "utils/type_utils.hpp" +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace constructors +{ + +template class linear_sequence_step_kernel; +template class linear_sequence_affine_kernel; +template class eye_kernel; + +namespace py = pybind11; + +/* =========== Unboxing Python scalar =============== */ + +template T unbox_py_scalar(py::object o) +{ + return py::cast(o); +} + +template <> sycl::half unbox_py_scalar(py::object o) +{ + float tmp = py::cast(o); + return static_cast(tmp); +} + +// Constructor to populate tensor with linear sequence defined by +// start and step data + +typedef sycl::event (*lin_space_step_fn_ptr_t)( + sycl::queue, + size_t, // num_elements + py::object start, + py::object step, + char *, // dst_data_ptr + const std::vector &); + +template class LinearSequenceStepFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty step_v; + +public: + LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv) + : p(reinterpret_cast(dst_p)), start_v(v0), step_v(dv) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + p[i] = Ty{start_v.real() + i * step_v.real(), + start_v.imag() + i * step_v.imag()}; + } + else { + p[i] = start_v + i * step_v; + } + } +}; + +template +sycl::event lin_space_step_impl(sycl::queue exec_q, + size_t nelems, + Ty start_v, + Ty step_v, + char *array_data, + const std::vector &depends) +{ + sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>( + sycl::range<1>{nelems}, + LinearSequenceStepFunctor(array_data, start_v, step_v)); + }); + + return lin_space_step_event; +} + +template +sycl::event lin_space_step_impl(sycl::queue exec_q, + size_t nelems, + py::object start, + py::object step, + char *array_data, + const std::vector &depends) +{ + Ty start_v; + Ty step_v; + try { + start_v = unbox_py_scalar(start); + step_v = unbox_py_scalar(step); + } catch (const py::error_already_set &e) { + throw; + } + + auto lin_space_step_event = lin_space_step_impl( + exec_q, nelems, start_v, step_v, array_data, depends); + + return lin_space_step_event; +} + +template struct LinSpaceStepFactory +{ + fnT get() + { + fnT f = lin_space_step_impl; + return f; + } +}; + +// Constructor to populate tensor with linear sequence defined by +// start and and data + +typedef sycl::event (*lin_space_affine_fn_ptr_t)( + sycl::queue, + size_t, // num_elements + py::object start, + py::object end, + bool include_endpoint, + char *, // dst_data_ptr + const std::vector &); + +template class LinearSequenceAffineFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty end_v; + size_t n; + +public: + LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, size_t den) + : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), + n((den == 0) ? 1 : den) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + wTy wc = wTy(i) / n; + wTy w = wTy(n - i) / n; + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + auto _w = static_cast(w); + auto _wc = static_cast(wc); + auto re_comb = start_v.real() * _w + end_v.real() * _wc; + auto im_comb = start_v.imag() * _w + end_v.imag() * _wc; + Ty affine_comb = Ty{re_comb, im_comb}; + p[i] = affine_comb; + } + else { + using dpctl::tensor::type_utils::convert_impl; + auto affine_comb = start_v * w + end_v * wc; + p[i] = convert_impl(affine_comb); + } + } +}; + +template +sycl::event lin_space_affine_impl(sycl::queue exec_q, + size_t nelems, + Ty start_v, + Ty end_v, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + bool device_supports_doubles = exec_q.get_device().has(sycl::aspect::fp64); + sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + if (device_supports_doubles) { + cgh.parallel_for>( + sycl::range<1>{nelems}, + LinearSequenceAffineFunctor( + array_data, start_v, end_v, + (include_endpoint) ? nelems - 1 : nelems)); + } + else { + cgh.parallel_for>( + sycl::range<1>{nelems}, + LinearSequenceAffineFunctor( + array_data, start_v, end_v, + (include_endpoint) ? nelems - 1 : nelems)); + } + }); + + return lin_space_affine_event; +} + +template +sycl::event lin_space_affine_impl(sycl::queue exec_q, + size_t nelems, + py::object start, + py::object end, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + Ty start_v, end_v; + try { + start_v = unbox_py_scalar(start); + end_v = unbox_py_scalar(end); + } catch (const py::error_already_set &e) { + throw; + } + + auto lin_space_affine_event = lin_space_affine_impl( + exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends); + + return lin_space_affine_event; +} + +template struct LinSpaceAffineFactory +{ + fnT get() + { + fnT f = lin_space_affine_impl; + return f; + } +}; + +/* ================ Full ================== */ + +typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue, + size_t, + py::object, + char *, + const std::vector &); + +template +sycl::event full_contig_impl(sycl::queue q, + size_t nelems, + dstTy fill_v, + char *dst_p, + const std::vector &depends) +{ + sycl::event fill_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + dstTy *p = reinterpret_cast(dst_p); + cgh.fill(p, fill_v, nelems); + }); + + return fill_ev; +} + +template +sycl::event full_contig_impl(sycl::queue exec_q, + size_t nelems, + py::object py_value, + char *dst_p, + const std::vector &depends) +{ + dstTy fill_v; + try { + fill_v = unbox_py_scalar(py_value); + } catch (const py::error_already_set &e) { + throw; + } + + sycl::event fill_ev = + full_contig_impl(exec_q, nelems, fill_v, dst_p, depends); + + return fill_ev; +} + +template struct FullContigFactory +{ + fnT get() + { + fnT f = full_contig_impl; + return f; + } +}; + +/* ================ Eye ================== */ + +typedef sycl::event (*eye_fn_ptr_t)(sycl::queue, + size_t nelems, // num_elements + py::ssize_t start, + py::ssize_t end, + py::ssize_t step, + char *, // dst_data_ptr + const std::vector &); + +template class EyeFunctor +{ +private: + Ty *p = nullptr; + py::ssize_t start_v; + py::ssize_t end_v; + py::ssize_t step_v; + +public: + EyeFunctor(char *dst_p, + const py::ssize_t v0, + const py::ssize_t v1, + const py::ssize_t dv) + : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), step_v(dv) + { + } + + void operator()(sycl::id<1> wiid) const + { + Ty set_v = 0; + py::ssize_t i = static_cast(wiid.get(0)); + if (i >= start_v and i <= end_v) { + if ((i - start_v) % step_v == 0) { + set_v = 1; + } + } + p[i] = set_v; + } +}; + +template +sycl::event eye_impl(sycl::queue exec_q, + size_t nelems, + const py::ssize_t start, + const py::ssize_t end, + const py::ssize_t step, + char *array_data, + const std::vector &depends) +{ + sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>( + sycl::range<1>{nelems}, + EyeFunctor(array_data, start, end, step)); + }); + + return eye_event; +} + +template struct EyeFactory +{ + fnT get() + { + fnT f = eye_impl; + return f; + } +}; + +/* =========================== Tril and triu ============================== */ + +// define function type +typedef sycl::event (*tri_fn_ptr_t)(sycl::queue, + py::ssize_t, // inner_range //py::ssize_t + py::ssize_t, // outer_range + char *, // src_data_ptr + char *, // dst_data_ptr + py::ssize_t, // nd + py::ssize_t *, // shape_and_strides + py::ssize_t, // k + const std::vector &, + const std::vector &); + +template class tri_kernel; +template +sycl::event tri_impl(sycl::queue exec_q, + py::ssize_t inner_range, + py::ssize_t outer_range, + char *src_p, + char *dst_p, + py::ssize_t nd, + py::ssize_t *shape_and_strides, + py::ssize_t k, + const std::vector &depends, + const std::vector &additional_depends) +{ + constexpr int d2 = 2; + py::ssize_t src_s = nd; + py::ssize_t dst_s = 2 * nd; + py::ssize_t nd_1 = nd - 1; + py::ssize_t nd_2 = nd - 2; + Ty *src = reinterpret_cast(src_p); + Ty *dst = reinterpret_cast(dst_p); + + sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + cgh.parallel_for>( + sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) { + py::ssize_t outer_gid = idx[0] / inner_range; + py::ssize_t inner_gid = idx[0] - inner_range * outer_gid; + + py::ssize_t src_inner_offset, dst_inner_offset; + bool to_copy; + + { + // py::ssize_t inner_gid = idx.get_id(0); + CIndexer_array indexer_i( + {shape_and_strides[nd_2], shape_and_strides[nd_1]}); + indexer_i.set(inner_gid); + const std::array &inner = indexer_i.get(); + src_inner_offset = + inner[0] * shape_and_strides[src_s + nd_2] + + inner[1] * shape_and_strides[src_s + nd_1]; + dst_inner_offset = + inner[0] * shape_and_strides[dst_s + nd_2] + + inner[1] * shape_and_strides[dst_s + nd_1]; + + if (l) + to_copy = (inner[0] + k >= inner[1]); + else + to_copy = (inner[0] + k <= inner[1]); + } + + py::ssize_t src_offset = 0; + py::ssize_t dst_offset = 0; + { + // py::ssize_t outer_gid = idx.get_id(1); + CIndexer_vector outer(nd - d2); + outer.get_displacement( + outer_gid, shape_and_strides, shape_and_strides + src_s, + shape_and_strides + dst_s, src_offset, dst_offset); + } + + src_offset += src_inner_offset; + dst_offset += dst_inner_offset; + + dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0); + }); + }); + return tri_ev; +} + +template struct TrilGenericFactory +{ + fnT get() + { + fnT f = tri_impl; + return f; + } +}; + +template struct TriuGenericFactory +{ + fnT get() + { + fnT f = tri_impl; + return f; + } +}; + +} // namespace constructors +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp new file mode 100644 index 0000000000..e0057a7402 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -0,0 +1,505 @@ +//=== copy_and_cast.hpp - Implementation of copy-and-cast kernels *-C++-*/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor copying and value casting. +//===----------------------------------------------------------------------===// + +#pragma once +#include "utils/strided_iters.hpp" +#include "utils/type_utils.hpp" +#include +#include +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace copy_and_cast +{ + +namespace py = pybind11; + +template class copy_cast_generic_kernel; +template class copy_cast_from_host_kernel; +template class copy_cast_spec_kernel; +template class copy_for_reshape_generic_kernel; + +template class Caster +{ +public: + Caster() = default; + void operator()(char *src, + std::ptrdiff_t src_offset, + char *dst, + std::ptrdiff_t dst_offset) const + { + using dpctl::tensor::type_utils::convert_impl; + + srcT *src_ = reinterpret_cast(src) + src_offset; + dstT *dst_ = reinterpret_cast(dst) + dst_offset; + *dst_ = convert_impl(*src_); + } +}; + +template class GenericCopyFunctor +{ +private: + char *src_ = nullptr; + char *dst_ = nullptr; + py::ssize_t *shape_strides_ = nullptr; + int nd_ = 0; + py::ssize_t src_offset0 = 0; + py::ssize_t dst_offset0 = 0; + +public: + GenericCopyFunctor(char *src_cp, + char *dst_cp, + py::ssize_t *shape_strides, + int nd, + py::ssize_t src_offset, + py::ssize_t dst_offset) + : src_(src_cp), dst_(dst_cp), shape_strides_(shape_strides), nd_(nd), + src_offset0(src_offset), dst_offset0(dst_offset) + { + } + + void operator()(sycl::id<1> wiid) const + { + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + CIndexer_vector indxr(nd_); + indxr.get_displacement( + static_cast(wiid.get(0)), + const_cast(shape_strides_), // common shape + const_cast(shape_strides_ + + nd_), // src strides + const_cast(shape_strides_ + + 2 * nd_), // dst strides + src_offset, // modified by reference + dst_offset // modified by reference + ); + CastFnT fn{}; + fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset); + } +}; + +template class NDSpecializedCopyFunctor +{ +private: + char *src_ = nullptr; + char *dst_ = nullptr; + CIndexer_array indxr; + const std::array src_strides_; + const std::array dst_strides_; + static const int nd_ = nd; + py::ssize_t src_offset0 = 0; + py::ssize_t dst_offset0 = 0; + +public: + NDSpecializedCopyFunctor(char *src_cp, // USM pointer + char *dst_cp, // USM pointer + const std::array shape, + const std::array src_strides, + const std::array dst_strides, + py::ssize_t src_offset, + py::ssize_t dst_offset) + : src_(src_cp), dst_(dst_cp), indxr(shape), src_strides_(src_strides), + dst_strides_(dst_strides), src_offset0(src_offset), + dst_offset0(dst_offset) + { + } + + void operator()(sycl::id<1> wiid) const + { + py::ssize_t src_offset = 0; + py::ssize_t dst_offset = 0; + CIndexer_array local_indxr(std::move(indxr)); + + local_indxr.set(wiid.get(0)); + auto mi = local_indxr.get(); + for (int i = 0; i < nd; ++i) + src_offset += mi[i] * src_strides_[i]; + for (int i = 0; i < nd; ++i) + dst_offset += mi[i] * dst_strides_[i]; + + CastFnT fn{}; + fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset); + } +}; + +typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( + sycl::queue, + size_t, + int, + py::ssize_t *, + char *, + py::ssize_t, + char *, + py::ssize_t, + const std::vector &, + const std::vector &); + +template +sycl::event +copy_and_cast_generic_impl(sycl::queue q, + size_t nelems, + int nd, + py::ssize_t *shape_and_strides, + char *src_p, + py::ssize_t src_offset, + char *dst_p, + py::ssize_t dst_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyFunctor>( + src_p, dst_p, shape_and_strides, nd, src_offset, dst_offset)); + }); + + return copy_and_cast_ev; +} + +template struct CopyAndCastGenericFactory +{ + fnT get() + { + fnT f = copy_and_cast_generic_impl; + return f; + } +}; + +// Specialization of copy_and_cast for 1D arrays + +typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( + sycl::queue, + size_t, + const std::array, + const std::array, + const std::array, + char *, + py::ssize_t, + char *, + py::ssize_t, + const std::vector &); + +typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( + sycl::queue, + size_t, + const std::array, + const std::array, + const std::array, + char *, + py::ssize_t, + char *, + py::ssize_t, + const std::vector &); + +template +sycl::event +copy_and_cast_nd_specialized_impl(sycl::queue q, + size_t nelems, + const std::array shape, + const std::array src_strides, + const std::array dst_strides, + char *src_p, + py::ssize_t src_offset, + char *dst_p, + py::ssize_t dst_offset, + const std::vector &depends) +{ + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>( + sycl::range<1>(nelems), + NDSpecializedCopyFunctor>( + src_p, dst_p, shape, src_strides, dst_strides, src_offset, + dst_offset)); + }); + + return copy_and_cast_ev; +} + +template struct CopyAndCast1DFactory +{ + fnT get() + { + fnT f = copy_and_cast_nd_specialized_impl; + return f; + } +}; + +template struct CopyAndCast2DFactory +{ + fnT get() + { + fnT f = copy_and_cast_nd_specialized_impl; + return f; + } +}; + +// ====================== Copying from host to USM + +template +class CasterForAccessor +{ +public: + CasterForAccessor() = default; + void operator()(AccessorT src, + std::ptrdiff_t src_offset, + char *dst, + std::ptrdiff_t dst_offset) const + { + using dpctl::tensor::type_utils::convert_impl; + + dstT *dst_ = reinterpret_cast(dst) + dst_offset; + *dst_ = convert_impl(src[src_offset]); + } +}; + +template class GenericCopyFromHostFunctor +{ +private: + AccessorT src_acc_; + char *dst_ = nullptr; + py::ssize_t *shape_strides_ = nullptr; + int nd_ = 0; + py::ssize_t src_offset0 = 0; + py::ssize_t dst_offset0 = 0; + +public: + GenericCopyFromHostFunctor(AccessorT src_acc, + char *dst_cp, + py::ssize_t *shape_strides, + int nd, + py::ssize_t src_offset, + py::ssize_t dst_offset) + : src_acc_(src_acc), dst_(dst_cp), shape_strides_(shape_strides), + nd_(nd), src_offset0(src_offset), dst_offset0(dst_offset) + { + } + + void operator()(sycl::id<1> wiid) const + { + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + CIndexer_vector indxr(nd_); + indxr.get_displacement( + static_cast(wiid.get(0)), + const_cast(shape_strides_), // common shape + const_cast(shape_strides_ + + nd_), // src strides + const_cast(shape_strides_ + + 2 * nd_), // dst strides + src_offset, // modified by reference + dst_offset // modified by reference + ); + CastFnT fn{}; + fn(src_acc_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset); + } +}; + +typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)( + sycl::queue, + size_t, + int, + py::ssize_t *, + const char *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + char *, + py::ssize_t, + const std::vector &, + const std::vector &); + +template +void copy_and_cast_from_host_impl( + sycl::queue q, + size_t nelems, + int nd, + py::ssize_t *shape_and_strides, + const char *host_src_p, + py::ssize_t src_offset, + py::ssize_t src_min_nelem_offset, + py::ssize_t src_max_nelem_offset, + char *dst_p, + py::ssize_t dst_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + py::ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1; + sycl::buffer npy_buf( + reinterpret_cast(host_src_p) + src_min_nelem_offset, + sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}}); + + sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only); + + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyFromHostFunctor< + CasterForAccessor, + decltype(npy_acc)>(npy_acc, dst_p, shape_and_strides, nd, + src_offset - src_min_nelem_offset, + dst_offset)); + }); + + copy_and_cast_from_host_ev.wait_and_throw(); + + return; +} + +template +struct CopyAndCastFromHostFactory +{ + fnT get() + { + fnT f = copy_and_cast_from_host_impl; + return f; + } +}; + +// =============== Copying for reshape ================== // + +template class GenericCopyForReshapeFunctor +{ +private: + py::ssize_t offset = 0; + py::ssize_t size = 1; + int src_nd = -1; + int dst_nd = -1; + // USM array of size 2*(src_nd + dst_nd) + // [ src_shape; src_strides; dst_shape; dst_strides ] + const py::ssize_t *src_dst_shapes_and_strides = nullptr; + Ty *src_p = nullptr; + Ty *dst_p = nullptr; + +public: + GenericCopyForReshapeFunctor(py::ssize_t shift, + py::ssize_t nelems, + int src_ndim, + int dst_ndim, + const py::ssize_t *packed_shapes_and_strides, + char *src_ptr, + char *dst_ptr) + : offset(shift), size(nelems), src_nd(src_ndim), dst_nd(dst_ndim), + src_dst_shapes_and_strides(packed_shapes_and_strides), + src_p(reinterpret_cast(src_ptr)), + dst_p(reinterpret_cast(dst_ptr)) + { + } + + void operator()(sycl::id<1> wiid) const + { + py::ssize_t this_src_offset(0); + CIndexer_vector src_indxr(src_nd); + + src_indxr.get_displacement( + static_cast(wiid.get(0)), + const_cast( + src_dst_shapes_and_strides), // src shape + const_cast(src_dst_shapes_and_strides + + src_nd), // src strides + this_src_offset // modified by reference + ); + const Ty *in = src_p + this_src_offset; + + py::ssize_t this_dst_offset(0); + CIndexer_vector dst_indxr(dst_nd); + py::ssize_t shifted_wiid = + (static_cast(wiid.get(0)) + offset) % size; + shifted_wiid = (shifted_wiid >= 0) ? shifted_wiid : shifted_wiid + size; + dst_indxr.get_displacement( + shifted_wiid, + const_cast(src_dst_shapes_and_strides + + 2 * src_nd), // dst shape + const_cast(src_dst_shapes_and_strides + + 2 * src_nd + dst_nd), // dst strides + this_dst_offset // modified by reference + ); + + Ty *out = dst_p + this_dst_offset; + *out = *in; + } +}; + +// define function type +typedef sycl::event (*copy_for_reshape_fn_ptr_t)( + sycl::queue, + py::ssize_t, // shift + size_t, // num_elements + int, + int, // src_nd, dst_nd + py::ssize_t *, // packed shapes and strides + char *, // src_data_ptr + char *, // dst_data_ptr + const std::vector &); + +template +sycl::event +copy_for_reshape_generic_impl(sycl::queue q, + py::ssize_t shift, + size_t nelems, + int src_nd, + int dst_nd, + py::ssize_t *packed_shapes_and_strides, + char *src_p, + char *dst_p, + const std::vector &depends) +{ + sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyForReshapeFunctor(shift, nelems, src_nd, dst_nd, + packed_shapes_and_strides, src_p, + dst_p)); + }); + + return copy_for_reshape_ev; +} + +template struct CopyForReshapeGenericFactory +{ + fnT get() + { + fnT f = copy_for_reshape_generic_impl; + return f; + } +}; + +} // namespace copy_and_cast +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp new file mode 100644 index 0000000000..181ff89adc --- /dev/null +++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp @@ -0,0 +1,73 @@ +//===------ type_utils.hpp - Implementation of types utils ----*-C++-*/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions for value casting. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace dpctl +{ +namespace tensor +{ +namespace type_utils +{ + +template struct is_complex : std::false_type +{ +}; +template struct is_complex> : std::true_type +{ +}; + +template dstTy convert_impl(const srcTy &v) +{ + if constexpr (std::is_same::value) { + return v; + } + else if constexpr (std::is_same_v && is_complex::value) + { + // bool(complex_v) == (complex_v.real() != 0) && (complex_v.imag() !=0) + return (convert_impl(v.real()) || + convert_impl(v.imag())); + } + else if constexpr (is_complex::value && !is_complex::value) { + // real_t(complex_v) == real_t(complex_v.real()) + return convert_impl(v.real()); + } + else if constexpr (!std::is_integral::value && + !std::is_same::value && + std::is_integral::value && + std::is_unsigned::value) + { + // first cast to signed variant, the cast to unsigned one + using signedT = typename std::make_signed::type; + return static_cast(convert_impl(v)); + } + else { + return static_cast(v); + } +} + +} // namespace type_utils +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 7a6c1fcca6..0566681d0f 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -34,245 +34,24 @@ #include #include "dpctl4pybind11.hpp" +#include "kernels/constructors.hpp" +#include "kernels/copy_and_cast.hpp" #include "utils/strided_iters.hpp" #include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" namespace py = pybind11; -template class copy_cast_generic_kernel; -template class copy_cast_from_host_kernel; -template class copy_cast_spec_kernel; -template class copy_for_reshape_generic_kernel; -template class linear_sequence_step_kernel; -template class linear_sequence_affine_kernel; -template class eye_kernel; - static dpctl::tensor::detail::usm_ndarray_types array_types; namespace { -template struct is_complex : std::false_type -{ -}; -template struct is_complex> : std::true_type -{ -}; -template dstTy convert_impl(const srcTy &v) -{ - if constexpr (std::is_same::value) { - return v; - } - else if constexpr (std::is_same_v && is_complex::value) - { - // bool(complex_v) == (complex_v.real() != 0) && (complex_v.imag() !=0) - return (convert_impl(v.real()) || - convert_impl(v.imag())); - } - else if constexpr (is_complex::value && !is_complex::value) { - // real_t(complex_v) == real_t(complex_v.real()) - return convert_impl(v.real()); - } - else if constexpr (!std::is_integral::value && - !std::is_same::value && - std::is_integral::value && - std::is_unsigned::value) - { - // first cast to signed variant, the cast to unsigned one - using signedT = typename std::make_signed::type; - return static_cast(convert_impl(v)); - } - else { - return static_cast(v); - } -} - -template class Caster -{ -public: - Caster() = default; - void operator()(char *src, - std::ptrdiff_t src_offset, - char *dst, - std::ptrdiff_t dst_offset) const - { - srcT *src_ = reinterpret_cast(src) + src_offset; - dstT *dst_ = reinterpret_cast(dst) + dst_offset; - *dst_ = convert_impl(*src_); - } -}; - -template class GenericCopyFunctor -{ -private: - char *src_ = nullptr; - char *dst_ = nullptr; - py::ssize_t *shape_strides_ = nullptr; - int nd_ = 0; - py::ssize_t src_offset0 = 0; - py::ssize_t dst_offset0 = 0; - -public: - GenericCopyFunctor(char *src_cp, - char *dst_cp, - py::ssize_t *shape_strides, - int nd, - py::ssize_t src_offset, - py::ssize_t dst_offset) - : src_(src_cp), dst_(dst_cp), shape_strides_(shape_strides), nd_(nd), - src_offset0(src_offset), dst_offset0(dst_offset) - { - } - - void operator()(sycl::id<1> wiid) const - { - py::ssize_t src_offset(0); - py::ssize_t dst_offset(0); - CIndexer_vector indxr(nd_); - indxr.get_displacement( - static_cast(wiid.get(0)), - const_cast(shape_strides_), // common shape - const_cast(shape_strides_ + - nd_), // src strides - const_cast(shape_strides_ + - 2 * nd_), // dst strides - src_offset, // modified by reference - dst_offset // modified by reference - ); - CastFnT fn{}; - fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset); - } -}; - -template class NDSpecializedCopyFunctor -{ -private: - char *src_ = nullptr; - char *dst_ = nullptr; - CIndexer_array indxr; - const std::array src_strides_; - const std::array dst_strides_; - static const int nd_ = nd; - py::ssize_t src_offset0 = 0; - py::ssize_t dst_offset0 = 0; - -public: - NDSpecializedCopyFunctor(char *src_cp, // USM pointer - char *dst_cp, // USM pointer - const std::array shape, - const std::array src_strides, - const std::array dst_strides, - py::ssize_t src_offset, - py::ssize_t dst_offset) - : src_(src_cp), dst_(dst_cp), indxr(shape), src_strides_(src_strides), - dst_strides_(dst_strides), src_offset0(src_offset), - dst_offset0(dst_offset) - { - } - - void operator()(sycl::id<1> wiid) const - { - py::ssize_t src_offset = 0; - py::ssize_t dst_offset = 0; - CIndexer_array local_indxr(std::move(indxr)); - - local_indxr.set(wiid.get(0)); - auto mi = local_indxr.get(); - for (int i = 0; i < nd; ++i) - src_offset += mi[i] * src_strides_[i]; - for (int i = 0; i < nd; ++i) - dst_offset += mi[i] * dst_strides_[i]; - - CastFnT fn{}; - fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset); - } -}; - -typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( - sycl::queue, - size_t, - int, - py::ssize_t *, - char *, - py::ssize_t, - char *, - py::ssize_t, - const std::vector &, - const std::vector &); - -template -sycl::event -copy_and_cast_generic_impl(sycl::queue q, - size_t nelems, - int nd, - py::ssize_t *shape_and_strides, - char *src_p, - py::ssize_t src_offset, - char *dst_p, - py::ssize_t dst_offset, - const std::vector &depends, - const std::vector &additional_depends) -{ - sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - cgh.parallel_for>( - sycl::range<1>(nelems), - GenericCopyFunctor>( - src_p, dst_p, shape_and_strides, nd, src_offset, dst_offset)); - }); - - return copy_and_cast_ev; -} - -typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( - sycl::queue, - size_t, - const std::array, - const std::array, - const std::array, - char *, - py::ssize_t, - char *, - py::ssize_t, - const std::vector &); - -typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( - sycl::queue, - size_t, - const std::array, - const std::array, - const std::array, - char *, - py::ssize_t, - char *, - py::ssize_t, - const std::vector &); - -template -sycl::event -copy_and_cast_nd_specialized_impl(sycl::queue q, - size_t nelems, - const std::array shape, - const std::array src_strides, - const std::array dst_strides, - char *src_p, - py::ssize_t src_offset, - char *dst_p, - py::ssize_t dst_offset, - const std::vector &depends) -{ - sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>(nelems), - NDSpecializedCopyFunctor>( - src_p, dst_p, shape, src_strides, dst_strides, src_offset, - dst_offset)); - }); - - return copy_and_cast_ev; -} +using dpctl::tensor::c_contiguous_strides; +using dpctl::tensor::f_contiguous_strides; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_2d_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t; namespace _ns = dpctl::tensor::detail; @@ -283,67 +62,6 @@ static copy_and_cast_1d_fn_ptr_t static copy_and_cast_2d_fn_ptr_t copy_and_cast_2d_dispatch_table[_ns::num_types][_ns::num_types]; -template struct CopyAndCastGenericFactory -{ - fnT get() - { - fnT f = copy_and_cast_generic_impl; - return f; - } -}; - -template struct CopyAndCast1DFactory -{ - fnT get() - { - fnT f = copy_and_cast_nd_specialized_impl; - return f; - } -}; - -template struct CopyAndCast2DFactory -{ - fnT get() - { - fnT f = copy_and_cast_nd_specialized_impl; - return f; - } -}; - -std::vector c_contiguous_strides(int nd, - const py::ssize_t *shape, - py::ssize_t element_size = 1) -{ - if (nd > 0) { - std::vector c_strides(nd, element_size); - for (int ic = nd - 1; ic > 0;) { - py::ssize_t next_v = c_strides[ic] * shape[ic]; - c_strides[--ic] = next_v; - } - return c_strides; - } - else { - return std::vector(); - } -} - -std::vector f_contiguous_strides(int nd, - const py::ssize_t *shape, - py::ssize_t element_size = 1) -{ - if (nd > 0) { - std::vector f_strides(nd, element_size); - for (int i = 0; i < nd - 1;) { - py::ssize_t next_v = f_strides[i] * shape[i]; - f_strides[++i] = next_v; - } - return f_strides; - } - else { - return std::vector(); - } -} - using dpctl::utils::keep_args_alive; void simplify_iteration_space(int &nd, @@ -745,120 +463,14 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src, copy_and_cast_generic_ev); } -/* =========================== Copy for reshape ============================== - */ - -template class GenericCopyForReshapeFunctor -{ -private: - py::ssize_t offset = 0; - py::ssize_t size = 1; - int src_nd = -1; - int dst_nd = -1; - // USM array of size 2*(src_nd + dst_nd) - // [ src_shape; src_strides; dst_shape; dst_strides ] - const py::ssize_t *src_dst_shapes_and_strides = nullptr; - Ty *src_p = nullptr; - Ty *dst_p = nullptr; - -public: - GenericCopyForReshapeFunctor(py::ssize_t shift, - py::ssize_t nelems, - int src_ndim, - int dst_ndim, - const py::ssize_t *packed_shapes_and_strides, - char *src_ptr, - char *dst_ptr) - : offset(shift), size(nelems), src_nd(src_ndim), dst_nd(dst_ndim), - src_dst_shapes_and_strides(packed_shapes_and_strides), - src_p(reinterpret_cast(src_ptr)), - dst_p(reinterpret_cast(dst_ptr)) - { - } - - void operator()(sycl::id<1> wiid) const - { - py::ssize_t this_src_offset(0); - CIndexer_vector src_indxr(src_nd); - - src_indxr.get_displacement( - static_cast(wiid.get(0)), - const_cast( - src_dst_shapes_and_strides), // src shape - const_cast(src_dst_shapes_and_strides + - src_nd), // src strides - this_src_offset // modified by reference - ); - const Ty *in = src_p + this_src_offset; - - py::ssize_t this_dst_offset(0); - CIndexer_vector dst_indxr(dst_nd); - py::ssize_t shifted_wiid = - (static_cast(wiid.get(0)) + offset) % size; - shifted_wiid = (shifted_wiid >= 0) ? shifted_wiid : shifted_wiid + size; - dst_indxr.get_displacement( - shifted_wiid, - const_cast(src_dst_shapes_and_strides + - 2 * src_nd), // dst shape - const_cast(src_dst_shapes_and_strides + - 2 * src_nd + dst_nd), // dst strides - this_dst_offset // modified by reference - ); - - Ty *out = dst_p + this_dst_offset; - *out = *in; - } -}; - -// define function type -typedef sycl::event (*copy_for_reshape_fn_ptr_t)( - sycl::queue, - py::ssize_t, // shift - size_t, // num_elements - int, - int, // src_nd, dst_nd - py::ssize_t *, // packed shapes and strides - char *, // src_data_ptr - char *, // dst_data_ptr - const std::vector &); - -template -sycl::event -copy_for_reshape_generic_impl(sycl::queue q, - py::ssize_t shift, - size_t nelems, - int src_nd, - int dst_nd, - py::ssize_t *packed_shapes_and_strides, - char *src_p, - char *dst_p, - const std::vector &depends) -{ - sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>(nelems), - GenericCopyForReshapeFunctor(shift, nelems, src_nd, dst_nd, - packed_shapes_and_strides, src_p, - dst_p)); - }); +/* =========================== Copy for reshape ============================= */ - return copy_for_reshape_ev; -} +using dpctl::tensor::kernels::copy_and_cast::copy_for_reshape_fn_ptr_t; // define static vector static copy_for_reshape_fn_ptr_t copy_for_reshape_generic_dispatch_vector[_ns::num_types]; -template struct CopyForReshapeGenericFactory -{ - fnT get() - { - fnT f = copy_for_reshape_generic_impl; - return f; - } -}; - /* * Copies src into dst (same data type) of different shapes by using flat * iterations. @@ -973,7 +585,7 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src, } else if (src_flags & USM_ARRAY_F_CONTIGUOUS) { const auto &src_contig_strides = - c_contiguous_strides(src_nd, src_shape); + f_contiguous_strides(src_nd, src_shape); std::copy(src_contig_strides.begin(), src_contig_strides.end(), packed_host_shapes_strides_shp->begin() + src_nd); } @@ -1056,131 +668,13 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src, /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */ -template -class CasterForAccessor -{ -public: - CasterForAccessor() = default; - void operator()(AccessorT src, - std::ptrdiff_t src_offset, - char *dst, - std::ptrdiff_t dst_offset) const - { - dstT *dst_ = reinterpret_cast(dst) + dst_offset; - *dst_ = convert_impl(src[src_offset]); - } -}; - -template class GenericCopyFromHostFunctor -{ -private: - AccessorT src_acc_; - char *dst_ = nullptr; - py::ssize_t *shape_strides_ = nullptr; - int nd_ = 0; - py::ssize_t src_offset0 = 0; - py::ssize_t dst_offset0 = 0; - -public: - GenericCopyFromHostFunctor(AccessorT src_acc, - char *dst_cp, - py::ssize_t *shape_strides, - int nd, - py::ssize_t src_offset, - py::ssize_t dst_offset) - : src_acc_(src_acc), dst_(dst_cp), shape_strides_(shape_strides), - nd_(nd), src_offset0(src_offset), dst_offset0(dst_offset) - { - } - - void operator()(sycl::id<1> wiid) const - { - py::ssize_t src_offset(0); - py::ssize_t dst_offset(0); - CIndexer_vector indxr(nd_); - indxr.get_displacement( - static_cast(wiid.get(0)), - const_cast(shape_strides_), // common shape - const_cast(shape_strides_ + - nd_), // src strides - const_cast(shape_strides_ + - 2 * nd_), // dst strides - src_offset, // modified by reference - dst_offset // modified by reference - ); - CastFnT fn{}; - fn(src_acc_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset); - } -}; - -typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)( - sycl::queue, - size_t, - int, - py::ssize_t *, - const char *, - py::ssize_t, - py::ssize_t, - py::ssize_t, - char *, - py::ssize_t, - const std::vector &, - const std::vector &); - -template -void copy_and_cast_from_host_impl( - sycl::queue q, - size_t nelems, - int nd, - py::ssize_t *shape_and_strides, - const char *host_src_p, - py::ssize_t src_offset, - py::ssize_t src_min_nelem_offset, - py::ssize_t src_max_nelem_offset, - char *dst_p, - py::ssize_t dst_offset, - const std::vector &depends, - const std::vector &additional_depends) -{ - py::ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1; - sycl::buffer npy_buf( - reinterpret_cast(host_src_p) + src_min_nelem_offset, - sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}}); - - sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - - sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only); - - cgh.parallel_for>( - sycl::range<1>(nelems), - GenericCopyFromHostFunctor< - CasterForAccessor, - decltype(npy_acc)>(npy_acc, dst_p, shape_and_strides, nd, - src_offset - src_min_nelem_offset, - dst_offset)); - }); - - copy_and_cast_from_host_ev.wait_and_throw(); - - return; -} +using dpctl::tensor::kernels::copy_and_cast:: + copy_and_cast_from_host_blocking_fn_ptr_t; static copy_and_cast_from_host_blocking_fn_ptr_t copy_and_cast_from_host_blocking_dispatch_table[_ns::num_types] [_ns::num_types]; -template -struct CopyAndCastFromHostFactory -{ - fnT get() - { - fnT f = copy_and_cast_from_host_impl; - return f; - } -}; - void copy_numpy_ndarray_into_usm_ndarray( py::array npy_src, dpctl::tensor::usm_ndarray dst, @@ -1386,188 +880,17 @@ void copy_numpy_ndarray_into_usm_ndarray( return; } -/* =========== Unboxing Python scalar =============== */ - -template T unbox_py_scalar(py::object o) -{ - return py::cast(o); -} - -template <> sycl::half unbox_py_scalar(py::object o) -{ - float tmp = py::cast(o); - return static_cast(tmp); -} - /* ============= linear-sequence ==================== */ -typedef sycl::event (*lin_space_step_fn_ptr_t)( - sycl::queue, - size_t, // num_elements - py::object start, - py::object step, - char *, // dst_data_ptr - const std::vector &); +using dpctl::tensor::kernels::constructors::lin_space_step_fn_ptr_t; static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[_ns::num_types]; -template class LinearSequenceStepFunctor -{ -private: - Ty *p = nullptr; - Ty start_v; - Ty step_v; - -public: - LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv) - : p(reinterpret_cast(dst_p)), start_v(v0), step_v(dv) - { - } - - void operator()(sycl::id<1> wiid) const - { - auto i = wiid.get(0); - if constexpr (is_complex::value) { - p[i] = Ty{start_v.real() + i * step_v.real(), - start_v.imag() + i * step_v.imag()}; - } - else { - p[i] = start_v + i * step_v; - } - } -}; - -template -sycl::event lin_space_step_impl(sycl::queue exec_q, - size_t nelems, - py::object start, - py::object step, - char *array_data, - const std::vector &depends) -{ - Ty start_v; - Ty step_v; - try { - start_v = unbox_py_scalar(start); - step_v = unbox_py_scalar(step); - } catch (const py::error_already_set &e) { - throw; - } - - sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>{nelems}, - LinearSequenceStepFunctor(array_data, start_v, step_v)); - }); - - return lin_space_step_event; -} - -template struct LinSpaceStepFactory -{ - fnT get() - { - fnT f = lin_space_step_impl; - return f; - } -}; - -typedef sycl::event (*lin_space_affine_fn_ptr_t)( - sycl::queue, - size_t, // num_elements - py::object start, - py::object end, - bool include_endpoint, - char *, // dst_data_ptr - const std::vector &); +using dpctl::tensor::kernels::constructors::lin_space_affine_fn_ptr_t; static lin_space_affine_fn_ptr_t lin_space_affine_dispatch_vector[_ns::num_types]; -template class LinearSequenceAffineFunctor -{ -private: - Ty *p = nullptr; - Ty start_v; - Ty end_v; - size_t n; - -public: - LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, size_t den) - : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), - n((den == 0) ? 1 : den) - { - } - - void operator()(sycl::id<1> wiid) const - { - auto i = wiid.get(0); - wTy wc = wTy(i) / n; - wTy w = wTy(n - i) / n; - if constexpr (is_complex::value) { - auto _w = static_cast(w); - auto _wc = static_cast(wc); - auto re_comb = start_v.real() * _w + end_v.real() * _wc; - auto im_comb = start_v.imag() * _w + end_v.imag() * _wc; - Ty affine_comb = Ty{re_comb, im_comb}; - p[i] = affine_comb; - } - else { - auto affine_comb = start_v * w + end_v * wc; - p[i] = convert_impl(affine_comb); - } - } -}; - -template -sycl::event lin_space_affine_impl(sycl::queue exec_q, - size_t nelems, - py::object start, - py::object end, - bool include_endpoint, - char *array_data, - const std::vector &depends) -{ - Ty start_v, end_v; - try { - start_v = unbox_py_scalar(start); - end_v = unbox_py_scalar(end); - } catch (const py::error_already_set &e) { - throw; - } - - bool device_supports_doubles = exec_q.get_device().has(sycl::aspect::fp64); - sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - if (device_supports_doubles) { - cgh.parallel_for>( - sycl::range<1>{nelems}, - LinearSequenceAffineFunctor( - array_data, start_v, end_v, - (include_endpoint) ? nelems - 1 : nelems)); - } - else { - cgh.parallel_for>( - sycl::range<1>{nelems}, - LinearSequenceAffineFunctor( - array_data, start_v, end_v, - (include_endpoint) ? nelems - 1 : nelems)); - } - }); - - return lin_space_affine_event; -} - -template struct LinSpaceAffineFactory -{ - fnT get() - { - fnT f = lin_space_affine_impl; - return f; - } -}; - std::pair usm_ndarray_linear_sequence_step(py::object start, py::object dt, @@ -1668,46 +991,10 @@ usm_ndarray_linear_sequence_affine(py::object start, /* ================ Full ================== */ -typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue, - size_t, - py::object, - char *, - const std::vector &); +using dpctl::tensor::kernels::constructors::full_contig_fn_ptr_t; static full_contig_fn_ptr_t full_contig_dispatch_vector[_ns::num_types]; -template -sycl::event full_contig_impl(sycl::queue q, - size_t nelems, - py::object py_value, - char *dst_p, - const std::vector &depends) -{ - dstTy fill_v; - try { - fill_v = unbox_py_scalar(py_value); - } catch (const py::error_already_set &e) { - throw; - } - - sycl::event fill_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - dstTy *p = reinterpret_cast(dst_p); - cgh.fill(p, fill_v, nelems); - }); - - return fill_ev; -} - -template struct FullContigFactory -{ - fnT get() - { - fnT f = full_contig_impl; - return f; - } -}; - std::pair usm_ndarray_full(py::object py_value, dpctl::tensor::usm_ndarray dst, @@ -1758,74 +1045,10 @@ usm_ndarray_full(py::object py_value, /* ================ Eye ================== */ -typedef sycl::event (*eye_fn_ptr_t)(sycl::queue, - size_t nelems, // num_elements - py::ssize_t start, - py::ssize_t end, - py::ssize_t step, - char *, // dst_data_ptr - const std::vector &); +using dpctl::tensor::kernels::constructors::eye_fn_ptr_t; static eye_fn_ptr_t eye_dispatch_vector[_ns::num_types]; -template class EyeFunctor -{ -private: - Ty *p = nullptr; - py::ssize_t start_v; - py::ssize_t end_v; - py::ssize_t step_v; - -public: - EyeFunctor(char *dst_p, - const py::ssize_t v0, - const py::ssize_t v1, - const py::ssize_t dv) - : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), step_v(dv) - { - } - - void operator()(sycl::id<1> wiid) const - { - Ty set_v = 0; - py::ssize_t i = static_cast(wiid.get(0)); - if (i >= start_v and i <= end_v) { - if ((i - start_v) % step_v == 0) { - set_v = 1; - } - } - p[i] = set_v; - } -}; - -template -sycl::event eye_impl(sycl::queue exec_q, - size_t nelems, - const py::ssize_t start, - const py::ssize_t end, - const py::ssize_t step, - char *array_data, - const std::vector &depends) -{ - sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>{nelems}, - EyeFunctor(array_data, start, end, step)); - }); - - return eye_event; -} - -template struct EyeFactory -{ - fnT get() - { - fnT f = eye_impl; - return f; - } -}; - std::pair eye(py::ssize_t k, dpctl::tensor::usm_ndarray dst, @@ -1895,110 +1118,12 @@ eye(py::ssize_t k, } /* =========================== Tril and triu ============================== */ -// define function type -typedef sycl::event (*tri_fn_ptr_t)(sycl::queue, - py::ssize_t, // inner_range //py::ssize_t - py::ssize_t, // outer_range - char *, // src_data_ptr - char *, // dst_data_ptr - py::ssize_t, // nd - py::ssize_t *, // shape_and_strides - py::ssize_t, // k - const std::vector &, - const std::vector &); - -template class tri_kernel; -template -sycl::event tri_impl(sycl::queue exec_q, - py::ssize_t inner_range, - py::ssize_t outer_range, - char *src_p, - char *dst_p, - py::ssize_t nd, - py::ssize_t *shape_and_strides, - py::ssize_t k, - const std::vector &depends, - const std::vector &additional_depends) -{ - constexpr int d2 = 2; - py::ssize_t src_s = nd; - py::ssize_t dst_s = 2 * nd; - py::ssize_t nd_1 = nd - 1; - py::ssize_t nd_2 = nd - 2; - Ty *src = reinterpret_cast(src_p); - Ty *dst = reinterpret_cast(dst_p); - - sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - cgh.parallel_for>( - sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) { - py::ssize_t outer_gid = idx[0] / inner_range; - py::ssize_t inner_gid = idx[0] - inner_range * outer_gid; - - py::ssize_t src_inner_offset, dst_inner_offset; - bool to_copy; - - { - // py::ssize_t inner_gid = idx.get_id(0); - CIndexer_array indexer_i( - {shape_and_strides[nd_2], shape_and_strides[nd_1]}); - indexer_i.set(inner_gid); - const std::array &inner = indexer_i.get(); - src_inner_offset = - inner[0] * shape_and_strides[src_s + nd_2] + - inner[1] * shape_and_strides[src_s + nd_1]; - dst_inner_offset = - inner[0] * shape_and_strides[dst_s + nd_2] + - inner[1] * shape_and_strides[dst_s + nd_1]; - - if (l) - to_copy = (inner[0] + k >= inner[1]); - else - to_copy = (inner[0] + k <= inner[1]); - } - - py::ssize_t src_offset = 0; - py::ssize_t dst_offset = 0; - { - // py::ssize_t outer_gid = idx.get_id(1); - CIndexer_vector outer(nd - d2); - outer.get_displacement( - outer_gid, shape_and_strides, shape_and_strides + src_s, - shape_and_strides + dst_s, src_offset, dst_offset); - } - - src_offset += src_inner_offset; - dst_offset += dst_inner_offset; - - dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0); - }); - }); - return tri_ev; -} - -static tri_fn_ptr_t tril_generic_dispatch_vector[_ns::num_types]; -template struct TrilGenericFactory -{ - fnT get() - { - fnT f = tri_impl; - return f; - } -}; +using dpctl::tensor::kernels::constructors::tri_fn_ptr_t; +static tri_fn_ptr_t tril_generic_dispatch_vector[_ns::num_types]; static tri_fn_ptr_t triu_generic_dispatch_vector[_ns::num_types]; -template struct TriuGenericFactory -{ - fnT get() - { - fnT f = tri_impl; - return f; - } -}; - std::pair tri(sycl::queue &exec_q, dpctl::tensor::usm_ndarray src, @@ -2109,10 +1234,12 @@ tri(sycl::queue &exec_q, const py::ssize_t *dst_strides_raw = dst.get_strides_raw(); if (dst_strides_raw == nullptr) { if (is_dst_c_contig) { - dst_strides = c_contiguous_strides(src_nd, src_shape); + dst_strides = + dpctl::tensor::c_contiguous_strides(src_nd, src_shape); } else if (is_dst_f_contig) { - dst_strides = f_contiguous_strides(src_nd, src_shape); + dst_strides = + dpctl::tensor::f_contiguous_strides(src_nd, src_shape); } else { throw std::runtime_error("Source array has null strides but has " @@ -2218,6 +1345,11 @@ void init_copy_and_cast_dispatch_tables(void) { using namespace dpctl::tensor::detail; + using dpctl::tensor::kernels::copy_and_cast::CopyAndCast1DFactory; + using dpctl::tensor::kernels::copy_and_cast::CopyAndCast2DFactory; + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastFromHostFactory; + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory; + DispatchTableBuilder dtb_generic; @@ -2247,6 +1379,13 @@ void init_copy_and_cast_dispatch_tables(void) void init_copy_for_reshape_dispatch_vector(void) { using namespace dpctl::tensor::detail; + using dpctl::tensor::kernels::constructors::EyeFactory; + using dpctl::tensor::kernels::constructors::FullContigFactory; + using dpctl::tensor::kernels::constructors::LinSpaceAffineFactory; + using dpctl::tensor::kernels::constructors::LinSpaceStepFactory; + using dpctl::tensor::kernels::constructors::TrilGenericFactory; + using dpctl::tensor::kernels::constructors::TriuGenericFactory; + using dpctl::tensor::kernels::copy_and_cast::CopyForReshapeGenericFactory; DispatchVectorBuilder From 96409598e9664397990c8411a5c2f932ad46fcdb Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sat, 8 Oct 2022 14:27:55 -0500 Subject: [PATCH 2/4] Marked non-templated function definitions inline This is to avoid multiple definitions compilation error when using multiple translation units. --- dpctl/apis/include/dpctl4pybind11.hpp | 18 ++++++++++-------- dpctl/apis/include/dpctl_capi.h | 2 +- .../libtensor/include/kernels/constructors.hpp | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp index 56a4e56f60..8c4d7a31fd 100644 --- a/dpctl/apis/include/dpctl4pybind11.hpp +++ b/dpctl/apis/include/dpctl4pybind11.hpp @@ -372,9 +372,10 @@ class usm_memory : public py::object namespace tensor { -std::vector c_contiguous_strides(int nd, - const py::ssize_t *shape, - py::ssize_t element_size = 1) +inline std::vector +c_contiguous_strides(int nd, + const py::ssize_t *shape, + py::ssize_t element_size = 1) { if (nd > 0) { std::vector c_strides(nd, element_size); @@ -389,9 +390,10 @@ std::vector c_contiguous_strides(int nd, } } -std::vector f_contiguous_strides(int nd, - const py::ssize_t *shape, - py::ssize_t element_size = 1) +inline std::vector +f_contiguous_strides(int nd, + const py::ssize_t *shape, + py::ssize_t element_size = 1) { if (nd > 0) { std::vector f_strides(nd, element_size); @@ -406,14 +408,14 @@ std::vector f_contiguous_strides(int nd, } } -std::vector +inline std::vector c_contiguous_strides(const std::vector &shape, py::ssize_t element_size = 1) { return c_contiguous_strides(shape.size(), shape.data(), element_size); } -std::vector +inline std::vector f_contiguous_strides(const std::vector &shape, py::ssize_t element_size = 1) { diff --git a/dpctl/apis/include/dpctl_capi.h b/dpctl/apis/include/dpctl_capi.h index af4a5fd951..61b31c1e3a 100644 --- a/dpctl/apis/include/dpctl_capi.h +++ b/dpctl/apis/include/dpctl_capi.h @@ -48,7 +48,7 @@ * shared objects defining this symbols, if they call `import_dpctl()` * prior to using those symbols. */ -void import_dpctl(void) +static inline void import_dpctl(void) { import_dpctl___sycl_device(); import_dpctl___sycl_context(); diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp index b08eebbab3..3234112311 100644 --- a/dpctl/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl/tensor/libtensor/include/kernels/constructors.hpp @@ -52,7 +52,7 @@ template T unbox_py_scalar(py::object o) return py::cast(o); } -template <> sycl::half unbox_py_scalar(py::object o) +template <> inline sycl::half unbox_py_scalar(py::object o) { float tmp = py::cast(o); return static_cast(tmp); From 34bfc6d9bf78943b84bbbe51f6e8344a3d1f0239 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 29 Sep 2022 16:26:38 -0500 Subject: [PATCH 3/4] Use inline for functions defined in header files to avoid 'multiple definition of' compilation error messages --- dpctl/apis/include/dpctl_capi.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dpctl/apis/include/dpctl_capi.h b/dpctl/apis/include/dpctl_capi.h index 61b31c1e3a..d6c104581a 100644 --- a/dpctl/apis/include/dpctl_capi.h +++ b/dpctl/apis/include/dpctl_capi.h @@ -47,6 +47,9 @@ * C functions can use dpctl's C-API functions without linking to * shared objects defining this symbols, if they call `import_dpctl()` * prior to using those symbols. + * + * It is declared inline to allow multiple definitions in + * different translation units */ static inline void import_dpctl(void) { From d658ebcda5a4222b09308f9fae33b5252ba0ef40 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 12 Oct 2022 06:53:56 -0500 Subject: [PATCH 4/4] Added doxygen docs for kernels --- .../include/kernels/constructors.hpp | 176 +++++++++++++++++- .../include/kernels/copy_and_cast.hpp | 164 ++++++++++++++++ 2 files changed, 337 insertions(+), 3 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp index 3234112311..4023d291af 100644 --- a/dpctl/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl/tensor/libtensor/include/kernels/constructors.hpp @@ -39,6 +39,10 @@ namespace kernels namespace constructors { +/*! + @defgroup CtorKernels + */ + template class linear_sequence_step_kernel; template class linear_sequence_affine_kernel; template class eye_kernel; @@ -47,6 +51,10 @@ namespace py = pybind11; /* =========== Unboxing Python scalar =============== */ +/*! + * @brief Cast pybind11 class managing Python object to specified type `T`. + * @defgroup CtorKernels + */ template T unbox_py_scalar(py::object o) { return py::cast(o); @@ -96,6 +104,23 @@ template class LinearSequenceStepFunctor } }; +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting value and + * increment. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start_v Typed starting value of the sequence + * @param step_v Typed increment of the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ template sycl::event lin_space_step_impl(sycl::queue exec_q, size_t nelems, @@ -114,6 +139,25 @@ sycl::event lin_space_step_impl(sycl::queue exec_q, return lin_space_step_event; } +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting value and increment + * given as Python objects. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start Starting value of the sequence as Python object. Must be + * convertible to array element data type `Ty`. + * @param step Increment of the sequence as Python object. Must be convertible + * to array element data type `Ty`. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ template sycl::event lin_space_step_impl(sycl::queue exec_q, size_t nelems, @@ -137,6 +181,11 @@ sycl::event lin_space_step_impl(sycl::queue exec_q, return lin_space_step_event; } +/*! + * @brief Factor to get function pointer of type `fnT` for array with elements + * of type `Ty`. + * @defgroup CtorKernels + */ template struct LinSpaceStepFactory { fnT get() @@ -195,6 +244,23 @@ template class LinearSequenceAffineFunctor } }; +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting and end values. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence. + * @param start_v Stating value of the sequence. + * @param end_v End-value of the sequence. + * @param include_endpoint Whether the end-value is included in the sequence. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ template sycl::event lin_space_affine_impl(sycl::queue exec_q, size_t nelems, @@ -226,6 +292,26 @@ sycl::event lin_space_affine_impl(sycl::queue exec_q, return lin_space_affine_event; } +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting and end values given + * as Python objects. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param start Stating value of the sequence as Python object. Must be + * convertible to array data element type `Ty`. + * @param end End-value of the sequence as Python object. Must be convertible + * to array data element type `Ty`. + * @param include_endpoint Whether the end-value is included in the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ template sycl::event lin_space_affine_impl(sycl::queue exec_q, size_t nelems, @@ -249,6 +335,10 @@ sycl::event lin_space_affine_impl(sycl::queue exec_q, return lin_space_affine_event; } +/*! + * @brief Factory to get function pointer of type `fnT` for array data type + * `Ty`. + */ template struct LinSpaceAffineFactory { fnT get() @@ -266,6 +356,21 @@ typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue, char *, const std::vector &); +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param fill_v Value to fill the array with + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ template sycl::event full_contig_impl(sycl::queue q, size_t nelems, @@ -282,6 +387,22 @@ sycl::event full_contig_impl(sycl::queue q, return fill_ev; } +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param py_value Python object representing the value to fill the array with. + * Must be convertible to `dstTy`. + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ template sycl::event full_contig_impl(sycl::queue exec_q, size_t nelems, @@ -351,6 +472,21 @@ template class EyeFunctor } }; +/*! + * @brief Function to populate 2D array with eye matrix. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Number of elements to assign. + * @param start Position of the first non-zero value. + * @param end Position of the last non-zero value. + * @param step Number of array elements between non-zeros. + * @param array_data Kernel accessible USM pointer for the destination array. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ template sycl::event eye_impl(sycl::queue exec_q, size_t nelems, @@ -370,6 +506,10 @@ sycl::event eye_impl(sycl::queue exec_q, return eye_event; } +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ template struct EyeFactory { fnT get() @@ -393,8 +533,30 @@ typedef sycl::event (*tri_fn_ptr_t)(sycl::queue, const std::vector &, const std::vector &); +/*! + * @brief Function to copy triangular matrices from source stack to destination + * stack. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param inner_range Number of elements in each matrix. + * @param outer_range Number of matrices to copy. + * @param src_p Kernel accessible USM pointer for the source array. + * @param dst_p Kernel accessible USM pointer for the destination array. + * @param nd The array dimensionality of source and destination arrays. + * @param shape_and_strides Kernel accessible USM pointer to packed shape and + * strides of arrays. + * @param k Position of the diagonal above/below which to copy filling the rest + * with zero elements. + * @param depends List of events to wait for before starting computations, if + * any. + * @param additional_depends List of additional events to wait for before + * starting computations, if any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ template class tri_kernel; -template +template sycl::event tri_impl(sycl::queue exec_q, py::ssize_t inner_range, py::ssize_t outer_range, @@ -417,7 +579,7 @@ sycl::event tri_impl(sycl::queue exec_q, sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); cgh.depends_on(additional_depends); - cgh.parallel_for>( + cgh.parallel_for>( sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) { py::ssize_t outer_gid = idx[0] / inner_range; py::ssize_t inner_gid = idx[0] - inner_range * outer_gid; @@ -438,7 +600,7 @@ sycl::event tri_impl(sycl::queue exec_q, inner[0] * shape_and_strides[dst_s + nd_2] + inner[1] * shape_and_strides[dst_s + nd_1]; - if (l) + if constexpr (upper) to_copy = (inner[0] + k >= inner[1]); else to_copy = (inner[0] + k <= inner[1]); @@ -463,6 +625,10 @@ sycl::event tri_impl(sycl::queue exec_q, return tri_ev; } +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ template struct TrilGenericFactory { fnT get() @@ -472,6 +638,10 @@ template struct TrilGenericFactory } }; +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ template struct TriuGenericFactory { fnT get() diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp index e0057a7402..0d5a1d21ca 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -150,6 +150,13 @@ template class NDSpecializedCopyFunctor } }; +/*! + @defgroup CopyAndCastKernels + */ + +/*! + * @brief Function pointer type for generic array cast and copying function. + */ typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( sycl::queue, size_t, @@ -162,6 +169,39 @@ typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( const std::vector &, const std::vector &); +/*! + * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to + `dst` usm_ndarray while casting from `srcTy` to `dstTy`. + + Both arrays have array dimensionality specied via argument `nd`. The + `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the + first `nd` elements encode common shape, second `nd` elements contain strides + of `src` array, and the trailing `nd` elements contain strides of `dst` array. + `src_p` and `dst_p` represent pointers into respective arrays, but the start of + iteration begins at offset of `src_offset` elements for `src` array and at + offset `dst_offset` elements for `dst` array. Kernel is submitted to sycl queue + `q` with events `depends` and `additional_depends` as dependencies. + + @param q Sycl queue to which the kernel is submitted. + @param nelems Number of elements to cast and copy. + @param nd Array dimensionality, i.e. number of indices needed to + identify an element of each array. + @param shape_and_strides Kernel accessible USM pointer to packed shape and + strides. + @param src_p Kernel accessible USM pointer for the source array + @param src_offset Offset to the beginning of iteration in number of + elements of source array from `src_p`. + @param dst_p Kernel accessible USM pointer for the destination array + @param dst_offset Offset to the beginning of iteration in number of + elements of destination array from `dst_p`. + @param depends List of events to wait for before starting computations, if + any. + @param additional_depends Additional list of events to wait for before + starting computations, if any. + + @return Event to wait on to ensure that computation completes. + @ingroup CopyAndCastKernels + */ template sycl::event copy_and_cast_generic_impl(sycl::queue q, @@ -187,6 +227,11 @@ copy_and_cast_generic_impl(sycl::queue q, return copy_and_cast_ev; } +/*! + * @brief Factory to get generic function pointer of type `fnT` for given source + * data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ template struct CopyAndCastGenericFactory { fnT get() @@ -198,6 +243,10 @@ template struct CopyAndCastGenericFactory // Specialization of copy_and_cast for 1D arrays +/*! + * @brief Factory to get function pointer for casting and copying 1D arrays. + * @ingroup CopyAndCastKernels + */ typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( sycl::queue, size_t, @@ -210,6 +259,10 @@ typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( py::ssize_t, const std::vector &); +/*! + * @brief Factory to get function pointer for casting and copying 2D arrays. + * @ingroup CopyAndCastKernels + */ typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( sycl::queue, size_t, @@ -222,6 +275,35 @@ typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( py::ssize_t, const std::vector &); +/*! + * @brief Specialized for given array dimension function to copy `nelems` + elements from `src` usm_ndarray to `dst` usm_ndarray while casting from `srcTy` + to `dstTy`. + + Both arrays have array dimensionality known at compile time and specified in + template parameters `nd`. Arrays' shape and strides are provided as + `std::array`. `src_p` and `dst_p` represent pointers into respective arrays, + but the start of iteration begins at offset of `src_offset` elements for `src` + array and at offset `dst_offset` elements for `dst` array. Kernel is submitted + to sycl queue `q` with events `depends` as dependencies. + + @param q The queue where the routine should be executed. + @param nelems Number of elements to cast and copy. + @param shape Common shape of the arrays. + @param src_strides Strides of the source array. + @param dst_strides Strides of the destination array. + @param src_p Kernel accessible USM pointer for the source array + @param src_offset Offset to the beginning of iteration in number of elements + of the source array from `src_p`. + @param dst_p Kernel accessible USM pointer for the destination array + @param dst_offset Offset to the beginning of iteration in number of elements + of the destination array from `src_p`. + @param depends List of events to wait for before starting computations, if + any. + + @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ template sycl::event copy_and_cast_nd_specialized_impl(sycl::queue q, @@ -247,6 +329,11 @@ copy_and_cast_nd_specialized_impl(sycl::queue q, return copy_and_cast_ev; } +/*! + * @brief Factory to get 1D-specialized function pointer of type `fnT` for given + * source data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ template struct CopyAndCast1DFactory { fnT get() @@ -256,6 +343,11 @@ template struct CopyAndCast1DFactory } }; +/*! + * @brief Factory to get 2D-specialized function pointer of type `fnT` for given + * source data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ template struct CopyAndCast2DFactory { fnT get() @@ -340,6 +432,44 @@ typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)( const std::vector &, const std::vector &); +/*! + * @brief Function to copy from NumPy's ndarray with elements of type `srcTy` + * into usm_ndarray with elements of type `srcTy`. + * + * Function to cast and copy elements from numpy.ndarray specified by typeless + * `host_src_p` and the `src_offset` given in the number of array elements. + * Arrays' metadata are given in packed USM vector of length `3*nd` whose first + * `nd` elements contain arrays' shape, next `nd` elements specify source + * strides in elements (not bytes), and trailing `nd` elements specify + * destination array strides. Kernel dependencies are given by two vectors of + * events: `depends` and `additional_depends`. The function execution is + * complete at the return. + * + * @param q The queue where the routine should be executed. + * @param nelems Number of elements to cast and copy. + * @param nd The dimensionality of arrays + * @param shape_and_strides Kernel accessible USM pointer to packed shape and + * strides. + * @param host_src_p Host (not USM allocated) pointer associated with the + * source array. + * @param src_offset Offset to the beginning of iteration in number of elements + * of the source array from `host_src_p`. + * @param src_min_nelem_offset Smallest value of offset relative to + * `host_src_p` in number of elements attained while iterating over elements of + * the source array. + * @param src_max_nelem_offset Largest value of offset relative to `host_src_p` + * in number of elements attained while iterating over elements of the source + * array. + * @param dst_p USM pointer associated with the destination array. + * @param dst_offset Offset to the beginning of iteration in number of elements + * of the destination array from `dst_p`. + * @param depends List of events to wait for before starting computations, if + * any. + * @param additional_depends List of additional events to wait for before + * starting computations, if any. + * + * @ingroup CopyAndCastKernels + */ template void copy_and_cast_from_host_impl( sycl::queue q, @@ -375,11 +505,18 @@ void copy_and_cast_from_host_impl( dst_offset)); }); + // perform explicit synchronization. Implicit synchronization would be + // performed by sycl::buffer destructor. copy_and_cast_from_host_ev.wait_and_throw(); return; } +/*! + * @brief Factory to get function pointer of type `fnT` for given NumPy array + * source data type `S` and destination data type `D`. + * @defgroup CopyAndCastKernels + */ template struct CopyAndCastFromHostFactory { @@ -466,6 +603,28 @@ typedef sycl::event (*copy_for_reshape_fn_ptr_t)( char *, // dst_data_ptr const std::vector &); +/*! + * @brief Function to copy content of array while reshaping. + * + * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems , + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param shift The shift in flat indexing. + * @param nelems The number of elements to copy + * @param src_nd Array dimension of the source array + * @param dst_nd Array dimension of the destination array + * @param packed_shapes_and_strides Kernel accessible USM array of size + * `2*src_nd + 2*dst_nd` with contant `[src_shape, src_strides, dst_shape, + * dst_strides]`. + * @param src_p Typeless USM pointer to the buffer of the source array + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ template sycl::event copy_for_reshape_generic_impl(sycl::queue q, @@ -490,6 +649,11 @@ copy_for_reshape_generic_impl(sycl::queue q, return copy_for_reshape_ev; } +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ template struct CopyForReshapeGenericFactory { fnT get()