diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp index 546d5bf2b6..02b7ac3c2d 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -46,51 +46,52 @@ using namespace dpctl::tensor::offset_utils; template class copy_cast_generic_kernel; + +template +class copy_cast_contig_kernel; + template class copy_cast_from_host_kernel; -// template -// class copy_cast_spec_kernel; + template class copy_for_reshape_generic_kernel; -template class Caster +template class Caster { public: Caster() = default; - void operator()(const char *src, - std::ptrdiff_t src_offset, - char *dst, - std::ptrdiff_t dst_offset) const + dstTy operator()(const srcTy &src) const { using dpctl::tensor::type_utils::convert_impl; - - const srcT *src_ = reinterpret_cast(src) + src_offset; - dstT *dst_ = reinterpret_cast(dst) + dst_offset; - *dst_ = convert_impl(*src_); + return convert_impl(src); } }; -template class GenericCopyFunctor +template +class GenericCopyFunctor { private: - const char *src_ = nullptr; - char *dst_ = nullptr; + const srcT *src_ = nullptr; + dstT *dst_ = nullptr; IndexerT indexer_; public: - GenericCopyFunctor(const char *src_cp, char *dst_cp, IndexerT indexer) - : src_(src_cp), dst_(dst_cp), indexer_(indexer) + GenericCopyFunctor(const srcT *src_p, dstT *dst_p, IndexerT indexer) + : src_(src_p), dst_(dst_p), indexer_(indexer) { } void operator()(sycl::id<1> wiid) const { - auto offsets = indexer_(static_cast(wiid.get(0))); - py::ssize_t src_offset = offsets.get_first_offset(); - py::ssize_t dst_offset = offsets.get_second_offset(); + const auto &offsets = indexer_(static_cast(wiid.get(0))); + const py::ssize_t &src_offset = offsets.get_first_offset(); + const py::ssize_t &dst_offset = offsets.get_second_offset(); CastFnT fn{}; - fn(src_, src_offset, dst_, dst_offset); + dst_[dst_offset] = fn(src_[src_offset]); } }; @@ -168,12 +169,15 @@ copy_and_cast_generic_impl(sycl::queue q, TwoOffsets_StridedIndexer indexer{nd, src_offset, dst_offset, shape_and_strides}; + const srcTy *src_tp = reinterpret_cast(src_p); + dstTy *dst_tp = reinterpret_cast(dst_p); cgh.parallel_for>( sycl::range<1>(nelems), - GenericCopyFunctor, TwoOffsets_StridedIndexer>( - src_p, dst_p, indexer)); + GenericCopyFunctor, + TwoOffsets_StridedIndexer>(src_tp, dst_tp, + indexer)); }); return copy_and_cast_ev; @@ -193,6 +197,160 @@ template struct CopyAndCastGenericFactory } }; +// Specialization of copy_and_cast for contiguous arrays + +template +class ContigCopyFunctor +{ +private: + const size_t nelems; + const srcT *src_p = nullptr; + dstT *dst_p = nullptr; + +public: + ContigCopyFunctor(const size_t nelems_, const srcT *src_p_, dstT *dst_p_) + : nelems(nelems_), src_p(src_p_), dst_p(dst_p_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + CastFnT fn{}; + + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value || is_complex::value) { + std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; + size_t base = ndit.get_global_linear_id(); + + base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize); + for (size_t offset = base; + offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz)); + offset += sgSize) + { + dst_p[offset] = fn(src_p[offset]); + } + } + else { + auto sg = ndit.get_sub_group(); + std::uint8_t sgSize = sg.get_local_range()[0]; + std::uint8_t max_sgSize = sg.get_max_local_range()[0]; + size_t base = n_vecs * vec_sz * + (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * max_sgSize); + + if (base + n_vecs * vec_sz * sgSize < nelems && + sgSize == max_sgSize) { + using src_ptrT = + sycl::multi_ptr; + using dst_ptrT = + sycl::multi_ptr; + sycl::vec src_vec; + sycl::vec dst_vec; + +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + src_vec = + sg.load(src_ptrT(&src_p[base + it * sgSize])); +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; k++) { + dst_vec[k] = fn(src_vec[k]); + } + sg.store(dst_ptrT(&dst_p[base + it * sgSize]), + dst_vec); + } + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < nelems; + k += sgSize) { + dst_p[k] = fn(src_p[k]); + } + } + } + } +}; + +/*! + * @brief Function pointer type for contiguous array cast and copy function. + */ +typedef sycl::event (*copy_and_cast_contig_fn_ptr_t)( + sycl::queue, + size_t, + const char *, + char *, + const std::vector &); + +/*! + * @brief Function to copy `nelems` elements from contiguous `src` usm_ndarray + to contiguous `dst` usm_ndarray while casting from `srcTy` to `dstTy`. + + Both arrays have the same number of elements `nelems`. + `src_cp` and `dst_cp` represent char pointers to the start of respective + arrays. Kernel is submitted to sycl queue `q` with events `depends` as + dependencies. + + @param q Sycl queue to which the kernel is submitted. + @param nelems Number of elements to cast and copy. + @param src_p Kernel accessible USM pointer for the source array + @param dst_p Kernel accessible USM pointer for the destination array + @param depends List of events to wait for before starting computations, if + any. + + @return Event to wait on to ensure that computation completes. + @ingroup CopyAndCastKernels + */ +template +sycl::event copy_and_cast_contig_impl(sycl::queue q, + size_t nelems, + const char *src_cp, + char *dst_cp, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const srcTy *src_tp = reinterpret_cast(src_cp); + dstTy *dst_tp = reinterpret_cast(dst_cp); + + size_t lws = 64; + constexpr unsigned int vec_sz = 4; + constexpr unsigned int n_vecs = 2; + const size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + cgh.parallel_for>( + sycl::nd_range<1>(gws_range, lws_range), + ContigCopyFunctor, vec_sz, + n_vecs>(nelems, src_tp, dst_tp)); + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get specialized function pointer for casting and copying + * contiguous arrays. + * @ingroup CopyAndCastKernels + */ +template struct CopyAndCastContigFactory +{ + fnT get() + { + fnT f = copy_and_cast_contig_impl; + return f; + } +}; + // Specialization of copy_and_cast for 1D arrays /*! @@ -276,13 +434,15 @@ copy_and_cast_nd_specialized_impl(sycl::queue q, using IndexerT = TwoOffsets_FixedDimStridedIndexer; IndexerT indexer{shape, src_strides, dst_strides, src_offset, dst_offset}; + const srcTy *src_tp = reinterpret_cast(src_p); + dstTy *dst_tp = reinterpret_cast(dst_p); cgh.depends_on(depends); cgh.parallel_for< class copy_cast_generic_kernel>( sycl::range<1>(nelems), - GenericCopyFunctor, IndexerT>(src_p, dst_p, - indexer)); + GenericCopyFunctor, IndexerT>( + src_tp, dst_tp, indexer)); }); return copy_and_cast_ev; @@ -318,46 +478,33 @@ template struct CopyAndCast2DFactory // ====================== Copying from host to USM -template -class CasterForAccessor -{ -public: - CasterForAccessor() = default; - void operator()(AccessorT src, - std::ptrdiff_t src_offset, - char *dst, - std::ptrdiff_t dst_offset) const - { - using dpctl::tensor::type_utils::convert_impl; - - dstT *dst_ = reinterpret_cast(dst) + dst_offset; - *dst_ = convert_impl(src[src_offset]); - } -}; - -template +template class GenericCopyFromHostFunctor { private: AccessorT src_acc_; - char *dst_ = nullptr; + dstTy *dst_ = nullptr; IndexerT indexer_; public: GenericCopyFromHostFunctor(AccessorT src_acc, - char *dst_cp, + dstTy *dst_p, IndexerT indexer) - : src_acc_(src_acc), dst_(dst_cp), indexer_(indexer) + : src_acc_(src_acc), dst_(dst_p), indexer_(indexer) { } void operator()(sycl::id<1> wiid) const { - auto offsets = indexer_(static_cast(wiid.get(0))); - py::ssize_t src_offset = offsets.get_first_offset(); - py::ssize_t dst_offset = offsets.get_second_offset(); + const auto &offsets = indexer_(static_cast(wiid.get(0))); + const py::ssize_t &src_offset = offsets.get_first_offset(); + const py::ssize_t &dst_offset = offsets.get_second_offset(); + CastFnT fn{}; - fn(src_acc_, src_offset, dst_, dst_offset); + dst_[dst_offset] = fn(src_acc_[src_offset]); } }; @@ -447,13 +594,15 @@ void copy_and_cast_from_host_impl( nd, src_offset - src_min_nelem_offset, dst_offset, const_cast(shape_and_strides)}; + dstTy *dst_tp = reinterpret_cast(dst_p); + cgh.parallel_for>( sycl::range<1>(nelems), - GenericCopyFromHostFunctor< - CasterForAccessor, - decltype(npy_acc), TwoOffsets_StridedIndexer>(npy_acc, dst_p, - indexer)); + GenericCopyFromHostFunctor, + TwoOffsets_StridedIndexer>( + npy_acc, dst_tp, indexer)); }); // perform explicit synchronization. Implicit synchronization would be diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp index f1f7f7eb0f..72272ff356 100644 --- a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp +++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -52,15 +52,15 @@ namespace py_internal namespace _ns = dpctl::tensor::detail; using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t; -using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_2d_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_contig_fn_ptr_t; using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t; static copy_and_cast_generic_fn_ptr_t copy_and_cast_generic_dispatch_table[_ns::num_types][_ns::num_types]; static copy_and_cast_1d_fn_ptr_t copy_and_cast_1d_dispatch_table[_ns::num_types][_ns::num_types]; -static copy_and_cast_2d_fn_ptr_t - copy_and_cast_2d_dispatch_table[_ns::num_types][_ns::num_types]; +static copy_and_cast_contig_fn_ptr_t + copy_and_cast_contig_dispatch_table[_ns::num_types][_ns::num_types]; namespace py = pybind11; @@ -142,25 +142,29 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src, bool is_dst_f_contig = dst.is_f_contiguous(); // check for applicability of special cases: - // (same type && (both C-contiguous || both F-contiguous) + // (both C-contiguous || both F-contiguous) bool both_c_contig = (is_src_c_contig && is_dst_c_contig); bool both_f_contig = (is_src_f_contig && is_dst_f_contig); if (both_c_contig || both_f_contig) { + + sycl::event copy_ev; if (src_type_id == dst_type_id) { int src_elem_size = src.get_elemsize(); - sycl::event copy_ev = - exec_q.memcpy(static_cast(dst_data), - static_cast(src_data), - src_nelems * src_elem_size, depends); - - // make sure src and dst are not GC-ed before copy_ev is complete - return std::make_pair( - keep_args_alive(exec_q, {src, dst}, {copy_ev}), copy_ev); + copy_ev = exec_q.memcpy(static_cast(dst_data), + static_cast(src_data), + src_nelems * src_elem_size, depends); } - // With contract_iter2 in place, there is no need to write - // dedicated kernels for casting between contiguous arrays + else { + auto contig_fn = + copy_and_cast_contig_dispatch_table[dst_type_id][src_type_id]; + copy_ev = + contig_fn(exec_q, src_nelems, src_data, dst_data, depends); + } + // make sure src and dst are not GC-ed before copy_ev is complete + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); } const py::ssize_t *src_strides = src.get_strides_raw(); @@ -187,7 +191,7 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src, simplified_shape, simplified_src_strides, simplified_dst_strides, src_offset, dst_offset); - if (nd < 3) { + if (nd < 2) { if (nd == 1) { std::array shape_arr = {shape[0]}; // strides may be null @@ -196,32 +200,28 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src, std::array dst_strides_arr = { (dst_strides ? dst_strides[0] : 1)}; - auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; - sycl::event copy_and_cast_1d_event = fn( - exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr, - src_data, src_offset, dst_data, dst_offset, depends); - + sycl::event copy_and_cast_1d_event; + if ((src_strides_arr[0] == 1) && (dst_strides_arr[0] == 1) && + (src_offset == 0) && (dst_offset == 0)) + { + auto contig_fn = + copy_and_cast_contig_dispatch_table[dst_type_id] + [src_type_id]; + sycl::event copy_and_cast_1d_event = + contig_fn(exec_q, src_nelems, src_data, dst_data, depends); + } + else { + auto fn = + copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; + copy_and_cast_1d_event = + fn(exec_q, src_nelems, shape_arr, src_strides_arr, + dst_strides_arr, src_data, src_offset, dst_data, + dst_offset, depends); + } return std::make_pair( keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}), copy_and_cast_1d_event); } - else if (nd == 2) { - std::array shape_arr = {shape[0], shape[1]}; - std::array src_strides_arr = {src_strides[0], - src_strides[1]}; - std::array dst_strides_arr = {dst_strides[0], - dst_strides[1]}; - - auto fn = copy_and_cast_2d_dispatch_table[dst_type_id][src_type_id]; - - sycl::event copy_and_cast_2d_event = fn( - exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr, - src_data, src_offset, dst_data, dst_offset, depends); - - return std::make_pair( - keep_args_alive(exec_q, {src, dst}, {copy_and_cast_2d_event}), - copy_and_cast_2d_event); - } else if (nd == 0) { // case of a scalar assert(src_nelems == 1); std::array shape_arr = {1}; @@ -279,6 +279,12 @@ void init_copy_and_cast_usm_to_usm_dispatch_tables(void) { using namespace dpctl::tensor::detail; + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastContigFactory; + DispatchTableBuilder + dtb_contig; + dtb_contig.populate_dispatch_table(copy_and_cast_contig_dispatch_table); + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory; DispatchTableBuilder @@ -290,12 +296,6 @@ void init_copy_and_cast_usm_to_usm_dispatch_tables(void) num_types> dtb_1d; dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table); - - using dpctl::tensor::kernels::copy_and_cast::CopyAndCast2DFactory; - DispatchTableBuilder - dtb_2d; - dtb_2d.populate_dispatch_table(copy_and_cast_2d_dispatch_table); } } // namespace py_internal diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp index 69a1b33bef..2a32e3c1a8 100644 --- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp +++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -141,8 +141,6 @@ void copy_numpy_ndarray_into_usm_ndarray( return; } - // With contract_iter2 in place, there is no need to write - // dedicated kernels for casting between contiguous arrays } const py::ssize_t *src_strides =