From fcc302ae8130e6da4624ecf26dfe73abe5069e4f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 6 Sep 2022 16:04:17 -0500 Subject: [PATCH 1/8] Make sure both arrays have the same allocation queue --- dpctl/tests/test_usm_ndarray_ctor.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py index c1574e8570..ef9af84e43 100644 --- a/dpctl/tests/test_usm_ndarray_ctor.py +++ b/dpctl/tests/test_usm_ndarray_ctor.py @@ -684,8 +684,12 @@ def test_setitem_scalar(dtype, usm_type): def test_setitem_errors(): - X = dpt.usm_ndarray((4,), dtype="u1") - Y = dpt.usm_ndarray((4, 2), dtype="u1") + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default queue could not be created") + X = dpt.empty((4,), dtype="u1", sycl_queue=q) + Y = dpt.empty((4, 2), dtype="u1", sycl_queue=q) with pytest.raises(ValueError): X[:] = Y with pytest.raises(ValueError): From 3bf842dfa0e19c04cedac79e2b36ed723ca4a47b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 6 Sep 2022 16:04:44 -0500 Subject: [PATCH 2/8] Introduced dpctl::utils::queues_are_compatible Usage queues_are_compatible(exec_q, {alloc_q1, alloc_q2, ...}). Returns true if compatible, false otherwise. --- dpctl/apis/include/dpctl4pybind11.hpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp index e6c31a4e38..48abe99152 100644 --- a/dpctl/apis/include/dpctl4pybind11.hpp +++ b/dpctl/apis/include/dpctl4pybind11.hpp @@ -547,6 +547,19 @@ sycl::event keep_args_alive(sycl::queue q, return host_task_ev; } +template +bool queues_are_compatible(sycl::queue exec_q, + const sycl::queue (&alloc_qs)[num]) +{ + for (std::size_t i = 0; i < num; ++i) { + + if (exec_q != alloc_qs[i]) { + return false; + } + } + return true; +} + } // end namespace utils } // end namespace dpctl From 0390cfe003b6a87c40974f5c921769a6d3958799 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 6 Sep 2022 16:06:28 -0500 Subject: [PATCH 3/8] Apply utility queues_are_compatible --- dpctl/tensor/libtensor/source/tensor_py.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index f63cc75d57..34ccb2c91f 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -524,14 +524,13 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src, } } - // check same contexts + // check compatibility of execution queue and allocation queue sycl::queue src_q = src.get_queue(); sycl::queue dst_q = dst.get_queue(); - sycl::context exec_ctx = exec_q.get_context(); - if (src_q.get_context() != exec_ctx || dst_q.get_context() != exec_ctx) { + if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) { throw py::value_error( - "Execution queue context is not the same as allocation contexts"); + "Execution queue is not compatible with allocation queues"); } int src_typenum = src.get_typenum(); @@ -938,10 +937,9 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src, sycl::queue src_q = src.get_queue(); sycl::queue dst_q = dst.get_queue(); - sycl::context exec_ctx = exec_q.get_context(); - if (src_q.get_context() != exec_ctx || dst_q.get_context() != exec_ctx) { + if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) { throw py::value_error( - "Execution queue context is not the same as allocation contexts"); + "Execution queue is not compatible with allocation queues"); } if (src_nelems == 1) { @@ -1255,10 +1253,9 @@ void copy_numpy_ndarray_into_usm_ndarray( sycl::queue dst_q = dst.get_queue(); - sycl::context exec_ctx = exec_q.get_context(); - if (dst_q.get_context() != exec_ctx) { - throw py::value_error("Execution queue context is not the same as the " - "allocation context"); + if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) { + throw py::value_error("Execution queue is not compatible with the " + "allocation queue"); } // here we assume that NumPy's type numbers agree with ours for types From f16e9327b9ac316dd2283a3f04f8d32cd2a45bb6 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 6 Sep 2022 21:09:21 -0500 Subject: [PATCH 4/8] Improved transferring of shapes/strides to device for usm_ndarray copy_cast kernels --- dpctl/tensor/libtensor/source/tensor_py.cpp | 96 +++++++-------------- 1 file changed, 32 insertions(+), 64 deletions(-) diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 34ccb2c91f..14311931cf 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -23,6 +23,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #include #include @@ -663,12 +664,6 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src, } } - std::shared_ptr shp_shape = std::make_shared(simplified_shape); - std::shared_ptr shp_src_strides = - std::make_shared(simplified_src_strides); - std::shared_ptr shp_dst_strides = - std::make_shared(simplified_dst_strides); - // Generic implementation auto copy_and_cast_fn = copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id]; @@ -682,77 +677,50 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src, throw std::runtime_error("Unabled to allocate device memory"); } - sycl::event copy_shape_ev = - exec_q.copy(shp_shape->data(), shape_strides, nd); + // create host temporary for packed shape and strides managed by shared + // pointer + std::shared_ptr shp_host_shape_strides = std::make_shared(3 * nd); + std::copy(simplified_shape.begin(), simplified_shape.end(), + shp_host_shape_strides->begin()); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_shape_ev); - cgh.host_task([shp_shape]() { - // increment shared pointer ref-count to keep it alive - // till copy operation completes; - }); - }); - - sycl::event copy_src_strides_ev; if (src_strides == nullptr) { - std::shared_ptr shp_contig_src_strides = - std::make_shared((src_flags & USM_ARRAY_C_CONTIGUOUS) - ? c_contiguous_strides(nd, shape) - : f_contiguous_strides(nd, shape)); - copy_src_strides_ev = exec_q.copy( - shp_contig_src_strides->data(), shape_strides + nd, nd); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_src_strides_ev); - cgh.host_task([shp_contig_src_strides]() { - // increment shared pointer ref-count to keep it alive - // till copy operation completes; - }); - }); + const shT &src_contig_strides = (src_flags & USM_ARRAY_C_CONTIGUOUS) + ? c_contiguous_strides(nd, shape) + : f_contiguous_strides(nd, shape); + std::copy(src_contig_strides.begin(), src_contig_strides.end(), + shp_host_shape_strides->begin() + nd); } else { - copy_src_strides_ev = exec_q.copy(shp_src_strides->data(), - shape_strides + nd, nd); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_src_strides_ev); - cgh.host_task([shp_src_strides]() { - // increment shared pointer ref-count to keep it alive - // till copy operation completes; - }); - }); + std::copy(simplified_src_strides.begin(), simplified_src_strides.end(), + shp_host_shape_strides->begin() + nd); } - sycl::event copy_dst_strides_ev; if (dst_strides == nullptr) { - std::shared_ptr shp_contig_dst_strides = - std::make_shared((dst_flags & USM_ARRAY_C_CONTIGUOUS) - ? c_contiguous_strides(nd, shape) - : f_contiguous_strides(nd, shape)); - copy_dst_strides_ev = exec_q.copy( - shp_contig_dst_strides->data(), shape_strides + 2 * nd, nd); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_dst_strides_ev); - cgh.host_task([shp_contig_dst_strides]() { - // increment shared pointer ref-count to keep it alive - // till copy operation completes; - }); - }); + const shT &dst_contig_strides = (src_flags & USM_ARRAY_C_CONTIGUOUS) + ? c_contiguous_strides(nd, shape) + : f_contiguous_strides(nd, shape); + std::copy(dst_contig_strides.begin(), dst_contig_strides.end(), + shp_host_shape_strides->begin() + 2 * nd); } else { - copy_dst_strides_ev = exec_q.copy( - shp_dst_strides->data(), shape_strides + 2 * nd, nd); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_dst_strides_ev); - cgh.host_task([shp_dst_strides]() { - // increment shared pointer ref-count to keep it alive - // till copy operation completes; - }); - }); + std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(), + shp_host_shape_strides->begin() + nd); } + sycl::event copy_shape_ev = exec_q.copy( + shp_host_shape_strides->data(), shape_strides, 3 * nd); + + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(copy_shape_ev); + cgh.host_task([shp_host_shape_strides]() { + // increment shared pointer ref-count to keep it alive + // till copy operation completes; + }); + }); + sycl::event copy_and_cast_generic_ev = copy_and_cast_fn( exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data, - dst_offset, depends, - {copy_shape_ev, copy_src_strides_ev, copy_dst_strides_ev}); + dst_offset, depends, {copy_shape_ev}); // async free of shape_strides temporary auto ctx = exec_q.get_context(); From 22cdb5ac9540f57d777d829087036edcd3d3ded3 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 7 Sep 2022 08:44:42 -0500 Subject: [PATCH 5/8] Changed passing shapes/strides to kernel in copy_usm_ndarray_for_reshape Insteads of invoking 4 copy kernels, it is more expedient to pack them on the host and use single copy kernel to reduce kernel submission overhead.wq --- dpctl/tensor/libtensor/source/tensor_py.cpp | 97 ++++++++++----------- 1 file changed, 47 insertions(+), 50 deletions(-) diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 14311931cf..0fde50504f 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -932,6 +932,7 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src, auto fn = copy_for_reshape_generic_dispatch_vector[type_id]; + // packed_shape_strides = [src_shape, src_strides, dst_shape, dst_strides] py::ssize_t *packed_shapes_strides = sycl::malloc_device(2 * (src_nd + dst_nd), exec_q); @@ -939,92 +940,88 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src, throw std::runtime_error("Unabled to allocate device memory"); } - sycl::event src_shape_copy_ev = - exec_q.copy(src_shape, packed_shapes_strides, src_nd); - sycl::event dst_shape_copy_ev = exec_q.copy( - dst_shape, packed_shapes_strides + 2 * src_nd, dst_nd); + using shT = std::vector; + std::shared_ptr packed_host_shapes_strides_shp = + std::make_shared(2 * (src_nd + dst_nd)); + + std::copy(src_shape, src_shape + src_nd, + packed_host_shapes_strides_shp->begin()); + std::copy(dst_shape, dst_shape + dst_nd, + packed_host_shapes_strides_shp->begin() + 2 * src_nd); const py::ssize_t *src_strides = src.get_strides_raw(); - sycl::event src_strides_copy_ev; if (src_strides == nullptr) { - using shT = std::vector; int src_flags = src.get_flags(); - std::shared_ptr contig_src_strides_shp; if (src_flags & USM_ARRAY_C_CONTIGUOUS) { - contig_src_strides_shp = - std::make_shared(c_contiguous_strides(src_nd, src_shape)); + const shT &src_contig_strides = + c_contiguous_strides(src_nd, src_shape); + std::copy(src_contig_strides.begin(), src_contig_strides.end(), + packed_host_shapes_strides_shp->begin() + src_nd); } else if (src_flags & USM_ARRAY_F_CONTIGUOUS) { - contig_src_strides_shp = - std::make_shared(f_contiguous_strides(src_nd, src_shape)); + const shT &src_contig_strides = + c_contiguous_strides(src_nd, src_shape); + std::copy(src_contig_strides.begin(), src_contig_strides.end(), + packed_host_shapes_strides_shp->begin() + src_nd); } else { - sycl::event::wait({src_shape_copy_ev, dst_shape_copy_ev}); sycl::free(packed_shapes_strides, exec_q); throw std::runtime_error( "Invalid src array encountered: in copy_for_reshape function"); } - src_strides_copy_ev = - exec_q.copy(contig_src_strides_shp->data(), - packed_shapes_strides + src_nd, src_nd); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(src_strides_copy_ev); - cgh.host_task([contig_src_strides_shp]() { - // Capturing shared pointer ensure it is freed after its data - // are copied into packed USM vector - }); - }); } else { - src_strides_copy_ev = exec_q.copy( - src_strides, packed_shapes_strides + src_nd, src_nd); + std::copy(src_strides, src_strides + src_nd, + packed_host_shapes_strides_shp->begin() + src_nd); } const py::ssize_t *dst_strides = dst.get_strides_raw(); - sycl::event dst_strides_copy_ev; if (dst_strides == nullptr) { - using shT = std::vector; int dst_flags = dst.get_flags(); - std::shared_ptr contig_dst_strides_shp; if (dst_flags & USM_ARRAY_C_CONTIGUOUS) { - contig_dst_strides_shp = - std::make_shared(c_contiguous_strides(dst_nd, dst_shape)); + const shT &dst_contig_strides = + c_contiguous_strides(dst_nd, dst_shape); + std::copy(dst_contig_strides.begin(), dst_contig_strides.end(), + packed_host_shapes_strides_shp->begin() + 2 * src_nd + + dst_nd); } else if (dst_flags & USM_ARRAY_F_CONTIGUOUS) { - contig_dst_strides_shp = - std::make_shared(f_contiguous_strides(dst_nd, dst_shape)); + const shT &dst_contig_strides = + f_contiguous_strides(dst_nd, dst_shape); + std::copy(dst_contig_strides.begin(), dst_contig_strides.end(), + packed_host_shapes_strides_shp->begin() + 2 * src_nd + + dst_nd); } else { - sycl::event::wait( - {src_shape_copy_ev, dst_shape_copy_ev, src_strides_copy_ev}); sycl::free(packed_shapes_strides, exec_q); throw std::runtime_error( "Invalid dst array encountered: in copy_for_reshape function"); } - dst_strides_copy_ev = exec_q.copy( - contig_dst_strides_shp->data(), - packed_shapes_strides + 2 * src_nd + dst_nd, dst_nd); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dst_strides_copy_ev); - cgh.host_task([contig_dst_strides_shp]() { - // Capturing shared pointer ensure it is freed after its data - // are copied into packed USM vector - }); - }); } else { - dst_strides_copy_ev = exec_q.copy( - dst_strides, packed_shapes_strides + 2 * src_nd + dst_nd, dst_nd); + std::copy(dst_strides, dst_strides + dst_nd, + packed_host_shapes_strides_shp->begin() + 2 * src_nd + + dst_nd); } + // copy packed shapes and strides from host to devices + sycl::event packed_shape_strides_copy_ev = exec_q.copy( + packed_host_shapes_strides_shp->data(), packed_shapes_strides, + packed_host_shapes_strides_shp->size()); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(packed_shape_strides_copy_ev); + cgh.host_task([packed_host_shapes_strides_shp] { + // Capturing shared pointer ensures that the underlying vector is + // not destroyed until after its data are copied into packed USM + // vector + }); + }); + char *src_data = src.get_data(); char *dst_data = dst.get_data(); - std::vector all_deps(depends.size() + 4); - all_deps.push_back(src_shape_copy_ev); - all_deps.push_back(dst_shape_copy_ev); - all_deps.push_back(src_strides_copy_ev); - all_deps.push_back(dst_strides_copy_ev); + std::vector all_deps(depends.size() + 1); + all_deps.push_back(packed_shape_strides_copy_ev); all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); sycl::event copy_for_reshape_event = From 46d8a8d2452432c794a3ba22e0c66db709dc8eca Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 7 Sep 2022 10:27:37 -0500 Subject: [PATCH 6/8] Optimized transfer of shape/strides to kernel in copy from ndarray Applied optimization of replacing 3 queue.copy calls to copy shape, src_strides, dst_strides to copy host meta-data into USM allocation for use in copy_and_cast kernel with creating packed vector on the host and using a single queue.copy call of the packed host vector to USM allocation.w --- dpctl/tensor/libtensor/source/tensor_py.cpp | 47 +++++---------------- 1 file changed, 11 insertions(+), 36 deletions(-) diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 0fde50504f..6ab0665aa1 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -1331,11 +1331,6 @@ void copy_numpy_ndarray_into_usm_ndarray( // Create shared pointers with shape and src/dst strides, copy into device // memory using shT = std::vector; - std::shared_ptr shp_shape = std::make_shared(simplified_shape); - std::shared_ptr shp_src_strides = - std::make_shared(simplified_src_strides); - std::shared_ptr shp_dst_strides = - std::make_shared(simplified_dst_strides); // Get implementation function pointer auto copy_and_cast_from_host_blocking_fn = @@ -1351,42 +1346,22 @@ void copy_numpy_ndarray_into_usm_ndarray( throw std::runtime_error("Unabled to allocate device memory"); } - sycl::event copy_shape_ev = - exec_q.copy(shp_shape->data(), shape_strides, nd); - - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_shape_ev); - cgh.host_task([shp_shape]() { - // increment shared pointer ref-count to keep it alive - // till copy operation completes; - }); - }); - - sycl::event copy_src_strides_ev = exec_q.copy( - shp_src_strides->data(), shape_strides + nd, nd); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_src_strides_ev); - cgh.host_task([shp_src_strides]() { - // increment shared pointer ref-count to keep it alive - // till copy operation completes; - }); - }); + std::shared_ptr host_shape_strides_shp = std::make_shared(3 * nd); + std::copy(simplified_shape.begin(), simplified_shape.end(), + host_shape_strides_shp->begin()); + std::copy(simplified_src_strides.begin(), simplified_src_strides.end(), + host_shape_strides_shp->begin() + nd); + std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(), + host_shape_strides_shp->begin() + 2 * nd); - sycl::event copy_dst_strides_ev = exec_q.copy( - shp_dst_strides->data(), shape_strides + 2 * nd, nd); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_dst_strides_ev); - cgh.host_task([shp_dst_strides]() { - // increment shared pointer ref-count to keep it alive - // till copy operation completes; - }); - }); + sycl::event copy_packed_ev = + exec_q.copy(host_shape_strides_shp->data(), shape_strides, + host_shape_strides_shp->size()); copy_and_cast_from_host_blocking_fn( exec_q, src_nelems, nd, shape_strides, src_data, src_offset, npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data, - dst_offset, depends, - {copy_shape_ev, copy_src_strides_ev, copy_dst_strides_ev}); + dst_offset, depends, {copy_packed_ev}); sycl::free(shape_strides, exec_q); From 7ab929fe6eed245bfafdfe0c18b1dfce4b3b3653 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 7 Sep 2022 10:31:21 -0500 Subject: [PATCH 7/8] Minor optimization of using vector::size instead of recomputing it again --- dpctl/tensor/libtensor/source/tensor_py.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 6ab0665aa1..44764c7cd8 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -707,8 +707,9 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src, shp_host_shape_strides->begin() + nd); } - sycl::event copy_shape_ev = exec_q.copy( - shp_host_shape_strides->data(), shape_strides, 3 * nd); + sycl::event copy_shape_ev = + exec_q.copy(shp_host_shape_strides->data(), shape_strides, + shp_host_shape_strides->size()); exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(copy_shape_ev); From 74f0b37ea0a38901a5f9685d5a928b83c4310abf Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 8 Sep 2022 10:31:37 -0500 Subject: [PATCH 8/8] Modularized utility for packing shapes/strides into device allocation for copy-and-cast operation between two usm_ndarrays --- dpctl/tensor/libtensor/source/tensor_py.cpp | 83 +++++++++++---------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 44764c7cd8..0bcd4ba3c8 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -479,6 +479,46 @@ void simplify_iteration_space(int &nd, } } +sycl::event _populate_packed_shape_strides_for_copycast_kernel( + sycl::queue exec_q, + int src_flags, + int dst_flags, + py::ssize_t *device_shape_strides, // to be populated + const std::vector &common_shape, + const std::vector &src_strides, + const std::vector &dst_strides) +{ + using shT = std::vector; + size_t nd = common_shape.size(); + + // create host temporary for packed shape and strides managed by shared + // pointer. Packed vector is concatenation of common_shape, src_stride and + // std_strides + std::shared_ptr shp_host_shape_strides = std::make_shared(3 * nd); + std::copy(common_shape.begin(), common_shape.end(), + shp_host_shape_strides->begin()); + + std::copy(src_strides.begin(), src_strides.end(), + shp_host_shape_strides->begin() + nd); + + std::copy(dst_strides.begin(), dst_strides.end(), + shp_host_shape_strides->begin() + 2 * nd); + + sycl::event copy_shape_ev = exec_q.copy( + shp_host_shape_strides->data(), device_shape_strides, + shp_host_shape_strides->size()); + + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(copy_shape_ev); + cgh.host_task([shp_host_shape_strides]() { + // increment shared pointer ref-count to keep it alive + // till copy operation completes; + }); + }); + + return copy_shape_ev; +} + std::pair copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst, @@ -677,47 +717,10 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src, throw std::runtime_error("Unabled to allocate device memory"); } - // create host temporary for packed shape and strides managed by shared - // pointer - std::shared_ptr shp_host_shape_strides = std::make_shared(3 * nd); - std::copy(simplified_shape.begin(), simplified_shape.end(), - shp_host_shape_strides->begin()); - - if (src_strides == nullptr) { - const shT &src_contig_strides = (src_flags & USM_ARRAY_C_CONTIGUOUS) - ? c_contiguous_strides(nd, shape) - : f_contiguous_strides(nd, shape); - std::copy(src_contig_strides.begin(), src_contig_strides.end(), - shp_host_shape_strides->begin() + nd); - } - else { - std::copy(simplified_src_strides.begin(), simplified_src_strides.end(), - shp_host_shape_strides->begin() + nd); - } - - if (dst_strides == nullptr) { - const shT &dst_contig_strides = (src_flags & USM_ARRAY_C_CONTIGUOUS) - ? c_contiguous_strides(nd, shape) - : f_contiguous_strides(nd, shape); - std::copy(dst_contig_strides.begin(), dst_contig_strides.end(), - shp_host_shape_strides->begin() + 2 * nd); - } - else { - std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(), - shp_host_shape_strides->begin() + nd); - } - sycl::event copy_shape_ev = - exec_q.copy(shp_host_shape_strides->data(), shape_strides, - shp_host_shape_strides->size()); - - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(copy_shape_ev); - cgh.host_task([shp_host_shape_strides]() { - // increment shared pointer ref-count to keep it alive - // till copy operation completes; - }); - }); + _populate_packed_shape_strides_for_copycast_kernel( + exec_q, src_flags, dst_flags, shape_strides, simplified_shape, + simplified_src_strides, simplified_dst_strides); sycl::event copy_and_cast_generic_ev = copy_and_cast_fn( exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data,