From 5ecf6fb52afccb7ec20f760c1c42c33de3b4681a Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 5 Jan 2023 07:40:16 -0600 Subject: [PATCH 1/3] shape, src_strides and dst_strides are modified unconditionally --- .../libtensor/source/simplify_iteration_space.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp index 8937300047..be4a35fb90 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp @@ -120,11 +120,6 @@ void simplify_iteration_space(int &nd, simplified_dst_strides.resize(contracted_nd); nd = contracted_nd; - shape = const_cast(simplified_shape.data()); - src_strides = - const_cast(simplified_src_strides.data()); - dst_strides = - const_cast(simplified_dst_strides.data()); } else if (nd == 1) { // Populate vectors @@ -171,6 +166,11 @@ void simplify_iteration_space(int &nd, assert(simplified_src_strides.size() == static_cast(nd)); assert(simplified_dst_strides.size() == static_cast(nd)); } + shape = const_cast(simplified_shape.data()); + src_strides = + const_cast(simplified_src_strides.data()); + dst_strides = + const_cast(simplified_dst_strides.data()); } } // namespace py_internal From 73efa361eb517e11326c7f8a0554b59725fb2842 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 5 Jan 2023 07:41:33 -0600 Subject: [PATCH 2/3] Modified libtensor/tests/test_copy.py Fixed violations of CFD. Made it work on Iris Xe. --- dpctl/tensor/libtensor/tests/test_copy.py | 67 ++++++++++++++++++++--- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/dpctl/tensor/libtensor/tests/test_copy.py b/dpctl/tensor/libtensor/tests/test_copy.py index e366002dec..2559de1c4e 100644 --- a/dpctl/tensor/libtensor/tests/test_copy.py +++ b/dpctl/tensor/libtensor/tests/test_copy.py @@ -40,6 +40,14 @@ ] +def _typestr_has_fp64(arr_typestr): + return arr_typestr in ["f8", "c16"] + + +def _typestr_has_fp16(arr_typestr): + return arr_typestr in ["f2"] + + @pytest.fixture(params=_usm_types_list) def usm_type(request): return request.param @@ -95,6 +103,14 @@ def test_copy1d_c_contig(src_typestr, dst_typestr): q = dpctl.SyclQueue() except dpctl.SyclQueueCreationError: pytest.skip("Queue could not be created") + if not q.sycl_device.has_aspect_fp64 and ( + _typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr) + ): + pytest.skip("Device does not support double precision") + if not q.sycl_device.has_aspect_fp16 and ( + _typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr) + ): + pytest.skip("Device does not support half precision") src_dt = np.dtype(src_typestr) dst_dt = np.dtype(dst_typestr) Xnp = _random_vector(4096, src_dt) @@ -113,6 +129,14 @@ def test_copy1d_strided(src_typestr, dst_typestr): q = dpctl.SyclQueue() except dpctl.SyclQueueCreationError: pytest.skip("Queue could not be created") + if not q.sycl_device.has_aspect_fp64 and ( + _typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr) + ): + pytest.skip("Device does not support double precision") + if not q.sycl_device.has_aspect_fp16 and ( + _typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr) + ): + pytest.skip("Device does not support half precision") src_dt = np.dtype(src_typestr) dst_dt = np.dtype(dst_typestr) Xnp = _random_vector(4096, src_dt) @@ -131,7 +155,12 @@ def test_copy1d_strided(src_typestr, dst_typestr): assert are_close(Ynp, dpt.asnumpy(Y)) # now 0-strided source - X = dpt.usm_ndarray((4096,), dtype=src_typestr, strides=(0,)) + X = dpt.usm_ndarray( + (4096,), + dtype=src_typestr, + strides=(0,), + buffer_ctor_kwargs={"queue": q}, + ) X[0] = Xnp[0] Y = dpt.empty(X.shape, dtype=dst_typestr, sycl_queue=q) hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(src=X, dst=Y, sycl_queue=q) @@ -145,6 +174,14 @@ def test_copy1d_strided2(src_typestr, dst_typestr): q = dpctl.SyclQueue() except dpctl.SyclQueueCreationError: pytest.skip("Queue could not be created") + if not q.sycl_device.has_aspect_fp64 and ( + _typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr) + ): + pytest.skip("Device does not support double precision") + if not q.sycl_device.has_aspect_fp16 and ( + _typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr) + ): + pytest.skip("Device does not support half precision") src_dt = np.dtype(src_typestr) dst_dt = np.dtype(dst_typestr) Xnp = _random_vector(4096, src_dt) @@ -172,6 +209,14 @@ def test_copy2d(src_typestr, dst_typestr, st1, sgn1, st2, sgn2): q = dpctl.SyclQueue() except dpctl.SyclQueueCreationError: pytest.skip("Queue could not be created") + if not q.sycl_device.has_aspect_fp64 and ( + _typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr) + ): + pytest.skip("Device does not support double precision") + if not q.sycl_device.has_aspect_fp16 and ( + _typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr) + ): + pytest.skip("Device does not support half precision") src_dt = np.dtype(src_typestr) dst_dt = np.dtype(dst_typestr) @@ -188,16 +233,16 @@ def test_copy2d(src_typestr, dst_typestr, st1, sgn1, st2, sgn2): slice(None, None, st1 * sgn1), slice(None, None, st2 * sgn2), ] - Y = dpt.empty((n1, n2), dtype=dst_dt) + Y = dpt.empty((n1, n2), dtype=dst_dt, device=X.device) hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(src=X, dst=Y, sycl_queue=q) Ynp = _force_cast(Xnp, dst_dt) hev.wait() assert are_close(Ynp, dpt.asnumpy(Y)) - Yst = dpt.empty((2 * n1, n2), dtype=dst_dt)[::2, ::-1] + Yst = dpt.empty((2 * n1, n2), dtype=dst_dt, device=X.device)[::2, ::-1] hev, ev = ti._copy_usm_ndarray_into_usm_ndarray( src=X, dst=Yst, sycl_queue=q ) - Y = dpt.empty((n1, n2), dtype=dst_dt) + Y = dpt.empty((n1, n2), dtype=dst_dt, device=X.device) hev2, ev2 = ti._copy_usm_ndarray_into_usm_ndarray( src=Yst, dst=Y, sycl_queue=q, depends=[ev] ) @@ -220,6 +265,14 @@ def test_copy3d(src_typestr, dst_typestr, st1, sgn1, st2, sgn2, st3, sgn3): except dpctl.SyclQueueCreationError: pytest.skip("Queue could not be created") + if not q.sycl_device.has_aspect_fp64 and ( + _typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr) + ): + pytest.skip("Device does not support double precision") + if not q.sycl_device.has_aspect_fp16 and ( + _typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr) + ): + pytest.skip("Device does not support half precision") src_dt = np.dtype(src_typestr) dst_dt = np.dtype(dst_typestr) n1, n2, n3 = 5, 4, 6 @@ -237,16 +290,16 @@ def test_copy3d(src_typestr, dst_typestr, st1, sgn1, st2, sgn2, st3, sgn3): slice(None, None, st2 * sgn2), slice(None, None, st3 * sgn3), ] - Y = dpt.empty((n1, n2, n3), dtype=dst_dt) + Y = dpt.empty((n1, n2, n3), dtype=dst_dt, device=X.device) hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(src=X, dst=Y, sycl_queue=q) Ynp = _force_cast(Xnp, dst_dt) hev.wait() assert are_close(Ynp, dpt.asnumpy(Y)), "1" - Yst = dpt.empty((2 * n1, n2, n3), dtype=dst_dt)[::2, ::-1] + Yst = dpt.empty((2 * n1, n2, n3), dtype=dst_dt, device=X.device)[::2, ::-1] hev2, ev2 = ti._copy_usm_ndarray_into_usm_ndarray( src=X, dst=Yst, sycl_queue=q ) - Y2 = dpt.empty((n1, n2, n3), dtype=dst_dt) + Y2 = dpt.empty((n1, n2, n3), dtype=dst_dt, device=X.device) hev3, ev3 = ti._copy_usm_ndarray_into_usm_ndarray( src=Yst, dst=Y2, sycl_queue=q, depends=[ev2] ) From eb153e6c6ba3cdf7ede9941b9b4d58ab694ad2bb Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 5 Jan 2023 10:09:54 -0600 Subject: [PATCH 3/3] Added const qualifiers to read-only pointers for copy-and-cast kernels This change made it possible to remove some uses of const_cast and made code simpler. Also used #pragma unroll in specialized CopyAndCast kernel where displacement is computed from multi-index. --- .../include/kernels/copy_and_cast.hpp | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp index bd70f18334..ad64e255a9 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -51,14 +51,14 @@ template class Caster { public: Caster() = default; - void operator()(char *src, + void operator()(const char *src, std::ptrdiff_t src_offset, char *dst, std::ptrdiff_t dst_offset) const { using dpctl::tensor::type_utils::convert_impl; - srcT *src_ = reinterpret_cast(src) + src_offset; + const srcT *src_ = reinterpret_cast(src) + src_offset; dstT *dst_ = reinterpret_cast(dst) + dst_offset; *dst_ = convert_impl(*src_); } @@ -67,17 +67,17 @@ template class Caster template class GenericCopyFunctor { private: - char *src_ = nullptr; + const char *src_ = nullptr; char *dst_ = nullptr; - py::ssize_t *shape_strides_ = nullptr; + const py::ssize_t *shape_strides_ = nullptr; int nd_ = 0; py::ssize_t src_offset0 = 0; py::ssize_t dst_offset0 = 0; public: - GenericCopyFunctor(char *src_cp, + GenericCopyFunctor(const char *src_cp, char *dst_cp, - py::ssize_t *shape_strides, + const py::ssize_t *shape_strides, int nd, py::ssize_t src_offset, py::ssize_t dst_offset) @@ -93,13 +93,11 @@ template class GenericCopyFunctor CIndexer_vector indxr(nd_); indxr.get_displacement( static_cast(wiid.get(0)), - const_cast(shape_strides_), // common shape - const_cast(shape_strides_ + - nd_), // src strides - const_cast(shape_strides_ + - 2 * nd_), // dst strides - src_offset, // modified by reference - dst_offset // modified by reference + shape_strides_, // common shape + shape_strides_ + nd_, // src strides + shape_strides_ + 2 * nd_, // dst strides + src_offset, // modified by reference + dst_offset // modified by reference ); CastFnT fn{}; fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset); @@ -109,7 +107,7 @@ template class GenericCopyFunctor template class NDSpecializedCopyFunctor { private: - char *src_ = nullptr; + const char *src_ = nullptr; char *dst_ = nullptr; CIndexer_array indxr; const std::array src_strides_; @@ -119,8 +117,8 @@ template class NDSpecializedCopyFunctor py::ssize_t dst_offset0 = 0; public: - NDSpecializedCopyFunctor(char *src_cp, // USM pointer - char *dst_cp, // USM pointer + NDSpecializedCopyFunctor(const char *src_cp, // USM pointer + char *dst_cp, // USM pointer const std::array shape, const std::array src_strides, const std::array dst_strides, @@ -140,8 +138,10 @@ template class NDSpecializedCopyFunctor local_indxr.set(wiid.get(0)); auto mi = local_indxr.get(); +#pragma unroll for (int i = 0; i < nd; ++i) src_offset += mi[i] * src_strides_[i]; +#pragma unroll for (int i = 0; i < nd; ++i) dst_offset += mi[i] * dst_strides_[i]; @@ -161,8 +161,8 @@ typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( sycl::queue, size_t, int, - py::ssize_t *, - char *, + const py::ssize_t *, + const char *, py::ssize_t, char *, py::ssize_t, @@ -207,8 +207,8 @@ sycl::event copy_and_cast_generic_impl(sycl::queue q, size_t nelems, int nd, - py::ssize_t *shape_and_strides, - char *src_p, + const py::ssize_t *shape_and_strides, + const char *src_p, py::ssize_t src_offset, char *dst_p, py::ssize_t dst_offset, @@ -256,7 +256,7 @@ typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( const std::array, const std::array, const std::array, - char *, + const char *, py::ssize_t, char *, py::ssize_t, @@ -272,7 +272,7 @@ typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( const std::array, const std::array, const std::array, - char *, + const char *, py::ssize_t, char *, py::ssize_t, @@ -314,7 +314,7 @@ copy_and_cast_nd_specialized_impl(sycl::queue q, const std::array shape, const std::array src_strides, const std::array dst_strides, - char *src_p, + const char *src_p, py::ssize_t src_offset, char *dst_p, py::ssize_t dst_offset,