From 8f44a5ef579c3dcf126c7825984d3dc132de3db2 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Mon, 17 Jul 2023 09:53:46 -0500
Subject: [PATCH 1/6] Added support for out=arg by temporary copy

---
 dpctl/tensor/_elementwise_common.py | 53 +++++++++++++++++------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 9c61f5e97c..1137fad2e8 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -52,6 +52,15 @@ def __call__(self, x, out=None, order="K"):
         if not isinstance(x, dpt.usm_ndarray):
             raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
 
+        if order not in ["C", "F", "K", "A"]:
+            order = "K"
+        buf_dt, res_dt = _find_buf_dtype(
+            x.dtype, self.result_type_resolver_fn_, x.sycl_device
+        )
+        if res_dt is None:
+            raise RuntimeError
+
+        orig_out = out
         if out is not None:
             if not isinstance(out, dpt.usm_ndarray):
                 raise TypeError(
@@ -64,8 +73,17 @@ def __call__(self, x, out=None, order="K"):
                     f"Expected output shape is {x.shape}, got {out.shape}"
                 )
 
-            if ti._array_overlap(x, out):
-                raise TypeError("Input and output arrays have memory overlap")
+            if res_dt != out.dtype:
+                raise TypeError(
+                    f"Output array of type {res_dt} is needed,"
+                    f" got {out.dtype}"
+                )
+
+            if buf_dt is None and ti._array_overlap(x, out):
+                # Allocate a temporary buffer to avoid memory overlapping.
+                # Note if `buf_dt` is not None, a temporary copy of `x` will be
+                # created, so the array overlap check isn't needed.
+                out = dpt.empty_like(out)
 
             if (
                 dpctl.utils.get_execution_queue((x.sycl_queue, out.sycl_queue))
@@ -75,13 +93,6 @@ def __call__(self, x, out=None, order="K"):
                     "Input and output allocation queues are not compatible"
                 )
 
-        if order not in ["C", "F", "K", "A"]:
-            order = "K"
-        buf_dt, res_dt = _find_buf_dtype(
-            x.dtype, self.result_type_resolver_fn_, x.sycl_device
-        )
-        if res_dt is None:
-            raise RuntimeError
         exec_q = x.sycl_queue
         if buf_dt is None:
             if out is None:
@@ -91,17 +102,20 @@ def __call__(self, x, out=None, order="K"):
                     if order == "A":
                         order = "F" if x.flags.f_contiguous else "C"
                     out = dpt.empty_like(x, dtype=res_dt, order=order)
-            else:
-                if res_dt != out.dtype:
-                    raise TypeError(
-                        f"Output array of type {res_dt} is needed,"
-                        f" got {out.dtype}"
-                    )
 
-            ht, _ = self.unary_fn_(x, out, sycl_queue=exec_q)
-            ht.wait()
+            ht_unary_ev, unary_ev = self.unary_fn_(x, out, sycl_queue=exec_q)
+
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out, dst=orig_out, sycl_queue=exec_q, depends=[unary_ev]
+                )
+                ht_copy_ev.wait()
+                out = orig_out
 
+            ht_unary_ev.wait()
             return out
+
         if order == "K":
             buf = _empty_like_orderK(x, buf_dt)
         else:
@@ -117,11 +131,6 @@ def __call__(self, x, out=None, order="K"):
                 out = _empty_like_orderK(buf, res_dt)
             else:
                 out = dpt.empty_like(buf, dtype=res_dt, order=order)
-        else:
-            if buf_dt != out.dtype:
-                raise TypeError(
-                    f"Output array of type {buf_dt} is needed, got {out.dtype}"
-                )
 
         ht, _ = self.unary_fn_(buf, out, sycl_queue=exec_q, depends=[copy_ev])
         ht_copy_ev.wait()

From 656189386c12c2b673a8da220b0cedd788e0c2ba Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Thu, 13 Jul 2023 15:09:44 -0500
Subject: [PATCH 2/6] Added tests for out=arg for unary functions, suppressed
 numpy warnings

test_square.py::test_sqrt_out_overlap -> test_square.py::test_square_out_overlap
---
 dpctl/tests/_numpy_warnings.py         | 28 +++++++++++++++++++
 dpctl/tests/conftest.py                |  9 ++++++-
 dpctl/tests/elementwise/test_abs.py    | 24 ++++++++++++++++-
 dpctl/tests/elementwise/test_exp.py    | 23 ++++++++++++++++
 dpctl/tests/elementwise/test_log.py    | 36 ++++++++++++++++++++-----
 dpctl/tests/elementwise/test_sincos.py | 31 ++++++++++++++++-----
 dpctl/tests/elementwise/test_sqrt.py   | 37 +++++++++++++++++++++-----
 dpctl/tests/elementwise/test_square.py | 26 ++++++++++++++++++
 8 files changed, 192 insertions(+), 22 deletions(-)
 create mode 100644 dpctl/tests/_numpy_warnings.py

diff --git a/dpctl/tests/_numpy_warnings.py b/dpctl/tests/_numpy_warnings.py
new file mode 100644
index 0000000000..1e723c3001
--- /dev/null
+++ b/dpctl/tests/_numpy_warnings.py
@@ -0,0 +1,28 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import pytest
+
+
+@pytest.fixture
+def suppress_invalid_numpy_warnings():
+    # invalid: treatment for invalid floating-point operation
+    # (result is not an expressible number, typically indicates
+    # that a NaN was produced)
+    old_settings = numpy.seterr(invalid="ignore")
+    yield
+    numpy.seterr(**old_settings)  # reset to default
diff --git a/dpctl/tests/conftest.py b/dpctl/tests/conftest.py
index 7fc63a5a24..600953bcf7 100644
--- a/dpctl/tests/conftest.py
+++ b/dpctl/tests/conftest.py
@@ -26,8 +26,15 @@
     invalid_filter,
     valid_filter,
 )
+from _numpy_warnings import suppress_invalid_numpy_warnings
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "helper"))
 
 # common fixtures
-__all__ = ["check", "device_selector", "invalid_filter", "valid_filter"]
+__all__ = [
+    "check",
+    "device_selector",
+    "invalid_filter",
+    "suppress_invalid_numpy_warnings",
+    "valid_filter",
+]
diff --git a/dpctl/tests/elementwise/test_abs.py b/dpctl/tests/elementwise/test_abs.py
index ee7fa0cb6c..ab0d34d54d 100644
--- a/dpctl/tests/elementwise/test_abs.py
+++ b/dpctl/tests/elementwise/test_abs.py
@@ -22,7 +22,7 @@
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
-from .utils import _all_dtypes, _usm_types
+from .utils import _all_dtypes, _no_complex_dtypes, _usm_types
 
 
 @pytest.mark.parametrize("dtype", _all_dtypes)
@@ -113,3 +113,25 @@ def test_abs_complex(dtype):
             np.testing.assert_allclose(
                 dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
             )
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_abs_out_overlap(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(0, 35, 60, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5, 4))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np.abs(Xnp, out=Xnp)
+
+    Y = dpt.abs(X, out=X)
+    assert Y is X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+
+    Ynp = np.abs(Xnp, out=Xnp[::-1])
+    Y = dpt.abs(X, out=X[::-1])
+    assert Y is not X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+    assert np.allclose(dpt.asnumpy(Y), Ynp)
diff --git a/dpctl/tests/elementwise/test_exp.py b/dpctl/tests/elementwise/test_exp.py
index 5ea8ded018..85f21694c5 100644
--- a/dpctl/tests/elementwise/test_exp.py
+++ b/dpctl/tests/elementwise/test_exp.py
@@ -145,3 +145,26 @@ def test_exp_strided(dtype):
                 atol=tol,
                 rtol=tol,
             )
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_exp_out_overlap(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(0, 1, 15, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np.exp(Xnp, out=Xnp)
+
+    Y = dpt.exp(X, out=X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+    assert Y is X
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+
+    Ynp = np.exp(Xnp, out=Xnp[::-1])
+    Y = dpt.exp(X, out=X[::-1])
+    assert Y is not X
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), Ynp, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_log.py b/dpctl/tests/elementwise/test_log.py
index ed56fb6468..b0cc337826 100644
--- a/dpctl/tests/elementwise/test_log.py
+++ b/dpctl/tests/elementwise/test_log.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 import pytest
-from numpy.testing import assert_equal
+from numpy.testing import assert_allclose, assert_equal
 
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
@@ -50,7 +50,7 @@ def test_log_output_contig(dtype):
     Y = dpt.log(X)
     tol = 8 * dpt.finfo(Y.dtype).resolution
 
-    np.testing.assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
@@ -66,7 +66,7 @@ def test_log_output_strided(dtype):
     Y = dpt.log(X)
     tol = 8 * dpt.finfo(Y.dtype).resolution
 
-    np.testing.assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("usm_type", _usm_types)
@@ -89,7 +89,7 @@ def test_log_usm_type(usm_type):
     expected_Y[..., 1::2] = np.log(np.float32(10 * dpt.e))
     tol = 8 * dpt.finfo(Y.dtype).resolution
 
-    np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("dtype", _all_dtypes)
@@ -112,9 +112,7 @@ def test_log_order(dtype):
                 dpt.finfo(Y.dtype).resolution,
                 np.finfo(expected_Y.dtype).resolution,
             )
-            np.testing.assert_allclose(
-                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
-            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
 
 
 def test_log_special_cases():
@@ -126,3 +124,27 @@ def test_log_special_cases():
     Xnp = dpt.asnumpy(X)
 
     assert_equal(dpt.asnumpy(dpt.log(X)), np.log(Xnp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_out_overlap(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(5, 35, 60, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5, 4))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np.log(Xnp, out=Xnp)
+
+    Y = dpt.log(X, out=X)
+    assert Y is X
+
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+
+    Ynp = np.log(Xnp, out=Xnp[::-1])
+    Y = dpt.log(X, out=X[::-1])
+    assert Y is not X
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), Ynp, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_sincos.py b/dpctl/tests/elementwise/test_sincos.py
index d027ef026a..d4ca463394 100644
--- a/dpctl/tests/elementwise/test_sincos.py
+++ b/dpctl/tests/elementwise/test_sincos.py
@@ -161,12 +161,6 @@ def test_sincos_errors(callable):
         y,
     )
 
-    x = dpt.zeros(2)
-    y = x
-    assert_raises_regex(
-        TypeError, "Input and output arrays have memory overlap", callable, x, y
-    )
-
     x = dpt.zeros(2, dtype="float32")
     y = np.empty_like(x)
     assert_raises_regex(
@@ -230,3 +224,28 @@ def test_sincos_strided(dtype):
                 atol=tol,
                 rtol=tol,
             )
+
+
+@pytest.mark.parametrize(
+    "np_call, dpt_call", [(np.sin, dpt.sin), (np.cos, dpt.cos)]
+)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_sincos_out_overlap(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(-np.pi / 2, np.pi / 2, 60, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5, 4))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np_call(Xnp, out=Xnp)
+
+    Y = dpt_call(X, out=X)
+    assert Y is X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+
+    Ynp = np_call(Xnp, out=Xnp[::-1])
+    Y = dpt_call(X, out=X[::-1])
+    assert Y is not X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+    assert np.allclose(dpt.asnumpy(Y), Ynp)
diff --git a/dpctl/tests/elementwise/test_sqrt.py b/dpctl/tests/elementwise/test_sqrt.py
index ce168a5ccb..a15f5262a7 100644
--- a/dpctl/tests/elementwise/test_sqrt.py
+++ b/dpctl/tests/elementwise/test_sqrt.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 import pytest
-from numpy.testing import assert_equal
+from numpy.testing import assert_allclose, assert_equal
 
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
@@ -50,7 +50,7 @@ def test_sqrt_output_contig(dtype):
     Y = dpt.sqrt(X)
     tol = 8 * dpt.finfo(Y.dtype).resolution
 
-    np.testing.assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
@@ -66,7 +66,7 @@ def test_sqrt_output_strided(dtype):
     Y = dpt.sqrt(X)
     tol = 8 * dpt.finfo(Y.dtype).resolution
 
-    np.testing.assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("usm_type", _usm_types)
@@ -89,7 +89,7 @@ def test_sqrt_usm_type(usm_type):
     expected_Y[..., 1::2] = np.sqrt(np.float32(23.0))
     tol = 8 * dpt.finfo(Y.dtype).resolution
 
-    np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("dtype", _all_dtypes)
@@ -112,11 +112,10 @@ def test_sqrt_order(dtype):
                 dpt.finfo(Y.dtype).resolution,
                 np.finfo(expected_Y.dtype).resolution,
             )
-            np.testing.assert_allclose(
-                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
-            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
 
 
+@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
 def test_sqrt_special_cases():
     q = get_queue_or_skip()
 
@@ -126,3 +125,27 @@ def test_sqrt_special_cases():
     Xnp = dpt.asnumpy(X)
 
     assert_equal(dpt.asnumpy(dpt.sqrt(X)), np.sqrt(Xnp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_sqrt_out_overlap(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(0, 35, 60, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5, 4))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np.sqrt(Xnp, out=Xnp)
+
+    Y = dpt.sqrt(X, out=X)
+    assert Y is X
+
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+
+    Ynp = np.sqrt(Xnp, out=Xnp[::-1])
+    Y = dpt.sqrt(X, out=X[::-1])
+    assert Y is not X
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), Ynp, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_square.py b/dpctl/tests/elementwise/test_square.py
index 95ec163e2f..3af0528944 100644
--- a/dpctl/tests/elementwise/test_square.py
+++ b/dpctl/tests/elementwise/test_square.py
@@ -97,3 +97,29 @@ def test_square_special_cases(dtype):
             rtol=tol,
             equal_nan=True,
         )
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_square_out_overlap(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(0, 35, 60, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5, 4))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np.square(Xnp, out=Xnp)
+
+    Y = dpt.square(X, out=X)
+    assert Y is X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+
+    X = dpt.linspace(0, 35, 60, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5, 4))
+    Xnp = dpt.asnumpy(X)
+
+    Ynp = np.square(Xnp, out=Xnp[::-1])
+    Y = dpt.square(X, out=X[::-1])
+    assert Y is not X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+    assert np.allclose(dpt.asnumpy(Y), Ynp)

From 9246e14d828751ea1a453264ba76b3fb4206af1e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 17 Jul 2023 11:39:50 -0500
Subject: [PATCH 3/6] Introduced dpctl::tensor::overlap::SameLogicalTensor

The call operator of this struct verifies whether two USM ND-arrays
logically address the same memory elements. In the case when
data-parallel read from and write to arrays that locally address
the same memory elements there is no race condition and no additional
copying is needed.
---
 .../include/utils/memory_overlap.hpp          | 47 +++++++++++++++++++
 .../source/elementwise_functions.hpp          |  4 +-
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
index e4be509a22..331ef6c5eb 100644
--- a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
+++ b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
@@ -100,6 +100,53 @@ struct MemoryOverlap
     }
 };
 
+struct SameLogicalTensors
+{
+    bool operator()(dpctl::tensor::usm_ndarray ar1,
+                    dpctl::tensor::usm_ndarray ar2) const
+    {
+        // Same ndim
+        int nd1 = ar1.get_ndim();
+        if (nd1 != ar2.get_ndim())
+            return false;
+
+        // Same dtype
+        int tn1 = ar1.get_typenum();
+        if (tn1 != ar2.get_typenum())
+            return false;
+
+        // Same pointer
+        const char *ar1_data = ar1.get_data();
+        const char *ar2_data = ar2.get_data();
+
+        if (ar1_data != ar2_data)
+            return false;
+
+        // Same shape and strides
+        const py::ssize_t *ar1_shape = ar1.get_shape_raw();
+        const py::ssize_t *ar2_shape = ar2.get_shape_raw();
+
+        if (!std::equal(ar1_shape, ar1_shape + nd1, ar2_shape))
+            return false;
+
+        // Same shape and strides
+        auto const &ar1_strides = ar1.get_strides_vector();
+        auto const &ar2_strides = ar2.get_strides_vector();
+
+        auto ar1_beg_it = std::begin(ar1_strides);
+        auto ar1_end_it = std::end(ar1_strides);
+
+        auto ar2_beg_it = std::begin(ar2_strides);
+
+        if (!std::equal(ar1_beg_it, ar1_end_it, ar2_beg_it))
+            return false;
+
+        // all checks passed: arrays are logical views
+        // into the same memory
+        return true;
+    }
+};
+
 } // namespace overlap
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions.hpp
index 27ee9c9fcb..453992220a 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions.hpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions.hpp
@@ -128,7 +128,9 @@ py_unary_ufunc(dpctl::tensor::usm_ndarray src,
 
     // check memory overlap
     auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if (overlap(src, dst) && !same_logical_tensors(src, dst)) {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
 

From ea0579d13ac423efae6302fe4f9f26aa1acc1de0 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 17 Jul 2023 12:25:19 -0500
Subject: [PATCH 4/6] Added tensor_impl._same_logical_tensors predicate

The predicate determines is argument arrays are the same
(same dimension, shape, data type, pointer, strides). Used
to determine if copying must be performed in case of overlap
to avoid race condition.
---
 dpctl/tensor/libtensor/source/tensor_py.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 4b36dea534..1833c2d770 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -60,6 +60,7 @@ using dpctl::tensor::c_contiguous_strides;
 using dpctl::tensor::f_contiguous_strides;
 
 using dpctl::tensor::overlap::MemoryOverlap;
+using dpctl::tensor::overlap::SameLogicalTensors;
 
 using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray;
 
@@ -338,6 +339,15 @@ PYBIND11_MODULE(_tensor_impl, m)
           "Determines if the memory regions indexed by each array overlap",
           py::arg("array1"), py::arg("array2"));
 
+    auto same_logical_tensors = [](dpctl::tensor::usm_ndarray x1,
+                                   dpctl::tensor::usm_ndarray x2) -> bool {
+        auto const &same_logical_tensors = SameLogicalTensors();
+        return same_logical_tensors(x1, x2);
+    };
+    m.def("_same_logical_tensors", same_logical_tensors,
+          "Determines if the memory regions indexed by each array are the same",
+          py::arg("array1"), py::arg("array2"));
+
     m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
           py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());

From 09cd171a31ebaedfda8d5cd7788d8992d131ddbd Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 17 Jul 2023 12:27:25 -0500
Subject: [PATCH 5/6] Improve race condition check for unary functions

Of out array is logically the same as input array, there is no
race condition, so avoid performing the temporary copy.
---
 dpctl/tensor/_elementwise_common.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 1137fad2e8..55c95f5360 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -79,7 +79,11 @@ def __call__(self, x, out=None, order="K"):
                     f" got {out.dtype}"
                 )
 
-            if buf_dt is None and ti._array_overlap(x, out):
+            if (
+                buf_dt is None
+                and ti._array_overlap(x, out)
+                and not ti._same_logical_tensors(x, out)
+            ):
                 # Allocate a temporary buffer to avoid memory overlapping.
                 # Note if `buf_dt` is not None, a temporary copy of `x` will be
                 # created, so the array overlap check isn't needed.

From 03a46e14c2aabae4408139673889d6dd7a0a6f87 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 17 Jul 2023 13:01:07 -0500
Subject: [PATCH 6/6] Require Cython<3 until source code is updated

---
 .github/workflows/generate-coverage.yaml | 2 +-
 .github/workflows/generate-docs.yml      | 2 +-
 .github/workflows/os-llvm-sycl-build.yml | 2 +-
 conda-recipe/meta.yaml                   | 2 +-
 setup.py                                 | 6 +++---
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/generate-coverage.yaml b/.github/workflows/generate-coverage.yaml
index 5975837d55..3de1427654 100644
--- a/.github/workflows/generate-coverage.yaml
+++ b/.github/workflows/generate-coverage.yaml
@@ -79,7 +79,7 @@ jobs:
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest pytest-cov scikit-build cmake coverage[toml]
+          pip install numpy cython"<3" setuptools pytest pytest-cov scikit-build cmake coverage[toml]
 
       - name: Build dpctl with coverage
         shell: bash -l {0}
diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml
index 768d958e02..a72741c67f 100644
--- a/.github/workflows/generate-docs.yml
+++ b/.github/workflows/generate-docs.yml
@@ -49,7 +49,7 @@ jobs:
         if: ${{ !github.event.pull_request || github.event.action != 'closed' }}
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools scikit-build cmake sphinx sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics
+          pip install numpy cython"<3" setuptools scikit-build cmake sphinx sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics
       - name: Checkout repo
         uses: actions/checkout@v3
         with:
diff --git a/.github/workflows/os-llvm-sycl-build.yml b/.github/workflows/os-llvm-sycl-build.yml
index 1aae32d4d9..e1a390aad8 100644
--- a/.github/workflows/os-llvm-sycl-build.yml
+++ b/.github/workflows/os-llvm-sycl-build.yml
@@ -108,7 +108,7 @@ jobs:
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest scikit-build cmake
+          pip install numpy cython"<3" setuptools pytest scikit-build cmake
 
       - name: Checkout repo
         uses: actions/checkout@v3
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 56958d3355..aad850b060 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -20,7 +20,7 @@ requirements:
         - cmake  >=3.21
         - ninja
         - git
-        - cython
+        - cython  <3
         - python
         - scikit-build
         - numpy
diff --git a/setup.py b/setup.py
index 6eda8f29f0..2ec9dbbde9 100644
--- a/setup.py
+++ b/setup.py
@@ -149,20 +149,20 @@ def _get_cmdclass():
     package_data={"dpctl": ["tests/*.*", "tests/helper/*.py"]},
     include_package_data=True,
     zip_safe=False,
-    setup_requires=["Cython"],
+    setup_requires=["Cython<3"],
     install_requires=[
         "numpy",
     ],
     extras_require={
         "docs": [
-            "Cython",
+            "Cython<3",
             "sphinx",
             "sphinx_rtd_theme",
             "pydot",
             "graphviz",
             "sphinxcontrib-programoutput",
         ],
-        "coverage": ["Cython", "pytest", "pytest-cov", "coverage", "tomli"],
+        "coverage": ["Cython<3", "pytest", "pytest-cov", "coverage", "tomli"],
     },
     keywords="dpctl",
     classifiers=[