From a0e32c0cea94c742d87fed56aad85f351d1d8931 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Mon, 12 Aug 2024 16:47:02 +0200
Subject: [PATCH 1/2] Adopt dpnp to DLPack v1.0

---
 dpnp/dpnp_array.py | 50 ++++++++++++++++++++++++++++++++++++++--------
 dpnp/dpnp_iface.py | 35 +++++++++++++++++++++++++++++---
 2 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index f7b0085b3d9b..7fa2803b5cef 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -184,27 +184,61 @@ def __copy__(self):
     # '__divmod__',
     # '__doc__',
 
-    def __dlpack__(self, stream=None):
+    def __dlpack__(
+        self, *, stream=None, max_version=None, dl_device=None, copy=None
+    ):
         """
         Produces DLPack capsule.
 
         Parameters
         ----------
         stream : {:class:`dpctl.SyclQueue`, None}, optional
-            Execution queue to synchronize with. If ``None``,
-            synchronization is not performed.
+            Execution queue to synchronize with. If ``None``, synchronization
+            is not performed.
+            Default: ``None``.
+        max_version {tuple of ints, None}, optional
+            The maximum DLPack version the consumer (caller of ``__dlpack__``)
+            supports. As ``__dlpack__`` may not always return a DLPack capsule
+            with version `max_version`, the consumer must verify the version
+            even if this argument is passed.
+            Default: ``None``.
+        dl_device {tuple, None}, optional:
+            The device the returned DLPack capsule will be placed on. The
+            device must be a 2-tuple matching the format of
+            ``__dlpack_device__`` method, an integer enumerator representing
+            the device type followed by an integer representing the index of
+            the device.
+            Default: ``None``.
+        copy {bool, None}, optional:
+            Boolean indicating whether or not to copy the input.
+
+            * If `copy` is ``True``, the input will always be copied.
+            * If ``False``, a ``BufferError`` will be raised if a copy is
+              deemed necessary.
+            * If ``None``, a copy will be made only if deemed necessary,
+              otherwise, the existing memory buffer will be reused.
+
+            Default: ``None``.
 
         Raises
         ------
-        MemoryError
+        MemoryError:
             when host memory can not be allocated.
-        DLPackCreationError
-            when array is allocated on a partitioned
-            SYCL device, or with a non-default context.
+        DLPackCreationError:
+            when array is allocated on a partitioned SYCL device, or with
+            a non-default context.
+        BufferError:
+            when a copy is deemed necessary but `copy` is ``False`` or when
+            the provided `dl_device` cannot be handled.
 
         """
 
-        return self._array_obj.__dlpack__(stream=stream)
+        return self._array_obj.__dlpack__(
+            stream=stream,
+            max_version=max_version,
+            dl_device=dl_device,
+            copy=copy,
+        )
 
     def __dlpack_device__(self):
         """
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index b2891c06adc3..15b68aa658b4 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -464,7 +464,7 @@ def default_float_type(device=None, sycl_queue=None):
     return map_dtype_to_device(float64, _sycl_queue.sycl_device)
 
 
-def from_dlpack(obj, /):
+def from_dlpack(obj, /, *, device=None, copy=None):
     """
     Create a dpnp array from a Python object implementing the ``__dlpack__``
     protocol.
@@ -476,6 +476,28 @@ def from_dlpack(obj, /):
     obj : object
         A Python object representing an array that implements the ``__dlpack__``
         and ``__dlpack_device__`` methods.
+    device : {:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`,
+              :class:`dpctl.tensor.Device`, tuple, None}, optional
+        Array API concept of a device where the output array is to be placed.
+        ``device`` can be ``None``, an oneAPI filter selector string,
+        an instance of :class:`dpctl.SyclDevice` corresponding to
+        a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`,
+        a :class:`dpctl.tensor.Device` object returned by
+        :attr:`dpctl.tensor.usm_ndarray.device`, or a 2-tuple matching
+        the format of the output of the ``__dlpack_device__`` method,
+        an integer enumerator representing the device type followed by
+        an integer representing the index of the device.
+        Default: ``None``.
+    copy {bool, None}, optional
+        Boolean indicating whether or not to copy the input.
+
+        * If `copy``is ``True``, the input will always be copied.
+        * If ``False``, a ``BufferError`` will be raised if a copy is deemed
+          necessary.
+        * If ``None``, a copy will be made only if deemed necessary, otherwise,
+          the existing memory buffer will be reused.
+
+        Default: ``None``.
 
     Returns
     -------
@@ -483,10 +505,17 @@ def from_dlpack(obj, /):
         Returns a new dpnp array containing the data from another array
         (obj) with the ``__dlpack__`` method on the same device as object.
 
+    Raises
+    ------
+    TypeError:
+        if `obj` does not implement ``__dlpack__`` method
+    ValueError:
+        if the input array resides on an unsupported device
+
     """
 
-    usm_ary = dpt.from_dlpack(obj)
-    return dpnp_array._create_from_usm_ndarray(usm_ary)
+    usm_res = dpt.from_dlpack(obj, device=device, copy=copy)
+    return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
 def get_dpnp_descriptor(

From c9d2f5bcd64a119983358a66b84eae3c72978120 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Mon, 12 Aug 2024 16:47:36 +0200
Subject: [PATCH 2/2] Add more tests to cover different use cases

---
 tests/test_dlpack.py                          |  74 +++++++++++
 .../cupy/core_tests/test_dlpack.py            | 120 ++++++++++++++++++
 2 files changed, 194 insertions(+)
 create mode 100644 tests/test_dlpack.py
 create mode 100644 tests/third_party/cupy/core_tests/test_dlpack.py

diff --git a/tests/test_dlpack.py b/tests/test_dlpack.py
new file mode 100644
index 000000000000..25090d397cb4
--- /dev/null
+++ b/tests/test_dlpack.py
@@ -0,0 +1,74 @@
+import numpy
+import pytest
+from numpy.testing import assert_array_equal
+
+import dpnp
+
+from .helper import (
+    get_all_dtypes,
+)
+
+device_oneAPI = 14  # DLDeviceType.kDLOneAPI
+
+
+class TestDLPack:
+    @pytest.mark.parametrize("stream", [None, 1])
+    def test_stream(self, stream):
+        x = dpnp.arange(5)
+        x.__dlpack__(stream=stream)
+
+    @pytest.mark.parametrize("copy", [True, None, False])
+    def test_copy(self, copy):
+        x = dpnp.arange(5)
+        x.__dlpack__(copy=copy)
+
+    def test_wrong_copy(self):
+        x = dpnp.arange(5)
+        x.__dlpack__(copy=dpnp.array([1, 2, 3]))
+
+    @pytest.mark.parametrize("xp", [dpnp, numpy])
+    @pytest.mark.parametrize("dt", get_all_dtypes(no_none=True))
+    def test_dtype_passthrough(self, xp, dt):
+        x = xp.arange(5).astype(dt)
+        y = xp.from_dlpack(x)
+
+        assert y.dtype == x.dtype
+        assert_array_equal(x, y)
+
+    @pytest.mark.parametrize("xp", [dpnp, numpy])
+    def test_non_contiguous(self, xp):
+        x = xp.arange(25).reshape((5, 5))
+
+        y1 = x[0]
+        assert_array_equal(y1, xp.from_dlpack(y1))
+
+        y2 = x[:, 0]
+        assert_array_equal(y2, xp.from_dlpack(y2))
+
+        y3 = x[1, :]
+        assert_array_equal(y3, xp.from_dlpack(y3))
+
+        y4 = x[1]
+        assert_array_equal(y4, xp.from_dlpack(y4))
+
+        y5 = xp.diagonal(x).copy()
+        assert_array_equal(y5, xp.from_dlpack(y5))
+
+    def test_device(self):
+        x = dpnp.arange(5)
+        assert x.__dlpack_device__()[0] == device_oneAPI
+        y = dpnp.from_dlpack(x)
+        assert y.__dlpack_device__()[0] == device_oneAPI
+        z = y[::2]
+        assert z.__dlpack_device__()[0] == device_oneAPI
+
+    def test_ndim0(self):
+        x = dpnp.array(1.0)
+        y = dpnp.from_dlpack(x)
+        assert_array_equal(x, y)
+
+    def test_device(self):
+        x = dpnp.arange(5)
+        y = dpnp.from_dlpack(x, device=x.__dlpack_device__())
+        assert x.device == y.device
+        assert x.get_array()._pointer == y.get_array()._pointer
diff --git a/tests/third_party/cupy/core_tests/test_dlpack.py b/tests/third_party/cupy/core_tests/test_dlpack.py
new file mode 100644
index 000000000000..18eba5574093
--- /dev/null
+++ b/tests/third_party/cupy/core_tests/test_dlpack.py
@@ -0,0 +1,120 @@
+import unittest
+
+import dpctl
+import dpctl.tensor._dlpack as dlp
+import numpy
+import pytest
+
+import dpnp as cupy
+from tests.third_party.cupy import testing
+
+
+def _gen_array(dtype, alloc_q=None):
+    if cupy.issubdtype(dtype, numpy.unsignedinteger):
+        array = cupy.random.randint(
+            0, 10, size=(2, 3), sycl_queue=alloc_q
+        ).astype(dtype)
+    elif cupy.issubdtype(dtype, cupy.integer):
+        array = cupy.random.randint(
+            -10, 10, size=(2, 3), sycl_queue=alloc_q
+        ).astype(dtype)
+    elif cupy.issubdtype(dtype, cupy.floating):
+        array = cupy.random.rand(2, 3, sycl_queue=alloc_q).astype(dtype)
+    elif cupy.issubdtype(dtype, cupy.complexfloating):
+        array = cupy.random.random((2, 3), sycl_queue=alloc_q).astype(dtype)
+    elif dtype == cupy.bool_:
+        array = cupy.random.randint(
+            0, 2, size=(2, 3), sycl_queue=alloc_q
+        ).astype(cupy.bool_)
+    else:
+        assert False, f"unrecognized dtype: {dtype}"
+    return array
+
+
+class TestDLPackConversion(unittest.TestCase):
+    @testing.for_all_dtypes(no_bool=False)
+    def test_conversion(self, dtype):
+        orig_array = _gen_array(dtype)
+        tensor = orig_array.__dlpack__()
+        out_array = dlp.from_dlpack_capsule(tensor)
+        testing.assert_array_equal(orig_array, out_array)
+        assert orig_array.get_array()._pointer == out_array._pointer
+
+
+@testing.parameterize(*testing.product({"memory": ("device", "managed")}))
+class TestNewDLPackConversion(unittest.TestCase):
+    def _get_stream(self, stream_name):
+        if stream_name == "null":
+            return dpctl.SyclQueue()
+        return dpctl.SyclQueue()
+
+    @testing.for_all_dtypes(no_bool=False)
+    def test_conversion(self, dtype):
+        orig_array = _gen_array(dtype)
+        out_array = cupy.from_dlpack(orig_array)
+        testing.assert_array_equal(orig_array, out_array)
+        assert orig_array.get_array()._pointer == out_array.get_array()._pointer
+
+    def test_stream(self):
+        allowed_streams = ["null", True]
+
+        # stream order is automatically established via DLPack protocol
+        for src_s in [self._get_stream(s) for s in allowed_streams]:
+            for dst_s in [self._get_stream(s) for s in allowed_streams]:
+                orig_array = _gen_array(cupy.float32, alloc_q=src_s)
+                dltensor = orig_array.__dlpack__(stream=orig_array)
+
+                out_array = dlp.from_dlpack_capsule(dltensor)
+                out_array = cupy.from_dlpack(out_array, device=dst_s)
+                testing.assert_array_equal(orig_array, out_array)
+                assert (
+                    orig_array.get_array()._pointer
+                    == out_array.get_array()._pointer
+                )
+
+
+class TestDLTensorMemory(unittest.TestCase):
+    # def setUp(self):
+    #     self.old_pool = cupy.get_default_memory_pool()
+    #     self.pool = cupy.cuda.MemoryPool()
+    #     cupy.cuda.set_allocator(self.pool.malloc)
+
+    # def tearDown(self):
+    #     self.pool.free_all_blocks()
+    #     cupy.cuda.set_allocator(self.old_pool.malloc)
+
+    def test_deleter(self):
+        # memory is freed when tensor is deleted, as it's not consumed
+        array = cupy.empty(10)
+        tensor = array.__dlpack__()
+        # str(tensor): <capsule object "dltensor" at 0x7f7c4c835330>
+        assert '"dltensor"' in str(tensor)
+        # assert self.pool.n_free_blocks() == 0
+        # del array
+        # assert self.pool.n_free_blocks() == 0
+        # del tensor
+        # assert self.pool.n_free_blocks() == 1
+
+    def test_deleter2(self):
+        # memory is freed when array2 is deleted, as tensor is consumed
+        array = cupy.empty(10)
+        tensor = array.__dlpack__()
+        assert '"dltensor"' in str(tensor)
+        array2 = dlp.from_dlpack_capsule(tensor)
+        assert '"used_dltensor"' in str(tensor)
+        # assert self.pool.n_free_blocks() == 0
+        # del array
+        # assert self.pool.n_free_blocks() == 0
+        # del array2
+        # assert self.pool.n_free_blocks() == 1
+        # del tensor
+        # assert self.pool.n_free_blocks() == 1
+
+    def test_multiple_consumption_error(self):
+        # Prevent segfault, see #3611
+        array = cupy.empty(10)
+        tensor = array.__dlpack__()
+        array2 = dlp.from_dlpack_capsule(tensor)
+        with pytest.raises(ValueError) as e:
+            array3 = dlp.from_dlpack_capsule(tensor)
+        assert "consumed multiple times" in str(e.value)