From 2661f517848c9c2e651880886c4ec0bdc262cb68 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 7 Aug 2024 08:55:30 -0500 Subject: [PATCH] Support copy-via-host in from_dlpack For arr that supports DLPack, version (1, 0), or legacy, support ``` from_dlpack(arr, device=target_dev) ``` where target_dev is `(kDLCPU, 0)` for transfer to host, or a value recognized by device keywords in dpctl.tensor for other functions, or `(kDLOneAPI, dev_id)`. To support transfer via host, `arr` must support `__dlpack__(max_version=(1,0), dl_device=(1, 0))`. For array objects with legacy `__dlpack__` support only, supported inputs are those residing on kDLCPU device, or those from kDLOneAPI device only. --- This is a combination of 17 commits squashed into one: Combine two validation checks into one, improving coverage Only fall-back to __dlpack__() if requested device does not change Simplify branching, only fall-back to no-arg call to __dlpack__ is dl_device is None or same as reported for the input Changed from_dlpack to copy via host is needed This enables dpt.from_dlpack(numpy_array, device="opencl:cpu") Add a test to exercise copy via host Handle possibilities for TypeError and BufferError These may be hard to test Change exception raised by __dlpack__ if dl_device is unsupported It used to raise NotImplementedError, not raises BufferError Add case of dlpack test to expand coverage Removed comment, add NotImplementedError to the except clause To ensure same validation across branches, compute host_blob by roundtripping it through dlpack Test from_dlpack on numpy input with strides not multiple of elementsize Refined from_dlpack docstrings, reorged impl of from_dlpack Used try/except/else/finally to avoid raising an exception when another one is in flight (confusing UX). device keyword is only allowed to be (kDLCPU, 0) or (kDLOneAPI, num). Device keyword value is used to create output array, rather than device_id deduced from it. Adjusted test per change in implementation Expand applicability of fall-back behavior When `from_dlpack(arr, device=dev)` is called, for `arr` object that supports legacy DLPack interface (max_version, dl_device, copy are not supported), we now support arr being device on host, that is (kDLCPU, 0), and (kDLOneAPI, different_device_id). Support for this last case is being added in this commit, as per review comment. Add symmetric support for containers with legacy DLPack support For legacy containers, support device=(kDLCPU, 0) as well as oneAPI device. Add tests for importing generic legacy and generic modern containers Fix typos in comments Add test for legacy container holding numpy's array. --- dpctl/tensor/_dlpack.pyx | 178 +++++++++++++++++++++---- dpctl/tensor/_usmarray.pyx | 2 +- dpctl/tests/test_usm_ndarray_dlpack.py | 129 ++++++++++++++++++ 3 files changed, 281 insertions(+), 28 deletions(-) diff --git a/dpctl/tensor/_dlpack.pyx b/dpctl/tensor/_dlpack.pyx index 4741854a84..098003ead2 100644 --- a/dpctl/tensor/_dlpack.pyx +++ b/dpctl/tensor/_dlpack.pyx @@ -168,7 +168,7 @@ cdef void _managed_tensor_versioned_deleter(DLManagedTensorVersioned *dlmv_tenso stdlib.free(dlmv_tensor) -cdef object _get_default_context(c_dpctl.SyclDevice dev) except *: +cdef object _get_default_context(c_dpctl.SyclDevice dev): try: default_context = dev.sycl_platform.default_context except RuntimeError: @@ -178,7 +178,7 @@ cdef object _get_default_context(c_dpctl.SyclDevice dev) except *: return default_context -cdef int get_parent_device_ordinal_id(c_dpctl.SyclDevice dev) except *: +cdef int get_parent_device_ordinal_id(c_dpctl.SyclDevice dev) except -1: cdef DPCTLSyclDeviceRef pDRef = NULL cdef DPCTLSyclDeviceRef tDRef = NULL cdef c_dpctl.SyclDevice p_dev @@ -201,7 +201,7 @@ cdef int get_parent_device_ordinal_id(c_dpctl.SyclDevice dev) except *: cdef int get_array_dlpack_device_id( usm_ndarray usm_ary -) except *: +) except -1: """Finds ordinal number of the parent of device where array was allocated. """ @@ -935,6 +935,32 @@ cpdef object from_dlpack_capsule(object py_caps): "The DLPack tensor resides on unsupported device." ) +cdef usm_ndarray _to_usm_ary_from_host_blob(object host_blob, dev : Device): + q = dev.sycl_queue + np_ary = np.asarray(host_blob) + dt = np_ary.dtype + if dt.char in "dD" and q.sycl_device.has_aspect_fp64 is False: + Xusm_dtype = ( + "float32" if dt.char == "d" else "complex64" + ) + else: + Xusm_dtype = dt + usm_mem = dpmem.MemoryUSMDevice(np_ary.nbytes, queue=q) + usm_ary = usm_ndarray(np_ary.shape, dtype=Xusm_dtype, buffer=usm_mem) + usm_mem.copy_from_host(np.reshape(np_ary.view(dtype="u1"), -1)) + return usm_ary + + +# only cdef to make it private +cdef object _create_device(object device, object dl_device): + if isinstance(device, Device): + return device + elif isinstance(device, dpctl.SyclDevice): + return Device.create_device(device) + else: + root_device = dpctl.SyclDevice(str(dl_device[1])) + return Device.create_device(root_device) + def from_dlpack(x, /, *, device=None, copy=None): """ from_dlpack(x, /, *, device=None, copy=None) @@ -943,7 +969,7 @@ def from_dlpack(x, /, *, device=None, copy=None): object ``x`` that implements ``__dlpack__`` protocol. Args: - x (Python object): + x (object): A Python object representing an array that supports ``__dlpack__`` protocol. device (Optional[str, @@ -959,7 +985,8 @@ def from_dlpack(x, /, *, device=None, copy=None): returned by :attr:`dpctl.tensor.usm_ndarray.device`, or a 2-tuple matching the format of the output of the ``__dlpack_device__`` method, an integer enumerator representing the device type followed by - an integer representing the index of the device. + an integer representing the index of the device. The only supported + :enum:`dpctl.tensor.DLDeviceType` types are "kDLCPU" and "kDLOneAPI". Default: ``None``. copy (bool, optional) Boolean indicating whether or not to copy the input. @@ -1008,33 +1035,130 @@ def from_dlpack(x, /, *, device=None, copy=None): C = Container(dpt.linspace(0, 100, num=20, dtype="int16")) X = dpt.from_dlpack(C) + Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0)) """ - if not hasattr(x, "__dlpack__"): - raise TypeError( - f"The argument of type {type(x)} does not implement " - "`__dlpack__` method." - ) - dlpack_attr = getattr(x, "__dlpack__") - if not callable(dlpack_attr): + dlpack_attr = getattr(x, "__dlpack__", None) + dlpack_dev_attr = getattr(x, "__dlpack_device__", None) + if not callable(dlpack_attr) or not callable(dlpack_dev_attr): raise TypeError( f"The argument of type {type(x)} does not implement " - "`__dlpack__` method." + "`__dlpack__` and `__dlpack_device__` methods." ) - try: - # device is converted to a dlpack_device if necessary - dl_device = None - if device: - if isinstance(device, tuple): - dl_device = device + # device is converted to a dlpack_device if necessary + dl_device = None + if device: + if isinstance(device, tuple): + dl_device = device + if len(dl_device) != 2: + raise ValueError( + "Argument `device` specified as a tuple must have length 2" + ) + else: + if not isinstance(device, dpctl.SyclDevice): + device = Device.create_device(device) + d = device.sycl_device else: - if not isinstance(device, dpctl.SyclDevice): - d = Device.create_device(device).sycl_device - dl_device = (device_OneAPI, get_parent_device_ordinal_id(d)) - else: - dl_device = (device_OneAPI, get_parent_device_ordinal_id(device)) - dlpack_capsule = dlpack_attr(max_version=get_build_dlpack_version(), dl_device=dl_device, copy=copy) - return from_dlpack_capsule(dlpack_capsule) + d = device + dl_device = (device_OneAPI, get_parent_device_ordinal_id(d)) + if dl_device is not None: + if (dl_device[0] not in [device_OneAPI, device_CPU]): + raise ValueError( + f"Argument `device`={device} is not supported." + ) + got_type_error = False + got_buffer_error = False + got_other_error = False + saved_exception = None + # First DLPack version supporting dl_device, and copy + requested_ver = (1, 0) + cpu_dev = (device_CPU, 0) + try: + # setting max_version to minimal version that supports dl_device/copy keywords + dlpack_capsule = dlpack_attr( + max_version=requested_ver, + dl_device=dl_device, + copy=copy + ) except TypeError: - dlpack_capsule = dlpack_attr() + # exporter does not support max_version keyword + got_type_error = True + except (BufferError, NotImplementedError): + # Either dl_device, or copy can be satisfied + got_buffer_error = True + except Exception as e: + got_other_error = True + saved_exception = e + else: + # execution did not raise exceptions return from_dlpack_capsule(dlpack_capsule) + finally: + if got_type_error: + # max_version/dl_device, copy keywords are not supported by __dlpack__ + x_dldev = dlpack_dev_attr() + if (dl_device is None) or (dl_device == x_dldev): + dlpack_capsule = dlpack_attr() + return from_dlpack_capsule(dlpack_capsule) + # must copy via host + if copy is False: + raise BufferError( + "Importing data via DLPack requires copying, but copy=False was provided" + ) + # when max_version/dl_device/copy are not supported + # we can only support importing to OneAPI devices + # from host, or from another oneAPI device + is_supported_x_dldev = ( + x_dldev == cpu_dev or + (x_dldev[0] == device_OneAPI) + ) + is_supported_dl_device = ( + dl_device == cpu_dev or + dl_device[0] == device_OneAPI + ) + if is_supported_x_dldev and is_supported_dl_device: + dlpack_capsule = dlpack_attr() + blob = from_dlpack_capsule(dlpack_capsule) + else: + raise BufferError(f"Can not import to requested device {dl_device}") + dev = _create_device(device, dl_device) + if x_dldev == cpu_dev and dl_device == cpu_dev: + # both source and destination are CPU + return blob + elif x_dldev == cpu_dev: + # source is CPU, destination is oneAPI + return _to_usm_ary_from_host_blob(blob, dev) + elif dl_device == cpu_dev: + # source is oneAPI, destination is CPU + cpu_caps = blob.__dlpack__( + max_version=get_build_dlpack_version(), + dl_device=cpu_dev + ) + return from_dlpack_capsule(cpu_caps) + else: + import dpctl.tensor as dpt + return dpt.asarray(blob, device=dev) + elif got_buffer_error: + # we are here, because dlpack_attr could not deal with requested dl_device, + # or copying was required + if copy is False: + raise BufferError( + "Importing data via DLPack requires copying, but copy=False was provided" + ) + # must copy via host + if dl_device[0] != device_OneAPI: + raise BufferError(f"Can not import to requested device {dl_device}") + x_dldev = dlpack_dev_attr() + if x_dldev == cpu_dev: + dlpack_capsule = dlpack_attr() + host_blob = from_dlpack_capsule(dlpack_capsule) + else: + dlpack_capsule = dlpack_attr( + max_version=requested_ver, + dl_device=cpu_dev, + copy=copy + ) + host_blob = from_dlpack_capsule(dlpack_capsule) + dev = _create_device(device, dl_device) + return _to_usm_ary_from_host_blob(host_blob, dev) + elif got_other_error: + raise saved_exception diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index cbe164c6d3..e806dcc956 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -1242,7 +1242,7 @@ cdef class usm_ndarray: _arr.flags["W"] = self.flags["W"] return c_dlpack.numpy_to_dlpack_versioned_capsule(_arr, True) else: - raise NotImplementedError( + raise BufferError( f"targeting `dl_device` {dl_device} with `__dlpack__` is not " "yet implemented" ) diff --git a/dpctl/tests/test_usm_ndarray_dlpack.py b/dpctl/tests/test_usm_ndarray_dlpack.py index 344080e2ae..2f07abf12a 100644 --- a/dpctl/tests/test_usm_ndarray_dlpack.py +++ b/dpctl/tests/test_usm_ndarray_dlpack.py @@ -696,3 +696,132 @@ def test_dlpack_size_0_on_kdlcpu(): cap = x_np.__dlpack__() y = _dlp.from_dlpack_capsule(cap) assert y.ctypes.data == x_np.ctypes.data + + +def test_copy_via_host(): + get_queue_or_skip() + x = dpt.ones(1, dtype="i4") + x_np = np.ones(1, dtype="i4") + x_dl_dev = x.__dlpack_device__() + y = dpt.from_dlpack(x_np, device=x_dl_dev) + assert isinstance(y, dpt.usm_ndarray) + assert y.sycl_device == x.sycl_device + assert y.usm_type == "device" + + with pytest.raises(ValueError): + # uncorrect length of tuple + dpt.from_dlpack(x_np, device=(1, 0, 0)) + with pytest.raises(ValueError): + # only kDLCPU and kDLOneAPI are supported + dpt.from_dlpack(x, device=(2, 0)) + + num_devs = dpctl.get_num_devices() + if num_devs > 1: + j = [i for i in range(num_devs) if i != x_dl_dev[1]][0] + z = dpt.from_dlpack(x, device=(x_dl_dev[0], j)) + assert isinstance(z, dpt.usm_ndarray) + assert z.usm_type == "device" + + +def test_copy_via_host_gh_1789(): + "Test based on review example from gh-1789" + get_queue_or_skip() + x_np = np.ones((10, 10), dtype="i4") + # strides are no longer multiple of itemsize + x_np.strides = (x_np.strides[0] - 1, x_np.strides[1]) + with pytest.raises(BufferError): + dpt.from_dlpack(x_np) + with pytest.raises(BufferError): + dpt.from_dlpack(x_np, device=(14, 0)) + + +class LegacyContainer: + "Helper class implementing legacy `__dlpack__` protocol" + + def __init__(self, array): + self._array = array + + def __dlpack__(self, stream=None): + return self._array.__dlpack__(stream=stream) + + def __dlpack_device__(self): + return self._array.__dlpack_device__() + + +class Container: + "Helper class implementing legacy `__dlpack__` protocol" + + def __init__(self, array): + self._array = array + + def __dlpack__( + self, max_version=None, dl_device=None, copy=None, stream=None + ): + return self._array.__dlpack__( + max_version=max_version, + dl_device=dl_device, + copy=copy, + stream=stream, + ) + + def __dlpack_device__(self): + return self._array.__dlpack_device__() + + +def test_generic_container_legacy(): + get_queue_or_skip() + C = LegacyContainer(dpt.linspace(0, 100, num=20, dtype="int16")) + + X = dpt.from_dlpack(C) + assert isinstance(X, dpt.usm_ndarray) + assert X._pointer == C._array._pointer + assert X.sycl_device == C._array.sycl_device + assert X.dtype == C._array.dtype + + Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0)) + assert isinstance(Y, np.ndarray) + assert Y.dtype == X.dtype + + Z = dpt.from_dlpack(C, device=X.device) + assert isinstance(Z, dpt.usm_ndarray) + assert Z._pointer == X._pointer + assert Z.device == X.device + + +def test_generic_container_legacy_np(): + get_queue_or_skip() + C = LegacyContainer(np.linspace(0, 100, num=20, dtype="int16")) + + X = dpt.from_dlpack(C) + assert isinstance(X, np.ndarray) + assert X.ctypes.data == C._array.ctypes.data + assert X.dtype == C._array.dtype + + Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0)) + assert isinstance(Y, np.ndarray) + assert Y.dtype == X.dtype + + dev = dpt.Device.create_device() + Z = dpt.from_dlpack(C, device=dev) + assert isinstance(Z, dpt.usm_ndarray) + assert Z.device == dev + + +def test_generic_container(): + get_queue_or_skip() + C = Container(dpt.linspace(0, 100, num=20, dtype="int16")) + + X = dpt.from_dlpack(C) + assert isinstance(X, dpt.usm_ndarray) + assert X._pointer == C._array._pointer + assert X.sycl_device == C._array.sycl_device + assert X.dtype == C._array.dtype + + Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0)) + assert isinstance(Y, np.ndarray) + assert Y.dtype == X.dtype + + Z = dpt.from_dlpack(C, device=X.device) + assert isinstance(Z, dpt.usm_ndarray) + assert Z._pointer == X._pointer + assert Z.device == X.device