From 831a5cb36eba038092e451164610492892c80d35 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 10 Feb 2023 22:16:11 -0600 Subject: [PATCH 01/57] Fixed typo in docstring --- dpctl/tensor/_slicing.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi index 69ed454bd0..f8eb6aaa31 100755 --- a/dpctl/tensor/_slicing.pxi +++ b/dpctl/tensor/_slicing.pxi @@ -38,7 +38,7 @@ cdef object _basic_slice_meta(object ind, tuple shape, tuple strides, Py_ssize_t offset): """ Give basic slicing index `ind` and array layout information produce - a tuple (resulting_shape, resulting_strides, resultin_offset) + a tuple (resulting_shape, resulting_strides, resulting_offset) used to contruct a view into underlying array. Raises IndexError for invalid index `ind`, and NotImplementedError From 402c1d60cfd83ec4041077716bf3c2dfeea38001 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 13 Feb 2023 18:40:05 -0600 Subject: [PATCH 02/57] Fixed TODO in utility _zero_like --- dpctl/tensor/_usmarray.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index a708418746..9a4fab8af3 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -1258,12 +1258,13 @@ cdef usm_ndarray _zero_like(usm_ndarray ary): Make C-contiguous array of zero elements with same shape and type as ary. """ + cdef dt = _make_typestr(ary.typenum_) cdef usm_ndarray r = usm_ndarray( _make_int_tuple(ary.nd_, ary.shape_), - dtype=_make_typestr(ary.typenum_), + dtype=dt, buffer=ary.base_.get_usm_type() ) - # TODO: call function to set array elements to zero + r.base_.memset() return r From d7fc400f50e126277adfda24ace3e386425f5ea7 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 13 Feb 2023 18:40:47 -0600 Subject: [PATCH 03/57] Extended _basic_slice_meta to process advanced indexing specs --- dpctl/tensor/_slicing.pxi | 124 +++++++++++++++++++++++++++++--------- 1 file changed, 97 insertions(+), 27 deletions(-) diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi index f8eb6aaa31..6689502955 100755 --- a/dpctl/tensor/_slicing.pxi +++ b/dpctl/tensor/_slicing.pxi @@ -34,23 +34,41 @@ cdef Py_ssize_t _slice_len( return 1 + ((sl_stop - sl_start + 1) // sl_step) -cdef object _basic_slice_meta(object ind, tuple shape, - tuple strides, Py_ssize_t offset): +cdef bint _is_integral(object x) except *: + """Gives True if x is an integral slice spec""" + if isinstance(x, (int, numbers.Integral)): + return True + if isinstance(x, usm_ndarray): + if x.ndim > 0: + return False + if x.dtype.kind not in "ui": + return False + return True + if callable(getattr(x, "__index__", None)): + try: + x.__index__() + except (TypeError, ValueError): + return False + return True + return False + + +def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int): """ Give basic slicing index `ind` and array layout information produce - a tuple (resulting_shape, resulting_strides, resulting_offset) - used to contruct a view into underlying array. + a 5-tuple (resulting_shape, resulting_strides, resulting_offset, + advanced_ind, resulting_advanced_ind_pos) + used to contruct a view into underlying array over which advanced + indexing, if any, is to be performed. - Raises IndexError for invalid index `ind`, and NotImplementedError - if `ind` is an array. + Raises IndexError for invalid index `ind`. """ - is_integral = lambda x: ( - isinstance(x, numbers.Integral) or callable(getattr(x, "__index__", None)) - ) + _no_advanced_ind = tuple() + _no_advanced_pos = -1 if ind is Ellipsis: - return (shape, strides, offset) + return (shape, strides, offset, _no_advanced_ind, _no_advanced_pos) elif ind is None: - return ((1,) + shape, (0,) + strides, offset) + return ((1,) + shape, (0,) + strides, offset, _no_advanced_ind, _no_advanced_pos) elif isinstance(ind, slice): sl_start, sl_stop, sl_step = ind.indices(shape[0]) sh0 = _slice_len(sl_start, sl_stop, sl_step) @@ -60,38 +78,70 @@ cdef object _basic_slice_meta(object ind, tuple shape, return ( (sh0, ) + shape[1:], new_strides, - new_offset + new_offset, + _no_advanced_ind, + _no_advanced_pos ) - elif is_integral(ind): + elif _is_integral(ind): ind = ind.__index__() if 0 <= ind < shape[0]: - return (shape[1:], strides[1:], offset + ind * strides[0]) + return (shape[1:], strides[1:], offset + ind * strides[0], _no_advanced_ind, _no_advanced_pos) elif -shape[0] <= ind < 0: return (shape[1:], strides[1:], - offset + (shape[0] + ind) * strides[0]) + offset + (shape[0] + ind) * strides[0], _no_advanced_ind, _no_advanced_pos) else: raise IndexError( "Index {0} is out of range for axes 0 with " "size {1}".format(ind, shape[0])) - elif isinstance(ind, list): - raise NotImplemented + elif isinstance(ind, usm_ndarray): + return (shape, strides, 0, (ind,), 0) elif isinstance(ind, tuple): axes_referenced = 0 ellipses_count = 0 newaxis_count = 0 explicit_index = 0 + array_count = 0 + seen_arrays_yet = False + array_streak_started = False + array_streak_interrupted = False for i in ind: if i is None: - newaxis_count = newaxis_count + 1 + newaxis_count += 1 + if array_streak_started: + array_streak_interrupted = True elif i is Ellipsis: - ellipses_count = ellipses_count + 1 + ellipses_count += 1 + if array_streak_started: + array_streak_interrupted = True elif isinstance(i, slice): - axes_referenced = axes_referenced + 1 - elif is_integral(i): - explicit_index = explicit_index + 1 - axes_referenced = axes_referenced + 1 - elif isinstance(i, list): - raise NotImplemented + axes_referenced += 1 + if array_streak_started: + array_streak_interrupted = True + elif _is_integral(i): + explicit_index += 1 + axes_referenced += 1 + if array_streak_started: + array_streak_interrupted = True + elif isinstance(i, usm_ndarray): + if not seen_arrays_yet: + seen_arrays_yet = True + array_streak_started = True + array_streak_interrupted = False + if array_streak_interrupted: + raise IndexError( + "Advanced indexing array specs may not be " + "separated by basic slicing specs." + ) + dt_k = i.dtype.kind + if dt_k == "b": + axes_referenced += i.ndim + elif dt_k in "ui": + axes_referenced += 1 + else: + raise IndexError( + "arrays used as indices must be of integer (or boolean) type" + ) + array_count += 1 else: raise TypeError if ellipses_count > 1: @@ -108,7 +158,10 @@ cdef object _basic_slice_meta(object ind, tuple shape, + axes_referenced - explicit_index) new_shape = list() new_strides = list() + new_advanced_ind = list() k = 0 + new_advanced_start_pos = -1 + advanced_start_pos_set = False new_offset = offset is_empty = False for i in range(len(ind)): @@ -133,7 +186,7 @@ cdef object _basic_slice_meta(object ind, tuple shape, if sh_i == 0: is_empty = True k = k_new - elif is_integral(ind_i): + elif _is_integral(ind_i): ind_i = ind_i.__index__() if 0 <= ind_i < shape[k]: k_new = k + 1 @@ -149,8 +202,25 @@ cdef object _basic_slice_meta(object ind, tuple shape, raise IndexError( ("Index {0} is out of range for " "axes {1} with size {2}").format(ind_i, k, shape[k])) + elif isinstance(ind_i, usm_ndarray): + if not advanced_start_pos_set: + new_advanced_start_pos = len(new_shape) + advanced_start_pos_set = True + new_advanced_ind.append(ind_i) + dt_k = ind_i.dtype.kind + if dt_k == "b": + k_new = k + ind_i.ndim + else: + k_new = k + 1 + new_shape.extend(shape[k:k_new]) + new_strides.extend(strides[k:k_new]) + k = k_new new_shape.extend(shape[k:]) new_strides.extend(strides[k:]) - return (tuple(new_shape), tuple(new_strides), new_offset) + new_shape_len += len(shape) - k +# assert len(new_shape) == new_shape_len, f"{len(new_shape)} vs {new_shape_len}" +# assert len(new_strides) == new_shape_len, f"{len(new_strides)} vs {new_shape_len}" +# assert len(new_advanced_ind) == array_count + return (tuple(new_shape), tuple(new_strides), new_offset, tuple(new_advanced_ind), new_advanced_start_pos) else: raise TypeError From bcc305bdeec56c2dbb1b129f1657f7d12744d56e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 13 Feb 2023 18:45:39 -0600 Subject: [PATCH 04/57] Added prototype with mock implementation of extract/place/take/put --- proto/advanced.py | 462 +++++++++++++++++++++++++++++++++++++++++ proto/test_advanced.py | 405 ++++++++++++++++++++++++++++++++++++ 2 files changed, 867 insertions(+) create mode 100644 proto/advanced.py create mode 100644 proto/test_advanced.py diff --git a/proto/advanced.py b/proto/advanced.py new file mode 100644 index 0000000000..8721214d73 --- /dev/null +++ b/proto/advanced.py @@ -0,0 +1,462 @@ +import numbers + +import dpctl.tensor as dpt +import dpctl.utils +from dpctl.tensor import usm_ndarray + +""" +Advanced slicing meta-infomation extraction +""" + + +class ExecutionPlacementError(Exception): + pass + + +def _slice_len(sl_start: int, sl_stop: int, sl_step: int): + """ + Compute len(range(sl_start, sl_stop, sl_step)) + """ + if sl_start == sl_stop: + return 0 + if sl_step > 0: + # 1 + argmax k such htat sl_start + sl_step*k < sl_stop + return 1 + ((sl_stop - sl_start - 1) // sl_step) + else: + return 1 + ((sl_stop - sl_start + 1) // sl_step) + + +def _is_integral(x): + """Gives True if x is an integral slice spec""" + if isinstance(x, (int, numbers.Integral)): + return True + if isinstance(x, usm_ndarray): + if x.ndim > 0: + return False + if x.dtype.kind not in "ui": + return False + return True + if callable(getattr(x, "__index__", None)): + try: + x.__index__() + except (TypeError, ValueError): + return False + return True + return False + + +def _basic_slice_meta(ind, shape: tuple, strides: tuple, offset: int): + """ + Give basic slicing index `ind` and array layout information produce + a 5-tuple (resulting_shape, resulting_strides, resulting_offset, + advanced_ind, resulting_advanced_ind_pos) + used to contruct a view into underlying array over which advanced + indexing, if any, is to be performed. + + Raises IndexError for invalid index `ind`. + """ + _no_advanced_ind = tuple() + _no_advanced_pos = -1 + if ind is Ellipsis: + return (shape, strides, offset, _no_advanced_ind, _no_advanced_pos) + elif ind is None: + return ( + (1,) + shape, + (0,) + strides, + offset, + _no_advanced_ind, + _no_advanced_pos, + ) + elif isinstance(ind, slice): + sl_start, sl_stop, sl_step = ind.indices(shape[0]) + sh0 = _slice_len(sl_start, sl_stop, sl_step) + str0 = sl_step * strides[0] + new_strides = ( + strides if (sl_step == 1 or sh0 == 0) else (str0,) + strides[1:] + ) + new_offset = offset if sh0 == 0 else offset + sl_start * strides[0] + return ( + (sh0,) + shape[1:], + new_strides, + new_offset, + _no_advanced_ind, + _no_advanced_pos, + ) + elif _is_integral(ind): + ind = ind.__index__() + if 0 <= ind < shape[0]: + return ( + shape[1:], + strides[1:], + offset + ind * strides[0], + _no_advanced_ind, + _no_advanced_pos, + ) + elif -shape[0] <= ind < 0: + return ( + shape[1:], + strides[1:], + offset + (shape[0] + ind) * strides[0], + _no_advanced_ind, + _no_advanced_pos, + ) + else: + raise IndexError( + "Index {0} is out of range for axes 0 with " + "size {1}".format(ind, shape[0]) + ) + elif isinstance(ind, usm_ndarray): + return (shape, strides, 0, (ind,), 0) + elif isinstance(ind, tuple): + axes_referenced = 0 + ellipses_count = 0 + newaxis_count = 0 + explicit_index = 0 + array_count = 0 + seen_arrays_yet = False + array_streak_started = False + array_streak_interrupted = False + for i in ind: + if i is None: + newaxis_count += 1 + if array_streak_started: + array_streak_interrupted = True + elif i is Ellipsis: + ellipses_count += 1 + if array_streak_started: + array_streak_interrupted = True + elif isinstance(i, slice): + axes_referenced += 1 + if array_streak_started: + array_streak_interrupted = True + elif _is_integral(i): + explicit_index += 1 + axes_referenced += 1 + if array_streak_started: + array_streak_interrupted = True + elif isinstance(i, usm_ndarray): + if not seen_arrays_yet: + seen_arrays_yet = True + array_streak_started = True + array_streak_interrupted = False + if array_streak_interrupted: + raise IndexError( + "Advanced indexing array specs may not be " + "separated by basic slicing specs." + ) + dt_k = i.dtype.kind + if dt_k == "b": + axes_referenced += i.ndim + elif dt_k in "ui": + axes_referenced += 1 + else: + raise IndexError( + "arrays used as indices must be of integer " + "(or boolean) type" + ) + array_count += 1 + else: + raise TypeError + if ellipses_count > 1: + raise IndexError("an index can only have a sinlge ellipsis ('...')") + if axes_referenced > len(shape): + raise IndexError( + "too many indices for an array, array is " + "{0}-dimensional, but {1} were indexed".format( + len(shape), axes_referenced + ) + ) + if ellipses_count: + ellipses_count = len(shape) - axes_referenced + new_shape_len = ( + newaxis_count + ellipses_count + axes_referenced - explicit_index + ) + new_shape = list() + new_strides = list() + new_advanced_ind = list() + k = 0 + new_advanced_start_pos = -1 + advanced_start_pos_set = False + new_offset = offset + is_empty = False + for i in range(len(ind)): + ind_i = ind[i] + if ind_i is Ellipsis: + k_new = k + ellipses_count + new_shape.extend(shape[k:k_new]) + new_strides.extend(strides[k:k_new]) + k = k_new + elif ind_i is None: + new_shape.append(1) + new_strides.append(0) + elif isinstance(ind_i, slice): + k_new = k + 1 + sl_start, sl_stop, sl_step = ind_i.indices(shape[k]) + sh_i = _slice_len(sl_start, sl_stop, sl_step) + str_i = (1 if sh_i == 0 else sl_step) * strides[k] + new_shape.append(sh_i) + new_strides.append(str_i) + if sh_i > 0 and not is_empty: + new_offset = new_offset + sl_start * strides[k] + if sh_i == 0: + is_empty = True + k = k_new + elif _is_integral(ind_i): + ind_i = ind_i.__index__() + if 0 <= ind_i < shape[k]: + k_new = k + 1 + if not is_empty: + new_offset = new_offset + ind_i * strides[k] + k = k_new + elif -shape[k] <= ind_i < 0: + k_new = k + 1 + if not is_empty: + new_offset = ( + new_offset + (shape[k] + ind_i) * strides[k] + ) + k = k_new + else: + raise IndexError( + ( + "Index {0} is out of range for " + "axes {1} with size {2}" + ).format(ind_i, k, shape[k]) + ) + elif isinstance(ind_i, usm_ndarray): + if not advanced_start_pos_set: + new_advanced_start_pos = len(new_shape) + advanced_start_pos_set = True + new_advanced_ind.append(ind_i) + dt_k = ind_i.dtype.kind + if dt_k == "b": + k_new = k + ind_i.ndim + else: + k_new = k + 1 + new_shape.extend(shape[k:k_new]) + new_strides.extend(strides[k:k_new]) + k = k_new + new_shape.extend(shape[k:]) + new_strides.extend(strides[k:]) + debug = True + if debug: + new_shape_len += len(shape) - k + assert ( + len(new_shape) == new_shape_len + ), f"{len(new_shape)} vs {new_shape_len}" + assert ( + len(new_strides) == new_shape_len + ), f"{len(new_strides)} vs {new_shape_len}" + assert len(new_advanced_ind) == array_count + return ( + tuple(new_shape), + tuple(new_strides), + new_offset, + tuple(new_advanced_ind), + new_advanced_start_pos, + ) + else: + raise TypeError + + +def _mock_extract(ary, ary_mask, p): + exec_q = dpctl.utils.get_execution_queue( + ( + ary.sycl_queue, + ary_mask.sycl_queue, + ) + ) + if exec_q is None: + raise ExecutionPlacementError( + "Can not automatically determine where to allocate the " + "result or performance execution. " + "Use `usm_ndarray.to_device` method to migrate data to " + "be associated with the same queue." + ) + + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + ary.usm_type, + ary_mask.usm_type, + ) + ) + ary_np = dpt.asnumpy(ary) + mask_np = dpt.asnumpy(ary_mask) + res_np = ary_np[(slice(None),) * p + (mask_np,)] + res = dpt.empty( + res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q + ) + res[...] = res_np + return res + + +def _mock_nonzero(ary): + if not isinstance(ary, usm_ndarray): + raise TypeError + q = ary.sycl_queue + usm_type = ary.usm_type + ary_np = dpt.asnumpy(ary) + nz = ary_np.nonzero() + return tuple(dpt.asarray(i, usm_type=usm_type, sycl_queue=q) for i in nz) + + +def _mock_take_multi_index(ary, inds, p): + queues_ = [ + ary.sycl_queue, + ] + usm_types_ = [ + ary.usm_type, + ] + all_integers = True + for ind in inds: + queues_.append(ind.sycl_queue) + usm_types_.append(ind.usm_type) + if all_integers: + all_integers = ind.dtype.kind in "ui" + exec_q = dpctl.utils.get_execution_queue(queues_) + if exec_q is None: + raise ExecutionPlacementError("") + if not all_integers: + print(inds) + raise IndexError( + "arrays used as indices must be of integer (or boolean) type" + ) + ary_np = dpt.asnumpy(ary) + ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds) + res_np = ary_np[ind_np] + res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) + res = dpt.empty( + res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q + ) + res[...] = res_np + return res + + +def get_item(ary, ind): + suai = ary.__sycl_usm_array_interface__ + _meta = _basic_slice_meta( + ind, ary.shape, ary.strides, suai.get("offset", 0) + ) + + if len(_meta) < 5: + raise RuntimeError + + res = usm_ndarray.__new__( + usm_ndarray, + _meta[0], + dtype=ary.dtype, # _make_typestr(ary.dtype.num), + strides=_meta[1], + buffer=ary.usm_data, # self.base_, + offset=_meta[2], + ) + # set flags and namespace + # res.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE) + # res.array_namespace_ = self.array_namespace_ + adv_ind = _meta[3] + adv_ind_start_p = _meta[4] + + if adv_ind_start_p < 0: + return res + + if len(adv_ind) == 1 and adv_ind[0].dtype == dpt.bool: + return _mock_extract(res, adv_ind[0], adv_ind_start_p) + + if any(ind.dtype == dpt.bool for ind in adv_ind): + adv_ind_int = list() + for ind in adv_ind: + if ind.dtype == dpt.bool: + adv_ind_int.extend(_mock_nonzero(ind)) + else: + adv_ind_int.append(ind) + return _mock_take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p) + + return _mock_take_multi_index(res, adv_ind, adv_ind_start_p) + + +def _mock_place(ary, ary_mask, p, vals): + exec_q = dpctl.utils.get_execution_queue( + (ary.sycl_queue, ary_mask.sycl_queue, vals.sycl_queue) + ) + if exec_q is None: + raise ExecutionPlacementError( + "Can not automatically determine where to allocate the " + "result or performance execution. " + "Use `usm_ndarray.to_device` method to migrate data to " + "be associated with the same queue." + ) + + ary_np = dpt.asnumpy(ary) + mask_np = dpt.asnumpy(ary_mask) + vals_np = dpt.asnumpy(vals) + ary_np[(slice(None),) * p + (mask_np,)] = vals_np + ary[...] = ary_np + return + + +def _mock_put_multi_index(ary, inds, p, vals): + queues_ = [ary.sycl_queue, vals.sycl_queue] + usm_types_ = [ary.usm_type, vals.usm_type] + all_integers = True + for ind in inds: + queues_.append(ind.sycl_queue) + usm_types_.append(ind.usm_type) + if all_integers: + all_integers = ind.dtype.kind in "ui" + exec_q = dpctl.utils.get_execution_queue(queues_) + if exec_q is None: + raise ExecutionPlacementError("") + if not all_integers: + print(inds) + raise IndexError( + "arrays used as indices must be of integer (or boolean) type" + ) + ary_np = dpt.asnumpy(ary) + vals_np = dpt.asnumpy(vals) + ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds) + ary_np[ind_np] = vals_np + ary[...] = ary_np + return + + +def set_item(ary, ind, rhs): + suai = ary.__sycl_usm_array_interface__ + _meta = _basic_slice_meta( + ind, ary.shape, ary.strides, suai.get("offset", 0) + ) + + if len(_meta) < 5: + raise RuntimeError + + res = usm_ndarray.__new__( + usm_ndarray, + _meta[0], + dtype=ary.dtype, # _make_typestr(ary.dtype.num), + strides=_meta[1], + buffer=ary.usm_data, # self.base_, + offset=_meta[2], + ) + # set flags and namespace + # res.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE) + # res.array_namespace_ = self.array_namespace_ + adv_ind = _meta[3] + adv_ind_start_p = _meta[4] + + if adv_ind_start_p < 0: + res[...] = rhs + return + + if len(adv_ind) == 1 and adv_ind[0].dtype == dpt.bool: + _mock_place(res, adv_ind[0], adv_ind_start_p, rhs) + return + + if any(ind.dtype == dpt.bool for ind in adv_ind): + adv_ind_int = list() + for ind in adv_ind: + if ind.dtype == dpt.bool: + adv_ind_int.extend(_mock_nonzero(ind)) + else: + adv_ind_int.append(ind) + _mock_put_multi_index(res, tuple(adv_ind_int), adv_ind_start_p, rhs) + return + + _mock_put_multi_index(res, adv_ind, adv_ind_start_p, rhs) + return diff --git a/proto/test_advanced.py b/proto/test_advanced.py new file mode 100644 index 0000000000..7cfb44c3d6 --- /dev/null +++ b/proto/test_advanced.py @@ -0,0 +1,405 @@ +import advanced +import numpy as np +import pytest + +import dpctl.tensor as dpt + + +def test_basic_slice1(): + res = advanced._basic_slice_meta((0,), (1,), (1,), 0) + assert res == (tuple(), tuple(), 0, tuple(), -1) + + +def test_basic_slice1a(): + res = advanced._basic_slice_meta(0, (1,), (1,), 0) + assert res == (tuple(), tuple(), 0, tuple(), -1) + + +def test_basic_slice2(): + res = advanced._basic_slice_meta((slice(None),), (1,), (1,), 0) + assert res == ((1,), (1,), 0, tuple(), -1) + + +def test_basic_slice3(): + res = advanced._basic_slice_meta((slice(None, None, -1),), (1,), (1,), 0) + assert res == ((1,), (-1,), 0, tuple(), -1) + + +def test_basic_slice4(): + res = advanced._basic_slice_meta( + (slice(None, None, -1),), + ( + 5, + 3, + ), + ( + 3, + 1, + ), + 0, + ) + assert res == ((5, 3), (-3, 1), (5 - 1) * 3, tuple(), -1) + + +def test_basic_slice5(): + res = advanced._basic_slice_meta( + ( + slice(None), + slice(None, None, -1), + ), + ( + 4, + 3, + ), + ( + 3, + 1, + ), + 0, + ) + assert res == ((4, 3), (3, -1), 3 - 1, tuple(), -1) + + +def test_basic_slice6(): + res = advanced._basic_slice_meta( + ( + 2, + slice(None, None, -1), + ), + ( + 4, + 3, + ), + ( + 3, + 1, + ), + 0, + ) + assert res == ((3,), (-1,), 2 * 3 + 3 - 1, tuple(), -1) + + +def test_basic_slice7(): + res = advanced._basic_slice_meta( + ( + Ellipsis, + slice(None, None, -1), + ), + ( + 4, + 3, + ), + ( + 3, + 1, + ), + 0, + ) + assert res == ((4, 3), (3, -1), 3 - 1, tuple(), -1) + + +def test_basic_slice8(): + res = advanced._basic_slice_meta( + (Ellipsis, None), + ( + 4, + 3, + ), + ( + 3, + 1, + ), + 0, + ) + assert res == ((4, 3, 1), (3, 1, 0), 0, tuple(), -1) + + +def test_basic_slice9(): + res = advanced._basic_slice_meta( + ( + None, + Ellipsis, + ), + ( + 4, + 3, + ), + ( + 3, + 1, + ), + 0, + ) + assert res == ( + ( + 1, + 4, + 3, + ), + (0, 3, 1), + 0, + tuple(), + -1, + ) + + +def test_basic_slice10(): + res = advanced._basic_slice_meta( + (None, Ellipsis, slice(None)), (4, 3, 5), (30, 5, 1), 0 + ) + assert res == ((1, 4, 3, 5), (0, 30, 5, 1), 0, tuple(), -1) + + +def test_advanced_slice1(): + ii = dpt.asarray([0, 1]) + res = advanced._basic_slice_meta((ii,), (10,), (1,), 0) + assert res == ((10,), (1,), 0, (ii,), 0) + + res = advanced._basic_slice_meta(ii, (10,), (1,), 0) + assert res == ((10,), (1,), 0, (ii,), 0) + + +def test_advanced_slice2(): + ii = dpt.asarray([0, 1]) + res = advanced._basic_slice_meta((ii, None), (10,), (1,), 0) + assert res == ((10, 1), (1, 0), 0, (ii,), 0) + + +def test_advanced_slice3(): + ii = dpt.asarray([0, 1]) + res = advanced._basic_slice_meta((None, ii), (10,), (1,), 0) + assert res == ( + ( + 1, + 10, + ), + ( + 0, + 1, + ), + 0, + (ii,), + 1, + ) + + +def test_advanced_slice4(): + ii = dpt.asarray([0, 1]) + res = advanced._basic_slice_meta( + (ii, ii, ii), + (10, 10, 10), + ( + 100, + 10, + 1, + ), + 0, + ) + assert res == ( + (10, 10, 10), + ( + 100, + 10, + 1, + ), + 0, + (ii, ii, ii), + 0, + ) + + +def test_advanced_slice5(): + ii = dpt.asarray([0, 1]) + with pytest.raises(IndexError): + advanced._basic_slice_meta( + (ii, slice(None), ii), + (10, 10, 10), + ( + 100, + 10, + 1, + ), + 0, + ) + + +def test_advanced_slice6(): + ii = dpt.asarray([0, 1]) + res = advanced._basic_slice_meta( + ( + slice(None), + ii, + ii, + ), + (10, 10, 10), + ( + 100, + 10, + 1, + ), + 0, + ) + assert res == ( + ( + 10, + 10, + 10, + ), + (100, 10, 1), + 0, + ( + ii, + ii, + ), + 1, + ) + + +def test_advanced_slice7(): + x = dpt.reshape( + dpt.arange(3 * 3 * 3, dtype="i8"), + ( + 3, + 3, + 3, + ), + ) + mask = dpt.asarray( + [ + [[True, True, False], [False, True, True], [True, False, True]], + [[True, False, False], [False, False, True], [False, True, False]], + [[True, True, True], [False, False, False], [False, False, True]], + ] + ) + res = advanced.get_item(x, mask) + res_expected = np.array([0, 1, 4, 5, 6, 8, 9, 14, 16, 18, 19, 20, 26]) + assert np.array_equal(dpt.asnumpy(res), res_expected) + res2 = advanced.get_item(x, (mask,)) + assert np.array_equal(dpt.asnumpy(res2), res_expected) + + +def test_advanced_slice8(): + x = dpt.reshape( + dpt.arange(3 * 3 * 3, dtype="i8"), + ( + 3, + 3, + 3, + ), + ) + mask = dpt.asarray( + [[True, False, False], [False, True, False], [False, True, False]] + ) + res = advanced.get_item(x, mask) + res_expected = np.array([[0, 1, 2], [12, 13, 14], [21, 22, 23]]) + assert np.array_equal(dpt.asnumpy(res), res_expected) + res2 = advanced.get_item(x, (mask,)) + assert np.array_equal(dpt.asnumpy(res2), res_expected) + + +def test_advanced_slice9(): + x = dpt.reshape( + dpt.arange(3 * 3 * 3, dtype="i8"), + ( + 3, + 3, + 3, + ), + ) + mask = dpt.asarray( + [[True, False, False], [False, True, False], [False, True, False]] + ) + res = advanced.get_item( + x, + ( + slice(None, None, None), + mask, + ), + ) + res_expected = np.array([[0, 4, 7], [9, 13, 16], [18, 22, 25]]) + assert np.array_equal(dpt.asnumpy(res), res_expected) + + +def lin_id(i, j, k): + return 9 * i + 3 * j + k + + +def test_advanced_slice10(): + x = dpt.reshape( + dpt.arange(3 * 3 * 3, dtype="i8"), + ( + 3, + 3, + 3, + ), + ) + i0 = dpt.asarray([0, 1, 1]) + i1 = dpt.asarray([1, 1, 2]) + i2 = dpt.asarray([2, 0, 1]) + res = advanced.get_item(x, (i0, i1, i2)) + res_expected = np.array( + [ + lin_id(0, 1, 2), + lin_id(1, 1, 0), + lin_id(1, 2, 1), + ] + ) + assert np.array_equal(dpt.asnumpy(res), res_expected) + + +def test_advanced_slice11(): + x = dpt.reshape( + dpt.arange(3 * 3 * 3, dtype="i8"), + ( + 3, + 3, + 3, + ), + ) + i0 = dpt.asarray([0, 1, 1]) + i2 = dpt.asarray([2, 0, 1]) + with pytest.raises(IndexError): + advanced.get_item(x, (i0, slice(None, None, None), i2)) + + +def test_advanced_slice12(): + x = dpt.reshape( + dpt.arange(3 * 3 * 3, dtype="i8"), + ( + 3, + 3, + 3, + ), + ) + i1 = dpt.asarray([1, 1, 2]) + i2 = dpt.asarray([2, 0, 1]) + res = advanced.get_item(x, (slice(None), None, i1, i2, None)) + res_expected = np.array( + [ + [[[lin_id(0, 1, 2)], [lin_id(0, 1, 0)], [lin_id(0, 2, 1)]]], + [[[lin_id(1, 1, 2)], [lin_id(1, 1, 0)], [lin_id(1, 2, 1)]]], + [[[lin_id(2, 1, 2)], [lin_id(2, 1, 0)], [lin_id(2, 2, 1)]]], + ] + ) + assert np.array_equal(dpt.asnumpy(res), res_expected) + + +def test_advanced_slice13(): + x = dpt.reshape( + dpt.arange(3 * 3 * 3, dtype="i8"), + ( + 3, + 3, + 3, + ), + ) + i1 = dpt.asarray([[1], [2]]) + i2 = dpt.asarray([[0, 1]]) + res = advanced.get_item(x, (i1, i2, 0)) + res_expected = np.array( + [ + [lin_id(1, 0, 0), lin_id(1, 1, 0)], + [lin_id(2, 0, 0), lin_id(2, 1, 0)], + ] + ) + assert np.array_equal(dpt.asnumpy(res), res_expected) From a1791e28ff45278df797ea73340391859452286a Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 14 Feb 2023 08:53:28 -0600 Subject: [PATCH 05/57] Change mod of _slicing.pxi to be non-executable --- dpctl/tensor/_slicing.pxi | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 dpctl/tensor/_slicing.pxi diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi old mode 100755 new mode 100644 From ea339eb179c4c01f4e6f22ba18689ff93fb4a0e9 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 14 Feb 2023 08:54:00 -0600 Subject: [PATCH 06/57] Added ExecutionPlacementError --- dpctl/utils/__init__.py | 2 ++ dpctl/utils/_compute_follows_data.pyx | 13 ++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/dpctl/utils/__init__.py b/dpctl/utils/__init__.py index 7589f9de9f..671564cda5 100644 --- a/dpctl/utils/__init__.py +++ b/dpctl/utils/__init__.py @@ -19,6 +19,7 @@ """ from ._compute_follows_data import ( + ExecutionPlacementError, get_coerced_usm_type, get_execution_queue, validate_usm_type, @@ -30,4 +31,5 @@ "get_coerced_usm_type", "validate_usm_type", "onetrace_enabled", + "ExecutionPlacementError", ] diff --git a/dpctl/utils/_compute_follows_data.pyx b/dpctl/utils/_compute_follows_data.pyx index f61cebc90c..179fb6f875 100644 --- a/dpctl/utils/_compute_follows_data.pyx +++ b/dpctl/utils/_compute_follows_data.pyx @@ -28,7 +28,18 @@ import dpctl from .._sycl_queue cimport SyclQueue -__all__ = ["get_execution_queue", "get_coerced_usm_type"] +__all__ = ["get_execution_queue", "get_coerced_usm_type", "ExecutionPlacementError"] + + +class ExecutionPlacementError(Exception): + """Exception raised when execution placement target can be determined + from input arrays. + + Make sure that input arrays are associated with the same SyclQueue, + or migrate data to the same SyclQueue using usm_ndarray.to_device + method. + """ + pass cdef bint queue_equiv(SyclQueue q1, SyclQueue q2): From 483a423b01fef875111605fdac8cba478ac3009d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 14 Feb 2023 09:20:41 -0600 Subject: [PATCH 07/57] Factored out dpt.dtype and dpt.bool, etc. definitions into dedicated file --- dpctl/tensor/__init__.py | 34 ++++++++++++------------- dpctl/tensor/_data_types.py | 50 +++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 17 deletions(-) create mode 100644 dpctl/tensor/_data_types.py diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index bc6ae52564..cc4e31adf5 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -21,8 +21,6 @@ """ -from numpy import dtype - from dpctl.tensor._copy_utils import asnumpy, astype, copy, from_numpy, to_numpy from dpctl.tensor._ctors import ( arange, @@ -41,6 +39,23 @@ zeros, zeros_like, ) +from dpctl.tensor._data_types import ( + bool, + complex64, + complex128, + dtype, + float16, + float32, + float64, + int8, + int16, + int32, + int64, + uint8, + uint16, + uint32, + uint64, +) from dpctl.tensor._device import Device from dpctl.tensor._dlpack import from_dlpack from dpctl.tensor._manipulation_functions import ( @@ -68,21 +83,6 @@ from dpctl.tensor._reshape import reshape from dpctl.tensor._usmarray import usm_ndarray -bool = dtype("bool") -int8 = dtype("int8") -int16 = dtype("int16") -int32 = dtype("int32") -int64 = dtype("int64") -uint8 = dtype("uint8") -uint16 = dtype("uint16") -uint32 = dtype("uint32") -uint64 = dtype("uint64") -float16 = dtype("float16") -float32 = dtype("float32") -float64 = dtype("float64") -complex64 = dtype("complex64") -complex128 = dtype("complex128") - __all__ = [ "Device", "usm_ndarray", diff --git a/dpctl/tensor/_data_types.py b/dpctl/tensor/_data_types.py new file mode 100644 index 0000000000..c97afe37be --- /dev/null +++ b/dpctl/tensor/_data_types.py @@ -0,0 +1,50 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numpy import dtype + +bool = dtype("bool") +int8 = dtype("int8") +int16 = dtype("int16") +int32 = dtype("int32") +int64 = dtype("int64") +uint8 = dtype("uint8") +uint16 = dtype("uint16") +uint32 = dtype("uint32") +uint64 = dtype("uint64") +float16 = dtype("float16") +float32 = dtype("float32") +float64 = dtype("float64") +complex64 = dtype("complex64") +complex128 = dtype("complex128") + +__all__ = [ + "dtype", + "bool", + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + "uint64", + "float16", + "float32", + "float64", + "complex64", + "complex128", +] From efcd9cb01c7621dfde131b3b4a58c25f0e2e75e5 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 14 Feb 2023 09:21:29 -0600 Subject: [PATCH 08/57] Deployed lazy implementation of advanced indexing to develop tests --- dpctl/tensor/_copy_utils.py | 123 ++++++++++++++++++++++++++++++++ dpctl/tensor/_usmarray.pyx | 136 ++++++++++++++++++++++++++++-------- 2 files changed, 230 insertions(+), 29 deletions(-) diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index f006582cd3..41a2fd3203 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -15,9 +15,11 @@ # limitations under the License. import numpy as np +import dpctl import dpctl.memory as dpm import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti +import dpctl.utils from dpctl.tensor._device import normalize_queue_device __doc__ = ( @@ -382,3 +384,124 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True): ) _copy_from_usm_ndarray_to_usm_ndarray(R, usm_ary) return R + + +def _mock_extract(ary, ary_mask, p): + exec_q = dpctl.utils.get_execution_queue( + ( + ary.sycl_queue, + ary_mask.sycl_queue, + ) + ) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError( + "Can not automatically determine where to allocate the " + "result or performance execution. " + "Use `usm_ndarray.to_device` method to migrate data to " + "be associated with the same queue." + ) + + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + ary.usm_type, + ary_mask.usm_type, + ) + ) + ary_np = dpt.asnumpy(ary) + mask_np = dpt.asnumpy(ary_mask) + res_np = ary_np[(slice(None),) * p + (mask_np,)] + res = dpt.empty( + res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q + ) + res[...] = res_np + return res + + +def _mock_nonzero(ary): + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError + q = ary.sycl_queue + usm_type = ary.usm_type + ary_np = dpt.asnumpy(ary) + nz = ary_np.nonzero() + return tuple(dpt.asarray(i, usm_type=usm_type, sycl_queue=q) for i in nz) + + +def _mock_take_multi_index(ary, inds, p): + queues_ = [ + ary.sycl_queue, + ] + usm_types_ = [ + ary.usm_type, + ] + all_integers = True + for ind in inds: + queues_.append(ind.sycl_queue) + usm_types_.append(ind.usm_type) + if all_integers: + all_integers = ind.dtype.kind in "ui" + exec_q = dpctl.utils.get_execution_queue(queues_) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError("") + if not all_integers: + raise IndexError( + "arrays used as indices must be of integer (or boolean) type" + ) + ary_np = dpt.asnumpy(ary) + ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds) + res_np = ary_np[ind_np] + res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) + res = dpt.empty( + res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q + ) + res[...] = res_np + return res + + +def _mock_place(ary, ary_mask, p, vals): + exec_q = dpctl.utils.get_execution_queue( + (ary.sycl_queue, ary_mask.sycl_queue, vals.sycl_queue) + ) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError( + "Can not automatically determine where to allocate the " + "result or performance execution. " + "Use `usm_ndarray.to_device` method to migrate data to " + "be associated with the same queue." + ) + + ary_np = dpt.asnumpy(ary) + mask_np = dpt.asnumpy(ary_mask) + vals_np = dpt.asnumpy(vals) + ary_np[(slice(None),) * p + (mask_np,)] = vals_np + ary[...] = ary_np + return + + +def _mock_put_multi_index(ary, inds, p, vals): + queues_ = [ary.sycl_queue, vals.sycl_queue] + usm_types_ = [ary.usm_type, vals.usm_type] + all_integers = True + for ind in inds: + queues_.append(ind.sycl_queue) + usm_types_.append(ind.usm_type) + if all_integers: + all_integers = ind.dtype.kind in "ui" + exec_q = dpctl.utils.get_execution_queue(queues_) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError( + "Can not automatically determine where to allocate the " + "result or performance execution. " + "Use `usm_ndarray.to_device` method to migrate data to " + "be associated with the same queue." + ) + if not all_integers: + raise IndexError( + "arrays used as indices must be of integer (or boolean) type" + ) + ary_np = dpt.asnumpy(ary) + vals_np = dpt.asnumpy(vals) + ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds) + ary_np[ind_np] = vals_np + ary[...] = ary_np + return diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 9a4fab8af3..3c42c96dd5 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -25,6 +25,7 @@ import numpy as np import dpctl import dpctl.memory as dpmem +from ._data_types import bool as dpt_bool from ._device import Device from ._print import usm_ndarray_repr, usm_ndarray_str @@ -34,6 +35,7 @@ from cpython.tuple cimport PyTuple_New, PyTuple_SetItem cimport dpctl as c_dpctl cimport dpctl.memory as c_dpmem cimport dpctl.tensor._dlpack as c_dlpack + import dpctl.tensor._flags as _flags include "_stride_utils.pxi" @@ -648,6 +650,9 @@ cdef class usm_ndarray: self.get_offset()) cdef usm_ndarray res + if len(_meta) < 5: + raise RuntimeError + res = usm_ndarray.__new__( usm_ndarray, _meta[0], @@ -658,7 +663,32 @@ cdef class usm_ndarray: ) res.flags_ |= (self.flags_ & USM_ARRAY_WRITABLE) res.array_namespace_ = self.array_namespace_ - return res + + adv_ind = _meta[3] + adv_ind_start_p = _meta[4] + + if adv_ind_start_p < 0: + return res + + from ._copy_utils import ( + _mock_extract, + _mock_nonzero, + _mock_take_multi_index, + ) + if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool: + return _mock_extract(res, adv_ind[0], adv_ind_start_p) + + if any(ind.dtype == dpt_bool for ind in adv_ind): + adv_ind_int = list() + for ind in adv_ind: + if ind.dtype == dpt_bool: + adv_ind_int.extend(_mock_nonzero(ind)) + else: + adv_ind_int.append(ind) + return _mock_take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p) + + return _mock_take_multi_index(res, adv_ind, adv_ind_start_p) + def to_device(self, target): """ @@ -959,39 +989,87 @@ cdef class usm_ndarray: return _dispatch_binary_elementwise2(first, "right_shift", other) return NotImplemented - def __setitem__(self, key, val): - try: - Xv = self.__getitem__(key) - except (ValueError, IndexError) as e: - raise e + def __setitem__(self, key, rhs): + cdef tuple _meta + cdef usm_ndarray Xv + + if (self.flags_ & USM_ARRAY_WRITABLE) == 0: + raise ValueError("Can not modify read-only array.") + + _meta = _basic_slice_meta( + key, (self).shape, ( self).strides, + self.get_offset() + ) + + if len(_meta) < 5: + raise RuntimeError + + Xv = usm_ndarray.__new__( + usm_ndarray, + _meta[0], + dtype=_make_typestr(self.typenum_), + strides=_meta[1], + buffer=self.base_, + offset=_meta[2], + ) + # set flags and namespace + Xv.flags_ |= (self.flags_ & USM_ARRAY_WRITABLE) + Xv.array_namespace_ = self.array_namespace_ + from ._copy_utils import ( _copy_from_numpy_into, _copy_from_usm_ndarray_to_usm_ndarray, + _mock_nonzero, + _mock_place, + _mock_put_multi_index, ) - if (( Xv).flags_ & USM_ARRAY_WRITABLE) == 0: - raise ValueError("Can not modify read-only array.") - if isinstance(val, usm_ndarray): - _copy_from_usm_ndarray_to_usm_ndarray(Xv, val) - else: - if hasattr(val, "__sycl_usm_array_interface__"): - from dpctl.tensor import asarray - try: - val_ar = asarray(val) - _copy_from_usm_ndarray_to_usm_ndarray(Xv, val_ar) - except Exception: - raise ValueError( - f"Input of type {type(val)} could not be " - "converted to usm_ndarray" - ) + + adv_ind = _meta[3] + adv_ind_start_p = _meta[4] + + if adv_ind_start_p < 0: + # basic slicing + if isinstance(rhs, usm_ndarray): + _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs) else: - try: - val_np = np.asarray(val) - _copy_from_numpy_into(Xv, val_np) - except Exception: - raise ValueError( - f"Input of type {type(val)} could not be " - "converted to usm_ndarray" - ) + if hasattr(rhs, "__sycl_usm_array_interface__"): + from dpctl.tensor import asarray + try: + rhs_ar = asarray(rhs) + _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs_ar) + except Exception: + raise ValueError( + f"Input of type {type(rhs)} could not be " + "converted to usm_ndarray" + ) + else: + try: + rhs_np = np.asarray(rhs) + _copy_from_numpy_into(Xv, rhs_np) + except Exception: + raise ValueError( + f"Input of type {type(rhs)} could not be " + "converted to usm_ndarray" + ) + return + + if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool: + _mock_place(Xv, adv_ind[0], adv_ind_start_p, rhs) + return + + if any(ind.dtype == dpt_bool for ind in adv_ind): + adv_ind_int = list() + for ind in adv_ind: + if ind.dtype == dpt_bool: + adv_ind_int.extend(_mock_nonzero(ind)) + else: + adv_ind_int.append(ind) + _mock_put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs) + return + + _mock_put_multi_index(Xv, adv_ind, adv_ind_start_p, rhs) + return + def __sub__(first, other): "See comment in __add__" From 04e4b5128ba650ebaa4578eab544ef69e1b72e35 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 14 Feb 2023 11:12:08 -0600 Subject: [PATCH 09/57] Ensure that rhs can be a scalar or numpy array --- dpctl/tensor/_copy_utils.py | 38 ++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index 41a2fd3203..f83ecdbd74 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -428,6 +428,8 @@ def _mock_nonzero(ary): def _mock_take_multi_index(ary, inds, p): + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError queues_ = [ ary.sycl_queue, ] @@ -459,9 +461,15 @@ def _mock_take_multi_index(ary, inds, p): def _mock_place(ary, ary_mask, p, vals): + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError + if not isinstance(ary_mask, dpt.usm_ndarray): + raise TypeError exec_q = dpctl.utils.get_execution_queue( - (ary.sycl_queue, ary_mask.sycl_queue, vals.sycl_queue) + (ary.sycl_queue, ary_mask.sycl_queue) ) + if exec_q is not None and isinstance(vals, dpt.usm_ndarray): + exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue)) if exec_q is None: raise dpctl.utils.ExecutionPlacementError( "Can not automatically determine where to allocate the " @@ -472,17 +480,32 @@ def _mock_place(ary, ary_mask, p, vals): ary_np = dpt.asnumpy(ary) mask_np = dpt.asnumpy(ary_mask) - vals_np = dpt.asnumpy(vals) + if isinstance(vals, dpt.usm_ndarray) or hasattr( + vals, "__sycl_usm_array_interface__" + ): + vals_np = dpt.asnumpy(vals) + else: + vals_np = vals ary_np[(slice(None),) * p + (mask_np,)] = vals_np ary[...] = ary_np return def _mock_put_multi_index(ary, inds, p, vals): - queues_ = [ary.sycl_queue, vals.sycl_queue] - usm_types_ = [ary.usm_type, vals.usm_type] + if isinstance(vals, dpt.ums_ndarray): + queues_ = [ary.sycl_queue, vals.sycl_queue] + usm_types_ = [ary.usm_type, vals.usm_type] + else: + queues_ = [ + ary.sycl_queue, + ] + usm_types_ = [ + ary.usm_type, + ] all_integers = True for ind in inds: + if not isinstance(ind, dpt.usm_ndarray): + raise TypeError queues_.append(ind.sycl_queue) usm_types_.append(ind.usm_type) if all_integers: @@ -500,7 +523,12 @@ def _mock_put_multi_index(ary, inds, p, vals): "arrays used as indices must be of integer (or boolean) type" ) ary_np = dpt.asnumpy(ary) - vals_np = dpt.asnumpy(vals) + if isinstance(vals, dpt.usm_ndarray) or hasattr( + vals, "__sycl_usm_array_interface__" + ): + vals_np = dpt.asnumpy(vals) + else: + vals_np = vals ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds) ary_np[ind_np] = vals_np ary[...] = ary_np From 6589dfa05b6355cf42919a4028d48f8787d59523 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 14 Feb 2023 11:22:29 -0600 Subject: [PATCH 10/57] Added new test file to test indexing --- dpctl/tests/test_usm_ndarray_indexing.py | 60 ++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 dpctl/tests/test_usm_ndarray_indexing.py diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py new file mode 100644 index 0000000000..2441a663cf --- /dev/null +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -0,0 +1,60 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# import numpy as np +# import pytest +from helper import get_queue_or_skip + +# import dpctl +import dpctl.tensor as dpt + +# from helper import skip_if_dtype_not_supported + + +def test_basic_slice1(): + q = get_queue_or_skip() + x = dpt.empty(10, dtype="u2", sycl_queue=q) + y = x[0] + assert isinstance(y, dpt.usm_ndarray) + assert y.ndim == 0 + assert y.shape == tuple() + assert y.strides == tuple() + + +def test_basic_slice2(): + q = get_queue_or_skip() + x = dpt.empty(10, dtype="i2", sycl_queue=q) + y = x[(0,)] + assert isinstance(y, dpt.usm_ndarray) + assert y.ndim == 0 + assert y.shape == tuple() + assert y.strides == tuple() + + +def test_basic_slice3(): + q = get_queue_or_skip() + x = dpt.empty(10, dtype="i2", sycl_queue=q) + y = x[:] + assert isinstance(y, dpt.usm_ndarray) + assert y.ndim == x.ndim + assert y.shape == x.shape + assert y.strides == x.strides + y = x[(slice(None, None, None),)] + assert isinstance(y, dpt.usm_ndarray) + assert y.ndim == x.ndim + assert y.shape == x.shape + assert y.strides == x.strides From d66c4940a9fecec47f77abac546756c6b10ca391 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 15 Feb 2023 09:53:25 -0600 Subject: [PATCH 11/57] Added _constants, and extended advanced indexing tests --- dpctl/tensor/__init__.py | 7 ++ dpctl/tensor/_constants.py | 24 ++++++ dpctl/tests/test_usm_ndarray_indexing.py | 99 ++++++++++++++++++++++++ 3 files changed, 130 insertions(+) create mode 100644 dpctl/tensor/_constants.py diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index cc4e31adf5..7f2a6a9962 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -83,6 +83,8 @@ from dpctl.tensor._reshape import reshape from dpctl.tensor._usmarray import usm_ndarray +from ._constants import e, inf, nan, newaxis, pi + __all__ = [ "Device", "usm_ndarray", @@ -141,4 +143,9 @@ "print_options", "usm_ndarray_repr", "usm_ndarray_str", + "newaxis", + "e", + "pi", + "nan", + "inf", ] diff --git a/dpctl/tensor/_constants.py b/dpctl/tensor/_constants.py new file mode 100644 index 0000000000..88c516364d --- /dev/null +++ b/dpctl/tensor/_constants.py @@ -0,0 +1,24 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +newaxis = None + +pi = np.pi +e = np.e +nan = np.nan +inf = np.inf diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index 2441a663cf..152c1aefdd 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -58,3 +58,102 @@ def test_basic_slice3(): assert y.ndim == x.ndim assert y.shape == x.shape assert y.strides == x.strides + + +def test_basic_slice4(): + q = get_queue_or_skip() + n0, n1 = 5, 3 + x = dpt.empty((n0, n1), dtype="f4", sycl_queue=q) + y = x[::-1] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == x.shape + assert y.strides == (-x.strides[0], x.strides[1]) + actual_offset = y.__sycl_usm_array_interface__["offset"] + assert actual_offset == (n0 - 1) * n1 + + +def test_basic_slice5(): + q = get_queue_or_skip() + n0, n1 = 5, 3 + x = dpt.empty((n0, n1), dtype="c8", sycl_queue=q) + y = x[:, ::-1] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == x.shape + assert y.strides == (x.strides[0], -x.strides[1]) + actual_offset = y.__sycl_usm_array_interface__["offset"] + assert actual_offset == (n1 - 1) + + +def test_basic_slice6(): + q = get_queue_or_skip() + i0, n0, n1 = 2, 4, 3 + x = dpt.empty((n0, n1), dtype="c8", sycl_queue=q) + y = x[i0, ::-1] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == (x.shape[1],) + assert y.strides == (-x.strides[1],) + actual_offset = y.__sycl_usm_array_interface__["offset"] + expected_offset = i0 * x.strides[0] + (n1 - 1) * x.strides[1] + assert actual_offset == expected_offset + + +def test_basic_slice7(): + q = get_queue_or_skip() + n0, n1, n2 = 5, 3, 2 + x = dpt.empty((n0, n1, n2), dtype="?", sycl_queue=q) + y = x[..., ::-1] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == x.shape + assert y.strides == ( + x.strides[0], + x.strides[1], + -x.strides[2], + ) + actual_offset = y.__sycl_usm_array_interface__["offset"] + expected_offset = (n2 - 1) * x.strides[2] + assert actual_offset == expected_offset + + +def test_basic_slice8(): + q = get_queue_or_skip() + n0, n1 = 3, 7 + x = dpt.empty((n0, n1), dtype="u1", sycl_queue=q) + y = x[..., dpt.newaxis] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == (n0, n1, 1) + assert y.strides == (n1, 1, 0) + + +def test_basic_slice9(): + q = get_queue_or_skip() + n0, n1 = 3, 7 + x = dpt.empty((n0, n1), dtype="u8", sycl_queue=q) + y = x[dpt.newaxis, ...] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == (1, n0, n1) + assert y.strides == (0, n1, 1) + + +def test_basic_slice10(): + q = get_queue_or_skip() + n0, n1, n2 = 3, 7, 5 + x = dpt.empty((n0, n1, n2), dtype="u1", sycl_queue=q) + y = x[dpt.newaxis, ..., :] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == (1, n0, n1, n2) + assert y.strides == (0, n1 * n2, n2, 1) + + +def test_advanced_slice1(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = dpt.arange(10, dtype="i4", sycl_queue=q) + y = x[ii] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ii.shape + assert y.strides == (1,) + # FIXME, once usm_ndarray.__equal__ is implemented, + # use of asnumpy should be removed + assert all( + dpt.asnumpy(x[ii[k]]) == dpt.asnumpy(y[k]) for k in range(ii.shape[0]) + ) From 61917a5efee44cdf3c5f08eadd35a06169cc2d81 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 15 Feb 2023 13:46:55 -0600 Subject: [PATCH 12/57] More tests for advanced indexing --- dpctl/tests/test_usm_ndarray_indexing.py | 212 ++++++++++++++++++++++- 1 file changed, 209 insertions(+), 3 deletions(-) diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index 152c1aefdd..e6c7271ab1 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -16,7 +16,7 @@ # import numpy as np -# import pytest +import pytest from helper import get_queue_or_skip # import dpctl @@ -144,6 +144,10 @@ def test_basic_slice10(): assert y.strides == (0, n1 * n2, n2, 1) +def _all_equal(it1, it2): + return all(dpt.asnumpy(x) == dpt.asnumpy(y) for x, y in zip(it1, it2)) + + def test_advanced_slice1(): q = get_queue_or_skip() ii = dpt.asarray([1, 2], sycl_queue=q) @@ -154,6 +158,208 @@ def test_advanced_slice1(): assert y.strides == (1,) # FIXME, once usm_ndarray.__equal__ is implemented, # use of asnumpy should be removed - assert all( - dpt.asnumpy(x[ii[k]]) == dpt.asnumpy(y[k]) for k in range(ii.shape[0]) + assert _all_equal( + (x[ii[k]] for k in range(ii.shape[0])), + (y[k] for k in range(ii.shape[0])), + ) + y = x[(ii,)] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ii.shape + assert y.strides == (1,) + # FIXME, once usm_ndarray.__equal__ is implemented, + # use of asnumpy should be removed + assert _all_equal( + (x[ii[k]] for k in range(ii.shape[0])), + (y[k] for k in range(ii.shape[0])), ) + + +def test_advanced_slice2(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = dpt.arange(10, dtype="i4", sycl_queue=q) + y = x[ii, dpt.newaxis] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ii.shape + (1,) + assert y.flags["C"] + + +def test_advanced_slice3(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = dpt.arange(10, dtype="i4", sycl_queue=q) + y = x[dpt.newaxis, ii] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == (1,) + ii.shape + assert y.flags["C"] + + +def _make_3d(dt, q): + return dpt.reshape( + dpt.arange(3 * 3 * 3, dtype=dt, sycl_queue=q), + ( + 3, + 3, + 3, + ), + ) + + +def test_advanced_slice4(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = _make_3d("i4", q) + y = x[ii, ii, ii] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ii.shape + assert _all_equal( + (x[ii[k], ii[k], ii[k]] for k in range(ii.shape[0])), + (y[k] for k in range(ii.shape[0])), + ) + + +def test_advanced_slice5(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = _make_3d("i4", q) + with pytest.raises(IndexError): + x[ii, 0, ii] + + +def test_advanced_slice6(): + q = get_queue_or_skip() + ii = dpt.asarray([1, 2], sycl_queue=q) + x = _make_3d("i4", q) + y = x[:, ii, ii] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ( + x.shape[0], + ii.shape[0], + ) + assert _all_equal( + ( + x[i, ii[k], ii[k]] + for i in range(x.shape[0]) + for k in range(ii.shape[0]) + ), + (y[i, k] for i in range(x.shape[0]) for k in range(ii.shape[0])), + ) + + +def test_advanced_slice7(): + q = get_queue_or_skip() + mask = dpt.asarray( + [ + [[True, True, False], [False, True, True], [True, False, True]], + [[True, False, False], [False, False, True], [False, True, False]], + [[True, True, True], [False, False, False], [False, False, True]], + ], + sycl_queue=q, + ) + x = _make_3d("i2", q) + y = x[mask] + expected = [0, 1, 4, 5, 6, 8, 9, 14, 16, 18, 19, 20, 26] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == (len(expected),) + assert all(dpt.asnumpy(y[k]) == expected[k] for k in range(len(expected))) + + +def test_advanced_slice8(): + q = get_queue_or_skip() + mask = dpt.asarray( + [[True, False, False], [False, True, False], [False, True, False]], + sycl_queue=q, + ) + x = _make_3d("u2", q) + y = x[mask] + expected = dpt.asarray( + [[0, 1, 2], [12, 13, 14], [21, 22, 23]], sycl_queue=q + ) + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == expected.shape + assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all() + + +def test_advanced_slice9(): + q = get_queue_or_skip() + mask = dpt.asarray( + [[True, False, False], [False, True, False], [False, True, False]], + sycl_queue=q, + ) + x = _make_3d("u4", q) + y = x[:, mask] + expected = dpt.asarray([[0, 4, 7], [9, 13, 16], [18, 22, 25]], sycl_queue=q) + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == expected.shape + assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all() + + +def lin_id(i, j, k): + """global_linear_id for (3,3,3) range traversed in C-contiguous order""" + return 9 * i + 3 * j + k + + +def test_advanced_slice10(): + q = get_queue_or_skip() + x = _make_3d("u8", q) + i0 = dpt.asarray([0, 1, 1], device=x.device) + i1 = dpt.asarray([1, 1, 2], device=x.device) + i2 = dpt.asarray([2, 0, 1], device=x.device) + y = x[i0, i1, i2] + res_expected = dpt.asarray( + [ + lin_id(0, 1, 2), + lin_id(1, 1, 0), + lin_id(1, 2, 1), + ], + sycl_queue=q, + ) + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == res_expected.shape + assert (dpt.asnumpy(y) == dpt.asnumpy(res_expected)).all() + + +def test_advanced_slice11(): + q = get_queue_or_skip() + x = _make_3d("u8", q) + i0 = dpt.asarray([0, 1, 1], device=x.device) + i2 = dpt.asarray([2, 0, 1], device=x.device) + with pytest.raises(IndexError): + x[i0, :, i2] + + +def test_advanced_slice12(): + q = get_queue_or_skip() + x = _make_3d("u8", q) + i1 = dpt.asarray([1, 1, 2], device=x.device) + i2 = dpt.asarray([2, 0, 1], device=x.device) + y = x[:, dpt.newaxis, i1, i2, dpt.newaxis] + res_expected = dpt.asarray( + [ + [[[lin_id(0, 1, 2)], [lin_id(0, 1, 0)], [lin_id(0, 2, 1)]]], + [[[lin_id(1, 1, 2)], [lin_id(1, 1, 0)], [lin_id(1, 2, 1)]]], + [[[lin_id(2, 1, 2)], [lin_id(2, 1, 0)], [lin_id(2, 2, 1)]]], + ], + sycl_queue=q, + ) + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == res_expected.shape + assert (dpt.asnumpy(y) == dpt.asnumpy(res_expected)).all() + + +def test_advanced_slice13(): + q = get_queue_or_skip() + x = _make_3d("u8", q) + i1 = dpt.asarray([[1], [2]], device=x.device) + i2 = dpt.asarray([[0, 1]], device=x.device) + y = x[i1, i2, 0] + expected = dpt.asarray( + [ + [lin_id(1, 0, 0), lin_id(1, 1, 0)], + [lin_id(2, 0, 0), lin_id(2, 1, 0)], + ], + device=x.device, + ) + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == expected.shape + assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all() From c2d7928bb440675be1dac7ef2fe1a98d1bcd2593 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 15 Feb 2023 14:06:16 -0600 Subject: [PATCH 13/57] Removed proto/ folder --- proto/advanced.py | 462 ----------------------------------------- proto/test_advanced.py | 405 ------------------------------------ 2 files changed, 867 deletions(-) delete mode 100644 proto/advanced.py delete mode 100644 proto/test_advanced.py diff --git a/proto/advanced.py b/proto/advanced.py deleted file mode 100644 index 8721214d73..0000000000 --- a/proto/advanced.py +++ /dev/null @@ -1,462 +0,0 @@ -import numbers - -import dpctl.tensor as dpt -import dpctl.utils -from dpctl.tensor import usm_ndarray - -""" -Advanced slicing meta-infomation extraction -""" - - -class ExecutionPlacementError(Exception): - pass - - -def _slice_len(sl_start: int, sl_stop: int, sl_step: int): - """ - Compute len(range(sl_start, sl_stop, sl_step)) - """ - if sl_start == sl_stop: - return 0 - if sl_step > 0: - # 1 + argmax k such htat sl_start + sl_step*k < sl_stop - return 1 + ((sl_stop - sl_start - 1) // sl_step) - else: - return 1 + ((sl_stop - sl_start + 1) // sl_step) - - -def _is_integral(x): - """Gives True if x is an integral slice spec""" - if isinstance(x, (int, numbers.Integral)): - return True - if isinstance(x, usm_ndarray): - if x.ndim > 0: - return False - if x.dtype.kind not in "ui": - return False - return True - if callable(getattr(x, "__index__", None)): - try: - x.__index__() - except (TypeError, ValueError): - return False - return True - return False - - -def _basic_slice_meta(ind, shape: tuple, strides: tuple, offset: int): - """ - Give basic slicing index `ind` and array layout information produce - a 5-tuple (resulting_shape, resulting_strides, resulting_offset, - advanced_ind, resulting_advanced_ind_pos) - used to contruct a view into underlying array over which advanced - indexing, if any, is to be performed. - - Raises IndexError for invalid index `ind`. - """ - _no_advanced_ind = tuple() - _no_advanced_pos = -1 - if ind is Ellipsis: - return (shape, strides, offset, _no_advanced_ind, _no_advanced_pos) - elif ind is None: - return ( - (1,) + shape, - (0,) + strides, - offset, - _no_advanced_ind, - _no_advanced_pos, - ) - elif isinstance(ind, slice): - sl_start, sl_stop, sl_step = ind.indices(shape[0]) - sh0 = _slice_len(sl_start, sl_stop, sl_step) - str0 = sl_step * strides[0] - new_strides = ( - strides if (sl_step == 1 or sh0 == 0) else (str0,) + strides[1:] - ) - new_offset = offset if sh0 == 0 else offset + sl_start * strides[0] - return ( - (sh0,) + shape[1:], - new_strides, - new_offset, - _no_advanced_ind, - _no_advanced_pos, - ) - elif _is_integral(ind): - ind = ind.__index__() - if 0 <= ind < shape[0]: - return ( - shape[1:], - strides[1:], - offset + ind * strides[0], - _no_advanced_ind, - _no_advanced_pos, - ) - elif -shape[0] <= ind < 0: - return ( - shape[1:], - strides[1:], - offset + (shape[0] + ind) * strides[0], - _no_advanced_ind, - _no_advanced_pos, - ) - else: - raise IndexError( - "Index {0} is out of range for axes 0 with " - "size {1}".format(ind, shape[0]) - ) - elif isinstance(ind, usm_ndarray): - return (shape, strides, 0, (ind,), 0) - elif isinstance(ind, tuple): - axes_referenced = 0 - ellipses_count = 0 - newaxis_count = 0 - explicit_index = 0 - array_count = 0 - seen_arrays_yet = False - array_streak_started = False - array_streak_interrupted = False - for i in ind: - if i is None: - newaxis_count += 1 - if array_streak_started: - array_streak_interrupted = True - elif i is Ellipsis: - ellipses_count += 1 - if array_streak_started: - array_streak_interrupted = True - elif isinstance(i, slice): - axes_referenced += 1 - if array_streak_started: - array_streak_interrupted = True - elif _is_integral(i): - explicit_index += 1 - axes_referenced += 1 - if array_streak_started: - array_streak_interrupted = True - elif isinstance(i, usm_ndarray): - if not seen_arrays_yet: - seen_arrays_yet = True - array_streak_started = True - array_streak_interrupted = False - if array_streak_interrupted: - raise IndexError( - "Advanced indexing array specs may not be " - "separated by basic slicing specs." - ) - dt_k = i.dtype.kind - if dt_k == "b": - axes_referenced += i.ndim - elif dt_k in "ui": - axes_referenced += 1 - else: - raise IndexError( - "arrays used as indices must be of integer " - "(or boolean) type" - ) - array_count += 1 - else: - raise TypeError - if ellipses_count > 1: - raise IndexError("an index can only have a sinlge ellipsis ('...')") - if axes_referenced > len(shape): - raise IndexError( - "too many indices for an array, array is " - "{0}-dimensional, but {1} were indexed".format( - len(shape), axes_referenced - ) - ) - if ellipses_count: - ellipses_count = len(shape) - axes_referenced - new_shape_len = ( - newaxis_count + ellipses_count + axes_referenced - explicit_index - ) - new_shape = list() - new_strides = list() - new_advanced_ind = list() - k = 0 - new_advanced_start_pos = -1 - advanced_start_pos_set = False - new_offset = offset - is_empty = False - for i in range(len(ind)): - ind_i = ind[i] - if ind_i is Ellipsis: - k_new = k + ellipses_count - new_shape.extend(shape[k:k_new]) - new_strides.extend(strides[k:k_new]) - k = k_new - elif ind_i is None: - new_shape.append(1) - new_strides.append(0) - elif isinstance(ind_i, slice): - k_new = k + 1 - sl_start, sl_stop, sl_step = ind_i.indices(shape[k]) - sh_i = _slice_len(sl_start, sl_stop, sl_step) - str_i = (1 if sh_i == 0 else sl_step) * strides[k] - new_shape.append(sh_i) - new_strides.append(str_i) - if sh_i > 0 and not is_empty: - new_offset = new_offset + sl_start * strides[k] - if sh_i == 0: - is_empty = True - k = k_new - elif _is_integral(ind_i): - ind_i = ind_i.__index__() - if 0 <= ind_i < shape[k]: - k_new = k + 1 - if not is_empty: - new_offset = new_offset + ind_i * strides[k] - k = k_new - elif -shape[k] <= ind_i < 0: - k_new = k + 1 - if not is_empty: - new_offset = ( - new_offset + (shape[k] + ind_i) * strides[k] - ) - k = k_new - else: - raise IndexError( - ( - "Index {0} is out of range for " - "axes {1} with size {2}" - ).format(ind_i, k, shape[k]) - ) - elif isinstance(ind_i, usm_ndarray): - if not advanced_start_pos_set: - new_advanced_start_pos = len(new_shape) - advanced_start_pos_set = True - new_advanced_ind.append(ind_i) - dt_k = ind_i.dtype.kind - if dt_k == "b": - k_new = k + ind_i.ndim - else: - k_new = k + 1 - new_shape.extend(shape[k:k_new]) - new_strides.extend(strides[k:k_new]) - k = k_new - new_shape.extend(shape[k:]) - new_strides.extend(strides[k:]) - debug = True - if debug: - new_shape_len += len(shape) - k - assert ( - len(new_shape) == new_shape_len - ), f"{len(new_shape)} vs {new_shape_len}" - assert ( - len(new_strides) == new_shape_len - ), f"{len(new_strides)} vs {new_shape_len}" - assert len(new_advanced_ind) == array_count - return ( - tuple(new_shape), - tuple(new_strides), - new_offset, - tuple(new_advanced_ind), - new_advanced_start_pos, - ) - else: - raise TypeError - - -def _mock_extract(ary, ary_mask, p): - exec_q = dpctl.utils.get_execution_queue( - ( - ary.sycl_queue, - ary_mask.sycl_queue, - ) - ) - if exec_q is None: - raise ExecutionPlacementError( - "Can not automatically determine where to allocate the " - "result or performance execution. " - "Use `usm_ndarray.to_device` method to migrate data to " - "be associated with the same queue." - ) - - res_usm_type = dpctl.utils.get_coerced_usm_type( - ( - ary.usm_type, - ary_mask.usm_type, - ) - ) - ary_np = dpt.asnumpy(ary) - mask_np = dpt.asnumpy(ary_mask) - res_np = ary_np[(slice(None),) * p + (mask_np,)] - res = dpt.empty( - res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q - ) - res[...] = res_np - return res - - -def _mock_nonzero(ary): - if not isinstance(ary, usm_ndarray): - raise TypeError - q = ary.sycl_queue - usm_type = ary.usm_type - ary_np = dpt.asnumpy(ary) - nz = ary_np.nonzero() - return tuple(dpt.asarray(i, usm_type=usm_type, sycl_queue=q) for i in nz) - - -def _mock_take_multi_index(ary, inds, p): - queues_ = [ - ary.sycl_queue, - ] - usm_types_ = [ - ary.usm_type, - ] - all_integers = True - for ind in inds: - queues_.append(ind.sycl_queue) - usm_types_.append(ind.usm_type) - if all_integers: - all_integers = ind.dtype.kind in "ui" - exec_q = dpctl.utils.get_execution_queue(queues_) - if exec_q is None: - raise ExecutionPlacementError("") - if not all_integers: - print(inds) - raise IndexError( - "arrays used as indices must be of integer (or boolean) type" - ) - ary_np = dpt.asnumpy(ary) - ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds) - res_np = ary_np[ind_np] - res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) - res = dpt.empty( - res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q - ) - res[...] = res_np - return res - - -def get_item(ary, ind): - suai = ary.__sycl_usm_array_interface__ - _meta = _basic_slice_meta( - ind, ary.shape, ary.strides, suai.get("offset", 0) - ) - - if len(_meta) < 5: - raise RuntimeError - - res = usm_ndarray.__new__( - usm_ndarray, - _meta[0], - dtype=ary.dtype, # _make_typestr(ary.dtype.num), - strides=_meta[1], - buffer=ary.usm_data, # self.base_, - offset=_meta[2], - ) - # set flags and namespace - # res.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE) - # res.array_namespace_ = self.array_namespace_ - adv_ind = _meta[3] - adv_ind_start_p = _meta[4] - - if adv_ind_start_p < 0: - return res - - if len(adv_ind) == 1 and adv_ind[0].dtype == dpt.bool: - return _mock_extract(res, adv_ind[0], adv_ind_start_p) - - if any(ind.dtype == dpt.bool for ind in adv_ind): - adv_ind_int = list() - for ind in adv_ind: - if ind.dtype == dpt.bool: - adv_ind_int.extend(_mock_nonzero(ind)) - else: - adv_ind_int.append(ind) - return _mock_take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p) - - return _mock_take_multi_index(res, adv_ind, adv_ind_start_p) - - -def _mock_place(ary, ary_mask, p, vals): - exec_q = dpctl.utils.get_execution_queue( - (ary.sycl_queue, ary_mask.sycl_queue, vals.sycl_queue) - ) - if exec_q is None: - raise ExecutionPlacementError( - "Can not automatically determine where to allocate the " - "result or performance execution. " - "Use `usm_ndarray.to_device` method to migrate data to " - "be associated with the same queue." - ) - - ary_np = dpt.asnumpy(ary) - mask_np = dpt.asnumpy(ary_mask) - vals_np = dpt.asnumpy(vals) - ary_np[(slice(None),) * p + (mask_np,)] = vals_np - ary[...] = ary_np - return - - -def _mock_put_multi_index(ary, inds, p, vals): - queues_ = [ary.sycl_queue, vals.sycl_queue] - usm_types_ = [ary.usm_type, vals.usm_type] - all_integers = True - for ind in inds: - queues_.append(ind.sycl_queue) - usm_types_.append(ind.usm_type) - if all_integers: - all_integers = ind.dtype.kind in "ui" - exec_q = dpctl.utils.get_execution_queue(queues_) - if exec_q is None: - raise ExecutionPlacementError("") - if not all_integers: - print(inds) - raise IndexError( - "arrays used as indices must be of integer (or boolean) type" - ) - ary_np = dpt.asnumpy(ary) - vals_np = dpt.asnumpy(vals) - ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds) - ary_np[ind_np] = vals_np - ary[...] = ary_np - return - - -def set_item(ary, ind, rhs): - suai = ary.__sycl_usm_array_interface__ - _meta = _basic_slice_meta( - ind, ary.shape, ary.strides, suai.get("offset", 0) - ) - - if len(_meta) < 5: - raise RuntimeError - - res = usm_ndarray.__new__( - usm_ndarray, - _meta[0], - dtype=ary.dtype, # _make_typestr(ary.dtype.num), - strides=_meta[1], - buffer=ary.usm_data, # self.base_, - offset=_meta[2], - ) - # set flags and namespace - # res.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE) - # res.array_namespace_ = self.array_namespace_ - adv_ind = _meta[3] - adv_ind_start_p = _meta[4] - - if adv_ind_start_p < 0: - res[...] = rhs - return - - if len(adv_ind) == 1 and adv_ind[0].dtype == dpt.bool: - _mock_place(res, adv_ind[0], adv_ind_start_p, rhs) - return - - if any(ind.dtype == dpt.bool for ind in adv_ind): - adv_ind_int = list() - for ind in adv_ind: - if ind.dtype == dpt.bool: - adv_ind_int.extend(_mock_nonzero(ind)) - else: - adv_ind_int.append(ind) - _mock_put_multi_index(res, tuple(adv_ind_int), adv_ind_start_p, rhs) - return - - _mock_put_multi_index(res, adv_ind, adv_ind_start_p, rhs) - return diff --git a/proto/test_advanced.py b/proto/test_advanced.py deleted file mode 100644 index 7cfb44c3d6..0000000000 --- a/proto/test_advanced.py +++ /dev/null @@ -1,405 +0,0 @@ -import advanced -import numpy as np -import pytest - -import dpctl.tensor as dpt - - -def test_basic_slice1(): - res = advanced._basic_slice_meta((0,), (1,), (1,), 0) - assert res == (tuple(), tuple(), 0, tuple(), -1) - - -def test_basic_slice1a(): - res = advanced._basic_slice_meta(0, (1,), (1,), 0) - assert res == (tuple(), tuple(), 0, tuple(), -1) - - -def test_basic_slice2(): - res = advanced._basic_slice_meta((slice(None),), (1,), (1,), 0) - assert res == ((1,), (1,), 0, tuple(), -1) - - -def test_basic_slice3(): - res = advanced._basic_slice_meta((slice(None, None, -1),), (1,), (1,), 0) - assert res == ((1,), (-1,), 0, tuple(), -1) - - -def test_basic_slice4(): - res = advanced._basic_slice_meta( - (slice(None, None, -1),), - ( - 5, - 3, - ), - ( - 3, - 1, - ), - 0, - ) - assert res == ((5, 3), (-3, 1), (5 - 1) * 3, tuple(), -1) - - -def test_basic_slice5(): - res = advanced._basic_slice_meta( - ( - slice(None), - slice(None, None, -1), - ), - ( - 4, - 3, - ), - ( - 3, - 1, - ), - 0, - ) - assert res == ((4, 3), (3, -1), 3 - 1, tuple(), -1) - - -def test_basic_slice6(): - res = advanced._basic_slice_meta( - ( - 2, - slice(None, None, -1), - ), - ( - 4, - 3, - ), - ( - 3, - 1, - ), - 0, - ) - assert res == ((3,), (-1,), 2 * 3 + 3 - 1, tuple(), -1) - - -def test_basic_slice7(): - res = advanced._basic_slice_meta( - ( - Ellipsis, - slice(None, None, -1), - ), - ( - 4, - 3, - ), - ( - 3, - 1, - ), - 0, - ) - assert res == ((4, 3), (3, -1), 3 - 1, tuple(), -1) - - -def test_basic_slice8(): - res = advanced._basic_slice_meta( - (Ellipsis, None), - ( - 4, - 3, - ), - ( - 3, - 1, - ), - 0, - ) - assert res == ((4, 3, 1), (3, 1, 0), 0, tuple(), -1) - - -def test_basic_slice9(): - res = advanced._basic_slice_meta( - ( - None, - Ellipsis, - ), - ( - 4, - 3, - ), - ( - 3, - 1, - ), - 0, - ) - assert res == ( - ( - 1, - 4, - 3, - ), - (0, 3, 1), - 0, - tuple(), - -1, - ) - - -def test_basic_slice10(): - res = advanced._basic_slice_meta( - (None, Ellipsis, slice(None)), (4, 3, 5), (30, 5, 1), 0 - ) - assert res == ((1, 4, 3, 5), (0, 30, 5, 1), 0, tuple(), -1) - - -def test_advanced_slice1(): - ii = dpt.asarray([0, 1]) - res = advanced._basic_slice_meta((ii,), (10,), (1,), 0) - assert res == ((10,), (1,), 0, (ii,), 0) - - res = advanced._basic_slice_meta(ii, (10,), (1,), 0) - assert res == ((10,), (1,), 0, (ii,), 0) - - -def test_advanced_slice2(): - ii = dpt.asarray([0, 1]) - res = advanced._basic_slice_meta((ii, None), (10,), (1,), 0) - assert res == ((10, 1), (1, 0), 0, (ii,), 0) - - -def test_advanced_slice3(): - ii = dpt.asarray([0, 1]) - res = advanced._basic_slice_meta((None, ii), (10,), (1,), 0) - assert res == ( - ( - 1, - 10, - ), - ( - 0, - 1, - ), - 0, - (ii,), - 1, - ) - - -def test_advanced_slice4(): - ii = dpt.asarray([0, 1]) - res = advanced._basic_slice_meta( - (ii, ii, ii), - (10, 10, 10), - ( - 100, - 10, - 1, - ), - 0, - ) - assert res == ( - (10, 10, 10), - ( - 100, - 10, - 1, - ), - 0, - (ii, ii, ii), - 0, - ) - - -def test_advanced_slice5(): - ii = dpt.asarray([0, 1]) - with pytest.raises(IndexError): - advanced._basic_slice_meta( - (ii, slice(None), ii), - (10, 10, 10), - ( - 100, - 10, - 1, - ), - 0, - ) - - -def test_advanced_slice6(): - ii = dpt.asarray([0, 1]) - res = advanced._basic_slice_meta( - ( - slice(None), - ii, - ii, - ), - (10, 10, 10), - ( - 100, - 10, - 1, - ), - 0, - ) - assert res == ( - ( - 10, - 10, - 10, - ), - (100, 10, 1), - 0, - ( - ii, - ii, - ), - 1, - ) - - -def test_advanced_slice7(): - x = dpt.reshape( - dpt.arange(3 * 3 * 3, dtype="i8"), - ( - 3, - 3, - 3, - ), - ) - mask = dpt.asarray( - [ - [[True, True, False], [False, True, True], [True, False, True]], - [[True, False, False], [False, False, True], [False, True, False]], - [[True, True, True], [False, False, False], [False, False, True]], - ] - ) - res = advanced.get_item(x, mask) - res_expected = np.array([0, 1, 4, 5, 6, 8, 9, 14, 16, 18, 19, 20, 26]) - assert np.array_equal(dpt.asnumpy(res), res_expected) - res2 = advanced.get_item(x, (mask,)) - assert np.array_equal(dpt.asnumpy(res2), res_expected) - - -def test_advanced_slice8(): - x = dpt.reshape( - dpt.arange(3 * 3 * 3, dtype="i8"), - ( - 3, - 3, - 3, - ), - ) - mask = dpt.asarray( - [[True, False, False], [False, True, False], [False, True, False]] - ) - res = advanced.get_item(x, mask) - res_expected = np.array([[0, 1, 2], [12, 13, 14], [21, 22, 23]]) - assert np.array_equal(dpt.asnumpy(res), res_expected) - res2 = advanced.get_item(x, (mask,)) - assert np.array_equal(dpt.asnumpy(res2), res_expected) - - -def test_advanced_slice9(): - x = dpt.reshape( - dpt.arange(3 * 3 * 3, dtype="i8"), - ( - 3, - 3, - 3, - ), - ) - mask = dpt.asarray( - [[True, False, False], [False, True, False], [False, True, False]] - ) - res = advanced.get_item( - x, - ( - slice(None, None, None), - mask, - ), - ) - res_expected = np.array([[0, 4, 7], [9, 13, 16], [18, 22, 25]]) - assert np.array_equal(dpt.asnumpy(res), res_expected) - - -def lin_id(i, j, k): - return 9 * i + 3 * j + k - - -def test_advanced_slice10(): - x = dpt.reshape( - dpt.arange(3 * 3 * 3, dtype="i8"), - ( - 3, - 3, - 3, - ), - ) - i0 = dpt.asarray([0, 1, 1]) - i1 = dpt.asarray([1, 1, 2]) - i2 = dpt.asarray([2, 0, 1]) - res = advanced.get_item(x, (i0, i1, i2)) - res_expected = np.array( - [ - lin_id(0, 1, 2), - lin_id(1, 1, 0), - lin_id(1, 2, 1), - ] - ) - assert np.array_equal(dpt.asnumpy(res), res_expected) - - -def test_advanced_slice11(): - x = dpt.reshape( - dpt.arange(3 * 3 * 3, dtype="i8"), - ( - 3, - 3, - 3, - ), - ) - i0 = dpt.asarray([0, 1, 1]) - i2 = dpt.asarray([2, 0, 1]) - with pytest.raises(IndexError): - advanced.get_item(x, (i0, slice(None, None, None), i2)) - - -def test_advanced_slice12(): - x = dpt.reshape( - dpt.arange(3 * 3 * 3, dtype="i8"), - ( - 3, - 3, - 3, - ), - ) - i1 = dpt.asarray([1, 1, 2]) - i2 = dpt.asarray([2, 0, 1]) - res = advanced.get_item(x, (slice(None), None, i1, i2, None)) - res_expected = np.array( - [ - [[[lin_id(0, 1, 2)], [lin_id(0, 1, 0)], [lin_id(0, 2, 1)]]], - [[[lin_id(1, 1, 2)], [lin_id(1, 1, 0)], [lin_id(1, 2, 1)]]], - [[[lin_id(2, 1, 2)], [lin_id(2, 1, 0)], [lin_id(2, 2, 1)]]], - ] - ) - assert np.array_equal(dpt.asnumpy(res), res_expected) - - -def test_advanced_slice13(): - x = dpt.reshape( - dpt.arange(3 * 3 * 3, dtype="i8"), - ( - 3, - 3, - 3, - ), - ) - i1 = dpt.asarray([[1], [2]]) - i2 = dpt.asarray([[0, 1]]) - res = advanced.get_item(x, (i1, i2, 0)) - res_expected = np.array( - [ - [lin_id(1, 0, 0), lin_id(1, 1, 0)], - [lin_id(2, 0, 0), lin_id(2, 1, 0)], - ] - ) - assert np.array_equal(dpt.asnumpy(res), res_expected) From 832a981f1bbda43a872eced15322ce90ae6ca5d9 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 22 Feb 2023 15:40:44 -0800 Subject: [PATCH 14/57] Implemented advanced indexing kernels - Kernels for _take, _put - Python API functions for take, put --- dpctl/tensor/CMakeLists.txt | 1 + dpctl/tensor/__init__.py | 3 + dpctl/tensor/_copy_utils.py | 57 +- dpctl/tensor/_indexing_functions.py | 171 +++ .../include/kernels/advanced_indexing.hpp | 417 ++++++ .../libtensor/source/advanced_indexing.cpp | 1142 +++++++++++++++++ .../libtensor/source/advanced_indexing.hpp | 62 + dpctl/tensor/libtensor/source/tensor_py.cpp | 24 + 8 files changed, 1862 insertions(+), 15 deletions(-) create mode 100644 dpctl/tensor/_indexing_functions.py create mode 100644 dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp create mode 100644 dpctl/tensor/libtensor/source/advanced_indexing.cpp create mode 100644 dpctl/tensor/libtensor/source/advanced_indexing.hpp diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index b496ecfbd8..83db95805e 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -23,6 +23,7 @@ pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/advanced_indexing.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 7f2a6a9962..d21958b4fa 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -58,6 +58,7 @@ ) from dpctl.tensor._device import Device from dpctl.tensor._dlpack import from_dlpack +from dpctl.tensor._indexing_functions import put, take from dpctl.tensor._manipulation_functions import ( broadcast_arrays, broadcast_to, @@ -112,6 +113,8 @@ "expand_dims", "permute_dims", "squeeze", + "take", + "put", "from_numpy", "to_numpy", "asnumpy", diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index f83ecdbd74..382d92bb79 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -13,7 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import operator + import numpy as np +from numpy.core.numeric import normalize_axis_index import dpctl import dpctl.memory as dpm @@ -449,14 +452,25 @@ def _mock_take_multi_index(ary, inds, p): raise IndexError( "arrays used as indices must be of integer (or boolean) type" ) - ary_np = dpt.asnumpy(ary) - ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds) - res_np = ary_np[ind_np] + inds = dpt.broadcast_arrays(*inds) + ary_ndim = ary.ndim + if ary_ndim > 0: + p = operator.index(p) + p = normalize_axis_index(p, ary_ndim) + + res_shape = ary.shape[:p] + inds[0].shape + ary.shape[p + len(inds) :] + else: + res_shape = inds[0].shape res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) res = dpt.empty( - res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q + res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - res[...] = res_np + + hev, _ = ti._take( + src=ary, ind=inds, dst=res, axis_start=p, mode=0, sycl_queue=exec_q + ) + hev.wait() + return res @@ -492,7 +506,7 @@ def _mock_place(ary, ary_mask, p, vals): def _mock_put_multi_index(ary, inds, p, vals): - if isinstance(vals, dpt.ums_ndarray): + if isinstance(vals, dpt.usm_ndarray): queues_ = [ary.sycl_queue, vals.sycl_queue] usm_types_ = [ary.usm_type, vals.usm_type] else: @@ -522,14 +536,27 @@ def _mock_put_multi_index(ary, inds, p, vals): raise IndexError( "arrays used as indices must be of integer (or boolean) type" ) - ary_np = dpt.asnumpy(ary) - if isinstance(vals, dpt.usm_ndarray) or hasattr( - vals, "__sycl_usm_array_interface__" - ): - vals_np = dpt.asnumpy(vals) + + inds = dpt.broadcast_arrays(*inds) + ary_ndim = ary.ndim + if ary_ndim > 0: + p = operator.index(p) + p = normalize_axis_index(p, ary_ndim) + vals_shape = ary.shape[:p] + inds[0].shape + ary.shape[p + len(inds) :] else: - vals_np = vals - ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds) - ary_np[ind_np] = vals_np - ary[...] = ary_np + vals_shape = inds[0].shape + + vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) + if not isinstance(vals, dpt.usm_ndarray): + vals = dpt.asarray( + vals, ary.dtype, usm_type=vals_usm_type, sycl_queue=exec_q + ) + + vals = dpt.broadcast_to(vals, vals_shape) + + hev, _ = ti._put( + dst=ary, ind=inds, val=vals, axis_start=p, mode=0, sycl_queue=exec_q + ) + hev.wait() + return diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py new file mode 100644 index 0000000000..c3562de8f8 --- /dev/null +++ b/dpctl/tensor/_indexing_functions.py @@ -0,0 +1,171 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator + +import numpy as np +from numpy.core.numeric import normalize_axis_index + +import dpctl +import dpctl.tensor as dpt +from dpctl.tensor._tensor_impl import _put, _take + + +def take(x, indices, /, *, axis=None, mode="clip"): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) + ) + + if not isinstance(indices, list) and not isinstance(indices, tuple): + indices = (indices,) + + queues_ = [ + x.sycl_queue, + ] + usm_types_ = [ + x.usm_type, + ] + + for i in indices: + if not isinstance(i, dpt.usm_ndarray): + raise TypeError( + "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( + type(i) + ) + ) + if not np.issubdtype(i.dtype, np.integer): + raise TypeError( + "`indices` expected integer data type, got `{}`".format(i.dtype) + ) + queues_.append(i.sycl_queue) + usm_types_.append(i.usm_type) + exec_q = dpctl.utils.get_execution_queue(queues_) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError( + "Can not automatically determine where to allocate the " + "result or performance execution. " + "Use `usm_ndarray.to_device` method to migrate data to " + "be associated with the same queue." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) + + modes = {"clip": 0, "wrap": 1} + try: + mode = modes[mode] + except KeyError: + raise ValueError("`mode` must be `clip` or `wrap`.") + + x_ndim = x.ndim + if axis is None: + if x_ndim > 1: + raise ValueError( + "`axis` cannot be `None` for array of dimension `{}`".format( + x_ndim + ) + ) + axis = 0 + + indices = dpt.broadcast_arrays(*indices) + if x_ndim > 0: + axis = operator.index(axis) + axis = normalize_axis_index(axis, x_ndim) + res_shape = ( + x.shape[:axis] + indices[0].shape + x.shape[axis + len(indices) :] + ) + else: + res_shape = indices[0].shape + + res = dpt.empty( + res_shape, dtype=x.dtype, usm_type=res_usm_type, sycl_queue=exec_q + ) + + hev, _ = _take(x, indices, res, axis, mode, sycl_queue=exec_q) + hev.wait() + + return res + + +def put(x, indices, vals, /, *, axis=None, mode="clip"): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) + ) + queues_ = [ + x.sycl_queue, + ] + usm_types_ = [ + x.usm_type, + ] + + if not isinstance(indices, list) and not isinstance(indices, tuple): + indices = (indices,) + + for i in indices: + if not isinstance(i, dpt.usm_ndarray): + raise TypeError( + "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( + type(i) + ) + ) + if not np.issubdtype(i.dtype, np.integer): + raise TypeError( + "`indices` expected integer data type, got `{}`".format(i.dtype) + ) + queues_.append(i.sycl_queue) + usm_types_.append(i.usm_type) + exec_q = dpctl.utils.get_execution_queue(queues_) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError( + "Can not automatically determine where to allocate the " + "result or performance execution. " + "Use `usm_ndarray.to_device` method to migrate data to " + "be associated with the same queue." + ) + val_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) + + modes = {"clip": 0, "wrap": 1} + try: + mode = modes[mode] + except KeyError: + raise ValueError("`mode` must be `wrap`, or `clip`.") + + # when axis is none, array is treated as 1D + if axis is None: + x = dpt.reshape(x, (x.size,), copy=False) + axis = 0 + + indices = dpt.broadcast_arrays(*indices) + x_ndim = x.ndim + if x_ndim > 0: + axis = operator.index(axis) + axis = normalize_axis_index(axis, x_ndim) + + val_shape = ( + x.shape[:axis] + indices[0].shape + x.shape[axis + len(indices) :] + ) + else: + val_shape = indices[0].shape + + if not isinstance(vals, dpt.usm_ndarray): + vals = dpt.asarray( + vals, dtype=x.dtype, usm_type=val_usm_type, sycl_queue=exec_q + ) + + vals = dpt.broadcast_to(vals, val_shape) + + hev, _ = _put(x, indices, vals, axis, mode, sycl_queue=exec_q) + hev.wait() diff --git a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp new file mode 100644 index 0000000000..77234296ff --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp @@ -0,0 +1,417 @@ +//=== indexing.hpp - Implementation of indexing kernels ---*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for advanced tensor index operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include "utils/strided_iters.hpp" +#include "utils/type_utils.hpp" +#include +#include +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace indexing +{ + +namespace py = pybind11; + +template class take_kernel; +template class put_kernel; + +template class ClipIndex +{ +public: + ClipIndex() = default; + + void operator()(py::ssize_t max_item, indT &ind) const + { + max_item = (max_item > 0) ? max_item : 1; + py::ssize_t clip_ind = static_cast(ind); + ind = (ind < 0) ? 0 : (clip_ind >= max_item) ? (max_item - 1) : ind; + return; + } +}; + +template class WrapIndex +{ +public: + WrapIndex() = default; + + void operator()(py::ssize_t max_item, indT &ind) const + { + max_item = (max_item > 0) ? max_item : 1; + py::ssize_t wrap_ind = static_cast(ind); + ind = (ind < 0) ? max_item - (-wrap_ind % max_item) + : (wrap_ind >= max_item) ? wrap_ind % max_item + : ind; + return; + } +}; + +template class TakeFunctor +{ +private: + const char *src_ = nullptr; + char *dst_ = nullptr; + char **ind_ = nullptr; + int nd_ = 0; + int ind_nd_ = 0; + int k_ = 0; + size_t ind_nelems_ = 0; + const py::ssize_t *orthog_shape_and_strides_ = nullptr; + const py::ssize_t *axes_shape_and_strides_ = nullptr; + const py::ssize_t *ind_shape_and_strides_ = nullptr; + py::ssize_t src_offset_ = 0; + py::ssize_t dst_offset_ = 0; + const py::ssize_t *ind_offsets_ = nullptr; + +public: + TakeFunctor(const char *src_cp, + char *dst_cp, + char **ind_cp, + int nd, + int ind_nd, + int k, + size_t ind_nelems, + const py::ssize_t *orthog_shape_and_strides, + const py::ssize_t *axes_shape_and_strides, + const py::ssize_t *ind_shape_and_strides, + py::ssize_t src_offset, + py::ssize_t dst_offset, + const py::ssize_t *ind_offsets) + : src_(src_cp), dst_(dst_cp), ind_(ind_cp), nd_(nd), ind_nd_(ind_nd), + k_(k), ind_nelems_(ind_nelems), + orthog_shape_and_strides_(orthog_shape_and_strides), + axes_shape_and_strides_(axes_shape_and_strides), + ind_shape_and_strides_(ind_shape_and_strides), + src_offset_(src_offset), dst_offset_(dst_offset), + ind_offsets_(ind_offsets) + { + } + + void operator()(sycl::id<1> id) const + { + const T *src = reinterpret_cast(src_); + T *dst = reinterpret_cast(dst_); + + py::ssize_t i_orthog = id / ind_nelems_; + py::ssize_t i_along = id - (i_orthog * ind_nelems_); + + py::ssize_t src_orthog_idx(0); + py::ssize_t dst_orthog_idx(0); + CIndexer_vector indxr(nd_); + indxr.get_displacement( + static_cast(i_orthog), + orthog_shape_and_strides_, // common shape + orthog_shape_and_strides_ + nd_, // src strides + orthog_shape_and_strides_ + 2 * nd_, // dst strides + src_orthog_idx, // modified by reference + dst_orthog_idx); + + ProjectorT proj{}; + py::ssize_t ind_arr_idx(0); + CIndexer_vector ind_indxr(ind_nd_); + for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { + ind_indxr.get_displacement( + static_cast(i_along), ind_shape_and_strides_, + ind_shape_and_strides_ + ((axis_idx + 1) * ind_nd_), + ind_arr_idx); + indT *ind_data = reinterpret_cast(ind_[axis_idx]); + indT i = ind_data[ind_arr_idx + ind_offsets_[axis_idx]]; + proj(axes_shape_and_strides_[axis_idx], i); + src_orthog_idx += i * axes_shape_and_strides_[k_ + axis_idx]; + } + py::ssize_t ind_dst_idx(0); + ind_indxr.get_displacement( + static_cast(i_along), ind_shape_and_strides_, + axes_shape_and_strides_ + (2 * k_), ind_dst_idx); + + dst[dst_orthog_idx + ind_dst_idx + dst_offset_] = + src[src_orthog_idx + src_offset_]; + } +}; + +typedef sycl::event (*take_fn_ptr_t)(sycl::queue, + size_t, + size_t, + int, + int, + int, + const py::ssize_t *, + const py::ssize_t *, + const py::ssize_t *, + const char *, + char *, + char **, + py::ssize_t, + py::ssize_t, + const py::ssize_t *, + const std::vector &); + +template +sycl::event take_impl(sycl::queue q, + size_t orthog_nelems, + size_t ind_nelems, + int nd, + int ind_nd, + int k, + const py::ssize_t *orthog_shape_and_strides, + const py::ssize_t *axes_shape_and_strides, + const py::ssize_t *ind_shape_and_strides, + const char *src_p, + char *dst_p, + char **ind_p, + py::ssize_t src_offset, + py::ssize_t dst_offset, + const py::ssize_t *ind_offsets, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event take_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const size_t gws = orthog_nelems * ind_nelems; + + cgh.parallel_for>( + sycl::range<1>(gws), + TakeFunctor( + src_p, dst_p, ind_p, nd, ind_nd, k, ind_nelems, + orthog_shape_and_strides, axes_shape_and_strides, + ind_shape_and_strides, src_offset, dst_offset, ind_offsets)); + }); + + return take_ev; +} + +template class PutFunctor +{ +private: + char *dst_ = nullptr; + const char *val_ = nullptr; + char **ind_ = nullptr; + int nd_ = 0; + int ind_nd_ = 0; + int k_ = 0; + size_t ind_nelems_ = 0; + const py::ssize_t *orthog_shape_and_strides_ = nullptr; + const py::ssize_t *axes_shape_and_strides_ = nullptr; + const py::ssize_t *ind_shape_and_strides_ = nullptr; + py::ssize_t dst_offset_ = 0; + py::ssize_t val_offset_ = 0; + const py::ssize_t *ind_offsets_ = nullptr; + +public: + PutFunctor(char *dst_cp, + const char *val_cp, + char **ind_cp, + int nd, + int ind_nd, + int k, + size_t ind_nelems, + const py::ssize_t *orthog_shape_and_strides, + const py::ssize_t *axes_shape_and_strides, + const py::ssize_t *ind_shape_and_strides, + py::ssize_t dst_offset, + py::ssize_t val_offset, + const py::ssize_t *ind_offsets) + : dst_(dst_cp), val_(val_cp), ind_(ind_cp), nd_(nd), ind_nd_(ind_nd), + k_(k), ind_nelems_(ind_nelems), + orthog_shape_and_strides_(orthog_shape_and_strides), + axes_shape_and_strides_(axes_shape_and_strides), + ind_shape_and_strides_(ind_shape_and_strides), + dst_offset_(dst_offset), val_offset_(val_offset), + ind_offsets_(ind_offsets) + { + } + + void operator()(sycl::id<1> id) const + { + T *dst = reinterpret_cast(dst_); + const T *val = reinterpret_cast(val_); + + py::ssize_t i_orthog = id / ind_nelems_; + py::ssize_t i_along = id - (i_orthog * ind_nelems_); + + py::ssize_t dst_orthog_idx(0); + py::ssize_t val_orthog_idx(0); + CIndexer_vector indxr(nd_); + indxr.get_displacement( + static_cast(i_orthog), + orthog_shape_and_strides_, // common shape + orthog_shape_and_strides_ + nd_, // dst strides + orthog_shape_and_strides_ + 2 * nd_, // val strides + dst_orthog_idx, // modified by reference + val_orthog_idx); + + ProjectorT proj{}; + py::ssize_t ind_arr_idx(0); + CIndexer_vector ind_indxr(ind_nd_); + for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { + ind_indxr.get_displacement( + static_cast(i_along), ind_shape_and_strides_, + ind_shape_and_strides_ + ((axis_idx + 1) * ind_nd_), + ind_arr_idx); + indT *ind_data = reinterpret_cast(ind_[axis_idx]); + indT i = ind_data[ind_arr_idx + ind_offsets_[axis_idx]]; + proj(axes_shape_and_strides_[axis_idx], i); + dst_orthog_idx += i * axes_shape_and_strides_[k_ + axis_idx]; + } + py::ssize_t ind_val_idx(0); + ind_indxr.get_displacement( + static_cast(i_along), ind_shape_and_strides_, + axes_shape_and_strides_ + (2 * k_), ind_val_idx); + + dst[dst_orthog_idx + dst_offset_] = + val[val_orthog_idx + ind_val_idx + val_offset_]; + } +}; + +typedef sycl::event (*put_fn_ptr_t)(sycl::queue, + size_t, + size_t, + int, + int, + int, + const py::ssize_t *, + const py::ssize_t *, + const py::ssize_t *, + char *, + const char *, + char **, + py::ssize_t, + py::ssize_t, + const py::ssize_t *, + const std::vector &); + +template +sycl::event put_impl(sycl::queue q, + size_t orthog_nelems, + size_t ind_nelems, + int nd, + int ind_nd, + int k, + const py::ssize_t *orthog_shape_and_strides, + const py::ssize_t *axes_shape_and_strides, + const py::ssize_t *ind_shape_and_strides, + char *dst_p, + const char *val_p, + char **ind_p, + py::ssize_t dst_offset, + py::ssize_t val_offset, + const py::ssize_t *ind_offsets, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event put_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const size_t gws = orthog_nelems * ind_nelems; + + cgh.parallel_for>( + sycl::range<1>(gws), + PutFunctor( + dst_p, val_p, ind_p, nd, ind_nd, k, ind_nelems, + orthog_shape_and_strides, axes_shape_and_strides, + ind_shape_and_strides, dst_offset, val_offset, ind_offsets)); + }); + + return put_ev; +} + +template struct TakeWrapFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + fnT fn = take_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template struct TakeClipFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + fnT fn = take_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template struct PutWrapFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + fnT fn = put_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template struct PutClipFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + fnT fn = put_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +} // namespace indexing +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.cpp b/dpctl/tensor/libtensor/source/advanced_indexing.cpp new file mode 100644 index 0000000000..5f043db7bc --- /dev/null +++ b/dpctl/tensor/libtensor/source/advanced_indexing.cpp @@ -0,0 +1,1142 @@ +//===-- take_kernel_impl.cpp - Implementation of take --*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines implementation functions of dpctl.tensor.take and +/// dpctl.tensor.put +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include "kernels/advanced_indexing.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#define INDEXING_MODES 2 +#define CLIP_MODE 0 +#define WRAP_MODE 1 + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace _ns = dpctl::tensor::detail; + +using dpctl::tensor::kernels::indexing::put_fn_ptr_t; +using dpctl::tensor::kernels::indexing::take_fn_ptr_t; + +static take_fn_ptr_t take_dispatch_table[INDEXING_MODES][_ns::num_types] + [_ns::num_types]; + +static put_fn_ptr_t put_dispatch_table[INDEXING_MODES][_ns::num_types] + [_ns::num_types]; + +namespace py = pybind11; + +using dpctl::tensor::c_contiguous_strides; +using dpctl::tensor::f_contiguous_strides; + +using dpctl::utils::keep_args_alive; + +std::vector _populate_packed_shapes_strides_for_indexing( + sycl::queue exec_q, + py::ssize_t *device_orthog_shapes_strides, + py::ssize_t *device_axes_shapes_strides, + const py::ssize_t *inp_shape, + const py::ssize_t *inp_strides, + bool is_inp_c_contig, + bool is_inp_f_contig, + const py::ssize_t *arr_shape, + const py::ssize_t *arr_strides, + bool is_arr_c_contig, + bool is_arr_f_contig, + int axis_start, + int k, + int ind_nd, + int inp_nd, + int arr_nd) +{ + + int orthog_sh_elems = (inp_nd > 1) ? inp_nd - k : 1; + int along_sh_elems = (ind_nd > 1) ? ind_nd : 1; + + using usm_host_allocatorT = + sycl::usm_allocator; + using shT = std::vector; + + usm_host_allocatorT allocator(exec_q); + std::shared_ptr packed_host_shapes_strides_shp = + std::make_shared(3 * orthog_sh_elems, allocator); + + std::shared_ptr packed_host_axes_shapes_strides_shp = + std::make_shared(2 * k + along_sh_elems, allocator); + + // can be made more efficient by checking if inp_nd > 1, then performing + // same treatment of orthog_sh_elems as for 0D (orthog will not exist) + if (inp_nd > 0) { + std::copy(inp_shape, inp_shape + axis_start, + packed_host_shapes_strides_shp->begin()); + std::copy(inp_shape + axis_start + k, inp_shape + inp_nd, + packed_host_shapes_strides_shp->begin() + axis_start); + std::copy(inp_shape + axis_start, inp_shape + axis_start + k, + packed_host_axes_shapes_strides_shp->begin()); + + // contract axes by using two copies + if (inp_strides == nullptr) { + if (is_inp_c_contig) { + const auto &inp_contig_strides = + c_contiguous_strides(inp_nd, inp_shape); + std::copy(inp_contig_strides.begin(), + inp_contig_strides.begin() + axis_start, + packed_host_shapes_strides_shp->begin() + + orthog_sh_elems); + std::copy(inp_contig_strides.begin() + axis_start + k, + inp_contig_strides.end(), + packed_host_shapes_strides_shp->begin() + + orthog_sh_elems + axis_start); + std::copy(inp_contig_strides.begin() + axis_start, + inp_contig_strides.begin() + axis_start + k, + packed_host_axes_shapes_strides_shp->begin() + k); + } + else if (is_inp_f_contig) { + const auto &inp_contig_strides = + f_contiguous_strides(inp_nd, inp_shape); + std::copy(inp_contig_strides.begin(), + inp_contig_strides.begin() + axis_start, + packed_host_shapes_strides_shp->begin() + + orthog_sh_elems); + std::copy(inp_contig_strides.begin() + axis_start + k, + inp_contig_strides.end(), + packed_host_shapes_strides_shp->begin() + + orthog_sh_elems + axis_start); + std::copy(inp_contig_strides.begin() + axis_start, + inp_contig_strides.begin() + axis_start + k, + packed_host_axes_shapes_strides_shp->begin() + k); + } + else { + sycl::free(device_orthog_shapes_strides, exec_q); + throw std::runtime_error("Invalid array encountered"); + } + } + else { + std::copy(inp_strides, inp_strides + axis_start, + packed_host_shapes_strides_shp->begin() + + orthog_sh_elems); + std::copy(inp_strides + axis_start + k, inp_strides + inp_nd, + packed_host_shapes_strides_shp->begin() + + orthog_sh_elems + axis_start); + std::copy(inp_strides + axis_start, inp_strides + axis_start + k, + packed_host_axes_shapes_strides_shp->begin() + k); + } + + if (arr_strides == nullptr) { + if (is_arr_c_contig) { + const auto &arr_contig_strides = + c_contiguous_strides(arr_nd, arr_shape); + std::copy(arr_contig_strides.begin(), + arr_contig_strides.begin() + axis_start, + packed_host_shapes_strides_shp->begin() + + 2 * orthog_sh_elems); + std::copy(arr_contig_strides.begin() + axis_start + ind_nd, + arr_contig_strides.end(), + packed_host_shapes_strides_shp->begin() + + 2 * orthog_sh_elems + axis_start); + std::copy(arr_contig_strides.begin() + axis_start, + arr_contig_strides.begin() + axis_start + ind_nd, + packed_host_axes_shapes_strides_shp->begin() + 2 * k); + } + else if (is_arr_f_contig) { + const auto &arr_contig_strides = + f_contiguous_strides(arr_nd, arr_shape); + std::copy(arr_contig_strides.begin(), + arr_contig_strides.begin() + axis_start, + packed_host_shapes_strides_shp->begin() + + 2 * orthog_sh_elems); + std::copy(arr_contig_strides.begin() + axis_start + ind_nd, + arr_contig_strides.end(), + packed_host_shapes_strides_shp->begin() + + 2 * orthog_sh_elems + axis_start); + std::copy(arr_contig_strides.begin() + axis_start, + arr_contig_strides.begin() + axis_start + ind_nd, + packed_host_axes_shapes_strides_shp->begin() + 2 * k); + } + else { + sycl::free(device_orthog_shapes_strides, exec_q); + throw std::runtime_error("Invalid array encountered"); + } + } + else { + std::copy(arr_strides, arr_strides + axis_start, + packed_host_shapes_strides_shp->begin() + + 2 * orthog_sh_elems); + std::copy(arr_strides + axis_start + ind_nd, arr_strides + inp_nd, + packed_host_shapes_strides_shp->begin() + + 2 * orthog_sh_elems + axis_start); + std::copy(arr_strides + axis_start, + arr_strides + axis_start + ind_nd, + packed_host_axes_shapes_strides_shp->begin() + 2 * k); + } + + // copy packed shapes and strides from host to devices + sycl::event device_orthog_shapes_strides_copy_ev = + exec_q.copy(packed_host_shapes_strides_shp->data(), + device_orthog_shapes_strides, + packed_host_shapes_strides_shp->size()); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(device_orthog_shapes_strides_copy_ev); + cgh.host_task([packed_host_shapes_strides_shp] {}); + }); + + sycl::event device_axes_shapes_strides_copy_ev = + exec_q.copy( + packed_host_axes_shapes_strides_shp->data(), + device_axes_shapes_strides, + packed_host_axes_shapes_strides_shp->size()); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(device_axes_shapes_strides_copy_ev); + cgh.host_task([packed_host_axes_shapes_strides_shp]() {}); + }); + + std::vector v = {device_orthog_shapes_strides_copy_ev, + device_axes_shapes_strides_copy_ev}; + return v; + } + else { + // no orthogonal dimensions + sycl::event device_orthog_shapes_strides_fill_ev = + exec_q.fill(device_orthog_shapes_strides, + py::ssize_t(0), 3); + + packed_host_axes_shapes_strides_shp->insert( + packed_host_axes_shapes_strides_shp->end(), py::ssize_t(0), 2); + if (arr_strides == nullptr) { + if (is_arr_c_contig) { + const auto &arr_contig_strides = + c_contiguous_strides(arr_nd, arr_shape); + std::copy(arr_contig_strides.begin() + axis_start, + arr_contig_strides.begin() + axis_start + ind_nd, + packed_host_axes_shapes_strides_shp->begin() + 2); + } + else if (is_arr_f_contig) { + const auto &arr_contig_strides = + f_contiguous_strides(arr_nd, arr_shape); + std::copy(arr_contig_strides.begin() + axis_start, + arr_contig_strides.begin() + axis_start + ind_nd, + packed_host_axes_shapes_strides_shp->begin() + 2); + } + else { + sycl::free(device_orthog_shapes_strides, exec_q); + throw std::runtime_error("Invalid array encountered"); + } + } + else { + std::copy(arr_strides + axis_start, + arr_strides + axis_start + ind_nd, + packed_host_axes_shapes_strides_shp->begin() + 2); + } + + sycl::event device_axes_shapes_strides_copy_ev = + exec_q.copy( + packed_host_axes_shapes_strides_shp->data(), + device_axes_shapes_strides, + packed_host_axes_shapes_strides_shp->size()); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(device_axes_shapes_strides_copy_ev); + cgh.host_task([packed_host_axes_shapes_strides_shp]() {}); + }); + + std::vector v = {device_orthog_shapes_strides_fill_ev, + device_axes_shapes_strides_copy_ev}; + return v; + } +} + +std::pair +usm_ndarray_take(dpctl::tensor::usm_ndarray src, + std::vector ind, + dpctl::tensor::usm_ndarray dst, + int axis_start, + uint8_t mode, + sycl::queue exec_q, + const std::vector &depends = {}) +{ + int k = ind.size(); + + if (k == 0) { + // no indices to take from + return std::make_pair(sycl::event{}, sycl::event{}); + } + + if (axis_start < 0) { + throw py::value_error("Axis cannot be negative."); + } + + const dpctl::tensor::usm_ndarray ind_rep = ind[0]; + + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + int ind_nd = ind_rep.get_ndim(); + + auto sh_elems = (src_nd > 0) ? src_nd : 1; + + if (axis_start + k > sh_elems) { + throw py::value_error("Axes are out of range for array of dimension " + + std::to_string(src_nd)); + } + if (src_nd == 0) { + if (dst_nd != ind_nd) { + throw py::value_error( + "Destination is not of appropriate dimension for take kernel."); + } + } + else { + if (dst_nd != (src_nd - k + ind_nd)) { + throw py::value_error( + "Destination is not of appropriate dimension for take kernel."); + } + } + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool orthog_shapes_equal(true); + size_t orthog_nelems(1); + for (int i = 0; i < axis_start; ++i) { + orthog_nelems *= static_cast(src_shape[i]); + orthog_shapes_equal = + orthog_shapes_equal && (src_shape[i] == dst_shape[i]); + } + + for (int i = (axis_start + k), j = (axis_start + ind_nd); + (i < src_nd && j < dst_nd); ++i, ++j) + { + orthog_nelems *= static_cast(src_shape[i]); + orthog_shapes_equal = + orthog_shapes_equal && (src_shape[i] == dst_shape[j]); + } + + if (!orthog_shapes_equal) { + throw py::value_error( + "Axes of basic indices are not of matching shapes."); + } + + if (orthog_nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + auto src_offsets = src.get_minmax_offsets(); + auto dst_offsets = dst.get_minmax_offsets(); + int src_elem_size = src.get_elemsize(); + int dst_elem_size = dst.get_elemsize(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + py::ssize_t src_offset = py::ssize_t(0); + py::ssize_t dst_offset = py::ssize_t(0); + + if (!dst.is_writable()) { + throw py::value_error("Output array is read-only."); + } + + bool memory_overlap = + ((dst_data - src_data > src_offsets.second * src_elem_size - + dst_offsets.first * dst_elem_size) && + (src_data - dst_data > dst_offsets.second * dst_elem_size - + src_offsets.first * src_elem_size)); + if (memory_overlap) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = dpctl::tensor::detail::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::type_error("Array data types are not the same."); + } + + const py::ssize_t *ind_shape = ind_rep.get_shape_raw(); + + int ind_typenum = ind_rep.get_typenum(); + int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum); + + size_t ind_nelems(1); + for (int i = 0; i < ind_nd; ++i) { + ind_nelems *= static_cast(ind_shape[i]); + + if (!(ind_shape[i] == dst_shape[axis_start + i])) { + throw py::value_error( + "Indices shape does not match shape of axis in destination."); + } + } + + auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1; + + char **packed_ind_ptrs = sycl::malloc_device(k, exec_q); + + if (packed_ind_ptrs == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_ind_ptrs device memory"); + } + + // packed_ind_shapes_strides = [ind_shape, + // ind[0] strides, + // ..., + // ind[k] strides] + py::ssize_t *packed_ind_shapes_strides = + sycl::malloc_device((k + 1) * ind_sh_elems, exec_q); + + if (packed_ind_shapes_strides == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_ind_shapes_strides device memory"); + } + + py::ssize_t *packed_ind_offsets = + sycl::malloc_device(k, exec_q); + + if (packed_ind_offsets == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_ind_offsets device memory"); + } + + using usm_host_allocator_T = + sycl::usm_allocator; + using ptrT = std::vector; + + usm_host_allocator_T ptr_allocator(exec_q); + std::shared_ptr host_ind_ptrs_shp = + std::make_shared(k, ptr_allocator); + + using usm_host_allocatorT = + sycl::usm_allocator; + using shT = std::vector; + + usm_host_allocatorT ind_allocator(exec_q); + std::shared_ptr host_ind_shapes_strides_shp = + std::make_shared(ind_sh_elems * (k + 1), ind_allocator); + + // shape can be copied now (must be the same for every array) + if (ind_nd > 0) { + std::copy(ind_shape, ind_shape + ind_nd, + host_ind_shapes_strides_shp->begin()); + } + else { + // all strides are 0 for 0D array + host_ind_shapes_strides_shp->insert(host_ind_shapes_strides_shp->end(), + (k + 1), 0); + } + + std::shared_ptr host_ind_offsets_shp = + std::make_shared(k, ind_allocator); + + std::vector ind_ptrs; + ind_ptrs.reserve(k); + std::vector ind_offsets; + ind_offsets.reserve(k); + for (int i = 0; i < k; ++i) { + dpctl::tensor::usm_ndarray ind_ = ind[i]; + + if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // ndim, type, and shape are checked against the first array + if (i > 0) { + if (!(ind_.get_ndim() == ind_nd)) { + throw py::value_error("Index dimensions are not the same"); + } + + if (!(ind_type_id == + array_types.typenum_to_lookup_id(ind_.get_typenum()))) { + throw py::type_error( + "Indices array data types are not all the same."); + } + + const py::ssize_t *ind_shape_ = ind_.get_shape_raw(); + for (int dim = 0; dim < ind_nd; ++dim) { + if (!(ind_shape[dim] == ind_shape_[dim])) { + throw py::value_error("Indices shapes are not all equal."); + } + } + } + + // check for overlap with destination + int ind_elem_size = ind_.get_elemsize(); + auto ind_mem_offsets = ind_.get_minmax_offsets(); + char *ind_data = ind_.get_data(); + bool ind_memory_overlap = + ((dst_data - ind_data > ind_mem_offsets.second * ind_elem_size - + dst_offsets.first * dst_elem_size) && + (ind_data - dst_data > dst_offsets.second * dst_elem_size - + ind_mem_offsets.first * ind_elem_size)); + + if (ind_memory_overlap) { + throw py::value_error( + "Arrays index overlapping segments of memory"); + } + + // strides are initialized to 0 for 0D indices, so skip here + if (ind_nd > 0) { + const py::ssize_t *ind_strides = ind_.get_strides_raw(); + if (ind_strides == nullptr) { + if (ind_.is_c_contiguous()) { + const auto &ind_contig_strides_ = + c_contiguous_strides(ind_nd, ind_shape); + std::copy(ind_contig_strides_.begin(), + ind_contig_strides_.end(), + host_ind_shapes_strides_shp->begin() + + (i + 1) * ind_nd); + } + else if (ind_.is_f_contiguous()) { + const auto &ind_contig_strides_ = + f_contiguous_strides(ind_nd, ind_shape); + std::copy(ind_contig_strides_.begin(), + ind_contig_strides_.end(), + host_ind_shapes_strides_shp->begin() + + (i + 1) * ind_nd); + } + else { + throw std::runtime_error( + "Invalid ind array encountered in: take function"); + } + } + else { + std::copy(ind_strides, ind_strides + ind_nd, + host_ind_shapes_strides_shp->begin() + + (i + 1) * ind_nd); + } + } + + ind_ptrs.push_back(ind_data); + ind_offsets.push_back(py::ssize_t(0)); + } + + std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); + std::copy(ind_offsets.begin(), ind_offsets.end(), + host_ind_offsets_shp->begin()); + + sycl::event device_ind_ptrs_copy_ev = exec_q.copy( + host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(device_ind_ptrs_copy_ev); + cgh.host_task([host_ind_ptrs_shp]() {}); + }); + + sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy( + host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides, + host_ind_shapes_strides_shp->size()); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(device_ind_shapes_strides_copy_ev); + cgh.host_task([host_ind_shapes_strides_shp]() {}); + }); + + sycl::event device_ind_offsets_copy_ev = exec_q.copy( + host_ind_offsets_shp->data(), packed_ind_offsets, + host_ind_offsets_shp->size()); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(device_ind_offsets_copy_ev); + cgh.host_task([host_ind_offsets_shp]() {}); + }); + + std::vector ind_pack_depends = { + device_ind_ptrs_copy_ev, device_ind_shapes_strides_copy_ev, + device_ind_offsets_copy_ev}; + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + const py::ssize_t *src_strides = src.get_strides_raw(); + const py::ssize_t *dst_strides = dst.get_strides_raw(); + + // destination must be ample enough to accomodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if ((range + 1) < (orthog_nelems * ind_nelems)) { + throw py::value_error( + "Destination array can not accomodate all the " + "elements of source array."); + } + } + + // packed_shapes_strides = [src_shape[:axis] + src_shape[:axis+1], + // src_strides[:axis] + src_strides[:axis+1], + // dst_strides[:axis] + dst_strides[:axis+1]] + py::ssize_t *packed_shapes_strides = + sycl::malloc_device(3 * sh_elems, exec_q); + + if (packed_shapes_strides == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_shapes_strides device memory"); + } + + // packed_axes_shapes_strides = [src_shape[axis:k], + // src_strides[axis:k, + // dst_strides[axis:ind.ndim]] + py::ssize_t *packed_axes_shapes_strides = + sycl::malloc_device((2 * k) + ind_sh_elems, exec_q); + + if (packed_axes_shapes_strides == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_axes_shapes_strides device memory"); + } + + std::vector src_dst_pack_deps = + _populate_packed_shapes_strides_for_indexing( + exec_q, packed_shapes_strides, packed_axes_shapes_strides, + src_shape, src_strides, is_src_c_contig, is_src_f_contig, dst_shape, + dst_strides, is_dst_c_contig, is_dst_f_contig, axis_start, k, + ind_nd, src_nd, dst_nd); + + std::vector all_deps(depends.size() + ind_pack_depends.size() + + src_dst_pack_deps.size()); + all_deps.insert(std::end(all_deps), std::begin(ind_pack_depends), + std::end(ind_pack_depends)); + all_deps.insert(std::end(all_deps), std::begin(src_dst_pack_deps), + std::end(src_dst_pack_deps)); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + auto fn = take_dispatch_table[mode][src_type_id][ind_type_id]; + + if (fn == nullptr) { + throw std::runtime_error("Indices must be integer type, got " + + std::to_string(ind_type_id)); + } + + int orthog_nd = ((src_nd - k) > 0) ? src_nd - k : 1; + + sycl::event take_generic_ev = + fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k, + packed_shapes_strides, packed_axes_shapes_strides, + packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs, + src_offset, dst_offset, packed_ind_offsets, all_deps); + + // free packed_shapes_strides temporary + + auto ctx = exec_q.get_context(); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(take_generic_ev); + cgh.host_task([packed_shapes_strides, ctx]() { + sycl::free(packed_shapes_strides, ctx); + }); + }); + + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(take_generic_ev); + cgh.host_task([packed_axes_shapes_strides, ctx]() { + sycl::free(packed_axes_shapes_strides, ctx); + }); + }); + + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(take_generic_ev); + cgh.host_task([packed_ind_shapes_strides, ctx]() { + sycl::free(packed_ind_shapes_strides, ctx); + }); + }); + + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(take_generic_ev); + cgh.host_task( + [packed_ind_ptrs, ctx]() { sycl::free(packed_ind_ptrs, ctx); }); + }); + + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(take_generic_ev); + cgh.host_task([packed_ind_offsets, ctx]() { + sycl::free(packed_ind_offsets, ctx); + }); + }); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {take_generic_ev}), + take_generic_ev); +} + +std::pair +usm_ndarray_put(dpctl::tensor::usm_ndarray dst, + std::vector ind, + dpctl::tensor::usm_ndarray val, + int axis_start, + uint8_t mode, + sycl::queue exec_q, + const std::vector &depends = {}) +{ + // check compatibility of execution queue and allocation queue + int k = ind.size(); + + if (k == 0) { + // no indices to write to + return std::make_pair(sycl::event{}, sycl::event{}); + } + + if (axis_start < 0) { + throw py::value_error("Axis cannot be negative."); + } + + const dpctl::tensor::usm_ndarray ind_rep = ind[0]; + + int dst_nd = dst.get_ndim(); + int val_nd = val.get_ndim(); + int ind_nd = ind_rep.get_ndim(); + + auto sh_elems = (dst_nd > 0) ? dst_nd : 1; + + if (axis_start + k > sh_elems) { + throw py::value_error("Axes are out of range for array of dimension " + + std::to_string(dst_nd)); + } + if (dst_nd == 0) { + if (val_nd != ind_nd) { + throw py::value_error("Destination is not of appropriate dimension " + "for put function."); + } + } + else { + if (val_nd != (dst_nd - k + ind_nd)) { + throw py::value_error("Destination is not of appropriate dimension " + "for put function."); + } + } + + size_t dst_nelems = dst.get_size(); + + const py::ssize_t *dst_shape = dst.get_shape_raw(); + const py::ssize_t *val_shape = val.get_shape_raw(); + + bool orthog_shapes_equal(true); + size_t orthog_nelems(1); + for (int i = 0; i < axis_start; ++i) { + orthog_nelems *= static_cast(dst_shape[i]); + orthog_shapes_equal = + orthog_shapes_equal && (dst_shape[i] == val_shape[i]); + } + + for (int i = (axis_start + k), j = (axis_start + ind_nd); + (i < dst_nd && j < val_nd); ++i, ++j) + { + orthog_nelems *= static_cast(dst_shape[i]); + orthog_shapes_equal = + orthog_shapes_equal && (dst_shape[i] == val_shape[j]); + } + + if (!orthog_shapes_equal) { + throw py::value_error( + "Axes of basic indices are not of matching shapes."); + } + + if (orthog_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + char *dst_data = dst.get_data(); + char *val_data = val.get_data(); + + auto dst_offsets = dst.get_minmax_offsets(); + auto val_offsets = val.get_minmax_offsets(); + int dst_elem_size = dst.get_elemsize(); + int val_elem_size = val.get_elemsize(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst, val})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + py::ssize_t dst_offset = py::ssize_t(0); + py::ssize_t val_offset = py::ssize_t(0); + + if (!dst.is_writable()) { + throw py::value_error("Output array is read-only."); + } + + bool memory_overlap = + ((val_data - dst_data > dst_offsets.second * dst_elem_size - + val_offsets.first * val_elem_size) && + (dst_data - val_data > val_offsets.second * val_elem_size - + dst_offsets.first * dst_elem_size)); + if (memory_overlap) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + int dst_typenum = dst.get_typenum(); + int val_typenum = val.get_typenum(); + + auto array_types = dpctl::tensor::detail::usm_ndarray_types(); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + int val_type_id = array_types.typenum_to_lookup_id(val_typenum); + + if (dst_type_id != val_type_id) { + throw py::type_error("Array data types are not the same."); + } + + const py::ssize_t *ind_shape = ind_rep.get_shape_raw(); + + int ind_typenum = ind_rep.get_typenum(); + int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum); + + size_t ind_nelems(1); + for (int i = 0; i < ind_nd; ++i) { + ind_nelems *= static_cast(ind_shape[i]); + + if (!(ind_shape[i] == val_shape[axis_start + i])) { + throw py::value_error( + "Indices shapes does not match shape of axis in vals."); + } + } + + auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1; + + char **packed_ind_ptrs = sycl::malloc_device(k, exec_q); + + if (packed_ind_ptrs == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_ind_ptrs device memory"); + } + + // packed_ind_shapes_strides = [ind_shape, + // ind[0] strides, + // ..., + // ind[k] strides] + py::ssize_t *packed_ind_shapes_strides = + sycl::malloc_device((k + 1) * ind_sh_elems, exec_q); + + if (packed_ind_shapes_strides == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_ind_shapes_strides device memory"); + } + + py::ssize_t *packed_ind_offsets = + sycl::malloc_device(k, exec_q); + + if (packed_ind_offsets == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_ind_offsets device memory"); + } + + using usm_host_allocator_T = + sycl::usm_allocator; + using ptrT = std::vector; + + usm_host_allocator_T ptr_allocator(exec_q); + std::shared_ptr host_ind_ptrs_shp = + std::make_shared(k, ptr_allocator); + + using usm_host_allocatorT = + sycl::usm_allocator; + using shT = std::vector; + + usm_host_allocatorT ind_allocator(exec_q); + std::shared_ptr host_ind_shapes_strides_shp = + std::make_shared(ind_sh_elems * (k + 1), ind_allocator); + + // shape can be copied now (must be the same for every array) + if (ind_nd > 0) { + std::copy(ind_shape, ind_shape + ind_nd, + host_ind_shapes_strides_shp->begin()); + } + else { + // all strides are 0 for 0D array + host_ind_shapes_strides_shp->insert(host_ind_shapes_strides_shp->end(), + (k + 1), 0); + } + + std::shared_ptr host_ind_offsets_shp = + std::make_shared(k, ind_allocator); + + std::vector ind_ptrs; + ind_ptrs.reserve(k); + std::vector ind_offsets; + ind_offsets.reserve(k); + for (int i = 0; i < k; ++i) { + dpctl::tensor::usm_ndarray ind_ = ind[i]; + + if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // ndim, type, and shape are checked against the first array + if (i > 0) { + if (!(ind_.get_ndim() == ind_nd)) { + throw py::value_error("Index dimensions are not the same"); + } + + if (!(ind_type_id == + array_types.typenum_to_lookup_id(ind_.get_typenum()))) { + throw py::type_error( + "Indices array data types are not all the same."); + } + + const py::ssize_t *ind_shape_ = ind_.get_shape_raw(); + for (int dim = 0; dim < ind_nd; ++dim) { + if (!(ind_shape[dim] == ind_shape_[dim])) { + throw py::value_error("Indices shapes are not all equal."); + } + } + } + + // check for overlap with destination + int ind_elem_size = ind_.get_elemsize(); + auto ind_mem_offsets = ind_.get_minmax_offsets(); + char *ind_data = ind_.get_data(); + bool ind_memory_overlap = + ((val_data - ind_data > ind_mem_offsets.second * ind_elem_size - + val_offsets.first * val_elem_size) && + (ind_data - val_data > val_offsets.second * val_elem_size - + ind_mem_offsets.first * ind_elem_size)); + + if (ind_memory_overlap) { + throw py::value_error( + "Arrays index overlapping segments of memory"); + } + + // strides are initialized to 0 for 0D indices, so skip here + if (ind_nd > 0) { + const py::ssize_t *ind_strides = ind_.get_strides_raw(); + if (ind_strides == nullptr) { + if (ind_.is_c_contiguous()) { + const auto &ind_contig_strides_ = + c_contiguous_strides(ind_nd, ind_shape); + std::copy(ind_contig_strides_.begin(), + ind_contig_strides_.end(), + host_ind_shapes_strides_shp->begin() + + (i + 1) * ind_nd); + } + else if (ind_.is_f_contiguous()) { + const auto &ind_contig_strides_ = + f_contiguous_strides(ind_nd, ind_shape); + std::copy(ind_contig_strides_.begin(), + ind_contig_strides_.end(), + host_ind_shapes_strides_shp->begin() + + (i + 1) * ind_nd); + } + else { + throw std::runtime_error( + "Invalid ind array encountered in: take function"); + } + } + else { + std::copy(ind_strides, ind_strides + ind_nd, + host_ind_shapes_strides_shp->begin() + + (i + 1) * ind_nd); + } + } + + ind_ptrs.push_back(ind_data); + ind_offsets.push_back(py::ssize_t(0)); + } + + std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); + std::copy(ind_offsets.begin(), ind_offsets.end(), + host_ind_offsets_shp->begin()); + + sycl::event device_ind_ptrs_copy_ev = exec_q.copy( + host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(device_ind_ptrs_copy_ev); + cgh.host_task([host_ind_ptrs_shp]() { + // Capturing shared pointer ensures that the underlying vector is + // not destroyed until after its data are copied into packed USM + // vector + }); + }); + + sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy( + host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides, + host_ind_shapes_strides_shp->size()); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(device_ind_shapes_strides_copy_ev); + cgh.host_task([host_ind_shapes_strides_shp]() {}); + }); + + sycl::event device_ind_offsets_copy_ev = exec_q.copy( + host_ind_offsets_shp->data(), packed_ind_offsets, + host_ind_offsets_shp->size()); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(device_ind_offsets_copy_ev); + cgh.host_task([host_ind_offsets_shp]() {}); + }); + + std::vector ind_pack_depends = { + device_ind_ptrs_copy_ev, device_ind_shapes_strides_copy_ev, + device_ind_offsets_copy_ev}; + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + bool is_val_c_contig = val.is_c_contiguous(); + bool is_val_f_contig = val.is_f_contiguous(); + + const py::ssize_t *dst_strides = dst.get_strides_raw(); + const py::ssize_t *val_strides = val.get_strides_raw(); + + // destination must be ample enough to accomodate all possible elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if ((range + 1) < dst_nelems) { + throw py::value_error( + "Destination array can not accomodate all the " + "elements of source array."); + } + } + + // packed_shapes_strides = [dst_shape[:axis] + dst_shape[:axis+1], + // dst_strides[:axis] + dst_strides[:axis+1], + // val_strides[:axis] + val_strides[:axis+1]] + py::ssize_t *packed_shapes_strides = + sycl::malloc_device(3 * sh_elems, exec_q); + + if (packed_shapes_strides == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_shapes_strides device memory"); + } + + // packed_axes_shapes_strides = [dst_shape[axis:k], + // dst_strides[axis:k, + // val_strides[axis:ind.ndim]] + py::ssize_t *packed_axes_shapes_strides = + sycl::malloc_device((2 * k) + ind_sh_elems, exec_q); + + if (packed_axes_shapes_strides == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_axes_shapes_strides device memory"); + } + + std::vector copy_shapes_strides_deps = + _populate_packed_shapes_strides_for_indexing( + exec_q, packed_shapes_strides, packed_axes_shapes_strides, + dst_shape, dst_strides, is_dst_c_contig, is_dst_f_contig, val_shape, + val_strides, is_val_c_contig, is_val_f_contig, axis_start, k, + ind_nd, dst_nd, val_nd); + + std::vector all_deps(depends.size() + + copy_shapes_strides_deps.size() + + ind_pack_depends.size()); + all_deps.insert(std::end(all_deps), std::begin(copy_shapes_strides_deps), + std::end(copy_shapes_strides_deps)); + all_deps.insert(std::end(all_deps), std::begin(ind_pack_depends), + std::end(ind_pack_depends)); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id]; + + if (fn == nullptr) { + throw std::runtime_error("Indices must be integer type, got " + + std::to_string(ind_type_id)); + } + + int orthog_nd = ((dst_nd - k) > 0) ? dst_nd - k : 1; + + sycl::event put_generic_ev = + fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k, + packed_shapes_strides, packed_axes_shapes_strides, + packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs, + dst_offset, val_offset, packed_ind_offsets, all_deps); + + // free packed_shapes_strides temporary + + auto ctx = exec_q.get_context(); + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(put_generic_ev); + cgh.host_task([packed_shapes_strides, ctx]() { + sycl::free(packed_shapes_strides, ctx); + }); + }); + + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(put_generic_ev); + cgh.host_task([packed_axes_shapes_strides, ctx]() { + sycl::free(packed_axes_shapes_strides, ctx); + }); + }); + + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(put_generic_ev); + cgh.host_task([packed_ind_shapes_strides, ctx]() { + sycl::free(packed_ind_shapes_strides, ctx); + }); + }); + + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(put_generic_ev); + cgh.host_task( + [packed_ind_ptrs, ctx]() { sycl::free(packed_ind_ptrs, ctx); }); + }); + + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(put_generic_ev); + cgh.host_task([packed_ind_offsets, ctx]() { + sycl::free(packed_ind_offsets, ctx); + }); + }); + + return std::make_pair(keep_args_alive(exec_q, {dst, val}, {put_generic_ev}), + put_generic_ev); +} + +void init_advanced_indexing_dispatch_tables(void) +{ + using namespace dpctl::tensor::detail; + + using dpctl::tensor::kernels::indexing::TakeClipFactory; + DispatchTableBuilder + dtb_takeclip; + dtb_takeclip.populate_dispatch_table(take_dispatch_table[CLIP_MODE]); + + using dpctl::tensor::kernels::indexing::TakeWrapFactory; + DispatchTableBuilder + dtb_takewrap; + dtb_takewrap.populate_dispatch_table(take_dispatch_table[WRAP_MODE]); + + using dpctl::tensor::kernels::indexing::PutClipFactory; + DispatchTableBuilder dtb_putclip; + dtb_putclip.populate_dispatch_table(put_dispatch_table[CLIP_MODE]); + + using dpctl::tensor::kernels::indexing::PutWrapFactory; + DispatchTableBuilder dtb_putwrap; + dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.hpp b/dpctl/tensor/libtensor/source/advanced_indexing.hpp new file mode 100644 index 0000000000..d99d4f1828 --- /dev/null +++ b/dpctl/tensor/libtensor/source/advanced_indexing.hpp @@ -0,0 +1,62 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair +usm_ndarray_take(dpctl::tensor::usm_ndarray src, + std::vector ind, + dpctl::tensor::usm_ndarray dst, + int axis_start, + uint8_t mode, + sycl::queue exec_q, + const std::vector &depends = {}); + +extern std::pair +usm_ndarray_put(dpctl::tensor::usm_ndarray dst, + std::vector ind, + dpctl::tensor::usm_ndarray val, + int axis_start, + uint8_t mode, + sycl::queue exec_q, + const std::vector &depends = {}); + +extern void init_advanced_indexing_dispatch_tables(); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index aa8634ecf4..94458bccf9 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -33,6 +33,7 @@ #include "dpctl4pybind11.hpp" +#include "advanced_indexing.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_for_reshape.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" @@ -70,6 +71,10 @@ using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; using dpctl::tensor::py_internal::usm_ndarray_full; +/* ============== Advanced Indexing ============= */ +using dpctl::tensor::py_internal::usm_ndarray_put; +using dpctl::tensor::py_internal::usm_ndarray_take; + /* ================ Eye ================== */ using dpctl::tensor::py_internal::usm_ndarray_eye; @@ -85,6 +90,7 @@ void init_dispatch_tables(void) init_copy_and_cast_usm_to_usm_dispatch_tables(); init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); + init_advanced_indexing_dispatch_tables(); return; } @@ -179,6 +185,24 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_take", &usm_ndarray_take, + "Takes elements at usm_ndarray indices `ind` and axes starting " + "at axis `axis_start` from array `src` and copies them " + "into usm_ndarray `dst` synchronously." + "Returns a tuple of events: (hev, ev)", + py::arg("src"), py::arg("ind"), py::arg("dst"), py::arg("axis_start"), + py::arg("mode"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_put", &usm_ndarray_put, + "Puts elements at usm_ndarray indices `ind` and axes starting " + "at axis `axis_start` into array `dst` from " + "usm_ndarray `val` synchronously." + "Returns a tuple of events: (hev, ev)", + py::arg("dst"), py::arg("ind"), py::arg("val"), py::arg("axis_start"), + py::arg("mode"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_eye", &usm_ndarray_eye, "Fills input 2D contiguous usm_ndarray `dst` with " "zeros outside of the diagonal " From 6239eb7d631713051211c4870883e9ff219d965d Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 24 Feb 2023 09:52:36 -0800 Subject: [PATCH 15/57] Changes to advanced indexing - Clipping now clips indices to -n <= i < n for n = axis size - Fixed a segfault caused by a typo when copying strides --- dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp | 4 +++- dpctl/tensor/libtensor/source/advanced_indexing.cpp | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp index 77234296ff..8ccad8db28 100644 --- a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp @@ -54,7 +54,9 @@ template class ClipIndex { max_item = (max_item > 0) ? max_item : 1; py::ssize_t clip_ind = static_cast(ind); - ind = (ind < 0) ? 0 : (clip_ind >= max_item) ? (max_item - 1) : ind; + ind = (ind < 0) ? (clip_ind <= -max_item) ? (0) : (clip_ind + max_item) + : (clip_ind >= max_item) ? (max_item - 1) + : ind; return; } }; diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.cpp b/dpctl/tensor/libtensor/source/advanced_indexing.cpp index 5f043db7bc..3dc6f47904 100644 --- a/dpctl/tensor/libtensor/source/advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/advanced_indexing.cpp @@ -197,7 +197,7 @@ std::vector _populate_packed_shapes_strides_for_indexing( std::copy(arr_strides, arr_strides + axis_start, packed_host_shapes_strides_shp->begin() + 2 * orthog_sh_elems); - std::copy(arr_strides + axis_start + ind_nd, arr_strides + inp_nd, + std::copy(arr_strides + axis_start + ind_nd, arr_strides + arr_nd, packed_host_shapes_strides_shp->begin() + 2 * orthog_sh_elems + axis_start); std::copy(arr_strides + axis_start, From a0895be431e0827263955ea3e24b724eec4145c0 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 24 Feb 2023 12:33:57 -0800 Subject: [PATCH 16/57] Changes to advanced_indexing.cpp - Moved indices validation to avoid memory leaks - Refactored for loop over orthogonal elements of shapes - Direct initialization of sycl::event vectors --- .../libtensor/source/advanced_indexing.cpp | 434 ++++++++---------- 1 file changed, 183 insertions(+), 251 deletions(-) diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.cpp b/dpctl/tensor/libtensor/source/advanced_indexing.cpp index 3dc6f47904..fed5d543ed 100644 --- a/dpctl/tensor/libtensor/source/advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/advanced_indexing.cpp @@ -85,7 +85,7 @@ std::vector _populate_packed_shapes_strides_for_indexing( int arr_nd) { - int orthog_sh_elems = (inp_nd > 1) ? inp_nd - k : 1; + int orthog_sh_elems = ((inp_nd - k) > 1) ? (inp_nd - k) : 1; int along_sh_elems = (ind_nd > 1) ? ind_nd : 1; using usm_host_allocatorT = @@ -291,14 +291,17 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, int k = ind.size(); if (k == 0) { - // no indices to take from - return std::make_pair(sycl::event{}, sycl::event{}); + throw py::value_error("List of indices is empty."); } if (axis_start < 0) { throw py::value_error("Axis cannot be negative."); } + if (mode != 0 && mode != 1) { + throw py::value_error("Mode must be 0 or 1."); + } + const dpctl::tensor::usm_ndarray ind_rep = ind[0]; int src_nd = src.get_ndim(); @@ -327,20 +330,17 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, const py::ssize_t *src_shape = src.get_shape_raw(); const py::ssize_t *dst_shape = dst.get_shape_raw(); + int orthog_nd = ((src_nd - k) > 0) ? src_nd - k : 1; + bool orthog_shapes_equal(true); size_t orthog_nelems(1); - for (int i = 0; i < axis_start; ++i) { - orthog_nelems *= static_cast(src_shape[i]); - orthog_shapes_equal = - orthog_shapes_equal && (src_shape[i] == dst_shape[i]); - } + for (int i = 0; i < (src_nd - k); ++i) { + auto idx1 = (i < axis_start) ? i : i + k; + auto idx2 = (i < axis_start) ? i : i + ind_nd; - for (int i = (axis_start + k), j = (axis_start + ind_nd); - (i < src_nd && j < dst_nd); ++i, ++j) - { - orthog_nelems *= static_cast(src_shape[i]); + orthog_nelems *= static_cast(src_shape[idx1]); orthog_shapes_equal = - orthog_shapes_equal && (src_shape[i] == dst_shape[j]); + orthog_shapes_equal && (src_shape[idx1] == dst_shape[idx2]); } if (!orthog_shapes_equal) { @@ -355,29 +355,26 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, char *src_data = src.get_data(); char *dst_data = dst.get_data(); + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + auto src_offsets = src.get_minmax_offsets(); auto dst_offsets = dst.get_minmax_offsets(); int src_elem_size = src.get_elemsize(); int dst_elem_size = dst.get_elemsize(); - if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { - throw py::value_error( - "Execution queue is not compatible with allocation queues"); - } py::ssize_t src_offset = py::ssize_t(0); py::ssize_t dst_offset = py::ssize_t(0); - if (!dst.is_writable()) { - throw py::value_error("Output array is read-only."); - } - bool memory_overlap = ((dst_data - src_data > src_offsets.second * src_elem_size - dst_offsets.first * dst_elem_size) && (src_data - dst_data > dst_offsets.second * dst_elem_size - src_offsets.first * src_elem_size)); if (memory_overlap) { - throw py::value_error("Arrays index overlapping segments of memory"); + throw py::value_error("Array memory overlap."); } int src_typenum = src.get_typenum(); @@ -408,67 +405,16 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1; - char **packed_ind_ptrs = sycl::malloc_device(k, exec_q); - - if (packed_ind_ptrs == nullptr) { - throw std::runtime_error( - "Unable to allocate packed_ind_ptrs device memory"); - } - - // packed_ind_shapes_strides = [ind_shape, - // ind[0] strides, - // ..., - // ind[k] strides] - py::ssize_t *packed_ind_shapes_strides = - sycl::malloc_device((k + 1) * ind_sh_elems, exec_q); - - if (packed_ind_shapes_strides == nullptr) { - throw std::runtime_error( - "Unable to allocate packed_ind_shapes_strides device memory"); - } - - py::ssize_t *packed_ind_offsets = - sycl::malloc_device(k, exec_q); - - if (packed_ind_offsets == nullptr) { - throw std::runtime_error( - "Unable to allocate packed_ind_offsets device memory"); - } - - using usm_host_allocator_T = - sycl::usm_allocator; - using ptrT = std::vector; - - usm_host_allocator_T ptr_allocator(exec_q); - std::shared_ptr host_ind_ptrs_shp = - std::make_shared(k, ptr_allocator); - - using usm_host_allocatorT = - sycl::usm_allocator; - using shT = std::vector; - - usm_host_allocatorT ind_allocator(exec_q); - std::shared_ptr host_ind_shapes_strides_shp = - std::make_shared(ind_sh_elems * (k + 1), ind_allocator); - - // shape can be copied now (must be the same for every array) - if (ind_nd > 0) { - std::copy(ind_shape, ind_shape + ind_nd, - host_ind_shapes_strides_shp->begin()); - } - else { - // all strides are 0 for 0D array - host_ind_shapes_strides_shp->insert(host_ind_shapes_strides_shp->end(), - (k + 1), 0); - } - - std::shared_ptr host_ind_offsets_shp = - std::make_shared(k, ind_allocator); - std::vector ind_ptrs; ind_ptrs.reserve(k); + std::vector ind_offsets; ind_offsets.reserve(k); + + std::vector ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0)); + if (ind_nd > 0) { + std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin()); + } for (int i = 0; i < k; ++i) { dpctl::tensor::usm_ndarray ind_ = ind[i]; @@ -521,16 +467,14 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, c_contiguous_strides(ind_nd, ind_shape); std::copy(ind_contig_strides_.begin(), ind_contig_strides_.end(), - host_ind_shapes_strides_shp->begin() + - (i + 1) * ind_nd); + ind_sh_sts.begin() + (i + 1) * ind_nd); } else if (ind_.is_f_contiguous()) { const auto &ind_contig_strides_ = f_contiguous_strides(ind_nd, ind_shape); std::copy(ind_contig_strides_.begin(), ind_contig_strides_.end(), - host_ind_shapes_strides_shp->begin() + - (i + 1) * ind_nd); + ind_sh_sts.begin() + (i + 1) * ind_nd); } else { throw std::runtime_error( @@ -539,8 +483,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, } else { std::copy(ind_strides, ind_strides + ind_nd, - host_ind_shapes_strides_shp->begin() + - (i + 1) * ind_nd); + ind_sh_sts.begin() + (i + 1) * ind_nd); } } @@ -548,36 +491,85 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, ind_offsets.push_back(py::ssize_t(0)); } + char **packed_ind_ptrs = sycl::malloc_device(k, exec_q); + + if (packed_ind_ptrs == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_ind_ptrs device memory"); + } + + // rearrange to past where indices shapes are checked + // packed_ind_shapes_strides = [ind_shape, + // ind[0] strides, + // ..., + // ind[k] strides] + py::ssize_t *packed_ind_shapes_strides = + sycl::malloc_device((k + 1) * ind_sh_elems, exec_q); + + if (packed_ind_shapes_strides == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_ind_shapes_strides device memory"); + } + + py::ssize_t *packed_ind_offsets = + sycl::malloc_device(k, exec_q); + + if (packed_ind_offsets == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_ind_offsets device memory"); + } + + using usm_host_allocator_T = + sycl::usm_allocator; + using ptrT = std::vector; + + usm_host_allocator_T ptr_allocator(exec_q); + std::shared_ptr host_ind_ptrs_shp = + std::make_shared(k, ptr_allocator); + + using usm_host_allocatorT = + sycl::usm_allocator; + using shT = std::vector; + + usm_host_allocatorT ind_allocator(exec_q); + std::shared_ptr host_ind_shapes_strides_shp = + std::make_shared(ind_sh_elems * (k + 1), ind_allocator); + + std::shared_ptr host_ind_offsets_shp = + std::make_shared(k, ind_allocator); + + std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), + host_ind_shapes_strides_shp->begin()); std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); std::copy(ind_offsets.begin(), ind_offsets.end(), host_ind_offsets_shp->begin()); - sycl::event device_ind_ptrs_copy_ev = exec_q.copy( + sycl::event packed_ind_ptrs_copy_ev = exec_q.copy( host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(device_ind_ptrs_copy_ev); + cgh.depends_on(packed_ind_ptrs_copy_ev); cgh.host_task([host_ind_ptrs_shp]() {}); }); - sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy( + sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy( host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides, host_ind_shapes_strides_shp->size()); exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(device_ind_shapes_strides_copy_ev); + cgh.depends_on(packed_ind_shapes_strides_copy_ev); cgh.host_task([host_ind_shapes_strides_shp]() {}); }); - sycl::event device_ind_offsets_copy_ev = exec_q.copy( + sycl::event packed_ind_offsets_copy_ev = exec_q.copy( host_ind_offsets_shp->data(), packed_ind_offsets, host_ind_offsets_shp->size()); exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(device_ind_offsets_copy_ev); + cgh.depends_on(packed_ind_offsets_copy_ev); cgh.host_task([host_ind_offsets_shp]() {}); }); - std::vector ind_pack_depends = { - device_ind_ptrs_copy_ev, device_ind_shapes_strides_copy_ev, - device_ind_offsets_copy_ev}; + std::vector ind_pack_depends{packed_ind_ptrs_copy_ev, + packed_ind_shapes_strides_copy_ev, + packed_ind_offsets_copy_ev}; bool is_src_c_contig = src.is_c_contiguous(); bool is_src_f_contig = src.is_f_contiguous(); @@ -588,20 +580,20 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, const py::ssize_t *src_strides = src.get_strides_raw(); const py::ssize_t *dst_strides = dst.get_strides_raw(); - // destination must be ample enough to accomodate all elements + // destination must be ample enough to accommodate all elements { size_t range = static_cast(dst_offsets.second - dst_offsets.first); if ((range + 1) < (orthog_nelems * ind_nelems)) { throw py::value_error( - "Destination array can not accomodate all the " + "Destination array can not accommodate all the " "elements of source array."); } } - // packed_shapes_strides = [src_shape[:axis] + src_shape[:axis+1], - // src_strides[:axis] + src_strides[:axis+1], - // dst_strides[:axis] + dst_strides[:axis+1]] + // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:], + // src_strides[:axis] + src_strides[axis+k:], + // dst_strides[:axis] + dst_strides[axis+k:]] py::ssize_t *packed_shapes_strides = sycl::malloc_device(3 * sh_elems, exec_q); @@ -610,8 +602,8 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, "Unable to allocate packed_shapes_strides device memory"); } - // packed_axes_shapes_strides = [src_shape[axis:k], - // src_strides[axis:k, + // packed_axes_shapes_strides = [src_shape[axis:axis+k], + // src_strides[axis:axis+k, // dst_strides[axis:ind.ndim]] py::ssize_t *packed_axes_shapes_strides = sycl::malloc_device((2 * k) + ind_sh_elems, exec_q); @@ -643,47 +635,23 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, std::to_string(ind_type_id)); } - int orthog_nd = ((src_nd - k) > 0) ? src_nd - k : 1; - sycl::event take_generic_ev = fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k, packed_shapes_strides, packed_axes_shapes_strides, packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs, src_offset, dst_offset, packed_ind_offsets, all_deps); - // free packed_shapes_strides temporary - + // free packed temporaries auto ctx = exec_q.get_context(); exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(take_generic_ev); - cgh.host_task([packed_shapes_strides, ctx]() { + cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides, + packed_ind_shapes_strides, packed_ind_ptrs, + packed_ind_offsets, ctx]() { sycl::free(packed_shapes_strides, ctx); - }); - }); - - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(take_generic_ev); - cgh.host_task([packed_axes_shapes_strides, ctx]() { sycl::free(packed_axes_shapes_strides, ctx); - }); - }); - - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(take_generic_ev); - cgh.host_task([packed_ind_shapes_strides, ctx]() { sycl::free(packed_ind_shapes_strides, ctx); - }); - }); - - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(take_generic_ev); - cgh.host_task( - [packed_ind_ptrs, ctx]() { sycl::free(packed_ind_ptrs, ctx); }); - }); - - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(take_generic_ev); - cgh.host_task([packed_ind_offsets, ctx]() { + sycl::free(packed_ind_ptrs, ctx); sycl::free(packed_ind_offsets, ctx); }); }); @@ -702,18 +670,25 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, sycl::queue exec_q, const std::vector &depends = {}) { - // check compatibility of execution queue and allocation queue int k = ind.size(); if (k == 0) { // no indices to write to - return std::make_pair(sycl::event{}, sycl::event{}); + throw py::value_error("List of indices is empty."); } if (axis_start < 0) { throw py::value_error("Axis cannot be negative."); } + if (mode != 0 && mode != 1) { + throw py::value_error("Mode must be 0 or 1."); + } + + if (!dst.is_writable()) { + throw py::value_error("Output array is read-only."); + } + const dpctl::tensor::usm_ndarray ind_rep = ind[0]; int dst_nd = dst.get_ndim(); @@ -744,20 +719,17 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, const py::ssize_t *dst_shape = dst.get_shape_raw(); const py::ssize_t *val_shape = val.get_shape_raw(); + int orthog_nd = ((dst_nd - k) > 0) ? dst_nd - k : 1; + bool orthog_shapes_equal(true); size_t orthog_nelems(1); - for (int i = 0; i < axis_start; ++i) { - orthog_nelems *= static_cast(dst_shape[i]); - orthog_shapes_equal = - orthog_shapes_equal && (dst_shape[i] == val_shape[i]); - } + for (int i = 0; i < (dst_nd - k); ++i) { + auto idx1 = (i < axis_start) ? i : i + k; + auto idx2 = (i < axis_start) ? i : i + ind_nd; - for (int i = (axis_start + k), j = (axis_start + ind_nd); - (i < dst_nd && j < val_nd); ++i, ++j) - { - orthog_nelems *= static_cast(dst_shape[i]); + orthog_nelems *= static_cast(dst_shape[idx1]); orthog_shapes_equal = - orthog_shapes_equal && (dst_shape[i] == val_shape[j]); + orthog_shapes_equal && (dst_shape[idx1] == val_shape[idx2]); } if (!orthog_shapes_equal) { @@ -784,10 +756,6 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, py::ssize_t dst_offset = py::ssize_t(0); py::ssize_t val_offset = py::ssize_t(0); - if (!dst.is_writable()) { - throw py::value_error("Output array is read-only."); - } - bool memory_overlap = ((val_data - dst_data > dst_offsets.second * dst_elem_size - val_offsets.first * val_elem_size) && @@ -825,67 +793,14 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1; - char **packed_ind_ptrs = sycl::malloc_device(k, exec_q); - - if (packed_ind_ptrs == nullptr) { - throw std::runtime_error( - "Unable to allocate packed_ind_ptrs device memory"); - } - - // packed_ind_shapes_strides = [ind_shape, - // ind[0] strides, - // ..., - // ind[k] strides] - py::ssize_t *packed_ind_shapes_strides = - sycl::malloc_device((k + 1) * ind_sh_elems, exec_q); - - if (packed_ind_shapes_strides == nullptr) { - throw std::runtime_error( - "Unable to allocate packed_ind_shapes_strides device memory"); - } - - py::ssize_t *packed_ind_offsets = - sycl::malloc_device(k, exec_q); - - if (packed_ind_offsets == nullptr) { - throw std::runtime_error( - "Unable to allocate packed_ind_offsets device memory"); - } - - using usm_host_allocator_T = - sycl::usm_allocator; - using ptrT = std::vector; - - usm_host_allocator_T ptr_allocator(exec_q); - std::shared_ptr host_ind_ptrs_shp = - std::make_shared(k, ptr_allocator); - - using usm_host_allocatorT = - sycl::usm_allocator; - using shT = std::vector; - - usm_host_allocatorT ind_allocator(exec_q); - std::shared_ptr host_ind_shapes_strides_shp = - std::make_shared(ind_sh_elems * (k + 1), ind_allocator); - - // shape can be copied now (must be the same for every array) - if (ind_nd > 0) { - std::copy(ind_shape, ind_shape + ind_nd, - host_ind_shapes_strides_shp->begin()); - } - else { - // all strides are 0 for 0D array - host_ind_shapes_strides_shp->insert(host_ind_shapes_strides_shp->end(), - (k + 1), 0); - } - - std::shared_ptr host_ind_offsets_shp = - std::make_shared(k, ind_allocator); - std::vector ind_ptrs; ind_ptrs.reserve(k); std::vector ind_offsets; ind_offsets.reserve(k); + std::vector ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0)); + if (ind_nd > 0) { + std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin()); + } for (int i = 0; i < k; ++i) { dpctl::tensor::usm_ndarray ind_ = ind[i]; @@ -938,16 +853,14 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, c_contiguous_strides(ind_nd, ind_shape); std::copy(ind_contig_strides_.begin(), ind_contig_strides_.end(), - host_ind_shapes_strides_shp->begin() + - (i + 1) * ind_nd); + ind_sh_sts.begin() + (i + 1) * ind_nd); } else if (ind_.is_f_contiguous()) { const auto &ind_contig_strides_ = f_contiguous_strides(ind_nd, ind_shape); std::copy(ind_contig_strides_.begin(), ind_contig_strides_.end(), - host_ind_shapes_strides_shp->begin() + - (i + 1) * ind_nd); + ind_sh_sts.begin() + (i + 1) * ind_nd); } else { throw std::runtime_error( @@ -956,8 +869,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, } else { std::copy(ind_strides, ind_strides + ind_nd, - host_ind_shapes_strides_shp->begin() + - (i + 1) * ind_nd); + ind_sh_sts.begin() + (i + 1) * ind_nd); } } @@ -965,6 +877,54 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, ind_offsets.push_back(py::ssize_t(0)); } + char **packed_ind_ptrs = sycl::malloc_device(k, exec_q); + + if (packed_ind_ptrs == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_ind_ptrs device memory"); + } + + // packed_ind_shapes_strides = [ind_shape, + // ind[0] strides, + // ..., + // ind[k] strides] + py::ssize_t *packed_ind_shapes_strides = + sycl::malloc_device((k + 1) * ind_sh_elems, exec_q); + + if (packed_ind_shapes_strides == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_ind_shapes_strides device memory"); + } + + py::ssize_t *packed_ind_offsets = + sycl::malloc_device(k, exec_q); + + if (packed_ind_offsets == nullptr) { + throw std::runtime_error( + "Unable to allocate packed_ind_offsets device memory"); + } + + using usm_host_allocator_T = + sycl::usm_allocator; + using ptrT = std::vector; + + usm_host_allocator_T ptr_allocator(exec_q); + std::shared_ptr host_ind_ptrs_shp = + std::make_shared(k, ptr_allocator); + + using usm_host_allocatorT = + sycl::usm_allocator; + using shT = std::vector; + + usm_host_allocatorT ind_allocator(exec_q); + std::shared_ptr host_ind_shapes_strides_shp = + std::make_shared(ind_sh_elems * (k + 1), ind_allocator); + + std::shared_ptr host_ind_offsets_shp = + std::make_shared(k, ind_allocator); + + std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), + host_ind_shapes_strides_shp->begin()); std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); std::copy(ind_offsets.begin(), ind_offsets.end(), host_ind_offsets_shp->begin()); @@ -973,11 +933,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(device_ind_ptrs_copy_ev); - cgh.host_task([host_ind_ptrs_shp]() { - // Capturing shared pointer ensures that the underlying vector is - // not destroyed until after its data are copied into packed USM - // vector - }); + cgh.host_task([host_ind_ptrs_shp]() {}); }); sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy( @@ -996,9 +952,9 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, cgh.host_task([host_ind_offsets_shp]() {}); }); - std::vector ind_pack_depends = { - device_ind_ptrs_copy_ev, device_ind_shapes_strides_copy_ev, - device_ind_offsets_copy_ev}; + std::vector ind_pack_depends{device_ind_ptrs_copy_ev, + device_ind_shapes_strides_copy_ev, + device_ind_offsets_copy_ev}; bool is_dst_c_contig = dst.is_c_contiguous(); bool is_dst_f_contig = dst.is_f_contiguous(); @@ -1009,20 +965,20 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, const py::ssize_t *dst_strides = dst.get_strides_raw(); const py::ssize_t *val_strides = val.get_strides_raw(); - // destination must be ample enough to accomodate all possible elements + // destination must be ample enough to accommodate all possible elements { size_t range = static_cast(dst_offsets.second - dst_offsets.first); if ((range + 1) < dst_nelems) { throw py::value_error( - "Destination array can not accomodate all the " + "Destination array can not accommodate all the " "elements of source array."); } } - // packed_shapes_strides = [dst_shape[:axis] + dst_shape[:axis+1], - // dst_strides[:axis] + dst_strides[:axis+1], - // val_strides[:axis] + val_strides[:axis+1]] + // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:], + // dst_strides[:axis] + dst_strides[axis+k:], + // val_strides[:axis] + val_strides[axis+k:]] py::ssize_t *packed_shapes_strides = sycl::malloc_device(3 * sh_elems, exec_q); @@ -1065,47 +1021,23 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, std::to_string(ind_type_id)); } - int orthog_nd = ((dst_nd - k) > 0) ? dst_nd - k : 1; - sycl::event put_generic_ev = fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k, packed_shapes_strides, packed_axes_shapes_strides, packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs, dst_offset, val_offset, packed_ind_offsets, all_deps); - // free packed_shapes_strides temporary - + // free packed temporaries auto ctx = exec_q.get_context(); exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(put_generic_ev); - cgh.host_task([packed_shapes_strides, ctx]() { + cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides, + packed_ind_shapes_strides, packed_ind_ptrs, + packed_ind_offsets, ctx]() { sycl::free(packed_shapes_strides, ctx); - }); - }); - - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(put_generic_ev); - cgh.host_task([packed_axes_shapes_strides, ctx]() { sycl::free(packed_axes_shapes_strides, ctx); - }); - }); - - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(put_generic_ev); - cgh.host_task([packed_ind_shapes_strides, ctx]() { sycl::free(packed_ind_shapes_strides, ctx); - }); - }); - - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(put_generic_ev); - cgh.host_task( - [packed_ind_ptrs, ctx]() { sycl::free(packed_ind_ptrs, ctx); }); - }); - - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(put_generic_ev); - cgh.host_task([packed_ind_offsets, ctx]() { + sycl::free(packed_ind_ptrs, ctx); sycl::free(packed_ind_offsets, ctx); }); }); From 728b8e69f29d9fbd0ed339a7f31d59c250dd1f94 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Sat, 25 Feb 2023 13:24:30 -0800 Subject: [PATCH 17/57] Fixed missing cast for indices clip/wrap --- .../include/kernels/advanced_indexing.hpp | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp index 8ccad8db28..1e205c658c 100644 --- a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp @@ -45,34 +45,32 @@ namespace py = pybind11; template class take_kernel; template class put_kernel; -template class ClipIndex +class ClipIndex { public: ClipIndex() = default; - void operator()(py::ssize_t max_item, indT &ind) const + void operator()(py::ssize_t max_item, py::ssize_t &ind) const { max_item = (max_item > 0) ? max_item : 1; - py::ssize_t clip_ind = static_cast(ind); - ind = (ind < 0) ? (clip_ind <= -max_item) ? (0) : (clip_ind + max_item) - : (clip_ind >= max_item) ? (max_item - 1) - : ind; + ind = (ind < 0) ? (ind <= -max_item) ? (0) : (ind + max_item) + : (ind >= max_item) ? (max_item - 1) + : ind; return; } }; -template class WrapIndex +class WrapIndex { public: WrapIndex() = default; - void operator()(py::ssize_t max_item, indT &ind) const + void operator()(py::ssize_t max_item, py::ssize_t &ind) const { max_item = (max_item > 0) ? max_item : 1; - py::ssize_t wrap_ind = static_cast(ind); - ind = (ind < 0) ? max_item - (-wrap_ind % max_item) - : (wrap_ind >= max_item) ? wrap_ind % max_item - : ind; + ind = (ind < 0) ? max_item - (-ind % max_item) + : (ind >= max_item) ? ind % max_item + : ind; return; } }; @@ -146,7 +144,8 @@ template class TakeFunctor ind_shape_and_strides_ + ((axis_idx + 1) * ind_nd_), ind_arr_idx); indT *ind_data = reinterpret_cast(ind_[axis_idx]); - indT i = ind_data[ind_arr_idx + ind_offsets_[axis_idx]]; + py::ssize_t i = static_cast( + ind_data[ind_arr_idx + ind_offsets_[axis_idx]]); proj(axes_shape_and_strides_[axis_idx], i); src_orthog_idx += i * axes_shape_and_strides_[k_ + axis_idx]; } @@ -282,7 +281,8 @@ template class PutFunctor ind_shape_and_strides_ + ((axis_idx + 1) * ind_nd_), ind_arr_idx); indT *ind_data = reinterpret_cast(ind_[axis_idx]); - indT i = ind_data[ind_arr_idx + ind_offsets_[axis_idx]]; + py::ssize_t i = static_cast( + ind_data[ind_arr_idx + ind_offsets_[axis_idx]]); proj(axes_shape_and_strides_[axis_idx], i); dst_orthog_idx += i * axes_shape_and_strides_[k_ + axis_idx]; } @@ -355,7 +355,7 @@ template struct TakeWrapFactory { if constexpr (std::is_integral::value && !std::is_same::value) { - fnT fn = take_impl, T, indT>; + fnT fn = take_impl; return fn; } else { @@ -371,7 +371,7 @@ template struct TakeClipFactory { if constexpr (std::is_integral::value && !std::is_same::value) { - fnT fn = take_impl, T, indT>; + fnT fn = take_impl; return fn; } else { @@ -387,7 +387,7 @@ template struct PutWrapFactory { if constexpr (std::is_integral::value && !std::is_same::value) { - fnT fn = put_impl, T, indT>; + fnT fn = put_impl; return fn; } else { @@ -403,7 +403,7 @@ template struct PutClipFactory { if constexpr (std::is_integral::value && !std::is_same::value) { - fnT fn = put_impl, T, indT>; + fnT fn = put_impl; return fn; } else { From 333f9e64b0bc1f6b70f328a5db4740e45722b49e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sat, 25 Feb 2023 21:31:44 -0600 Subject: [PATCH 18/57] Fixed error from dpt.flip(dpt.arange(5))[dpt.arange(2)] --- dpctl/tensor/_slicing.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi index 6689502955..10b5c58395 100644 --- a/dpctl/tensor/_slicing.pxi +++ b/dpctl/tensor/_slicing.pxi @@ -94,7 +94,7 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int): "Index {0} is out of range for axes 0 with " "size {1}".format(ind, shape[0])) elif isinstance(ind, usm_ndarray): - return (shape, strides, 0, (ind,), 0) + return (shape, strides, offset, (ind,), 0) elif isinstance(ind, tuple): axes_referenced = 0 ellipses_count = 0 From ab79d843eab9baa166655a63beb39a32204a7384 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sat, 25 Feb 2023 21:33:05 -0600 Subject: [PATCH 19/57] More tests for advanced indexing --- dpctl/tests/test_usm_ndarray_indexing.py | 74 +++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index e6c7271ab1..4281938577 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -15,7 +15,7 @@ # limitations under the License. -# import numpy as np +import numpy as np import pytest from helper import get_queue_or_skip @@ -174,6 +174,22 @@ def test_advanced_slice1(): ) +def test_advanced_slice1_negative_strides(): + q = get_queue_or_skip() + ii = dpt.asarray([0, 1], sycl_queue=q) + x = dpt.flip(dpt.arange(5, dtype="i4", sycl_queue=q)) + y = x[ii] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ii.shape + assert y.strides == (1,) + # FIXME, once usm_ndarray.__equal__ is implemented, + # use of asnumpy should be removed + assert _all_equal( + (x[ii[k]] for k in range(ii.shape[0])), + (y[k] for k in range(ii.shape[0])), + ) + + def test_advanced_slice2(): q = get_queue_or_skip() ii = dpt.asarray([1, 2], sycl_queue=q) @@ -363,3 +379,59 @@ def test_advanced_slice13(): assert isinstance(y, dpt.usm_ndarray) assert y.shape == expected.shape assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all() + + +def test_integer_indexing_1d(): + get_queue_or_skip() + x = dpt.arange(10, dtype="i4") + ind_1d = dpt.asarray([7, 3, 1], dtype="u2") + ind_2d = dpt.asarray([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4") + + y1 = x[ind_1d] + assert y1.shape == ind_1d.shape + y2 = x[ind_2d] + assert y2.shape == ind_2d.shape + assert (dpt.asnumpy(y1) == np.array([7, 3, 1], dtype="i4")).all() + assert ( + dpt.asnumpy(y2) + == np.array([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4") + ).all() + + +def test_integer_indexing_2d(): + get_queue_or_skip() + n0, n1 = 5, 7 + x = dpt.reshape( + dpt.arange(n0 * n1, dtype="i4"), + ( + n0, + n1, + ), + ) + ind0 = dpt.arange(n0) + ind1 = dpt.arange(n1) + + y = x[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]] + assert y.dtype == x.dtype + assert (dpt.asnumpy(y) == np.array([[5, 6], [12, 13]])).all() + + +def test_integer_strided_indexing(): + get_queue_or_skip() + n0, n1 = 5, 7 + x = dpt.reshape( + dpt.arange(2 * n0 * n1, dtype="i4"), + ( + 2 * n0, + n1, + ), + ) + ind0 = dpt.arange(n0) + ind1 = dpt.arange(n1) + + z = x[::-2, :] + y = z[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]] + assert y.dtype == x.dtype + zc = dpt.copy(z, order="C") + yc = zc[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]] + assert (dpt.asnumpy(y) == dpt.asnumpy(yc)).all() From a966830bc720c37715fc2bbd1d19cb383116c765 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sat, 25 Feb 2023 21:31:44 -0600 Subject: [PATCH 20/57] Fixed error from dpt.flip(dpt.arange(5))[dpt.arange(2)] --- dpctl/tensor/_slicing.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi index 6689502955..10b5c58395 100644 --- a/dpctl/tensor/_slicing.pxi +++ b/dpctl/tensor/_slicing.pxi @@ -94,7 +94,7 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int): "Index {0} is out of range for axes 0 with " "size {1}".format(ind, shape[0])) elif isinstance(ind, usm_ndarray): - return (shape, strides, 0, (ind,), 0) + return (shape, strides, offset, (ind,), 0) elif isinstance(ind, tuple): axes_referenced = 0 ellipses_count = 0 From 8523d8e3e0be5229045632bc739c32b838b6edaf Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sat, 25 Feb 2023 21:33:05 -0600 Subject: [PATCH 21/57] More tests for advanced indexing --- dpctl/tests/test_usm_ndarray_indexing.py | 74 +++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index e6c7271ab1..4281938577 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -15,7 +15,7 @@ # limitations under the License. -# import numpy as np +import numpy as np import pytest from helper import get_queue_or_skip @@ -174,6 +174,22 @@ def test_advanced_slice1(): ) +def test_advanced_slice1_negative_strides(): + q = get_queue_or_skip() + ii = dpt.asarray([0, 1], sycl_queue=q) + x = dpt.flip(dpt.arange(5, dtype="i4", sycl_queue=q)) + y = x[ii] + assert isinstance(y, dpt.usm_ndarray) + assert y.shape == ii.shape + assert y.strides == (1,) + # FIXME, once usm_ndarray.__equal__ is implemented, + # use of asnumpy should be removed + assert _all_equal( + (x[ii[k]] for k in range(ii.shape[0])), + (y[k] for k in range(ii.shape[0])), + ) + + def test_advanced_slice2(): q = get_queue_or_skip() ii = dpt.asarray([1, 2], sycl_queue=q) @@ -363,3 +379,59 @@ def test_advanced_slice13(): assert isinstance(y, dpt.usm_ndarray) assert y.shape == expected.shape assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all() + + +def test_integer_indexing_1d(): + get_queue_or_skip() + x = dpt.arange(10, dtype="i4") + ind_1d = dpt.asarray([7, 3, 1], dtype="u2") + ind_2d = dpt.asarray([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4") + + y1 = x[ind_1d] + assert y1.shape == ind_1d.shape + y2 = x[ind_2d] + assert y2.shape == ind_2d.shape + assert (dpt.asnumpy(y1) == np.array([7, 3, 1], dtype="i4")).all() + assert ( + dpt.asnumpy(y2) + == np.array([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4") + ).all() + + +def test_integer_indexing_2d(): + get_queue_or_skip() + n0, n1 = 5, 7 + x = dpt.reshape( + dpt.arange(n0 * n1, dtype="i4"), + ( + n0, + n1, + ), + ) + ind0 = dpt.arange(n0) + ind1 = dpt.arange(n1) + + y = x[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]] + assert y.dtype == x.dtype + assert (dpt.asnumpy(y) == np.array([[5, 6], [12, 13]])).all() + + +def test_integer_strided_indexing(): + get_queue_or_skip() + n0, n1 = 5, 7 + x = dpt.reshape( + dpt.arange(2 * n0 * n1, dtype="i4"), + ( + 2 * n0, + n1, + ), + ) + ind0 = dpt.arange(n0) + ind1 = dpt.arange(n1) + + z = x[::-2, :] + y = z[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]] + assert y.dtype == x.dtype + zc = dpt.copy(z, order="C") + yc = zc[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]] + assert (dpt.asnumpy(y) == dpt.asnumpy(yc)).all() From 81ba473a38accbcd3841d85b378144228b9eb5d8 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 26 Feb 2023 10:15:43 -0600 Subject: [PATCH 22/57] Adding basic take, and basic put tests --- dpctl/tests/test_usm_ndarray_indexing.py | 83 +++++++++++++++++++++++- 1 file changed, 80 insertions(+), 3 deletions(-) diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index 4281938577..e088d4d00b 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -17,13 +17,11 @@ import numpy as np import pytest -from helper import get_queue_or_skip +from helper import get_queue_or_skip, skip_if_dtype_not_supported # import dpctl import dpctl.tensor as dpt -# from helper import skip_if_dtype_not_supported - def test_basic_slice1(): q = get_queue_or_skip() @@ -435,3 +433,82 @@ def test_integer_strided_indexing(): zc = dpt.copy(z, order="C") yc = zc[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]] assert (dpt.asnumpy(y) == dpt.asnumpy(yc)).all() + + +@pytest.mark.parametrize( + "data_dt", + ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8", "e", "f", "d", "F", "D"], +) +@pytest.mark.parametrize( + "ind_dt", ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"] +) +def test_take_basic(data_dt, ind_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.arange(10, dtype=data_dt) + ind = dpt.arange(2, 5, dtype=ind_dt) + y = dpt.take(x, ind) + assert y.dtype == x.dtype + assert (dpt.asnumpy(y) == np.arange(2, 5, dtype=data_dt)).all() + + +@pytest.mark.parametrize( + "data_dt", + ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8", "e", "f", "d", "F", "D"], +) +@pytest.mark.parametrize( + "ind_dt", ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"] +) +def test_put_basic(data_dt, ind_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.arange(10, dtype=data_dt) + ind = dpt.arange(2, 5, dtype=ind_dt) + val = dpt.ones(3, dtype=data_dt) + dpt.put(x, ind, val) + assert ( + dpt.asnumpy(x) + == np.array([0, 1, 1, 1, 1, 5, 6, 7, 8, 9], dtype=data_dt) + ).all() + + +def test_take_basic_axis(): + get_queue_or_skip() + + n0, n1 = 5, 7 + x = dpt.reshape( + dpt.arange(n0 * n1, dtype="i4"), + ( + n0, + n1, + ), + ) + ind = dpt.arange(2, 4) + y0 = dpt.take(x, ind, axis=0) + y1 = dpt.take(x, ind, axis=1) + assert y0.shape == (2, n1) + assert y1.shape == (n0, 2) + + +def test_put_basic_axis(): + get_queue_or_skip() + + n0, n1 = 5, 7 + x = dpt.reshape( + dpt.arange(n0 * n1, dtype="i4"), + ( + n0, + n1, + ), + ) + ind = dpt.arange(2, 4) + v0 = dpt.zeros((2, n1), dtype=x.dtype) + v1 = dpt.zeros((n0, 2), dtype=x.dtype) + dpt.put(x, ind, v0, axis=0) + dpt.put(x, ind, v1, axis=1) + expected = np.arange(n0 * n1, dtype="i4").reshape((n0, n1)) + expected[[2, 3], :] = 0 + expected[:, [2, 3]] = 0 + assert (expected == dpt.asnumpy(x)).all() From 7c0c6f025fd67b3285f370ec2db9859a0ca3ef8e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 26 Feb 2023 16:12:21 -0600 Subject: [PATCH 23/57] Turn debugging on for test_windows test run --- .github/workflows/conda-package.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index 4cb3f3ad99..7bbc5fffd1 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -170,7 +170,7 @@ jobs: # echo "libintelocl.so" | tee /etc/OpenCL/vendors/intel-cpu.icd export OCL_ICD_FILENAMES=libintelocl.so # clinfo -l - python -m pytest -p no:faulthandler --pyargs $MODULE_NAME + python -m pytest --pyargs $MODULE_NAME test_windows: needs: build_windows @@ -296,8 +296,10 @@ jobs: conda activate dpctl_test && python -m dpctl -f - name: Run tests shell: cmd /C CALL {0} + env: + DPCTL_VERBOSITY: error run: >- - conda activate dpctl_test && python -m pytest -p no:faulthandler --pyargs ${{ env.MODULE_NAME }} + conda activate dpctl_test && python -m pytest -v -s --pyargs ${{ env.MODULE_NAME }} upload_linux: needs: test_linux From 156f7f0a74f8ffeea3c3f2f1d59dad46072c0327 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Sun, 26 Feb 2023 20:11:28 -0800 Subject: [PATCH 24/57] Added several array indexing tests - Tests include - strided data for take and put - strided indices for take and put - indexing compute follows data - indexing argument validation --- dpctl/tests/test_usm_ndarray_indexing.py | 440 ++++++++++++++++++++++- 1 file changed, 435 insertions(+), 5 deletions(-) diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index e088d4d00b..840dfb931f 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -18,9 +18,28 @@ import numpy as np import pytest from helper import get_queue_or_skip, skip_if_dtype_not_supported +from numpy.testing import assert_array_equal -# import dpctl import dpctl.tensor as dpt +from dpctl.utils import ExecutionPlacementError + +_all_dtypes = [ + "u1", + "i1", + "u2", + "i2", + "u4", + "i4", + "u8", + "i8", + "e", + "f", + "d", + "F", + "D", +] + +_all_int_dtypes = ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"] def test_basic_slice1(): @@ -437,10 +456,11 @@ def test_integer_strided_indexing(): @pytest.mark.parametrize( "data_dt", - ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8", "e", "f", "d", "F", "D"], + _all_dtypes, ) @pytest.mark.parametrize( - "ind_dt", ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"] + "ind_dt", + _all_int_dtypes, ) def test_take_basic(data_dt, ind_dt): q = get_queue_or_skip() @@ -455,10 +475,11 @@ def test_take_basic(data_dt, ind_dt): @pytest.mark.parametrize( "data_dt", - ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8", "e", "f", "d", "F", "D"], + _all_dtypes, ) @pytest.mark.parametrize( - "ind_dt", ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"] + "ind_dt", + _all_int_dtypes, ) def test_put_basic(data_dt, ind_dt): q = get_queue_or_skip() @@ -512,3 +533,412 @@ def test_put_basic_axis(): expected[[2, 3], :] = 0 expected[:, [2, 3]] = 0 assert (expected == dpt.asnumpy(x)).all() + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +def test_take_0d_data(data_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.asarray(0, dtype=data_dt) + ind = dpt.arange(5) + + y = dpt.take(x, ind) + assert ( + dpt.asnumpy(y) + == np.broadcast_to(np.asarray(0, dtype=data_dt), ind.shape) + ).all() + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +def test_put_0d_data(data_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.asarray(0, dtype=data_dt) + ind = dpt.arange(5) + val = dpt.asarray(2, dtype=data_dt) + + dpt.put(x, ind, val) + assert ( + dpt.asnumpy(x) + == np.broadcast_to(np.asarray(2, dtype=data_dt), ind.shape) + ).all() + + +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +def test_take_0d_ind(ind_dt): + get_queue_or_skip() + + x = dpt.arange(5, dtype=ind_dt) + ind = dpt.asarray(3) + + y = dpt.take(x, ind) + assert dpt.asnumpy(x[3]) == dpt.asnumpy(y) + + +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +def test_put_0d_ind(ind_dt): + get_queue_or_skip() + + x = dpt.arange(5, dtype=ind_dt) + ind = dpt.asarray(3) + val = dpt.asarray(5, dtype=ind_dt) + + dpt.put(x, ind, val) + assert dpt.asnumpy(x[3]) == dpt.asnumpy(val) + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +def test_take_strided_1d_source(data_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.arange(27, dtype=data_dt, sycl_queue=q) + ind = dpt.arange(4, 9, dtype=np.intp, sycl_queue=q) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + assert_array_equal( + np.take(x_np[s], ind_np, axis=0), + dpt.asnumpy(dpt.take(x[s], ind, axis=0)), + ) + + # 0-strided + x = dpt.usm_ndarray( + (27,), + dtype=data_dt, + strides=(0,), + buffer_ctor_kwargs={"queue": q}, + ) + x[0] = x_np[0] + assert_array_equal( + np.broadcast_to(x_np[0], ind.shape), + dpt.asnumpy(dpt.take(x, ind, axis=0)), + ) + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +@pytest.mark.parametrize("order", ["C", "F"]) +def test_take_strided(data_dt, order): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order) + ind = dpt.arange(2, dtype=np.intp, sycl_queue=q) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + for sgn in (-1, 1): + xs = x[s, ::sgn] + xs_np = x_np[s, ::sgn] + assert_array_equal( + np.take(xs_np, ind_np, axis=0), + dpt.asnumpy(dpt.take(xs, ind, axis=0)), + ) + assert_array_equal( + np.take(xs_np, ind_np, axis=1), + dpt.asnumpy(dpt.take(xs, ind, axis=1)), + ) + assert_array_equal( + xs_np[ind_np, ind_np], + dpt.asnumpy(dpt.take(xs, [ind, ind], axis=0)), + ) + + +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +def test_take_strided_1d_indices(ind_dt): + q = get_queue_or_skip() + + x = dpt.arange(27, dtype="i4", sycl_queue=q) + ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind).astype(np.intp) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + assert_array_equal( + np.take(x_np, ind_np[s], axis=0), + dpt.asnumpy(dpt.take(x, ind[s], axis=0)), + ) + + # 0-strided + ind = dpt.usm_ndarray( + (12,), + dtype=ind_dt, + strides=(0,), + buffer_ctor_kwargs={"queue": q}, + ) + ind[0] = ind_np[0] + assert_array_equal( + np.broadcast_to(x_np[ind_np[0]], ind.shape), + dpt.asnumpy(dpt.take(x, ind, axis=0)), + ) + + +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +@pytest.mark.parametrize("order", ["C", "F"]) +def test_take_strided_indices(ind_dt, order): + q = get_queue_or_skip() + + x = dpt.arange(27, dtype="i4", sycl_queue=q) + ind = dpt.reshape( + dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q), (4, 3), order=order + ) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind).astype(np.intp) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + for sgn in [-1, 1]: + inds = ind[s, ::sgn] + inds_np = ind_np[s, ::sgn] + assert_array_equal( + np.take(x_np, inds_np, axis=0), + dpt.asnumpy(dpt.take(x, inds, axis=0)), + ) + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +@pytest.mark.parametrize("order", ["C", "F"]) +def test_put_strided_1d_destination(data_dt, order): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.arange(27, dtype=data_dt, sycl_queue=q) + ind = dpt.arange(4, 9, dtype=np.intp, sycl_queue=q) + val = dpt.asarray(9, dtype=data_dt, sycl_queue=q) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind) + val_np = dpt.asnumpy(val) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + x_np1 = x_np.copy() + x_np1[s][ind_np] = val_np + + x1 = dpt.copy(x) + dpt.put(x1[s], ind, val, axis=0) + + assert_array_equal(x_np1, dpt.asnumpy(x1)) + + +@pytest.mark.parametrize( + "data_dt", + _all_dtypes, +) +@pytest.mark.parametrize("order", ["C", "F"]) +def test_put_strided_destination(data_dt, order): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order) + ind = dpt.arange(2, dtype=np.intp, sycl_queue=q) + val = dpt.asarray(9, dtype=data_dt, sycl_queue=q) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind) + val_np = dpt.asnumpy(val) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + for sgn in [-1, 1]: + xs = x[s, ::sgn] + xs_np = x_np[s, ::sgn] + + x_np1 = xs_np.copy() + x_np1[ind_np] = val_np + + x1 = dpt.copy(xs) + dpt.put(x1, ind, val, axis=0) + assert_array_equal(x_np1, dpt.asnumpy(x1)) + + x_np1 = xs_np.copy() + x_np1[:, ind_np] = val_np + + x1 = dpt.copy(xs) + dpt.put(x1, ind, val, axis=1) + assert_array_equal(x_np1, dpt.asnumpy(x1)) + + x_np1 = xs_np.copy() + x_np1[ind_np, ind_np] = val_np + + x1 = dpt.copy(xs) + dpt.put(x1, [ind, ind], val, axis=0) + assert_array_equal(x_np1, dpt.asnumpy(x1)) + + +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +def test_put_strided_1d_indices(ind_dt): + q = get_queue_or_skip() + + x = dpt.arange(27, dtype="i4", sycl_queue=q) + ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q) + val = dpt.asarray(-1, dtype="i4", sycl_queue=q) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind).astype(np.intp) + val_np = dpt.asnumpy(val) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + x_copy = dpt.copy(x) + dpt.put(x_copy, ind[s], val, axis=0) + + x_np_copy = x_np.copy() + x_np_copy[ind_np[s]] = val_np + + assert_array_equal(x_np_copy, dpt.asnumpy(x_copy)) + + +@pytest.mark.parametrize( + "ind_dt", + _all_int_dtypes, +) +@pytest.mark.parametrize("order", ["C", "F"]) +def test_put_strided_indices(ind_dt, order): + q = get_queue_or_skip() + + x = dpt.arange(27, dtype="i4", sycl_queue=q) + ind = dpt.reshape( + dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q), (4, 3), order=order + ) + val = dpt.asarray(-1, sycl_queue=q, dtype=x.dtype) + + x_np = dpt.asnumpy(x) + ind_np = dpt.asnumpy(ind).astype(np.intp) + val_np = dpt.asnumpy(val) + + for s in ( + slice(None, None, 2), + slice(None, None, -2), + ): + for sgn in [-1, 1]: + inds = ind[s, ::sgn] + inds_np = ind_np[s, ::sgn] + + x_copy = dpt.copy(x) + dpt.put(x_copy, inds, val, axis=0) + + x_np_copy = x_np.copy() + x_np_copy[inds_np] = val_np + + assert_array_equal(x_np_copy, dpt.asnumpy(x_copy)) + + +def test_take_arg_validation(): + get_queue_or_skip() + + x = dpt.arange(4) + ind0 = dpt.arange(2) + ind1 = dpt.arange(2.0) + + with pytest.raises(TypeError): + dpt.take(dict(), ind0, axis=0) + with pytest.raises(TypeError): + dpt.take(x, dict(), axis=0) + with pytest.raises(TypeError): + dpt.take(x, ind1, axis=0) + + with pytest.raises(ValueError): + dpt.take(x, ind0, mode=0) + with pytest.raises(ValueError): + dpt.take(dpt.reshape(x, (2, 2)), ind0, axis=None) + + +def test_put_arg_validation(): + get_queue_or_skip() + + x = dpt.arange(4) + ind0 = dpt.arange(2) + ind1 = dpt.arange(2.0) + val = dpt.asarray(2) + + with pytest.raises(TypeError): + dpt.put(dict(), ind0, val, axis=0) + with pytest.raises(TypeError): + dpt.put(x, dict(), val, axis=0) + with pytest.raises(TypeError): + dpt.put(x, ind1, val, axis=0) + with pytest.raises(TypeError): + dpt.put(x, ind0, dict(), axis=0) + + with pytest.raises(ValueError): + dpt.put(x, ind0, val, mode=0) + + +def test_advanced_indexing_compute_follows_data(): + q1 = get_queue_or_skip() + q2 = get_queue_or_skip() + + x = dpt.arange(4, sycl_queue=q1) + ind0 = dpt.asarray([0], sycl_queue=q1) + ind1 = dpt.asarray([0], sycl_queue=q2) + val0 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q1) + val1 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q2) + + with pytest.raises(ExecutionPlacementError): + dpt.take(x, ind1, axis=0) + with pytest.raises(ExecutionPlacementError): + x[ind1] + with pytest.raises(ExecutionPlacementError): + dpt.put(x, ind1, val0) + with pytest.raises(ExecutionPlacementError): + x[ind1] = val0 + with pytest.raises(ExecutionPlacementError): + dpt.put(x, ind0, val1) + with pytest.raises(ExecutionPlacementError): + x[ind0] = val1 From d42b019c5f9875d40cb58c507eb99ea42bd3433c Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Sun, 26 Feb 2023 20:48:21 -0800 Subject: [PATCH 25/57] Put calls in tests corrected, organized put logic --- dpctl/tensor/_indexing_functions.py | 16 ++++++++++------ dpctl/tests/test_usm_ndarray_indexing.py | 10 ++++++---- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index c3562de8f8..864eb6924b 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -104,12 +104,16 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"): raise TypeError( "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) ) - queues_ = [ - x.sycl_queue, - ] - usm_types_ = [ - x.usm_type, - ] + if isinstance(vals, dpt.usm_ndarray): + queues_ = [x.sycl_queue, vals.sycl_queue] + usm_types_ = [x.usm_type, vals.usm_type] + else: + queues_ = [ + x.sycl_queue, + ] + usm_types_ = [ + x.usm_type, + ] if not isinstance(indices, list) and not isinstance(indices, tuple): indices = (indices,) diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index 840dfb931f..523810c811 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -565,7 +565,7 @@ def test_put_0d_data(data_dt): ind = dpt.arange(5) val = dpt.asarray(2, dtype=data_dt) - dpt.put(x, ind, val) + dpt.put(x, ind, val, axis=0) assert ( dpt.asnumpy(x) == np.broadcast_to(np.asarray(2, dtype=data_dt), ind.shape) @@ -597,7 +597,7 @@ def test_put_0d_ind(ind_dt): ind = dpt.asarray(3) val = dpt.asarray(5, dtype=ind_dt) - dpt.put(x, ind, val) + dpt.put(x, ind, val, axis=0) assert dpt.asnumpy(x[3]) == dpt.asnumpy(val) @@ -886,6 +886,8 @@ def test_take_arg_validation(): ind0 = dpt.arange(2) ind1 = dpt.arange(2.0) + with pytest.raises(ValueError): + dpt.take(dpt.reshape(x, (2, 2)), ind0) with pytest.raises(TypeError): dpt.take(dict(), ind0, axis=0) with pytest.raises(TypeError): @@ -935,10 +937,10 @@ def test_advanced_indexing_compute_follows_data(): with pytest.raises(ExecutionPlacementError): x[ind1] with pytest.raises(ExecutionPlacementError): - dpt.put(x, ind1, val0) + dpt.put(x, ind1, val0, axis=0) with pytest.raises(ExecutionPlacementError): x[ind1] = val0 with pytest.raises(ExecutionPlacementError): - dpt.put(x, ind0, val1) + dpt.put(x, ind0, val1, axis=0) with pytest.raises(ExecutionPlacementError): x[ind0] = val1 From 877c3c75bfa7d4fd8ac02a436603127135254d4b Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 27 Feb 2023 00:12:01 -0800 Subject: [PATCH 26/57] Test fixes - Error for non-integer usm_ndarrays used as indices changed to IndexError --- dpctl/tensor/_indexing_functions.py | 4 +- dpctl/tests/test_usm_ndarray_indexing.py | 77 ++++++++++++++++-------- 2 files changed, 53 insertions(+), 28 deletions(-) diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 864eb6924b..90718f4559 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -48,7 +48,7 @@ def take(x, indices, /, *, axis=None, mode="clip"): ) ) if not np.issubdtype(i.dtype, np.integer): - raise TypeError( + raise IndexError( "`indices` expected integer data type, got `{}`".format(i.dtype) ) queues_.append(i.sycl_queue) @@ -126,7 +126,7 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"): ) ) if not np.issubdtype(i.dtype, np.integer): - raise TypeError( + raise IndexError( "`indices` expected integer data type, got `{}`".format(i.dtype) ) queues_.append(i.sycl_queue) diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index 523810c811..45501afbac 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -535,6 +535,21 @@ def test_put_basic_axis(): assert (expected == dpt.asnumpy(x)).all() +@pytest.mark.parametrize("data_dt", _all_dtypes) +def test_put_0d_val(data_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(data_dt, q) + + x = dpt.arange(5, dtype=data_dt, sycl_queue=q) + ind = dpt.asarray([0], dtype=np.intp, sycl_queue=q) + x[ind] = 2 + assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x[0])) + + x = dpt.asarray(5, dtype=data_dt, sycl_queue=q) + x[ind] = 2 + assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x)) + + @pytest.mark.parametrize( "data_dt", _all_dtypes, @@ -543,8 +558,8 @@ def test_take_0d_data(data_dt): q = get_queue_or_skip() skip_if_dtype_not_supported(data_dt, q) - x = dpt.asarray(0, dtype=data_dt) - ind = dpt.arange(5) + x = dpt.asarray(0, dtype=data_dt, sycl_queue=q) + ind = dpt.arange(5, dtype=np.intp, sycl_queue=q) y = dpt.take(x, ind) assert ( @@ -561,9 +576,9 @@ def test_put_0d_data(data_dt): q = get_queue_or_skip() skip_if_dtype_not_supported(data_dt, q) - x = dpt.asarray(0, dtype=data_dt) - ind = dpt.arange(5) - val = dpt.asarray(2, dtype=data_dt) + x = dpt.asarray(0, dtype=data_dt, sycl_queue=q) + ind = dpt.arange(5, dtype=np.intp, sycl_queue=q) + val = dpt.asarray(2, dtype=data_dt, sycl_queue=q) dpt.put(x, ind, val, axis=0) assert ( @@ -577,10 +592,10 @@ def test_put_0d_data(data_dt): _all_int_dtypes, ) def test_take_0d_ind(ind_dt): - get_queue_or_skip() + q = get_queue_or_skip() - x = dpt.arange(5, dtype=ind_dt) - ind = dpt.asarray(3) + x = dpt.arange(5, dtype="i4", sycl_queue=q) + ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q) y = dpt.take(x, ind) assert dpt.asnumpy(x[3]) == dpt.asnumpy(y) @@ -591,11 +606,11 @@ def test_take_0d_ind(ind_dt): _all_int_dtypes, ) def test_put_0d_ind(ind_dt): - get_queue_or_skip() + q = get_queue_or_skip() - x = dpt.arange(5, dtype=ind_dt) - ind = dpt.asarray(3) - val = dpt.asarray(5, dtype=ind_dt) + x = dpt.arange(5, dtype="i4", sycl_queue=q) + ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q) + val = dpt.asarray(5, dtype=x.dtype, sycl_queue=q) dpt.put(x, ind, val, axis=0) assert dpt.asnumpy(x[3]) == dpt.asnumpy(val) @@ -750,7 +765,7 @@ def test_put_strided_1d_destination(data_dt, order): x = dpt.arange(27, dtype=data_dt, sycl_queue=q) ind = dpt.arange(4, 9, dtype=np.intp, sycl_queue=q) - val = dpt.asarray(9, dtype=data_dt, sycl_queue=q) + val = dpt.asarray(9, dtype=x.dtype, sycl_queue=q) x_np = dpt.asnumpy(x) ind_np = dpt.asnumpy(ind) @@ -780,7 +795,7 @@ def test_put_strided_destination(data_dt, order): x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order) ind = dpt.arange(2, dtype=np.intp, sycl_queue=q) - val = dpt.asarray(9, dtype=data_dt, sycl_queue=q) + val = dpt.asarray(9, dtype=x.dtype, sycl_queue=q) x_np = dpt.asnumpy(x) ind_np = dpt.asnumpy(ind) @@ -825,7 +840,7 @@ def test_put_strided_1d_indices(ind_dt): x = dpt.arange(27, dtype="i4", sycl_queue=q) ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q) - val = dpt.asarray(-1, dtype="i4", sycl_queue=q) + val = dpt.asarray(-1, dtype=x.dtype, sycl_queue=q) x_np = dpt.asnumpy(x) ind_np = dpt.asnumpy(ind).astype(np.intp) @@ -880,21 +895,25 @@ def test_put_strided_indices(ind_dt, order): def test_take_arg_validation(): - get_queue_or_skip() + q = get_queue_or_skip() - x = dpt.arange(4) - ind0 = dpt.arange(2) - ind1 = dpt.arange(2.0) + x = dpt.arange(4, dtype="i4", sycl_queue=q) + ind0 = dpt.arange(2, dtype=np.intp, sycl_queue=q) + ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q) - with pytest.raises(ValueError): - dpt.take(dpt.reshape(x, (2, 2)), ind0) with pytest.raises(TypeError): dpt.take(dict(), ind0, axis=0) with pytest.raises(TypeError): dpt.take(x, dict(), axis=0) with pytest.raises(TypeError): + x[[]] + with pytest.raises(IndexError): dpt.take(x, ind1, axis=0) + with pytest.raises(IndexError): + x[ind1] + with pytest.raises(ValueError): + dpt.take(dpt.reshape(x, (2, 2)), ind0) with pytest.raises(ValueError): dpt.take(x, ind0, mode=0) with pytest.raises(ValueError): @@ -902,21 +921,27 @@ def test_take_arg_validation(): def test_put_arg_validation(): - get_queue_or_skip() + q = get_queue_or_skip() - x = dpt.arange(4) - ind0 = dpt.arange(2) - ind1 = dpt.arange(2.0) - val = dpt.asarray(2) + x = dpt.arange(4, dtype="i4", sycl_queue=q) + ind0 = dpt.arange(2, dtype=np.intp, sycl_queue=q) + ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q) + val = dpt.asarray(2, x.dtype, sycl_queue=q) with pytest.raises(TypeError): dpt.put(dict(), ind0, val, axis=0) with pytest.raises(TypeError): dpt.put(x, dict(), val, axis=0) with pytest.raises(TypeError): + x[[]] = val + with pytest.raises(IndexError): dpt.put(x, ind1, val, axis=0) + with pytest.raises(IndexError): + x[ind1] = val with pytest.raises(TypeError): dpt.put(x, ind0, dict(), axis=0) + with pytest.raises(TypeError): + x[ind0] = dict() with pytest.raises(ValueError): dpt.put(x, ind0, val, mode=0) From e296d873229ef8e2cf1af0b3f7b1f5301cc1e4b4 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 27 Feb 2023 01:51:59 -0800 Subject: [PATCH 27/57] Moved advanced_indexing pointer range validation --- .../libtensor/source/advanced_indexing.cpp | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.cpp b/dpctl/tensor/libtensor/source/advanced_indexing.cpp index fed5d543ed..39f62a501a 100644 --- a/dpctl/tensor/libtensor/source/advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/advanced_indexing.cpp @@ -99,8 +99,6 @@ std::vector _populate_packed_shapes_strides_for_indexing( std::shared_ptr packed_host_axes_shapes_strides_shp = std::make_shared(2 * k + along_sh_elems, allocator); - // can be made more efficient by checking if inp_nd > 1, then performing - // same treatment of orthog_sh_elems as for 0D (orthog will not exist) if (inp_nd > 0) { std::copy(inp_shape, inp_shape + axis_start, packed_host_shapes_strides_shp->begin()); @@ -403,6 +401,17 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, } } + // destination must be ample enough to accommodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if ((range + 1) < (orthog_nelems * ind_nelems)) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1; std::vector ind_ptrs; @@ -580,17 +589,6 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, const py::ssize_t *src_strides = src.get_strides_raw(); const py::ssize_t *dst_strides = dst.get_strides_raw(); - // destination must be ample enough to accommodate all elements - { - size_t range = - static_cast(dst_offsets.second - dst_offsets.first); - if ((range + 1) < (orthog_nelems * ind_nelems)) { - throw py::value_error( - "Destination array can not accommodate all the " - "elements of source array."); - } - } - // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:], // src_strides[:axis] + src_strides[axis+k:], // dst_strides[:axis] + dst_strides[axis+k:]] @@ -765,6 +763,17 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, throw py::value_error("Arrays index overlapping segments of memory"); } + // destination must be ample enough to accommodate all possible elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if ((range + 1) < dst_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + int dst_typenum = dst.get_typenum(); int val_typenum = val.get_typenum(); @@ -965,17 +974,6 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, const py::ssize_t *dst_strides = dst.get_strides_raw(); const py::ssize_t *val_strides = val.get_strides_raw(); - // destination must be ample enough to accommodate all possible elements - { - size_t range = - static_cast(dst_offsets.second - dst_offsets.first); - if ((range + 1) < dst_nelems) { - throw py::value_error( - "Destination array can not accommodate all the " - "elements of source array."); - } - } - // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:], // dst_strides[:axis] + dst_strides[axis+k:], // val_strides[:axis] + val_strides[axis+k:]] From 0cf7ba4cabeca0bdfad95f6fa7455f70432061cd Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 27 Feb 2023 23:13:14 -0800 Subject: [PATCH 28/57] Fixed typo in advanced_indexing kernels --- dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp index 1e205c658c..093d88706f 100644 --- a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp @@ -150,7 +150,7 @@ template class TakeFunctor src_orthog_idx += i * axes_shape_and_strides_[k_ + axis_idx]; } py::ssize_t ind_dst_idx(0); - ind_indxr.get_displacement( + ind_indxr.get_displacement( static_cast(i_along), ind_shape_and_strides_, axes_shape_and_strides_ + (2 * k_), ind_dst_idx); @@ -287,7 +287,7 @@ template class PutFunctor dst_orthog_idx += i * axes_shape_and_strides_[k_ + axis_idx]; } py::ssize_t ind_val_idx(0); - ind_indxr.get_displacement( + ind_indxr.get_displacement( static_cast(i_along), ind_shape_and_strides_, axes_shape_and_strides_ + (2 * k_), ind_val_idx); From fc46303dd64b2700c4c3d26b2440e604878492b9 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 27 Feb 2023 09:28:59 -0600 Subject: [PATCH 29/57] Renamed advance_indexing.*pp into integer_advanced_indexing.*pp Streamlined call operator implementation for projection classes. Added missing includes. --- dpctl/tensor/CMakeLists.txt | 2 +- ...xing.hpp => integer_advanced_indexing.hpp} | 16 +++++++-------- ...xing.cpp => integer_advanced_indexing.cpp} | 20 ++++++++++++------- ...xing.hpp => integer_advanced_indexing.hpp} | 0 dpctl/tensor/libtensor/source/tensor_py.cpp | 2 +- 5 files changed, 22 insertions(+), 18 deletions(-) rename dpctl/tensor/libtensor/include/kernels/{advanced_indexing.hpp => integer_advanced_indexing.hpp} (96%) rename dpctl/tensor/libtensor/source/{advanced_indexing.cpp => integer_advanced_indexing.cpp} (98%) rename dpctl/tensor/libtensor/source/{advanced_indexing.hpp => integer_advanced_indexing.hpp} (100%) diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 622473a0d8..3f5780cd75 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -31,7 +31,7 @@ pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/advanced_indexing.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp diff --git a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp similarity index 96% rename from dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp rename to dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index 093d88706f..a239691c80 100644 --- a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -26,6 +26,7 @@ #include "utils/strided_iters.hpp" #include "utils/type_utils.hpp" #include +#include #include #include #include @@ -52,10 +53,9 @@ class ClipIndex void operator()(py::ssize_t max_item, py::ssize_t &ind) const { - max_item = (max_item > 0) ? max_item : 1; - ind = (ind < 0) ? (ind <= -max_item) ? (0) : (ind + max_item) - : (ind >= max_item) ? (max_item - 1) - : ind; + max_item = std::max(max_item, 1); + ind = std::clamp(ind, -max_item, max_item - 1); + ind = (ind < 0) ? ind + max_item : ind; return; } }; @@ -67,10 +67,8 @@ class WrapIndex void operator()(py::ssize_t max_item, py::ssize_t &ind) const { - max_item = (max_item > 0) ? max_item : 1; - ind = (ind < 0) ? max_item - (-ind % max_item) - : (ind >= max_item) ? ind % max_item - : ind; + max_item = std::max(max_item, 1); + ind = ind % max_item; return; } }; @@ -136,9 +134,9 @@ template class TakeFunctor dst_orthog_idx); ProjectorT proj{}; - py::ssize_t ind_arr_idx(0); CIndexer_vector ind_indxr(ind_nd_); for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { + py::ssize_t ind_arr_idx(0); ind_indxr.get_displacement( static_cast(i_along), ind_shape_and_strides_, ind_shape_and_strides_ + ((axis_idx + 1) * ind_nd_), diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp similarity index 98% rename from dpctl/tensor/libtensor/source/advanced_indexing.cpp rename to dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index 39f62a501a..9c9840e0de 100644 --- a/dpctl/tensor/libtensor/source/advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -27,16 +27,19 @@ #include #include #include +#include #include #include #include #include #include "dpctl4pybind11.hpp" -#include "kernels/advanced_indexing.hpp" +#include "kernels/integer_advanced_indexing.hpp" #include "utils/type_dispatch.hpp" #include "utils/type_utils.hpp" +#include "integer_advanced_indexing.hpp" + #define INDEXING_MODES 2 #define CLIP_MODE 0 #define WRAP_MODE 1 @@ -85,8 +88,8 @@ std::vector _populate_packed_shapes_strides_for_indexing( int arr_nd) { - int orthog_sh_elems = ((inp_nd - k) > 1) ? (inp_nd - k) : 1; - int along_sh_elems = (ind_nd > 1) ? ind_nd : 1; + int orthog_sh_elems = std::max(inp_nd - k, 1); + int along_sh_elems = std::max(ind_nd, 1); using usm_host_allocatorT = sycl::usm_allocator; @@ -284,7 +287,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, int axis_start, uint8_t mode, sycl::queue exec_q, - const std::vector &depends = {}) + const std::vector &depends) { int k = ind.size(); @@ -328,7 +331,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, const py::ssize_t *src_shape = src.get_shape_raw(); const py::ssize_t *dst_shape = dst.get_shape_raw(); - int orthog_nd = ((src_nd - k) > 0) ? src_nd - k : 1; + int orthog_nd = std::max(src_nd - k, 1); bool orthog_shapes_equal(true); size_t orthog_nelems(1); @@ -412,7 +415,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, } } - auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1; + int ind_sh_elems = std::max(ind_nd, 1); std::vector ind_ptrs; ind_ptrs.reserve(k); @@ -633,12 +636,15 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, std::to_string(ind_type_id)); } + std::cout << "Submitting take" << std::endl; sycl::event take_generic_ev = fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k, packed_shapes_strides, packed_axes_shapes_strides, packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs, src_offset, dst_offset, packed_ind_offsets, all_deps); + std::cout << "Submitting take clean-up host task" << std::endl; + // free packed temporaries auto ctx = exec_q.get_context(); exec_q.submit([&](sycl::handler &cgh) { @@ -666,7 +672,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, int axis_start, uint8_t mode, sycl::queue exec_q, - const std::vector &depends = {}) + const std::vector &depends) { int k = ind.size(); diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.hpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp similarity index 100% rename from dpctl/tensor/libtensor/source/advanced_indexing.hpp rename to dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 94458bccf9..e164be2421 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -33,13 +33,13 @@ #include "dpctl4pybind11.hpp" -#include "advanced_indexing.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_for_reshape.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" #include "eye_ctor.hpp" #include "full_ctor.hpp" +#include "integer_advanced_indexing.hpp" #include "linear_sequences.hpp" #include "triul_ctor.hpp" #include "utils/strided_iters.hpp" From 51e0fbbac997421572cdacf909e892239484950a Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 27 Feb 2023 11:48:04 -0600 Subject: [PATCH 30/57] Initialize packed shape+strides data with zeros --- dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index 9c9840e0de..6b053d7f77 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -97,10 +97,10 @@ std::vector _populate_packed_shapes_strides_for_indexing( usm_host_allocatorT allocator(exec_q); std::shared_ptr packed_host_shapes_strides_shp = - std::make_shared(3 * orthog_sh_elems, allocator); + std::make_shared(3 * orthog_sh_elems, 0, allocator); std::shared_ptr packed_host_axes_shapes_strides_shp = - std::make_shared(2 * k + along_sh_elems, allocator); + std::make_shared(2 * k + along_sh_elems, 0, allocator); if (inp_nd > 0) { std::copy(inp_shape, inp_shape + axis_start, From 84ba81a47c16d7d40eecb6fa6a7e779432c3a09d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 27 Feb 2023 15:41:56 -0600 Subject: [PATCH 31/57] Ensure that indices are also kept alive --- .../source/integer_advanced_indexing.cpp | 51 +++++++++++++++---- .../source/integer_advanced_indexing.hpp | 30 +++++------ 2 files changed, 57 insertions(+), 24 deletions(-) diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index 6b053d7f77..983eaa9b6f 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -280,15 +279,50 @@ std::vector _populate_packed_shapes_strides_for_indexing( } } +/* Utility to parse python object py_ind into vector of `usm_ndarray`s */ +std::vector parse_py_ind(const sycl::queue &q, + py::object py_ind) +{ + size_t ind_count = py::len(py_ind); + std::vector res; + res.reserve(ind_count); + + bool acquired = false; + int nd = -1; + for (size_t i = 0; i < ind_count; ++i) { + auto el_i = py_ind[py::cast(i)]; + auto arr_i = py::cast(el_i); + if (!dpctl::utils::queues_are_compatible(q, {arr_i})) { + throw py::value_error("Index allocation queue is not compatible " + "with execution queue"); + } + if (acquired) { + if (nd != arr_i.get_ndim()) { + throw py::value_error( + "Indices must have the same number of dimensions."); + } + } + else { + acquired = true; + nd = arr_i.get_ndim(); + } + res.push_back(arr_i); + } + + return res; +} + std::pair usm_ndarray_take(dpctl::tensor::usm_ndarray src, - std::vector ind, + py::object py_ind, dpctl::tensor::usm_ndarray dst, int axis_start, uint8_t mode, sycl::queue exec_q, const std::vector &depends) { + std::vector ind = parse_py_ind(exec_q, py_ind); + int k = ind.size(); if (k == 0) { @@ -636,15 +670,12 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, std::to_string(ind_type_id)); } - std::cout << "Submitting take" << std::endl; sycl::event take_generic_ev = fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k, packed_shapes_strides, packed_axes_shapes_strides, packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs, src_offset, dst_offset, packed_ind_offsets, all_deps); - std::cout << "Submitting take clean-up host task" << std::endl; - // free packed temporaries auto ctx = exec_q.get_context(); exec_q.submit([&](sycl::handler &cgh) { @@ -661,19 +692,20 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, }); return std::make_pair( - keep_args_alive(exec_q, {src, dst}, {take_generic_ev}), + keep_args_alive(exec_q, {src, py_ind, dst}, {take_generic_ev}), take_generic_ev); } std::pair usm_ndarray_put(dpctl::tensor::usm_ndarray dst, - std::vector ind, + py::object py_ind, dpctl::tensor::usm_ndarray val, int axis_start, uint8_t mode, sycl::queue exec_q, const std::vector &depends) { + std::vector ind = parse_py_ind(exec_q, py_ind); int k = ind.size(); if (k == 0) { @@ -1046,8 +1078,9 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, }); }); - return std::make_pair(keep_args_alive(exec_q, {dst, val}, {put_generic_ev}), - put_generic_ev); + return std::make_pair( + keep_args_alive(exec_q, {dst, py_ind, val}, {put_generic_ev}), + put_generic_ev); } void init_advanced_indexing_dispatch_tables(void) diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp index d99d4f1828..c6d5ed74b8 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp @@ -38,24 +38,24 @@ namespace py_internal { extern std::pair -usm_ndarray_take(dpctl::tensor::usm_ndarray src, - std::vector ind, - dpctl::tensor::usm_ndarray dst, - int axis_start, - uint8_t mode, - sycl::queue exec_q, - const std::vector &depends = {}); +usm_ndarray_take(dpctl::tensor::usm_ndarray, + py::object, + dpctl::tensor::usm_ndarray, + int, + uint8_t, + sycl::queue, + const std::vector & = {}); extern std::pair -usm_ndarray_put(dpctl::tensor::usm_ndarray dst, - std::vector ind, - dpctl::tensor::usm_ndarray val, - int axis_start, - uint8_t mode, - sycl::queue exec_q, - const std::vector &depends = {}); +usm_ndarray_put(dpctl::tensor::usm_ndarray, + py::object, + dpctl::tensor::usm_ndarray, + int, + uint8_t, + sycl::queue, + const std::vector & = {}); -extern void init_advanced_indexing_dispatch_tables(); +extern void init_advanced_indexing_dispatch_tables(void); } // namespace py_internal } // namespace tensor From 56bb65fc6c7a7ba026397d5d42ddccfa2e42a5a8 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 28 Feb 2023 12:32:50 -0600 Subject: [PATCH 32/57] Moved ctx creation into host-task-dispatching handler function. --- .../libtensor/source/integer_advanced_indexing.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index 983eaa9b6f..510ee2c554 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -677,9 +677,9 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, src_offset, dst_offset, packed_ind_offsets, all_deps); // free packed temporaries - auto ctx = exec_q.get_context(); exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(take_generic_ev); + auto ctx = exec_q.get_context(); cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides, packed_ind_shapes_strides, packed_ind_ptrs, packed_ind_offsets, ctx]() { @@ -691,9 +691,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, }); }); - return std::make_pair( - keep_args_alive(exec_q, {src, py_ind, dst}, {take_generic_ev}), - take_generic_ev); + sycl::event host_task_ev = + keep_args_alive(exec_q, {src, py_ind, dst}, {take_generic_ev}); + + return std::make_pair(host_task_ev, take_generic_ev); } std::pair From 24d7839ce74e8893b1dc6f3566658a919f4ceef5 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 1 Mar 2023 09:46:55 -0800 Subject: [PATCH 33/57] Prevent dangling host tasks in indexing functions - Host tasks are now collected and kept alive --- .../source/integer_advanced_indexing.cpp | 73 ++++++++++++------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index 510ee2c554..7649fcfba2 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -70,6 +70,7 @@ using dpctl::utils::keep_args_alive; std::vector _populate_packed_shapes_strides_for_indexing( sycl::queue exec_q, + std::vector &host_task_events, py::ssize_t *device_orthog_shapes_strides, py::ssize_t *device_axes_shapes_strides, const py::ssize_t *inp_shape, @@ -210,20 +211,21 @@ std::vector _populate_packed_shapes_strides_for_indexing( exec_q.copy(packed_host_shapes_strides_shp->data(), device_orthog_shapes_strides, packed_host_shapes_strides_shp->size()); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(device_orthog_shapes_strides_copy_ev); - cgh.host_task([packed_host_shapes_strides_shp] {}); - }); sycl::event device_axes_shapes_strides_copy_ev = exec_q.copy( packed_host_axes_shapes_strides_shp->data(), device_axes_shapes_strides, packed_host_axes_shapes_strides_shp->size()); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(device_axes_shapes_strides_copy_ev); - cgh.host_task([packed_host_axes_shapes_strides_shp]() {}); - }); + + sycl::event clean_up_host_task_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(device_axes_shapes_strides_copy_ev); + cgh.depends_on(device_orthog_shapes_strides_copy_ev); + cgh.host_task([packed_host_axes_shapes_strides_shp, + packed_host_shapes_strides_shp]() {}); + }); + host_task_events.push_back(clean_up_host_task_ev); std::vector v = {device_orthog_shapes_strides_copy_ev, device_axes_shapes_strides_copy_ev}; @@ -268,10 +270,13 @@ std::vector _populate_packed_shapes_strides_for_indexing( packed_host_axes_shapes_strides_shp->data(), device_axes_shapes_strides, packed_host_axes_shapes_strides_shp->size()); - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(device_axes_shapes_strides_copy_ev); - cgh.host_task([packed_host_axes_shapes_strides_shp]() {}); - }); + + sycl::event clean_up_host_task_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(device_axes_shapes_strides_copy_ev); + cgh.host_task([packed_host_axes_shapes_strides_shp]() {}); + }); + host_task_events.push_back(clean_up_host_task_ev); std::vector v = {device_orthog_shapes_strides_fill_ev, device_axes_shapes_strides_copy_ev}; @@ -590,28 +595,33 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, std::copy(ind_offsets.begin(), ind_offsets.end(), host_ind_offsets_shp->begin()); + std::vector host_task_events(5); + sycl::event packed_ind_ptrs_copy_ev = exec_q.copy( host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); - exec_q.submit([&](sycl::handler &cgh) { + sycl::event ind_ptrs_host_task = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(packed_ind_ptrs_copy_ev); cgh.host_task([host_ind_ptrs_shp]() {}); }); + host_task_events.push_back(ind_ptrs_host_task); sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy( host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides, host_ind_shapes_strides_shp->size()); - exec_q.submit([&](sycl::handler &cgh) { + sycl::event ind_sh_st_host_task = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(packed_ind_shapes_strides_copy_ev); cgh.host_task([host_ind_shapes_strides_shp]() {}); }); + host_task_events.push_back(ind_sh_st_host_task); sycl::event packed_ind_offsets_copy_ev = exec_q.copy( host_ind_offsets_shp->data(), packed_ind_offsets, host_ind_offsets_shp->size()); - exec_q.submit([&](sycl::handler &cgh) { + sycl::event ind_offsets_host_task = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(packed_ind_offsets_copy_ev); cgh.host_task([host_ind_offsets_shp]() {}); }); + host_task_events.push_back(ind_offsets_host_task); std::vector ind_pack_depends{packed_ind_ptrs_copy_ev, packed_ind_shapes_strides_copy_ev, @@ -650,10 +660,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, std::vector src_dst_pack_deps = _populate_packed_shapes_strides_for_indexing( - exec_q, packed_shapes_strides, packed_axes_shapes_strides, - src_shape, src_strides, is_src_c_contig, is_src_f_contig, dst_shape, - dst_strides, is_dst_c_contig, is_dst_f_contig, axis_start, k, - ind_nd, src_nd, dst_nd); + exec_q, host_task_events, packed_shapes_strides, + packed_axes_shapes_strides, src_shape, src_strides, is_src_c_contig, + is_src_f_contig, dst_shape, dst_strides, is_dst_c_contig, + is_dst_f_contig, axis_start, k, ind_nd, src_nd, dst_nd); std::vector all_deps(depends.size() + ind_pack_depends.size() + src_dst_pack_deps.size()); @@ -690,9 +700,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, sycl::free(packed_ind_offsets, ctx); }); }); + host_task_events.push_back(take_generic_ev); sycl::event host_task_ev = - keep_args_alive(exec_q, {src, py_ind, dst}, {take_generic_ev}); + keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events); return std::make_pair(host_task_ev, take_generic_ev); } @@ -977,28 +988,33 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, std::copy(ind_offsets.begin(), ind_offsets.end(), host_ind_offsets_shp->begin()); + std::vector host_task_events(5); + sycl::event device_ind_ptrs_copy_ev = exec_q.copy( host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); - exec_q.submit([&](sycl::handler &cgh) { + sycl::event ind_ptrs_host_task = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(device_ind_ptrs_copy_ev); cgh.host_task([host_ind_ptrs_shp]() {}); }); + host_task_events.push_back(ind_ptrs_host_task); sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy( host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides, host_ind_shapes_strides_shp->size()); - exec_q.submit([&](sycl::handler &cgh) { + sycl::event ind_sh_st_host_task = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(device_ind_shapes_strides_copy_ev); cgh.host_task([host_ind_shapes_strides_shp]() {}); }); + host_task_events.push_back(ind_sh_st_host_task); sycl::event device_ind_offsets_copy_ev = exec_q.copy( host_ind_offsets_shp->data(), packed_ind_offsets, host_ind_offsets_shp->size()); - exec_q.submit([&](sycl::handler &cgh) { + sycl::event ind_offsets_host_task = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(device_ind_offsets_copy_ev); cgh.host_task([host_ind_offsets_shp]() {}); }); + host_task_events.push_back(ind_offsets_host_task); std::vector ind_pack_depends{device_ind_ptrs_copy_ev, device_ind_shapes_strides_copy_ev, @@ -1037,10 +1053,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, std::vector copy_shapes_strides_deps = _populate_packed_shapes_strides_for_indexing( - exec_q, packed_shapes_strides, packed_axes_shapes_strides, - dst_shape, dst_strides, is_dst_c_contig, is_dst_f_contig, val_shape, - val_strides, is_val_c_contig, is_val_f_contig, axis_start, k, - ind_nd, dst_nd, val_nd); + exec_q, host_task_events, packed_shapes_strides, + packed_axes_shapes_strides, dst_shape, dst_strides, is_dst_c_contig, + is_dst_f_contig, val_shape, val_strides, is_val_c_contig, + is_val_f_contig, axis_start, k, ind_nd, dst_nd, val_nd); std::vector all_deps(depends.size() + copy_shapes_strides_deps.size() + @@ -1078,9 +1094,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, sycl::free(packed_ind_offsets, ctx); }); }); + host_task_events.push_back(put_generic_ev); return std::make_pair( - keep_args_alive(exec_q, {dst, py_ind, val}, {put_generic_ev}), + keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events), put_generic_ev); } From f84239fce8636c0330f750d9750be960630adec5 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 1 Mar 2023 16:11:29 -0600 Subject: [PATCH 34/57] Use py::gil_scoped_acquire instead of PyGILState_Ensure. --- dpctl/apis/include/dpctl4pybind11.hpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp index 921f231aa1..04b9f5d919 100644 --- a/dpctl/apis/include/dpctl4pybind11.hpp +++ b/dpctl/apis/include/dpctl4pybind11.hpp @@ -1000,14 +1000,10 @@ sycl::event keep_args_alive(sycl::queue q, shp_arr[i]->inc_ref(); } cgh.host_task([=]() { - bool guard = (Py_IsInitialized() && !_Py_IsFinalizing()); - if (guard) { - PyGILState_STATE gstate; - gstate = PyGILState_Ensure(); - for (std::size_t i = 0; i < num; ++i) { - shp_arr[i]->dec_ref(); - } - PyGILState_Release(gstate); + py::gil_scoped_acquire acquire; + + for (std::size_t i = 0; i < num; ++i) { + shp_arr[i]->dec_ref(); } }); }); From b69a415d4ccec75e61d0b1e957f4031020c5bb58 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 1 Mar 2023 16:14:18 -0600 Subject: [PATCH 35/57] Make both _take and _put effectively synchronous They still return a pair of events, but those are always in a compelte state. --- .../source/integer_advanced_indexing.cpp | 123 +++++++++++++----- 1 file changed, 88 insertions(+), 35 deletions(-) diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index 7649fcfba2..77c34f0a0b 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -143,6 +143,8 @@ std::vector _populate_packed_shapes_strides_for_indexing( packed_host_axes_shapes_strides_shp->begin() + k); } else { + // FIXME: this pointer was not allocated in this function + // the caller should be freeing it sycl::free(device_orthog_shapes_strides, exec_q); throw std::runtime_error("Invalid array encountered"); } @@ -190,6 +192,8 @@ std::vector _populate_packed_shapes_strides_for_indexing( packed_host_axes_shapes_strides_shp->begin() + 2 * k); } else { + // FIXME: this pointer was not allocated in this function + // the caller should be freeing it sycl::free(device_orthog_shapes_strides, exec_q); throw std::runtime_error("Invalid array encountered"); } @@ -255,6 +259,8 @@ std::vector _populate_packed_shapes_strides_for_indexing( packed_host_axes_shapes_strides_shp->begin() + 2); } else { + // FIXME: memory was not allocated in this function + // it should be freed by the caller sycl::free(device_orthog_shapes_strides, exec_q); throw std::runtime_error("Invalid array encountered"); } @@ -292,23 +298,24 @@ std::vector parse_py_ind(const sycl::queue &q, std::vector res; res.reserve(ind_count); - bool acquired = false; + bool nd_is_known = false; int nd = -1; for (size_t i = 0; i < ind_count; ++i) { - auto el_i = py_ind[py::cast(i)]; - auto arr_i = py::cast(el_i); + py::object el_i = py_ind[py::cast(i)]; + dpctl::tensor::usm_ndarray arr_i = + py::cast(el_i); if (!dpctl::utils::queues_are_compatible(q, {arr_i})) { throw py::value_error("Index allocation queue is not compatible " "with execution queue"); } - if (acquired) { + if (nd_is_known) { if (nd != arr_i.get_ndim()) { throw py::value_error( "Indices must have the same number of dimensions."); } } else { - acquired = true; + nd_is_known = true; nd = arr_i.get_ndim(); } res.push_back(arr_i); @@ -558,6 +565,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, sycl::malloc_device((k + 1) * ind_sh_elems, exec_q); if (packed_ind_shapes_strides == nullptr) { + sycl::free(packed_ind_ptrs, exec_q); throw std::runtime_error( "Unable to allocate packed_ind_shapes_strides device memory"); } @@ -566,6 +574,8 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, sycl::malloc_device(k, exec_q); if (packed_ind_offsets == nullptr) { + sycl::free(packed_ind_ptrs, exec_q); + sycl::free(packed_ind_shapes_strides, exec_q); throw std::runtime_error( "Unable to allocate packed_ind_offsets device memory"); } @@ -595,33 +605,29 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, std::copy(ind_offsets.begin(), ind_offsets.end(), host_ind_offsets_shp->begin()); - std::vector host_task_events(5); + std::vector host_task_events; + host_task_events.reserve(5); sycl::event packed_ind_ptrs_copy_ev = exec_q.copy( host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); - sycl::event ind_ptrs_host_task = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(packed_ind_ptrs_copy_ev); - cgh.host_task([host_ind_ptrs_shp]() {}); - }); - host_task_events.push_back(ind_ptrs_host_task); sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy( host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides, host_ind_shapes_strides_shp->size()); - sycl::event ind_sh_st_host_task = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(packed_ind_shapes_strides_copy_ev); - cgh.host_task([host_ind_shapes_strides_shp]() {}); - }); - host_task_events.push_back(ind_sh_st_host_task); sycl::event packed_ind_offsets_copy_ev = exec_q.copy( host_ind_offsets_shp->data(), packed_ind_offsets, host_ind_offsets_shp->size()); - sycl::event ind_offsets_host_task = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(packed_ind_offsets_copy_ev); - cgh.host_task([host_ind_offsets_shp]() {}); - }); - host_task_events.push_back(ind_offsets_host_task); + + sycl::event shared_ptr_cleanup_host_task = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on({packed_ind_offsets_copy_ev, + packed_ind_shapes_strides_copy_ev, + packed_ind_ptrs_copy_ev}); + cgh.host_task([host_ind_offsets_shp, host_ind_shapes_strides_shp, + host_ind_ptrs_shp]() {}); + }); + host_task_events.push_back(shared_ptr_cleanup_host_task); std::vector ind_pack_depends{packed_ind_ptrs_copy_ev, packed_ind_shapes_strides_copy_ev, @@ -643,6 +649,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, sycl::malloc_device(3 * sh_elems, exec_q); if (packed_shapes_strides == nullptr) { + sycl::event::wait(host_task_events); + sycl::free(packed_ind_ptrs, exec_q); + sycl::free(packed_ind_shapes_strides, exec_q); + sycl::free(packed_ind_offsets, exec_q); throw std::runtime_error( "Unable to allocate packed_shapes_strides device memory"); } @@ -654,6 +664,11 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, sycl::malloc_device((2 * k) + ind_sh_elems, exec_q); if (packed_axes_shapes_strides == nullptr) { + sycl::event::wait(host_task_events); + sycl::free(packed_ind_ptrs, exec_q); + sycl::free(packed_ind_shapes_strides, exec_q); + sycl::free(packed_ind_offsets, exec_q); + sycl::free(packed_shapes_strides, exec_q); throw std::runtime_error( "Unable to allocate packed_axes_shapes_strides device memory"); } @@ -665,8 +680,9 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, is_src_f_contig, dst_shape, dst_strides, is_dst_c_contig, is_dst_f_contig, axis_start, k, ind_nd, src_nd, dst_nd); - std::vector all_deps(depends.size() + ind_pack_depends.size() + - src_dst_pack_deps.size()); + std::vector all_deps; + all_deps.reserve(depends.size() + ind_pack_depends.size() + + src_dst_pack_deps.size()); all_deps.insert(std::end(all_deps), std::begin(ind_pack_depends), std::end(ind_pack_depends)); all_deps.insert(std::end(all_deps), std::begin(src_dst_pack_deps), @@ -676,6 +692,12 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, auto fn = take_dispatch_table[mode][src_type_id][ind_type_id]; if (fn == nullptr) { + sycl::event::wait(host_task_events); + sycl::free(packed_ind_ptrs, exec_q); + sycl::free(packed_ind_shapes_strides, exec_q); + sycl::free(packed_ind_offsets, exec_q); + sycl::free(packed_shapes_strides, exec_q); + sycl::free(packed_axes_shapes_strides, exec_q); throw std::runtime_error("Indices must be integer type, got " + std::to_string(ind_type_id)); } @@ -687,7 +709,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, src_offset, dst_offset, packed_ind_offsets, all_deps); // free packed temporaries - exec_q.submit([&](sycl::handler &cgh) { + sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(take_generic_ev); auto ctx = exec_q.get_context(); cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides, @@ -700,12 +722,16 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, sycl::free(packed_ind_offsets, ctx); }); }); - host_task_events.push_back(take_generic_ev); - sycl::event host_task_ev = - keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events); + sycl::event::wait(host_task_events); + sycl::event::wait({take_generic_ev, temporaries_cleanup_ev}); + + /* + sycl::event host_task_ev = keep_args_alive(exec_q, {src, py_ind, dst}, + {temporaries_cleanup_ev}); + */ - return std::make_pair(host_task_ev, take_generic_ev); + return std::make_pair(sycl::event(), temporaries_cleanup_ev); } std::pair @@ -951,6 +977,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, sycl::malloc_device((k + 1) * ind_sh_elems, exec_q); if (packed_ind_shapes_strides == nullptr) { + sycl::free(packed_ind_ptrs, exec_q); throw std::runtime_error( "Unable to allocate packed_ind_shapes_strides device memory"); } @@ -959,6 +986,8 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, sycl::malloc_device(k, exec_q); if (packed_ind_offsets == nullptr) { + sycl::free(packed_ind_ptrs, exec_q); + sycl::free(packed_ind_shapes_strides, exec_q); throw std::runtime_error( "Unable to allocate packed_ind_offsets device memory"); } @@ -988,7 +1017,8 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, std::copy(ind_offsets.begin(), ind_offsets.end(), host_ind_offsets_shp->begin()); - std::vector host_task_events(5); + std::vector host_task_events; + host_task_events.reserve(7); sycl::event device_ind_ptrs_copy_ev = exec_q.copy( host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); @@ -1036,6 +1066,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, sycl::malloc_device(3 * sh_elems, exec_q); if (packed_shapes_strides == nullptr) { + sycl::event::wait(host_task_events); + sycl::free(packed_ind_ptrs, exec_q); + sycl::free(packed_ind_shapes_strides, exec_q); + sycl::free(packed_ind_offsets, exec_q); throw std::runtime_error( "Unable to allocate packed_shapes_strides device memory"); } @@ -1047,6 +1081,11 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, sycl::malloc_device((2 * k) + ind_sh_elems, exec_q); if (packed_axes_shapes_strides == nullptr) { + sycl::event::wait(host_task_events); + sycl::free(packed_shapes_strides, exec_q); + sycl::free(packed_ind_ptrs, exec_q); + sycl::free(packed_ind_shapes_strides, exec_q); + sycl::free(packed_ind_offsets, exec_q); throw std::runtime_error( "Unable to allocate packed_axes_shapes_strides device memory"); } @@ -1070,6 +1109,13 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id]; if (fn == nullptr) { + sycl::event::wait(host_task_events); + sycl::free(packed_shapes_strides, exec_q); + sycl::free(packed_axes_shapes_strides, exec_q); + sycl::free(packed_ind_shapes_strides, exec_q); + sycl::free(packed_ind_ptrs, exec_q); + sycl::free(packed_ind_offsets, exec_q); + throw std::runtime_error("Indices must be integer type, got " + std::to_string(ind_type_id)); } @@ -1081,9 +1127,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, dst_offset, val_offset, packed_ind_offsets, all_deps); // free packed temporaries - auto ctx = exec_q.get_context(); - exec_q.submit([&](sycl::handler &cgh) { + + sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(put_generic_ev); + auto ctx = exec_q.get_context(); cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides, packed_ind_shapes_strides, packed_ind_ptrs, packed_ind_offsets, ctx]() { @@ -1094,11 +1141,17 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, sycl::free(packed_ind_offsets, ctx); }); }); - host_task_events.push_back(put_generic_ev); - return std::make_pair( - keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events), - put_generic_ev); + sycl::event::wait(host_task_events); + sycl::event::wait({put_generic_ev, temporaries_cleanup_ev}); + + /* + sycl::event py_obj_cleanup_ev = + keep_args_alive(exec_q, {dst, py_ind, val}, + {put_generic_ev, temporaries_cleanup_ev}); + */ + + return std::make_pair(sycl::event(), temporaries_cleanup_ev); } void init_advanced_indexing_dispatch_tables(void) From f35734be1bdb6f670fc4377f9e60418b2350983a Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 1 Mar 2023 15:35:17 -0800 Subject: [PATCH 36/57] Simplified host_tasks in _put --- .../source/integer_advanced_indexing.cpp | 49 ++++++++----------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index 77c34f0a0b..b6ce79f4d0 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -599,22 +599,21 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, std::shared_ptr host_ind_offsets_shp = std::make_shared(k, ind_allocator); - std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), - host_ind_shapes_strides_shp->begin()); - std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); - std::copy(ind_offsets.begin(), ind_offsets.end(), - host_ind_offsets_shp->begin()); - std::vector host_task_events; host_task_events.reserve(5); + std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), + host_ind_shapes_strides_shp->begin()); sycl::event packed_ind_ptrs_copy_ev = exec_q.copy( host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); + std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy( host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides, host_ind_shapes_strides_shp->size()); + std::copy(ind_offsets.begin(), ind_offsets.end(), + host_ind_offsets_shp->begin()); sycl::event packed_ind_offsets_copy_ev = exec_q.copy( host_ind_offsets_shp->data(), packed_ind_offsets, host_ind_offsets_shp->size()); @@ -1011,40 +1010,34 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, std::shared_ptr host_ind_offsets_shp = std::make_shared(k, ind_allocator); - std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), - host_ind_shapes_strides_shp->begin()); - std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); - std::copy(ind_offsets.begin(), ind_offsets.end(), - host_ind_offsets_shp->begin()); - std::vector host_task_events; - host_task_events.reserve(7); + host_task_events.reserve(5); + std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); sycl::event device_ind_ptrs_copy_ev = exec_q.copy( host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); - sycl::event ind_ptrs_host_task = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(device_ind_ptrs_copy_ev); - cgh.host_task([host_ind_ptrs_shp]() {}); - }); - host_task_events.push_back(ind_ptrs_host_task); + std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), + host_ind_shapes_strides_shp->begin()); sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy( host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides, host_ind_shapes_strides_shp->size()); - sycl::event ind_sh_st_host_task = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(device_ind_shapes_strides_copy_ev); - cgh.host_task([host_ind_shapes_strides_shp]() {}); - }); - host_task_events.push_back(ind_sh_st_host_task); + std::copy(ind_offsets.begin(), ind_offsets.end(), + host_ind_offsets_shp->begin()); sycl::event device_ind_offsets_copy_ev = exec_q.copy( host_ind_offsets_shp->data(), packed_ind_offsets, host_ind_offsets_shp->size()); - sycl::event ind_offsets_host_task = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(device_ind_offsets_copy_ev); - cgh.host_task([host_ind_offsets_shp]() {}); - }); - host_task_events.push_back(ind_offsets_host_task); + + sycl::event shared_ptr_cleanup_host_task = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(device_ind_ptrs_copy_ev); + cgh.depends_on(device_ind_shapes_strides_copy_ev); + cgh.depends_on(device_ind_offsets_copy_ev); + cgh.host_task([host_ind_ptrs_shp, host_ind_shapes_strides_shp, + host_ind_offsets_shp]() {}); + }); + host_task_events.push_back(shared_ptr_cleanup_host_task); std::vector ind_pack_depends{device_ind_ptrs_copy_ev, device_ind_shapes_strides_copy_ev, From 7fee9e4867a22cf6770f9dc50740a32749bb7208 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 1 Mar 2023 17:05:27 -0800 Subject: [PATCH 37/57] Reordered copies in _take and _put - Segmentation fault occurred with other ordering --- .../source/integer_advanced_indexing.cpp | 44 ++++++++++--------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index b6ce79f4d0..4a1b4e8ef0 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -599,21 +599,22 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, std::shared_ptr host_ind_offsets_shp = std::make_shared(k, ind_allocator); + std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), + host_ind_shapes_strides_shp->begin()); + std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); + std::copy(ind_offsets.begin(), ind_offsets.end(), + host_ind_offsets_shp->begin()); + std::vector host_task_events; host_task_events.reserve(5); - std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), - host_ind_shapes_strides_shp->begin()); sycl::event packed_ind_ptrs_copy_ev = exec_q.copy( host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); - std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy( host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides, host_ind_shapes_strides_shp->size()); - std::copy(ind_offsets.begin(), ind_offsets.end(), - host_ind_offsets_shp->begin()); sycl::event packed_ind_offsets_copy_ev = exec_q.copy( host_ind_offsets_shp->data(), packed_ind_offsets, host_ind_offsets_shp->size()); @@ -1010,38 +1011,39 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, std::shared_ptr host_ind_offsets_shp = std::make_shared(k, ind_allocator); + std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), + host_ind_shapes_strides_shp->begin()); + std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); + std::copy(ind_offsets.begin(), ind_offsets.end(), + host_ind_offsets_shp->begin()); + std::vector host_task_events; host_task_events.reserve(5); - std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); - sycl::event device_ind_ptrs_copy_ev = exec_q.copy( + sycl::event packed_ind_ptrs_copy_ev = exec_q.copy( host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); - std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), - host_ind_shapes_strides_shp->begin()); - sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy( + sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy( host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides, host_ind_shapes_strides_shp->size()); - std::copy(ind_offsets.begin(), ind_offsets.end(), - host_ind_offsets_shp->begin()); - sycl::event device_ind_offsets_copy_ev = exec_q.copy( + sycl::event packed_ind_offsets_copy_ev = exec_q.copy( host_ind_offsets_shp->data(), packed_ind_offsets, host_ind_offsets_shp->size()); sycl::event shared_ptr_cleanup_host_task = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(device_ind_ptrs_copy_ev); - cgh.depends_on(device_ind_shapes_strides_copy_ev); - cgh.depends_on(device_ind_offsets_copy_ev); - cgh.host_task([host_ind_ptrs_shp, host_ind_shapes_strides_shp, - host_ind_offsets_shp]() {}); + cgh.depends_on({packed_ind_offsets_copy_ev, + packed_ind_shapes_strides_copy_ev, + packed_ind_ptrs_copy_ev}); + cgh.host_task([host_ind_offsets_shp, host_ind_shapes_strides_shp, + host_ind_ptrs_shp]() {}); }); host_task_events.push_back(shared_ptr_cleanup_host_task); - std::vector ind_pack_depends{device_ind_ptrs_copy_ev, - device_ind_shapes_strides_copy_ev, - device_ind_offsets_copy_ev}; + std::vector ind_pack_depends{packed_ind_ptrs_copy_ev, + packed_ind_shapes_strides_copy_ev, + packed_ind_offsets_copy_ev}; bool is_dst_c_contig = dst.is_c_contiguous(); bool is_dst_f_contig = dst.is_f_contiguous(); From d5a49c286754e23b2cac4d146643bac8cf9b717a Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 1 Mar 2023 20:58:44 -0600 Subject: [PATCH 38/57] Reordered waits --- dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index 4a1b4e8ef0..440dd5a332 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -723,8 +723,8 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, }); }); - sycl::event::wait(host_task_events); sycl::event::wait({take_generic_ev, temporaries_cleanup_ev}); + sycl::event::wait(host_task_events); /* sycl::event host_task_ev = keep_args_alive(exec_q, {src, py_ind, dst}, @@ -1137,8 +1137,8 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, }); }); - sycl::event::wait(host_task_events); sycl::event::wait({put_generic_ev, temporaries_cleanup_ev}); + sycl::event::wait(host_task_events); /* sycl::event py_obj_cleanup_ev = From f06dde5bfacf2c6fea4ce819f27c86ac17ba2d91 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 1 Mar 2023 22:32:19 -0600 Subject: [PATCH 39/57] Add wait for every host task submitted. --- dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index 440dd5a332..53450d2a8a 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -229,6 +229,7 @@ std::vector _populate_packed_shapes_strides_for_indexing( cgh.host_task([packed_host_axes_shapes_strides_shp, packed_host_shapes_strides_shp]() {}); }); + clean_up_host_task_ev.wait(); host_task_events.push_back(clean_up_host_task_ev); std::vector v = {device_orthog_shapes_strides_copy_ev, @@ -282,6 +283,7 @@ std::vector _populate_packed_shapes_strides_for_indexing( cgh.depends_on(device_axes_shapes_strides_copy_ev); cgh.host_task([packed_host_axes_shapes_strides_shp]() {}); }); + clean_up_host_task_ev.wait(); host_task_events.push_back(clean_up_host_task_ev); std::vector v = {device_orthog_shapes_strides_fill_ev, @@ -627,6 +629,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, cgh.host_task([host_ind_offsets_shp, host_ind_shapes_strides_shp, host_ind_ptrs_shp]() {}); }); + shared_ptr_cleanup_host_task.wait(); host_task_events.push_back(shared_ptr_cleanup_host_task); std::vector ind_pack_depends{packed_ind_ptrs_copy_ev, @@ -1039,6 +1042,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, cgh.host_task([host_ind_offsets_shp, host_ind_shapes_strides_shp, host_ind_ptrs_shp]() {}); }); + shared_ptr_cleanup_host_task.wait(); host_task_events.push_back(shared_ptr_cleanup_host_task); std::vector ind_pack_depends{packed_ind_ptrs_copy_ev, From 1387634077e37062b3f7b572dea8d84c12b02ffd Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Tue, 28 Feb 2023 15:14:20 -0800 Subject: [PATCH 40/57] Advanced indices don't broadcast if 1 array passed - _mock removed from indexing methods --- dpctl/tensor/_copy_utils.py | 15 ++++++++++----- dpctl/tensor/_indexing_functions.py | 14 +++++++++----- dpctl/tensor/_usmarray.pyx | 12 ++++++------ 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index 382d92bb79..079f02fe52 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -430,7 +430,7 @@ def _mock_nonzero(ary): return tuple(dpt.asarray(i, usm_type=usm_type, sycl_queue=q) for i in nz) -def _mock_take_multi_index(ary, inds, p): +def _take_multi_index(ary, inds, p): if not isinstance(ary, dpt.usm_ndarray): raise TypeError queues_ = [ @@ -439,6 +439,8 @@ def _mock_take_multi_index(ary, inds, p): usm_types_ = [ ary.usm_type, ] + if not isinstance(inds, list) and not isinstance(inds, tuple): + inds = (inds,) all_integers = True for ind in inds: queues_.append(ind.sycl_queue) @@ -452,7 +454,8 @@ def _mock_take_multi_index(ary, inds, p): raise IndexError( "arrays used as indices must be of integer (or boolean) type" ) - inds = dpt.broadcast_arrays(*inds) + if (len(inds) > 1): + inds = dpt.broadcast_arrays(*inds) ary_ndim = ary.ndim if ary_ndim > 0: p = operator.index(p) @@ -505,7 +508,7 @@ def _mock_place(ary, ary_mask, p, vals): return -def _mock_put_multi_index(ary, inds, p, vals): +def _put_multi_index(ary, inds, p, vals): if isinstance(vals, dpt.usm_ndarray): queues_ = [ary.sycl_queue, vals.sycl_queue] usm_types_ = [ary.usm_type, vals.usm_type] @@ -516,6 +519,8 @@ def _mock_put_multi_index(ary, inds, p, vals): usm_types_ = [ ary.usm_type, ] + if not isinstance(inds, list) and not isinstance(inds, tuple): + inds = (inds,) all_integers = True for ind in inds: if not isinstance(ind, dpt.usm_ndarray): @@ -536,8 +541,8 @@ def _mock_put_multi_index(ary, inds, p, vals): raise IndexError( "arrays used as indices must be of integer (or boolean) type" ) - - inds = dpt.broadcast_arrays(*inds) + if (len(inds) > 1): + inds = dpt.broadcast_arrays(*inds) ary_ndim = ary.ndim if ary_ndim > 0: p = operator.index(p) diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 90718f4559..23d2c4d637 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -79,7 +79,8 @@ def take(x, indices, /, *, axis=None, mode="clip"): ) axis = 0 - indices = dpt.broadcast_arrays(*indices) + if len(indices) > 1: + indices = dpt.broadcast_arrays(*indices) if x_ndim > 0: axis = operator.index(axis) axis = normalize_axis_index(axis, x_ndim) @@ -149,10 +150,13 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"): # when axis is none, array is treated as 1D if axis is None: - x = dpt.reshape(x, (x.size,), copy=False) - axis = 0 - - indices = dpt.broadcast_arrays(*indices) + try: + x = dpt.reshape(x, (x.size,), copy=False) + axis = 0 + except ValueError: + raise ValueError("Cannot create 1D view of array") + if len(indices) > 1: + indices = dpt.broadcast_arrays(*indices) x_ndim = x.ndim if x_ndim > 0: axis = operator.index(axis) diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 3c42c96dd5..5d83a86c62 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -673,7 +673,7 @@ cdef class usm_ndarray: from ._copy_utils import ( _mock_extract, _mock_nonzero, - _mock_take_multi_index, + _take_multi_index, ) if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool: return _mock_extract(res, adv_ind[0], adv_ind_start_p) @@ -685,9 +685,9 @@ cdef class usm_ndarray: adv_ind_int.extend(_mock_nonzero(ind)) else: adv_ind_int.append(ind) - return _mock_take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p) + return _take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p) - return _mock_take_multi_index(res, adv_ind, adv_ind_start_p) + return _take_multi_index(res, adv_ind, adv_ind_start_p) def to_device(self, target): @@ -1021,7 +1021,7 @@ cdef class usm_ndarray: _copy_from_usm_ndarray_to_usm_ndarray, _mock_nonzero, _mock_place, - _mock_put_multi_index, + _put_multi_index, ) adv_ind = _meta[3] @@ -1064,10 +1064,10 @@ cdef class usm_ndarray: adv_ind_int.extend(_mock_nonzero(ind)) else: adv_ind_int.append(ind) - _mock_put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs) + _put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs) return - _mock_put_multi_index(Xv, adv_ind, adv_ind_start_p, rhs) + _put_multi_index(Xv, adv_ind, adv_ind_start_p, rhs) return From d0eb7cff395e6c88e337eae40630e56e44073ba5 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 2 Mar 2023 09:17:10 -0800 Subject: [PATCH 41/57] Take and put tweaks - take_multi_index and put_multi_index logic for 0D arrays removed, adjusted a test accordingly - take, put, take_multi_index, and put_multi_index axis type check and normalization only reassigns axis once --- dpctl/tensor/_copy_utils.py | 21 +++++++-------------- dpctl/tensor/_indexing_functions.py | 8 +++----- dpctl/tests/test_usm_ndarray_indexing.py | 3 ++- 3 files changed, 12 insertions(+), 20 deletions(-) diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index 079f02fe52..597db87c49 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -454,16 +454,12 @@ def _take_multi_index(ary, inds, p): raise IndexError( "arrays used as indices must be of integer (or boolean) type" ) - if (len(inds) > 1): + if len(inds) > 1: inds = dpt.broadcast_arrays(*inds) ary_ndim = ary.ndim - if ary_ndim > 0: - p = operator.index(p) - p = normalize_axis_index(p, ary_ndim) + p = normalize_axis_index(operator.index(p), ary_ndim) - res_shape = ary.shape[:p] + inds[0].shape + ary.shape[p + len(inds) :] - else: - res_shape = inds[0].shape + res_shape = ary.shape[:p] + inds[0].shape + ary.shape[p + len(inds) :] res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) res = dpt.empty( res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q @@ -541,15 +537,12 @@ def _put_multi_index(ary, inds, p, vals): raise IndexError( "arrays used as indices must be of integer (or boolean) type" ) - if (len(inds) > 1): + if len(inds) > 1: inds = dpt.broadcast_arrays(*inds) ary_ndim = ary.ndim - if ary_ndim > 0: - p = operator.index(p) - p = normalize_axis_index(p, ary_ndim) - vals_shape = ary.shape[:p] + inds[0].shape + ary.shape[p + len(inds) :] - else: - vals_shape = inds[0].shape + + p = normalize_axis_index(operator.index(p), ary_ndim) + vals_shape = ary.shape[:p] + inds[0].shape + ary.shape[p + len(inds) :] vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) if not isinstance(vals, dpt.usm_ndarray): diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 23d2c4d637..12f7b2d72e 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -82,8 +82,7 @@ def take(x, indices, /, *, axis=None, mode="clip"): if len(indices) > 1: indices = dpt.broadcast_arrays(*indices) if x_ndim > 0: - axis = operator.index(axis) - axis = normalize_axis_index(axis, x_ndim) + axis = normalize_axis_index(operator.index(axis), x_ndim) res_shape = ( x.shape[:axis] + indices[0].shape + x.shape[axis + len(indices) :] ) @@ -154,13 +153,12 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"): x = dpt.reshape(x, (x.size,), copy=False) axis = 0 except ValueError: - raise ValueError("Cannot create 1D view of array") + raise ValueError("Cannot create 1D view of input array") if len(indices) > 1: indices = dpt.broadcast_arrays(*indices) x_ndim = x.ndim if x_ndim > 0: - axis = operator.index(axis) - axis = normalize_axis_index(axis, x_ndim) + axis = normalize_axis_index(operator.index(axis), x_ndim) val_shape = ( x.shape[:axis] + indices[0].shape + x.shape[axis + len(indices) :] diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index 45501afbac..98bb674b21 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -546,7 +546,8 @@ def test_put_0d_val(data_dt): assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x[0])) x = dpt.asarray(5, dtype=data_dt, sycl_queue=q) - x[ind] = 2 + val = 2 + dpt.put(x, ind, val) assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x)) From 1e6794308b760e2282856076351b49ebbb349115 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 2 Mar 2023 09:21:47 -0800 Subject: [PATCH 42/57] Fixed WrapIndex class returning negative indices --- .../libtensor/include/kernels/integer_advanced_indexing.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index a239691c80..5258b3e4f5 100644 --- a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -68,7 +68,7 @@ class WrapIndex void operator()(py::ssize_t max_item, py::ssize_t &ind) const { max_item = std::max(max_item, 1); - ind = ind % max_item; + ind = (ind < 0) ? ind % max_item + max_item : ind % max_item; return; } }; From ac9072f63061f0a79c6e8cc77d62080c17e3d8bb Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 2 Mar 2023 09:35:33 -0800 Subject: [PATCH 43/57] Import formatting corrected in usm_ndarray getitem --- dpctl/tensor/_usmarray.pyx | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 5d83a86c62..70c8eadeda 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -670,11 +670,7 @@ cdef class usm_ndarray: if adv_ind_start_p < 0: return res - from ._copy_utils import ( - _mock_extract, - _mock_nonzero, - _take_multi_index, - ) ++ from ._copy_utils import _mock_extract, _mock_nonzero, _take_multi_index if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool: return _mock_extract(res, adv_ind[0], adv_ind_start_p) From d47fbf03107a44b05fd531c92b4f98bf4d7df8e4 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 2 Mar 2023 09:45:14 -0800 Subject: [PATCH 44/57] Whitespace in usm_ndarray getitem imports --- dpctl/tensor/_usmarray.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 70c8eadeda..64a492065f 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -670,7 +670,7 @@ cdef class usm_ndarray: if adv_ind_start_p < 0: return res -+ from ._copy_utils import _mock_extract, _mock_nonzero, _take_multi_index + from ._copy_utils import _mock_extract, _mock_nonzero, _take_multi_index if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool: return _mock_extract(res, adv_ind[0], adv_ind_start_p) From db84c42b133effcb5eafd04dee3f72ecdce81c1a Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 2 Mar 2023 12:00:36 -0800 Subject: [PATCH 45/57] Refactored advanced_indexing to 1 host_task --- .../source/integer_advanced_indexing.cpp | 619 +++++------------- 1 file changed, 172 insertions(+), 447 deletions(-) diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index 53450d2a8a..dfc74c12f0 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -68,228 +68,135 @@ using dpctl::tensor::f_contiguous_strides; using dpctl::utils::keep_args_alive; -std::vector _populate_packed_shapes_strides_for_indexing( - sycl::queue exec_q, - std::vector &host_task_events, - py::ssize_t *device_orthog_shapes_strides, - py::ssize_t *device_axes_shapes_strides, - const py::ssize_t *inp_shape, - const py::ssize_t *inp_strides, - bool is_inp_c_contig, - bool is_inp_f_contig, - const py::ssize_t *arr_shape, - const py::ssize_t *arr_strides, - bool is_arr_c_contig, - bool is_arr_f_contig, - int axis_start, - int k, - int ind_nd, - int inp_nd, - int arr_nd) +std::vector +_populate_kernel_params(sycl::queue exec_q, + std::vector &host_task_events, + char **device_ind_ptrs, + py::ssize_t *device_ind_sh_st, + py::ssize_t *device_ind_offsets, + py::ssize_t *device_orthog_sh_st, + py::ssize_t *device_along_sh_st, + const py::ssize_t *inp_shape, + std::vector &inp_strides, + std::vector &arr_strides, + std::vector &ind_sh_sts, + std::vector &ind_ptrs, + std::vector &ind_offsets, + int axis_start, + int k, + int ind_nd, + int inp_nd, + int orthog_sh_elems, + int ind_sh_elems) { - int orthog_sh_elems = std::max(inp_nd - k, 1); - int along_sh_elems = std::max(ind_nd, 1); + using usm_host_allocator_T = + sycl::usm_allocator; + using ptrT = std::vector; + + usm_host_allocator_T ptr_allocator(exec_q); + std::shared_ptr host_ind_ptrs_shp = + std::make_shared(k, ptr_allocator); using usm_host_allocatorT = sycl::usm_allocator; using shT = std::vector; - usm_host_allocatorT allocator(exec_q); - std::shared_ptr packed_host_shapes_strides_shp = - std::make_shared(3 * orthog_sh_elems, 0, allocator); + usm_host_allocatorT sz_allocator(exec_q); + std::shared_ptr host_ind_sh_st_shp = + std::make_shared(ind_sh_elems * (k + 1), sz_allocator); - std::shared_ptr packed_host_axes_shapes_strides_shp = - std::make_shared(2 * k + along_sh_elems, 0, allocator); + std::shared_ptr host_ind_offsets_shp = + std::make_shared(k, sz_allocator); - if (inp_nd > 0) { - std::copy(inp_shape, inp_shape + axis_start, - packed_host_shapes_strides_shp->begin()); - std::copy(inp_shape + axis_start + k, inp_shape + inp_nd, - packed_host_shapes_strides_shp->begin() + axis_start); - std::copy(inp_shape + axis_start, inp_shape + axis_start + k, - packed_host_axes_shapes_strides_shp->begin()); - - // contract axes by using two copies - if (inp_strides == nullptr) { - if (is_inp_c_contig) { - const auto &inp_contig_strides = - c_contiguous_strides(inp_nd, inp_shape); - std::copy(inp_contig_strides.begin(), - inp_contig_strides.begin() + axis_start, - packed_host_shapes_strides_shp->begin() + - orthog_sh_elems); - std::copy(inp_contig_strides.begin() + axis_start + k, - inp_contig_strides.end(), - packed_host_shapes_strides_shp->begin() + - orthog_sh_elems + axis_start); - std::copy(inp_contig_strides.begin() + axis_start, - inp_contig_strides.begin() + axis_start + k, - packed_host_axes_shapes_strides_shp->begin() + k); - } - else if (is_inp_f_contig) { - const auto &inp_contig_strides = - f_contiguous_strides(inp_nd, inp_shape); - std::copy(inp_contig_strides.begin(), - inp_contig_strides.begin() + axis_start, - packed_host_shapes_strides_shp->begin() + - orthog_sh_elems); - std::copy(inp_contig_strides.begin() + axis_start + k, - inp_contig_strides.end(), - packed_host_shapes_strides_shp->begin() + - orthog_sh_elems + axis_start); - std::copy(inp_contig_strides.begin() + axis_start, - inp_contig_strides.begin() + axis_start + k, - packed_host_axes_shapes_strides_shp->begin() + k); - } - else { - // FIXME: this pointer was not allocated in this function - // the caller should be freeing it - sycl::free(device_orthog_shapes_strides, exec_q); - throw std::runtime_error("Invalid array encountered"); - } - } - else { - std::copy(inp_strides, inp_strides + axis_start, - packed_host_shapes_strides_shp->begin() + - orthog_sh_elems); - std::copy(inp_strides + axis_start + k, inp_strides + inp_nd, - packed_host_shapes_strides_shp->begin() + - orthog_sh_elems + axis_start); - std::copy(inp_strides + axis_start, inp_strides + axis_start + k, - packed_host_axes_shapes_strides_shp->begin() + k); - } + std::shared_ptr host_orthog_sh_st_shp = + std::make_shared(3 * orthog_sh_elems, sz_allocator); - if (arr_strides == nullptr) { - if (is_arr_c_contig) { - const auto &arr_contig_strides = - c_contiguous_strides(arr_nd, arr_shape); - std::copy(arr_contig_strides.begin(), - arr_contig_strides.begin() + axis_start, - packed_host_shapes_strides_shp->begin() + - 2 * orthog_sh_elems); - std::copy(arr_contig_strides.begin() + axis_start + ind_nd, - arr_contig_strides.end(), - packed_host_shapes_strides_shp->begin() + - 2 * orthog_sh_elems + axis_start); - std::copy(arr_contig_strides.begin() + axis_start, - arr_contig_strides.begin() + axis_start + ind_nd, - packed_host_axes_shapes_strides_shp->begin() + 2 * k); - } - else if (is_arr_f_contig) { - const auto &arr_contig_strides = - f_contiguous_strides(arr_nd, arr_shape); - std::copy(arr_contig_strides.begin(), - arr_contig_strides.begin() + axis_start, - packed_host_shapes_strides_shp->begin() + - 2 * orthog_sh_elems); - std::copy(arr_contig_strides.begin() + axis_start + ind_nd, - arr_contig_strides.end(), - packed_host_shapes_strides_shp->begin() + - 2 * orthog_sh_elems + axis_start); - std::copy(arr_contig_strides.begin() + axis_start, - arr_contig_strides.begin() + axis_start + ind_nd, - packed_host_axes_shapes_strides_shp->begin() + 2 * k); - } - else { - // FIXME: this pointer was not allocated in this function - // the caller should be freeing it - sycl::free(device_orthog_shapes_strides, exec_q); - throw std::runtime_error("Invalid array encountered"); - } + std::shared_ptr host_along_sh_st_shp = + std::make_shared(2 * k + ind_sh_elems, sz_allocator); + + std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), + host_ind_sh_st_shp->begin()); + std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); + std::copy(ind_offsets.begin(), ind_offsets.end(), + host_ind_offsets_shp->begin()); + + sycl::event device_ind_ptrs_copy_ev = exec_q.copy( + host_ind_ptrs_shp->data(), device_ind_ptrs, host_ind_ptrs_shp->size()); + + sycl::event device_ind_sh_st_copy_ev = + exec_q.copy(host_ind_sh_st_shp->data(), device_ind_sh_st, + host_ind_sh_st_shp->size()); + + sycl::event device_ind_offsets_copy_ev = exec_q.copy( + host_ind_offsets_shp->data(), device_ind_offsets, + host_ind_offsets_shp->size()); + + int orthog_nd = inp_nd - k; + + if (orthog_nd > 0) { + if (axis_start > 0) { + std::copy(inp_shape, inp_shape + axis_start, + host_orthog_sh_st_shp->begin()); + std::copy(inp_strides.begin(), inp_strides.begin() + axis_start, + host_orthog_sh_st_shp->begin() + orthog_sh_elems); + std::copy(arr_strides.begin(), arr_strides.begin() + axis_start, + host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems); } - else { - std::copy(arr_strides, arr_strides + axis_start, - packed_host_shapes_strides_shp->begin() + - 2 * orthog_sh_elems); - std::copy(arr_strides + axis_start + ind_nd, arr_strides + arr_nd, - packed_host_shapes_strides_shp->begin() + - 2 * orthog_sh_elems + axis_start); - std::copy(arr_strides + axis_start, - arr_strides + axis_start + ind_nd, - packed_host_axes_shapes_strides_shp->begin() + 2 * k); + if (inp_nd > (axis_start + k)) { + std::copy(inp_shape + axis_start + k, inp_shape + inp_nd, + host_orthog_sh_st_shp->begin() + axis_start); + std::copy(inp_strides.begin() + axis_start + k, inp_strides.end(), + host_orthog_sh_st_shp->begin() + orthog_sh_elems + + axis_start); + + std::copy(arr_strides.begin() + axis_start + ind_nd, + arr_strides.end(), + host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems + + axis_start); } + } + + if (inp_nd > 0) { + std::copy(inp_shape + axis_start, inp_shape + axis_start + k, + host_along_sh_st_shp->begin()); - // copy packed shapes and strides from host to devices - sycl::event device_orthog_shapes_strides_copy_ev = - exec_q.copy(packed_host_shapes_strides_shp->data(), - device_orthog_shapes_strides, - packed_host_shapes_strides_shp->size()); - - sycl::event device_axes_shapes_strides_copy_ev = - exec_q.copy( - packed_host_axes_shapes_strides_shp->data(), - device_axes_shapes_strides, - packed_host_axes_shapes_strides_shp->size()); - - sycl::event clean_up_host_task_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(device_axes_shapes_strides_copy_ev); - cgh.depends_on(device_orthog_shapes_strides_copy_ev); - cgh.host_task([packed_host_axes_shapes_strides_shp, - packed_host_shapes_strides_shp]() {}); - }); - clean_up_host_task_ev.wait(); - host_task_events.push_back(clean_up_host_task_ev); - - std::vector v = {device_orthog_shapes_strides_copy_ev, - device_axes_shapes_strides_copy_ev}; - return v; + std::copy(inp_strides.begin() + axis_start, + inp_strides.begin() + axis_start + k, + host_along_sh_st_shp->begin() + k); } - else { - // no orthogonal dimensions - sycl::event device_orthog_shapes_strides_fill_ev = - exec_q.fill(device_orthog_shapes_strides, - py::ssize_t(0), 3); - - packed_host_axes_shapes_strides_shp->insert( - packed_host_axes_shapes_strides_shp->end(), py::ssize_t(0), 2); - if (arr_strides == nullptr) { - if (is_arr_c_contig) { - const auto &arr_contig_strides = - c_contiguous_strides(arr_nd, arr_shape); - std::copy(arr_contig_strides.begin() + axis_start, - arr_contig_strides.begin() + axis_start + ind_nd, - packed_host_axes_shapes_strides_shp->begin() + 2); - } - else if (is_arr_f_contig) { - const auto &arr_contig_strides = - f_contiguous_strides(arr_nd, arr_shape); - std::copy(arr_contig_strides.begin() + axis_start, - arr_contig_strides.begin() + axis_start + ind_nd, - packed_host_axes_shapes_strides_shp->begin() + 2); - } - else { - // FIXME: memory was not allocated in this function - // it should be freed by the caller - sycl::free(device_orthog_shapes_strides, exec_q); - throw std::runtime_error("Invalid array encountered"); - } - } - else { - std::copy(arr_strides + axis_start, - arr_strides + axis_start + ind_nd, - packed_host_axes_shapes_strides_shp->begin() + 2); - } - sycl::event device_axes_shapes_strides_copy_ev = - exec_q.copy( - packed_host_axes_shapes_strides_shp->data(), - device_axes_shapes_strides, - packed_host_axes_shapes_strides_shp->size()); - - sycl::event clean_up_host_task_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(device_axes_shapes_strides_copy_ev); - cgh.host_task([packed_host_axes_shapes_strides_shp]() {}); - }); - clean_up_host_task_ev.wait(); - host_task_events.push_back(clean_up_host_task_ev); - - std::vector v = {device_orthog_shapes_strides_fill_ev, - device_axes_shapes_strides_copy_ev}; - return v; + if (ind_nd > 0) { + std::copy(arr_strides.begin() + axis_start, + arr_strides.begin() + axis_start + ind_nd, + host_along_sh_st_shp->begin() + 2 * k); } + + sycl::event device_orthog_sh_st_copy_ev = exec_q.copy( + host_orthog_sh_st_shp->data(), device_orthog_sh_st, + host_orthog_sh_st_shp->size()); + + sycl::event device_along_sh_st_copy_ev = exec_q.copy( + host_along_sh_st_shp->data(), device_along_sh_st, + host_along_sh_st_shp->size()); + + sycl::event shared_ptr_cleanup_host_task = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on({device_along_sh_st_copy_ev, + device_orthog_sh_st_copy_ev, + device_ind_offsets_copy_ev, + device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev}); + cgh.host_task([host_ind_offsets_shp, host_ind_sh_st_shp, + host_ind_ptrs_shp, host_orthog_sh_st_shp, + host_along_sh_st_shp]() {}); + }); + host_task_events.push_back(shared_ptr_cleanup_host_task); + + std::vector sh_st_pack_deps{ + device_ind_ptrs_copy_ev, device_ind_sh_st_copy_ev, + device_ind_offsets_copy_ev, device_orthog_sh_st_copy_ev, + device_along_sh_st_copy_ev}; + return sh_st_pack_deps; } /* Utility to parse python object py_ind into vector of `usm_ndarray`s */ @@ -357,7 +264,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, int dst_nd = dst.get_ndim(); int ind_nd = ind_rep.get_ndim(); - auto sh_elems = (src_nd > 0) ? src_nd : 1; + auto sh_elems = std::max(src_nd, 1); if (axis_start + k > sh_elems) { throw py::value_error("Axes are out of range for array of dimension " + @@ -379,8 +286,6 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, const py::ssize_t *src_shape = src.get_shape_raw(); const py::ssize_t *dst_shape = dst.get_shape_raw(); - int orthog_nd = std::max(src_nd - k, 1); - bool orthog_shapes_equal(true); size_t orthog_nelems(1); for (int i = 0; i < (src_nd - k); ++i) { @@ -471,9 +376,9 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, std::vector ind_offsets; ind_offsets.reserve(k); - std::vector ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0)); + std::vector ind_sh_sts((k + 1) * ind_sh_elems, 0); if (ind_nd > 0) { - std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin()); + std::copy(ind_shape, ind_shape + ind_nd, ind_sh_sts.begin()); } for (int i = 0; i < k; ++i) { dpctl::tensor::usm_ndarray ind_ = ind[i]; @@ -520,31 +425,9 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, // strides are initialized to 0 for 0D indices, so skip here if (ind_nd > 0) { - const py::ssize_t *ind_strides = ind_.get_strides_raw(); - if (ind_strides == nullptr) { - if (ind_.is_c_contiguous()) { - const auto &ind_contig_strides_ = - c_contiguous_strides(ind_nd, ind_shape); - std::copy(ind_contig_strides_.begin(), - ind_contig_strides_.end(), - ind_sh_sts.begin() + (i + 1) * ind_nd); - } - else if (ind_.is_f_contiguous()) { - const auto &ind_contig_strides_ = - f_contiguous_strides(ind_nd, ind_shape); - std::copy(ind_contig_strides_.begin(), - ind_contig_strides_.end(), - ind_sh_sts.begin() + (i + 1) * ind_nd); - } - else { - throw std::runtime_error( - "Invalid ind array encountered in: take function"); - } - } - else { - std::copy(ind_strides, ind_strides + ind_nd, - ind_sh_sts.begin() + (i + 1) * ind_nd); - } + auto ind_strides = ind_.get_strides_vector(); + std::copy(ind_strides.begin(), ind_strides.end(), + ind_sh_sts.begin() + (i + 1) * ind_nd); } ind_ptrs.push_back(ind_data); @@ -582,77 +465,15 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, "Unable to allocate packed_ind_offsets device memory"); } - using usm_host_allocator_T = - sycl::usm_allocator; - using ptrT = std::vector; - - usm_host_allocator_T ptr_allocator(exec_q); - std::shared_ptr host_ind_ptrs_shp = - std::make_shared(k, ptr_allocator); - - using usm_host_allocatorT = - sycl::usm_allocator; - using shT = std::vector; - - usm_host_allocatorT ind_allocator(exec_q); - std::shared_ptr host_ind_shapes_strides_shp = - std::make_shared(ind_sh_elems * (k + 1), ind_allocator); - - std::shared_ptr host_ind_offsets_shp = - std::make_shared(k, ind_allocator); - - std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), - host_ind_shapes_strides_shp->begin()); - std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); - std::copy(ind_offsets.begin(), ind_offsets.end(), - host_ind_offsets_shp->begin()); - - std::vector host_task_events; - host_task_events.reserve(5); - - sycl::event packed_ind_ptrs_copy_ev = exec_q.copy( - host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); - - sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy( - host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides, - host_ind_shapes_strides_shp->size()); - - sycl::event packed_ind_offsets_copy_ev = exec_q.copy( - host_ind_offsets_shp->data(), packed_ind_offsets, - host_ind_offsets_shp->size()); - - sycl::event shared_ptr_cleanup_host_task = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on({packed_ind_offsets_copy_ev, - packed_ind_shapes_strides_copy_ev, - packed_ind_ptrs_copy_ev}); - cgh.host_task([host_ind_offsets_shp, host_ind_shapes_strides_shp, - host_ind_ptrs_shp]() {}); - }); - shared_ptr_cleanup_host_task.wait(); - host_task_events.push_back(shared_ptr_cleanup_host_task); - - std::vector ind_pack_depends{packed_ind_ptrs_copy_ev, - packed_ind_shapes_strides_copy_ev, - packed_ind_offsets_copy_ev}; - - bool is_src_c_contig = src.is_c_contiguous(); - bool is_src_f_contig = src.is_f_contiguous(); - - bool is_dst_c_contig = dst.is_c_contiguous(); - bool is_dst_f_contig = dst.is_f_contiguous(); - - const py::ssize_t *src_strides = src.get_strides_raw(); - const py::ssize_t *dst_strides = dst.get_strides_raw(); + int orthog_sh_elems = std::max(src_nd - k, 1); // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:], // src_strides[:axis] + src_strides[axis+k:], // dst_strides[:axis] + dst_strides[axis+k:]] py::ssize_t *packed_shapes_strides = - sycl::malloc_device(3 * sh_elems, exec_q); + sycl::malloc_device(3 * orthog_sh_elems, exec_q); if (packed_shapes_strides == nullptr) { - sycl::event::wait(host_task_events); sycl::free(packed_ind_ptrs, exec_q); sycl::free(packed_ind_shapes_strides, exec_q); sycl::free(packed_ind_offsets, exec_q); @@ -667,7 +488,6 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, sycl::malloc_device((2 * k) + ind_sh_elems, exec_q); if (packed_axes_shapes_strides == nullptr) { - sycl::event::wait(host_task_events); sycl::free(packed_ind_ptrs, exec_q); sycl::free(packed_ind_shapes_strides, exec_q); sycl::free(packed_ind_offsets, exec_q); @@ -676,20 +496,22 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, "Unable to allocate packed_axes_shapes_strides device memory"); } - std::vector src_dst_pack_deps = - _populate_packed_shapes_strides_for_indexing( - exec_q, host_task_events, packed_shapes_strides, - packed_axes_shapes_strides, src_shape, src_strides, is_src_c_contig, - is_src_f_contig, dst_shape, dst_strides, is_dst_c_contig, - is_dst_f_contig, axis_start, k, ind_nd, src_nd, dst_nd); + auto src_strides = src.get_strides_vector(); + auto dst_strides = dst.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + std::vector pack_deps = _populate_kernel_params( + exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides, + packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides, + src_shape, src_strides, dst_strides, ind_sh_sts, ind_ptrs, ind_offsets, + axis_start, k, ind_nd, src_nd, orthog_sh_elems, ind_sh_elems); std::vector all_deps; - all_deps.reserve(depends.size() + ind_pack_depends.size() + - src_dst_pack_deps.size()); - all_deps.insert(std::end(all_deps), std::begin(ind_pack_depends), - std::end(ind_pack_depends)); - all_deps.insert(std::end(all_deps), std::begin(src_dst_pack_deps), - std::end(src_dst_pack_deps)); + all_deps.reserve(depends.size() + pack_deps.size()); + all_deps.insert(std::end(all_deps), std::begin(pack_deps), + std::end(pack_deps)); all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); auto fn = take_dispatch_table[mode][src_type_id][ind_type_id]; @@ -706,7 +528,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, } sycl::event take_generic_ev = - fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k, + fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k, packed_shapes_strides, packed_axes_shapes_strides, packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs, src_offset, dst_offset, packed_ind_offsets, all_deps); @@ -726,15 +548,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, }); }); - sycl::event::wait({take_generic_ev, temporaries_cleanup_ev}); - sycl::event::wait(host_task_events); + sycl::event host_task_ev = keep_args_alive( + exec_q, {src, py_ind, dst}, {take_generic_ev, temporaries_cleanup_ev}); - /* - sycl::event host_task_ev = keep_args_alive(exec_q, {src, py_ind, dst}, - {temporaries_cleanup_ev}); - */ - - return std::make_pair(sycl::event(), temporaries_cleanup_ev); + return std::make_pair(host_task_ev, take_generic_ev); } std::pair @@ -772,7 +589,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, int val_nd = val.get_ndim(); int ind_nd = ind_rep.get_ndim(); - auto sh_elems = (dst_nd > 0) ? dst_nd : 1; + auto sh_elems = std::max(dst_nd, 1); if (axis_start + k > sh_elems) { throw py::value_error("Axes are out of range for array of dimension " + @@ -796,8 +613,6 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, const py::ssize_t *dst_shape = dst.get_shape_raw(); const py::ssize_t *val_shape = val.get_shape_raw(); - int orthog_nd = ((dst_nd - k) > 0) ? dst_nd - k : 1; - bool orthog_shapes_equal(true); size_t orthog_nelems(1); for (int i = 0; i < (dst_nd - k); ++i) { @@ -879,7 +694,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, } } - auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1; + auto ind_sh_elems = std::max(ind_nd, 1); std::vector ind_ptrs; ind_ptrs.reserve(k); @@ -934,31 +749,9 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, // strides are initialized to 0 for 0D indices, so skip here if (ind_nd > 0) { - const py::ssize_t *ind_strides = ind_.get_strides_raw(); - if (ind_strides == nullptr) { - if (ind_.is_c_contiguous()) { - const auto &ind_contig_strides_ = - c_contiguous_strides(ind_nd, ind_shape); - std::copy(ind_contig_strides_.begin(), - ind_contig_strides_.end(), - ind_sh_sts.begin() + (i + 1) * ind_nd); - } - else if (ind_.is_f_contiguous()) { - const auto &ind_contig_strides_ = - f_contiguous_strides(ind_nd, ind_shape); - std::copy(ind_contig_strides_.begin(), - ind_contig_strides_.end(), - ind_sh_sts.begin() + (i + 1) * ind_nd); - } - else { - throw std::runtime_error( - "Invalid ind array encountered in: take function"); - } - } - else { - std::copy(ind_strides, ind_strides + ind_nd, - ind_sh_sts.begin() + (i + 1) * ind_nd); - } + auto ind_strides = ind_.get_strides_vector(); + std::copy(ind_strides.begin(), ind_strides.end(), + ind_sh_sts.begin() + (i + 1) * ind_nd); } ind_ptrs.push_back(ind_data); @@ -995,77 +788,15 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, "Unable to allocate packed_ind_offsets device memory"); } - using usm_host_allocator_T = - sycl::usm_allocator; - using ptrT = std::vector; - - usm_host_allocator_T ptr_allocator(exec_q); - std::shared_ptr host_ind_ptrs_shp = - std::make_shared(k, ptr_allocator); - - using usm_host_allocatorT = - sycl::usm_allocator; - using shT = std::vector; - - usm_host_allocatorT ind_allocator(exec_q); - std::shared_ptr host_ind_shapes_strides_shp = - std::make_shared(ind_sh_elems * (k + 1), ind_allocator); - - std::shared_ptr host_ind_offsets_shp = - std::make_shared(k, ind_allocator); - - std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), - host_ind_shapes_strides_shp->begin()); - std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); - std::copy(ind_offsets.begin(), ind_offsets.end(), - host_ind_offsets_shp->begin()); - - std::vector host_task_events; - host_task_events.reserve(5); - - sycl::event packed_ind_ptrs_copy_ev = exec_q.copy( - host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size()); - - sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy( - host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides, - host_ind_shapes_strides_shp->size()); - - sycl::event packed_ind_offsets_copy_ev = exec_q.copy( - host_ind_offsets_shp->data(), packed_ind_offsets, - host_ind_offsets_shp->size()); - - sycl::event shared_ptr_cleanup_host_task = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on({packed_ind_offsets_copy_ev, - packed_ind_shapes_strides_copy_ev, - packed_ind_ptrs_copy_ev}); - cgh.host_task([host_ind_offsets_shp, host_ind_shapes_strides_shp, - host_ind_ptrs_shp]() {}); - }); - shared_ptr_cleanup_host_task.wait(); - host_task_events.push_back(shared_ptr_cleanup_host_task); - - std::vector ind_pack_depends{packed_ind_ptrs_copy_ev, - packed_ind_shapes_strides_copy_ev, - packed_ind_offsets_copy_ev}; - - bool is_dst_c_contig = dst.is_c_contiguous(); - bool is_dst_f_contig = dst.is_f_contiguous(); - - bool is_val_c_contig = val.is_c_contiguous(); - bool is_val_f_contig = val.is_f_contiguous(); - - const py::ssize_t *dst_strides = dst.get_strides_raw(); - const py::ssize_t *val_strides = val.get_strides_raw(); + int orthog_sh_elems = std::max(dst_nd - k, 1); // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:], // dst_strides[:axis] + dst_strides[axis+k:], // val_strides[:axis] + val_strides[axis+k:]] py::ssize_t *packed_shapes_strides = - sycl::malloc_device(3 * sh_elems, exec_q); + sycl::malloc_device(3 * orthog_sh_elems, exec_q); if (packed_shapes_strides == nullptr) { - sycl::event::wait(host_task_events); sycl::free(packed_ind_ptrs, exec_q); sycl::free(packed_ind_shapes_strides, exec_q); sycl::free(packed_ind_offsets, exec_q); @@ -1073,54 +804,54 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, "Unable to allocate packed_shapes_strides device memory"); } - // packed_axes_shapes_strides = [dst_shape[axis:k], - // dst_strides[axis:k, + // packed_axes_shapes_strides = [dst_shape[axis:axis+k], + // dst_strides[axis:axis+k, // val_strides[axis:ind.ndim]] py::ssize_t *packed_axes_shapes_strides = sycl::malloc_device((2 * k) + ind_sh_elems, exec_q); if (packed_axes_shapes_strides == nullptr) { - sycl::event::wait(host_task_events); - sycl::free(packed_shapes_strides, exec_q); sycl::free(packed_ind_ptrs, exec_q); sycl::free(packed_ind_shapes_strides, exec_q); sycl::free(packed_ind_offsets, exec_q); + sycl::free(packed_shapes_strides, exec_q); throw std::runtime_error( "Unable to allocate packed_axes_shapes_strides device memory"); } - std::vector copy_shapes_strides_deps = - _populate_packed_shapes_strides_for_indexing( - exec_q, host_task_events, packed_shapes_strides, - packed_axes_shapes_strides, dst_shape, dst_strides, is_dst_c_contig, - is_dst_f_contig, val_shape, val_strides, is_val_c_contig, - is_val_f_contig, axis_start, k, ind_nd, dst_nd, val_nd); - - std::vector all_deps(depends.size() + - copy_shapes_strides_deps.size() + - ind_pack_depends.size()); - all_deps.insert(std::end(all_deps), std::begin(copy_shapes_strides_deps), - std::end(copy_shapes_strides_deps)); - all_deps.insert(std::end(all_deps), std::begin(ind_pack_depends), - std::end(ind_pack_depends)); + auto dst_strides = dst.get_strides_vector(); + auto val_strides = val.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + std::vector pack_deps = _populate_kernel_params( + exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides, + packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides, + dst_shape, dst_strides, val_strides, ind_sh_sts, ind_ptrs, ind_offsets, + axis_start, k, ind_nd, dst_nd, orthog_sh_elems, ind_sh_elems); + + std::vector all_deps; + all_deps.reserve(depends.size() + pack_deps.size()); + all_deps.insert(std::end(all_deps), std::begin(pack_deps), + std::end(pack_deps)); all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id]; if (fn == nullptr) { sycl::event::wait(host_task_events); - sycl::free(packed_shapes_strides, exec_q); - sycl::free(packed_axes_shapes_strides, exec_q); - sycl::free(packed_ind_shapes_strides, exec_q); sycl::free(packed_ind_ptrs, exec_q); + sycl::free(packed_ind_shapes_strides, exec_q); sycl::free(packed_ind_offsets, exec_q); - + sycl::free(packed_shapes_strides, exec_q); + sycl::free(packed_axes_shapes_strides, exec_q); throw std::runtime_error("Indices must be integer type, got " + std::to_string(ind_type_id)); } sycl::event put_generic_ev = - fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k, + fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k, packed_shapes_strides, packed_axes_shapes_strides, packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs, dst_offset, val_offset, packed_ind_offsets, all_deps); @@ -1141,16 +872,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, }); }); - sycl::event::wait({put_generic_ev, temporaries_cleanup_ev}); - sycl::event::wait(host_task_events); - - /* - sycl::event py_obj_cleanup_ev = - keep_args_alive(exec_q, {dst, py_ind, val}, - {put_generic_ev, temporaries_cleanup_ev}); - */ + sycl::event py_obj_cleanup_ev = keep_args_alive( + exec_q, {dst, py_ind, val}, {put_generic_ev, temporaries_cleanup_ev}); - return std::make_pair(sycl::event(), temporaries_cleanup_ev); + return std::make_pair(temporaries_cleanup_ev, put_generic_ev); } void init_advanced_indexing_dispatch_tables(void) From 2446b00354533e69723e581d140da6ed542ea61d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 2 Mar 2023 19:35:18 -0600 Subject: [PATCH 46/57] Implements place, extract, nonzero kernels, and Python API for them Implemented mask_positions, _extract, _place, _nonzero and _array_overlap APIs. --- dpctl/tensor/CMakeLists.txt | 1 + .../kernels/boolean_advanced_indexing.hpp | 948 ++++++++++++++ .../source/boolean_advanced_indexing.cpp | 1085 +++++++++++++++++ .../source/boolean_advanced_indexing.hpp | 84 ++ .../source/simplify_iteration_space.cpp | 269 ++++ .../source/simplify_iteration_space.hpp | 36 + dpctl/tensor/libtensor/source/tensor_py.cpp | 31 + 7 files changed, 2454 insertions(+) create mode 100644 dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp create mode 100644 dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp create mode 100644 dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 3f5780cd75..300baa98c9 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -32,6 +32,7 @@ pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp new file mode 100644 index 0000000000..b42b7869d2 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -0,0 +1,948 @@ +//=== boolean_advance_indexing.hpp - ---*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for advanced tensor index operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "utils/strided_iters.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace indexing +{ + +namespace py = pybind11; + +template T ceiling_quotient(T n, T m) +{ + return (n + m - 1) / m; +} +template T1 ceiling_quotient(T1 n, T2 m) +{ + return ceiling_quotient(n, static_cast(m)); +} + +template +class inclusive_scan_rec_local_scan_krn; + +template +class inclusive_scan_rec_chunk_update_krn; + +struct NoOpIndexer +{ + size_t operator()(size_t gid) const + { + return gid; + } +}; + +struct StridedIndexer +{ + StridedIndexer(int _nd, + py::ssize_t _offset, + py::ssize_t const *_packed_shape_strides) + : nd(_nd), starting_offset(_offset), + shape_strides(_packed_shape_strides) + { + } + + size_t operator()(size_t gid) const + { + CIndexer_vector _ind(nd); + py::ssize_t relative_offset(0); + _ind.get_displacement( + static_cast(gid), + shape_strides, // shape ptr + shape_strides + nd, // strides ptr + relative_offset); + return starting_offset + relative_offset; + } + +private: + int nd; + py::ssize_t starting_offset; + py::ssize_t const *shape_strides; +}; + +struct Strided1DIndexer +{ + Strided1DIndexer(py::ssize_t _offset, py::ssize_t _size, py::ssize_t _step) + : offset(_offset), size(static_cast(_size)), step(_step) + { + } + + size_t operator()(size_t gid) const + { + return static_cast(offset + std::min(gid, size) * step); + } + +private: + py::ssize_t offset = 0; + size_t size = 1; + py::ssize_t step = 1; +}; + +template struct ZeroChecker +{ + + ZeroChecker(_IndexerFn _indexer) : indexer_fn(_indexer) {} + + template + bool operator()(dataT const *data, size_t gid) const + { + constexpr dataT _zero(0); + + return data[indexer_fn(gid)] == _zero; + } + +private: + _IndexerFn indexer_fn; +}; + +/* + * for integer type maskT, + * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) + * for 0 <= j < n_elems + */ +template +sycl::event inclusive_scan_rec(sycl::queue exec_q, + size_t n_elems, + size_t wg_size, + const inputT *input, + outputT *output, + size_t s0, + size_t s1, + IndexerT indexer, + std::vector const &depends = {}) +{ + size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size); + + sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using slmT = sycl::local_accessor; + + auto lws = sycl::range<1>(wg_size); + auto gws = sycl::range<1>(n_groups * wg_size); + + slmT slm_iscan_tmp(lws, cgh); + + ZeroChecker is_zero_fn(indexer); + + cgh.parallel_for, n_wi>>( + sycl::nd_range<1>(gws, lws), + [=](sycl::nd_item<1> it) + { + auto chunk_gid = it.get_global_id(0); + auto lid = it.get_local_id(0); + + std::array local_isum; + + size_t i = chunk_gid * n_wi; + for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { + constexpr outputT out_zero(0); + constexpr outputT out_one(1); + local_isum[m_wi] = + (i + m_wi < n_elems) + ? (is_zero_fn(input, s0 + s1 * (i + m_wi)) ? out_zero + : out_one) + : out_zero; + } + +// local_isum is now result of +// inclusive scan of locally stored mask indicators +#pragma unroll + for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += local_isum[m_wi - 1]; + } + + size_t wg_iscan_val = + sycl::inclusive_scan_over_group(it.get_group(), + local_isum.back(), + sycl::plus(), + size_t(0)); + + slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; + it.barrier(sycl::access::fence_space::local_space); + size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid]; + it.barrier(sycl::access::fence_space::local_space); + +#pragma unroll + for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += addand; + } + + for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) { + output[i + m_wi] = local_isum[m_wi]; + } + } + ); + }); + + sycl::event out_event = inc_scan_phase1_ev; + if (n_groups > 1) { + outputT *temp = sycl::malloc_device(n_groups - 1, exec_q); + + auto chunk_size = wg_size * n_wi; + + NoOpIndexer _no_op_indexer{}; + auto e2 = inclusive_scan_rec( + exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1, + chunk_size, _no_op_indexer, {inc_scan_phase1_ev}); + + // output[ chunk_size * (i + 1) + j] += temp[i] + auto e3 = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(e2); + cgh.parallel_for>( + {n_elems}, + [=](auto wiid) + { + auto gid = wiid[0]; + auto i = (gid / chunk_size); + output[gid] += (i > 0) ? temp[i - 1] : 0; + } + ); + }); + + // dangling task to free the temporary + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(e3); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); }); + }); + + out_event = e3; + } + + return out_event; +} + +template struct TwoOffsets +{ + TwoOffsets() : first_offset(0), second_offset(0) {} + TwoOffsets(displacementT first_offset_, displacementT second_offset_) + : first_offset(first_offset_), second_offset(second_offset_) + { + } + + displacementT get_first_offset() const + { + return first_offset; + } + displacementT get_second_offset() const + { + return second_offset; + } + +private: + displacementT first_offset = 0; + displacementT second_offset = 0; +}; + +struct TwoOffsets_StridedIndexer +{ + TwoOffsets_StridedIndexer(int common_nd, + py::ssize_t first_offset_, + py::ssize_t second_offset_, + py::ssize_t const *_packed_shape_strides) + : nd(common_nd), starting_first_offset(first_offset_), + starting_second_offset(second_offset_), + shape_strides(_packed_shape_strides) + { + } + + TwoOffsets operator()(py::ssize_t gid) const + { + CIndexer_vector _ind(nd); + py::ssize_t relative_first_offset(0); + py::ssize_t relative_second_offset(0); + _ind.get_displacement( + gid, + shape_strides, // shape ptr + shape_strides + nd, // src strides ptr + shape_strides + 2 * nd, // src strides ptr + relative_first_offset, relative_second_offset); + return TwoOffsets( + starting_first_offset + relative_first_offset, + starting_second_offset + relative_second_offset); + } + +private: + int nd; + py::ssize_t starting_first_offset; + py::ssize_t starting_second_offset; + py::ssize_t const *shape_strides; +}; + +struct TwoZeroOffsets_Indexer +{ + TwoZeroOffsets_Indexer() {} + + TwoOffsets operator()(py::ssize_t) const + { + return TwoOffsets(); + } +}; + +template +struct MaskedExtractStridedFunctor +{ + MaskedExtractStridedFunctor(const char *src_data_p, + const char *cumsum_data_p, + char *dst_data_p, + size_t orthog_iter_size, + size_t masked_iter_size, + OrthogIndexerT orthog_src_dst_indexer_, + MaskedSrcIndexerT masked_src_indexer_, + MaskedDstIndexerT masked_dst_indexer_) + : src_cp(src_data_p), cumsum_cp(cumsum_data_p), dst_cp(dst_data_p), + orthog_nelems(orthog_iter_size), masked_nelems(masked_iter_size), + orthog_src_dst_indexer(orthog_src_dst_indexer_), + masked_src_indexer(masked_src_indexer_), + masked_dst_indexer(masked_dst_indexer_) + { + } + + void operator()(sycl::id<1> idx) const + { + const dataT *src_data = reinterpret_cast(src_cp); + dataT *dst_data = reinterpret_cast(dst_cp); + const indT *cumsum_data = reinterpret_cast(cumsum_cp); + + size_t global_i = idx[0]; + size_t orthog_i = global_i / masked_nelems; + size_t masked_i = global_i - masked_nelems * orthog_i; + + indT current_running_count = cumsum_data[masked_i]; + bool mask_set = + (masked_i == 0) + ? (current_running_count == 1) + : (current_running_count == cumsum_data[masked_i - 1] + 1); + + // dst[cumsum[i], j] - 1 = src[i, j] if cumsum[i] == ((i > 0) ? + // cumsum[i-1] + // + 1 : 1) + if (mask_set) { + auto orthog_offsets = + orthog_src_dst_indexer(static_cast(orthog_i)); + + size_t total_src_offset = masked_src_indexer(masked_i) + + orthog_offsets.get_first_offset(); + size_t total_dst_offset = + masked_dst_indexer(current_running_count - 1) + + orthog_offsets.get_second_offset(); + + dst_data[total_dst_offset] = src_data[total_src_offset]; + } + } + +private: + const char *src_cp = nullptr; + const char *cumsum_cp = nullptr; + char *dst_cp = nullptr; + size_t orthog_nelems = 0; + size_t masked_nelems = 0; + OrthogIndexerT + orthog_src_dst_indexer; // has nd, shape, src_strides, dst_strides for + // dimensions that ARE NOT masked + MaskedSrcIndexerT masked_src_indexer; // has nd, shape, src_strides for + // dimensions that ARE masked + MaskedDstIndexerT + masked_dst_indexer; // has 1, dst_strides for dimensions that ARE masked +}; + +template +struct MaskedPlaceStridedFunctor +{ + MaskedPlaceStridedFunctor(char *dst_data_p, + const char *cumsum_data_p, + const char *rhs_data_p, + size_t orthog_iter_size, + size_t masked_iter_size, + OrthogIndexerT orthog_dst_rhs_indexer_, + MaskedDstIndexerT masked_dst_indexer_, + MaskedRhsIndexerT masked_rhs_indexer_) + : dst_cp(dst_data_p), cumsum_cp(cumsum_data_p), rhs_cp(rhs_data_p), + orthog_nelems(orthog_iter_size), masked_nelems(masked_iter_size), + orthog_dst_rhs_indexer(orthog_dst_rhs_indexer_), + masked_dst_indexer(masked_dst_indexer_), + masked_rhs_indexer(masked_rhs_indexer_) + { + } + + void operator()(sycl::id<1> idx) const + { + dataT *dst_data = reinterpret_cast(dst_cp); + const indT *cumsum_data = reinterpret_cast(cumsum_cp); + const dataT *rhs_data = reinterpret_cast(rhs_cp); + + size_t global_i = idx[0]; + size_t orthog_i = global_i / masked_nelems; + size_t masked_i = global_i - masked_nelems * orthog_i; + + indT current_running_count = cumsum_data[masked_i]; + bool mask_set = + (masked_i == 0) + ? (current_running_count == 1) + : (current_running_count == cumsum_data[masked_i - 1] + 1); + + // src[i, j] = rhs[cumsum[i] - 1, j] if cumsum[i] == ((i > 0) ? + // cumsum[i-1] + // + 1 : 1) + if (mask_set) { + auto orthog_offsets = + orthog_dst_rhs_indexer(static_cast(orthog_i)); + + size_t total_dst_offset = masked_dst_indexer(masked_i) + + orthog_offsets.get_first_offset(); + size_t total_rhs_offset = + masked_rhs_indexer(current_running_count - 1) + + orthog_offsets.get_second_offset(); + + dst_data[total_dst_offset] = rhs_data[total_rhs_offset]; + } + } + +private: + char *dst_cp = nullptr; + const char *cumsum_cp = nullptr; + const char *rhs_cp = nullptr; + size_t orthog_nelems = 0; + size_t masked_nelems = 0; + OrthogIndexerT + orthog_dst_rhs_indexer; // has nd, shape, dst_strides, rhs_strides for + // dimensions that ARE NOT masked + MaskedDstIndexerT masked_dst_indexer; // has nd, shape, dst_strides for + // dimensions that ARE masked + MaskedRhsIndexerT + masked_rhs_indexer; // has 1, rhs_strides for dimensions that ARE masked +}; + +// mask positions + +typedef size_t (*mask_positions_contig_impl_fn_ptr_t)( + sycl::queue, + size_t, + const char *, + char *, + std::vector const &); + +template +size_t mask_positions_contig_impl(sycl::queue q, + size_t n_elems, + const char *mask, + char *cumsum, + std::vector const &depends = {}) +{ + constexpr int n_wi = 8; + const maskT *mask_data_ptr = reinterpret_cast(mask); + cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); + size_t wg_size = 128; + + NoOpIndexer flat_indexer{}; + + sycl::event comp_ev = inclusive_scan_rec( + q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, flat_indexer, + depends); + + cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); + + cumsumT *last_elem_host_usm = sycl::malloc_host(1, q); + + if (last_elem_host_usm == nullptr) { + throw std::bad_alloc(); + } + sycl::event copy_e = + q.copy(last_elem, last_elem_host_usm, 1, {comp_ev}); + copy_e.wait(); + size_t return_val = static_cast(*last_elem_host_usm); + sycl::free(last_elem_host_usm, q); + + return return_val; +} + +template struct MaskPositionsContigFactory +{ + fnT get() + { + fnT fn = mask_positions_contig_impl; + return fn; + } +}; + +typedef size_t (*mask_positions_strided_impl_fn_ptr_t)( + sycl::queue, + size_t, + const char *, + int, + py::ssize_t, + const py::ssize_t *, + char *, + std::vector const &); + +template +size_t mask_positions_strided_impl(sycl::queue q, + size_t n_elems, + const char *mask, + int nd, + py::ssize_t input_offset, + const py::ssize_t *shape_strides, + char *cumsum, + std::vector const &depends = {}) +{ + constexpr int n_wi = 8; + const maskT *mask_data_ptr = reinterpret_cast(mask); + cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); + size_t wg_size = 128; + + StridedIndexer strided_indexer{nd, input_offset, shape_strides}; + + sycl::event comp_ev = + inclusive_scan_rec( + q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, + strided_indexer, depends); + + cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); + + cumsumT *last_elem_host_usm = sycl::malloc_host(1, q); + + if (last_elem_host_usm == nullptr) { + throw std::bad_alloc(); + } + sycl::event copy_e = + q.copy(last_elem, last_elem_host_usm, 1, {comp_ev}); + copy_e.wait(); + size_t return_val = static_cast(*last_elem_host_usm); + sycl::free(last_elem_host_usm, q); + + return return_val; +} + +template struct MaskPositionsStridedFactory +{ + fnT get() + { + fnT fn = mask_positions_strided_impl; + return fn; + } +}; + +// ======= Masked extraction ================================ + +template +class masked_extract_all_slices_strided_impl_krn; + +typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)( + sycl::queue, + py::ssize_t, + const char *, + const char *, + char *, + int, + py::ssize_t const *, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +sycl::event masked_extract_all_slices_strided_impl( + sycl::queue exec_q, + py::ssize_t iteration_size, + const char *src_p, + const char *cumsum_p, + char *dst_p, + int nd, + const py::ssize_t + *packed_src_shape_strides, // [src_shape, src_strides], length 2*nd + py::ssize_t dst_size, // dst is 1D + py::ssize_t dst_stride, + const std::vector &depends = {}) +{ + // using MaskedExtractStridedFunctor; + // using Strided1DIndexer; + // using StridedIndexer; + // using TwoZeroOffsets_Indexer; + + TwoZeroOffsets_Indexer orthog_src_dst_indexer{}; + + /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const + * *_packed_shape_strides) */ + StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides); + Strided1DIndexer masked_dst_indexer(0, dst_size, dst_stride); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for>( + sycl::range<1>(static_cast(iteration_size)), + MaskedExtractStridedFunctor( + src_p, cumsum_p, dst_p, 1, iteration_size, + orthog_src_dst_indexer, masked_src_indexer, + masked_dst_indexer)); + }); + + return comp_ev; +} + +typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)( + sycl::queue, + py::ssize_t, + py::ssize_t, + const char *, + const char *, + char *, + int, + py::ssize_t const *, + py::ssize_t, + py::ssize_t, + int, + py::ssize_t const *, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +class masked_extract_some_slices_strided_impl_krn; + +template +sycl::event masked_extract_some_slices_strided_impl( + sycl::queue exec_q, + py::ssize_t orthog_nelems, + py::ssize_t masked_nelems, + const char *src_p, + const char *cumsum_p, + char *dst_p, + int orthog_nd, + const py::ssize_t + *packed_ortho_src_dst_shape_strides, // [ortho_shape, ortho_src_strides, + // ortho_dst_strides], length + // 3*ortho_nd + py::ssize_t ortho_src_offset, + py::ssize_t ortho_dst_offset, + int masked_nd, + const py::ssize_t *packed_masked_src_shape_strides, // [masked_src_shape, + // masked_src_strides], + // length 2*masked_nd + py::ssize_t masked_dst_size, // mask_dst is 1D + py::ssize_t masked_dst_stride, + const std::vector &depends = {}) +{ + // using MaskedExtractStridedFunctor; + // using Strided1DIndexer; + // using StridedIndexer; + // using TwoOffsets_StridedIndexer; + + TwoOffsets_StridedIndexer orthog_src_dst_indexer{ + orthog_nd, ortho_src_offset, ortho_dst_offset, + packed_ortho_src_dst_shape_strides}; + + StridedIndexer masked_src_indexer{masked_nd, 0, + packed_masked_src_shape_strides}; + Strided1DIndexer masked_dst_indexer{0, masked_dst_size, masked_dst_stride}; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for>( + sycl::range<1>(static_cast(orthog_nelems * masked_nelems)), + MaskedExtractStridedFunctor( + src_p, cumsum_p, dst_p, orthog_nelems, masked_nelems, + orthog_src_dst_indexer, masked_src_indexer, + masked_dst_indexer)); + }); + + return comp_ev; +} + +template struct MaskExtractAllSlicesStridedFactory +{ + fnT get() + { + fnT fn = masked_extract_all_slices_strided_impl; + return fn; + } +}; + +template struct MaskExtractSomeSlicesStridedFactory +{ + fnT get() + { + fnT fn = masked_extract_some_slices_strided_impl; + return fn; + } +}; + +// Masked placement + +template +class masked_place_all_slices_strided_impl_krn; + +typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)( + sycl::queue, + py::ssize_t, + char *, + const char *, + const char *, + int, + py::ssize_t const *, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +sycl::event masked_place_all_slices_strided_impl( + sycl::queue exec_q, + py::ssize_t iteration_size, + char *dst_p, + const char *cumsum_p, + const char *rhs_p, + int nd, + const py::ssize_t + *packed_dst_shape_strides, // [dst_shape, dst_strides], length 2*nd + py::ssize_t rhs_size, // rhs is 1D + py::ssize_t rhs_stride, + const std::vector &depends = {}) +{ + // using MaskedPlaceStridedFunctor; + // using Strided1DIndexer; + // using StridedIndexer; + // using TwoZeroOffsets_Indexer; + + TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{}; + + /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const + * *_packed_shape_strides) */ + StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides); + Strided1DIndexer masked_rhs_indexer(0, rhs_size, rhs_stride); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for>( + sycl::range<1>(static_cast(iteration_size)), + MaskedPlaceStridedFunctor( + dst_p, cumsum_p, rhs_p, 1, iteration_size, + orthog_dst_rhs_indexer, masked_dst_indexer, + masked_rhs_indexer)); + }); + + return comp_ev; +} + +typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)( + sycl::queue, + py::ssize_t, + py::ssize_t, + char *, + const char *, + const char *, + int, + py::ssize_t const *, + py::ssize_t, + py::ssize_t, + int, + py::ssize_t const *, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +class masked_place_some_slices_strided_impl_krn; + +template +sycl::event masked_place_some_slices_strided_impl( + sycl::queue exec_q, + py::ssize_t orthog_nelems, + py::ssize_t masked_nelems, + char *dst_p, + const char *cumsum_p, + const char *rhs_p, + int orthog_nd, + const py::ssize_t + *packed_ortho_dst_rhs_shape_strides, // [ortho_shape, ortho_dst_strides, + // ortho_rhs_strides], length + // 3*ortho_nd + py::ssize_t ortho_dst_offset, + py::ssize_t ortho_rhs_offset, + int masked_nd, + const py::ssize_t *packed_masked_dst_shape_strides, // [masked_dst_shape, + // masked_dst_strides], + // length 2*masked_nd + py::ssize_t masked_rhs_size, // mask_dst is 1D + py::ssize_t masked_rhs_stride, + const std::vector &depends = {}) +{ + // using MaskedPlaceStridedFunctor; + // using Strided1DIndexer; + // using StridedIndexer; + // using TwoOffsets_StridedIndexer; + + TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{ + orthog_nd, ortho_dst_offset, ortho_rhs_offset, + packed_ortho_dst_rhs_shape_strides}; + + /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const + * *_packed_shape_strides) */ + StridedIndexer masked_dst_indexer{masked_nd, 0, + packed_masked_dst_shape_strides}; + Strided1DIndexer masked_rhs_indexer{0, masked_rhs_size, masked_rhs_stride}; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for>( + sycl::range<1>(static_cast(orthog_nelems * masked_nelems)), + MaskedPlaceStridedFunctor( + dst_p, cumsum_p, rhs_p, orthog_nelems, masked_nelems, + orthog_dst_rhs_indexer, masked_dst_indexer, + masked_rhs_indexer)); + }); + + return comp_ev; +} + +static masked_place_all_slices_strided_impl_fn_ptr_t + masked_place_all_slices_strided_impl_dispatch_vector + [dpctl::tensor::detail::num_types]; + +template struct MaskPlaceAllSlicesStridedFactory +{ + fnT get() + { + fnT fn = masked_place_all_slices_strided_impl; + return fn; + } +}; + +static masked_place_some_slices_strided_impl_fn_ptr_t + masked_place_some_slices_strided_impl_dispatch_vector + [dpctl::tensor::detail::num_types]; + +template struct MaskPlaceSomeSlicesStridedFactory +{ + fnT get() + { + fnT fn = masked_place_some_slices_strided_impl; + return fn; + } +}; + +// Non-zero + +class non_zero_indexes_krn; + +template +sycl::event non_zero_indexes_impl(sycl::queue exec_q, + py::ssize_t iter_size, + py::ssize_t nz_elems, + int nd, + const char *cumsum_cp, + char *indexes_cp, + const py::ssize_t *mask_shape, + std::vector const &depends) +{ + const indT1 *cumsum_data = reinterpret_cast(cumsum_cp); + indT2 *indexes_data = reinterpret_cast(indexes_cp); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for( + sycl::range<1>(iter_size), [=](sycl::id<1> idx) { + auto i = idx[0]; + + auto cs_curr_val = cumsum_data[i] - 1; + auto cs_prev_val = (i > 0) ? cumsum_data[i - 1] : indT1(0); + bool cond = (cs_curr_val == cs_prev_val); + + py::ssize_t i_ = static_cast(i); + for (int dim = nd; --dim > 0;) { + auto sd = mask_shape[dim]; + py::ssize_t q = i_ / sd; + py::ssize_t r = (i_ - q * sd); + if (cond) { + indexes_data[cs_curr_val + dim * nz_elems] = + static_cast(r); + } + i_ = q; + } + if (cond) { + indexes_data[cs_curr_val] = static_cast(i_); + } + }); + }); + + return comp_ev; +} + +} // namespace indexing +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp new file mode 100644 index 0000000000..1534b38391 --- /dev/null +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -0,0 +1,1085 @@ +//===-- boolean_advanced_indexing.cpp - --*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines implementation functions of dpctl.tensor.place and +/// dpctl.tensor.extract, dpctl.tensor.nonzero +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include +#include + +#include "boolean_advanced_indexing.hpp" +#include "kernels/boolean_advanced_indexing.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +struct sink_t +{ + sink_t(){}; + template sink_t(T &&){}; +}; + +template std::size_t accumulate_size(std::size_t &s, V &&v) +{ + return s += v.size(); +} + +template sink_t inserter(V &lhs, U &&rhs) +{ + lhs.insert(lhs.end(), rhs.begin(), rhs.end()); + return {}; +} + +template +std::vector concat(std::vector lhs, Vs &&... vs) +{ + std::size_t s = lhs.size(); + { + // limited scope ensures array is freed + [[maybe_unused]] sink_t tmp[] = {accumulate_size(s, vs)..., 0}; + } + lhs.reserve(s); + { + // array of no-data objects ensures ordering of calls to inserter + [[maybe_unused]] sink_t tmp[] = {inserter(lhs, std::forward(vs))..., + 0}; + } + + return std::move(lhs); // prevent return-value optimization +} + +template +std::tuple +device_allocate_and_pack(sycl::queue q, + std::vector &host_task_events, + Vs &&... vs) +{ + + // memory transfer optimization, use USM-host for temporary speeds up + // tranfer to device, especially on dGPUs + using usm_host_allocatorT = + sycl::usm_allocator; + using shT = std::vector; + + usm_host_allocatorT usm_host_allocator(q); + shT empty{0, usm_host_allocator}; + shT packed_shape_strides = concat(empty, vs...); + + auto packed_shape_strides_owner = + std::make_shared(std::move(packed_shape_strides)); + + auto sz = packed_shape_strides_owner->size(); + indT *shape_strides = sycl::malloc_device(sz, q); + + sycl::event copy_ev = + q.copy(packed_shape_strides_owner->data(), shape_strides, sz); + + sycl::event cleanup_host_task_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(copy_ev); + cgh.host_task([packed_shape_strides_owner] { + // increment shared pointer ref-count to keep it alive + // till copy operation completes; + }); + }); + host_task_events.push_back(cleanup_host_task_ev); + + return std::make_tuple(shape_strides, sz, copy_ev); +} + +/* @brief check for overlap of memory regions behind arrays. + +Presenty assume that array occupies all bytes between smallest and largest +displaced elements. + +TODO: Write proper Frobenius solver to account for holes, e.g. + overlap( x_contig[::2], x_contig[1::2]) should give False, + while this implementation gives True. +*/ +bool overlap(dpctl::tensor::usm_ndarray ar1, dpctl::tensor::usm_ndarray ar2) +{ + const char *ar1_data = ar1.get_data(); + + const auto &ar1_offsets = ar1.get_minmax_offsets(); + py::ssize_t ar1_elem_size = static_cast(ar1.get_elemsize()); + + const char *ar2_data = ar2.get_data(); + const auto &ar2_offsets = ar2.get_minmax_offsets(); + py::ssize_t ar2_elem_size = static_cast(ar2.get_elemsize()); + + /* Memory of array1 extends from */ + /* [ar1_data + ar1_offsets.first * ar1_elem_size, ar1_data + + * ar1_offsets.second * ar1_elem_size + ar1_elem_size] */ + /* Memory of array2 extends from */ + /* [ar2_data + ar2_offsets.first * ar2_elem_size, ar2_data + + * ar2_offsets.second * ar2_elem_size + ar2_elem_size] */ + + /* Intervals [x0, x1] and [y0, y1] do not overlap if (x0 <= x1) && (y0 <= + * y1) + * && (x1 <=y0 || y1 <= x0 ) */ + /* Given that x0 <= x1 and y0 <= y1 are true by construction, the condition + * for overlap us (x1 > y0) && (y1 > x0) */ + + /* Applying: + (ar1_data + ar1_offsets.second * ar1_elem_size + ar1_elem_size > + ar2_data + + ar2_offsets.first * ar2_elem_size) && (ar2_data + ar2_offsets.second * + ar2_elem_size + ar2_elem_size > ar1_data + ar1_offsets.first * + ar1_elem_size) + */ + + auto byte_distance = static_cast(ar2_data - ar1_data); + + py::ssize_t x1_minus_y0 = + (-byte_distance + + (ar1_elem_size + (ar1_offsets.second * ar1_elem_size) - + (ar2_offsets.first * ar2_elem_size))); + + py::ssize_t y1_minus_x0 = + (byte_distance + (ar2_elem_size + (ar2_offsets.second * ar2_elem_size) - + (ar1_offsets.first * ar1_elem_size))); + + bool memory_overlap = (x1_minus_y0 > 0) && (y1_minus_x0 > 0); + + return memory_overlap; +} + +/* @brief Split shape/strides into dir1 (complementary to axis_start <= i < + * axis_end) and dir2 (along given set of axes) + */ +template +void _split_iteration_space(const shT &shape_vec, + const shT &strides_vec, + int axis_start, + int axis_end, + shT &dir1_shape_vec, + shT &dir2_shape_vec, + shT &dir1_strides_vec, + shT &dir2_strides_vec) +{ + int nd = static_cast(shape_vec.size()); + int dir2_sz = axis_end - axis_start; + int dir1_sz = nd - dir2_sz; + + assert(dir1_sz > 0); + assert(dir2_sz > 0); + + dir1_shape_vec.resize(dir1_sz); + dir2_shape_vec.resize(dir2_sz); + + std::copy(shape_vec.begin(), shape_vec.begin() + axis_start, + dir1_shape_vec.begin()); + std::copy(shape_vec.begin() + axis_end, shape_vec.end(), + dir1_shape_vec.begin() + axis_start); + + std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end, + dir2_shape_vec.begin()); + + dir1_strides_vec.resize(dir1_sz); + dir2_strides_vec.resize(dir2_sz); + + std::copy(strides_vec.begin(), strides_vec.begin() + axis_start, + dir1_strides_vec.begin()); + std::copy(strides_vec.begin() + axis_end, strides_vec.end(), + dir1_strides_vec.begin() + axis_start); + + std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end, + dir2_strides_vec.begin()); + + return; +} + +// Computation of positions of masked elements + +using dpctl::tensor::kernels::indexing::mask_positions_contig_impl_fn_ptr_t; +static mask_positions_contig_impl_fn_ptr_t + mask_positions_contig_dispatch_vector[dpctl::tensor::detail::num_types]; + +using dpctl::tensor::kernels::indexing::mask_positions_strided_impl_fn_ptr_t; +static mask_positions_strided_impl_fn_ptr_t + mask_positions_strided_dispatch_vector[dpctl::tensor::detail::num_types]; + +void populate_mask_positions_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::indexing::MaskPositionsContigFactory; + dpctl::tensor::detail::DispatchVectorBuilder< + mask_positions_contig_impl_fn_ptr_t, MaskPositionsContigFactory, + dpctl::tensor::detail::num_types> + dvb1; + dvb1.populate_dispatch_vector(mask_positions_contig_dispatch_vector); + + using dpctl::tensor::kernels::indexing::MaskPositionsStridedFactory; + dpctl::tensor::detail::DispatchVectorBuilder< + mask_positions_strided_impl_fn_ptr_t, MaskPositionsStridedFactory, + dpctl::tensor::detail::num_types> + dvb2; + dvb2.populate_dispatch_vector(mask_positions_strided_dispatch_vector); + + return; +} + +size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, + dpctl::tensor::usm_ndarray cumsum, + sycl::queue exec_q, + std::vector const &depends) +{ + // cumsum is 1D + if (cumsum.get_ndim() != 1) { + throw py::value_error("Result array must be one-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array must be C-contiguous."); + } + + // cumsum.shape == (mask.size,) + auto mask_size = mask.get_size(); + auto cumsum_size = cumsum.get_shape(0); + if (cumsum_size != mask_size) { + throw py::value_error("Inconsistent dimensions"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) { + // FIXME: use ExecutionPlacementError + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (mask_size == 0) { + return 0; + } + + int mask_typenum = mask.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + // mask can be any type + const char *mask_data = mask.get_data(); + char *cumsum_data = cumsum.get_data(); + + auto const &array_types = dpctl::tensor::detail::usm_ndarray_types(); + + int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + // cumsum must be int64_t only + constexpr int int64_typeid = + static_cast(dpctl::tensor::detail::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Cumulative sum array must have int64 data-type."); + } + + if (mask.is_c_contiguous()) { + auto fn = mask_positions_contig_dispatch_vector[mask_typeid]; + + return fn(exec_q, mask_size, mask_data, cumsum_data, depends); + } + + const py::ssize_t *shape = mask.get_shape_raw(); + const py::ssize_t *strides = mask.get_strides_raw(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_strides; + py::ssize_t offset(0); + + int mask_nd = mask.get_ndim(); + int nd = mask_nd; + + constexpr py::ssize_t itemsize = 1; // in elements + bool is_c_contig = mask.is_c_contiguous(); + bool is_f_contig = mask.is_f_contiguous(); + + dpctl::tensor::py_internal::simplify_iteration_space_1( + nd, shape, strides, itemsize, is_c_contig, is_f_contig, + simplified_shape, simplified_strides, offset); + + if (nd == 1 && simplified_strides[0] == 1) { + auto fn = mask_positions_contig_dispatch_vector[mask_typeid]; + + return fn(exec_q, mask_size, mask_data, cumsum_data, depends); + } + + // Strided implementation + auto strided_fn = mask_positions_strided_dispatch_vector[mask_typeid]; + std::vector host_task_events; + + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_strides); + py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + + if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { + copy_shape_ev.wait(); + sycl::event::wait(host_task_events); + sycl::free(shape_strides, exec_q); + throw std::runtime_error("Unexacted error"); + } + + std::vector dependent_events; + dependent_events.reserve(depends.size() + 1); + dependent_events.insert(dependent_events.end(), copy_shape_ev); + dependent_events.insert(dependent_events.end(), depends.begin(), + depends.end()); + + size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, offset, + shape_strides, cumsum_data, dependent_events); + + sycl::event::wait(host_task_events); + sycl::free(shape_strides, exec_q); + + return total_set; +} + +// Masked extraction + +using dpctl::tensor::kernels::indexing:: + masked_extract_all_slices_strided_impl_fn_ptr_t; + +static masked_extract_all_slices_strided_impl_fn_ptr_t + masked_extract_all_slices_strided_impl_dispatch_vector + [dpctl::tensor::detail::num_types]; + +using dpctl::tensor::kernels::indexing:: + masked_extract_some_slices_strided_impl_fn_ptr_t; + +static masked_extract_some_slices_strided_impl_fn_ptr_t + masked_extract_some_slices_strided_impl_dispatch_vector + [dpctl::tensor::detail::num_types]; + +void populate_masked_extract_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::indexing::MaskExtractAllSlicesStridedFactory; + dpctl::tensor::detail::DispatchVectorBuilder< + masked_extract_all_slices_strided_impl_fn_ptr_t, + MaskExtractAllSlicesStridedFactory, dpctl::tensor::detail::num_types> + dvb1; + dvb1.populate_dispatch_vector( + masked_extract_all_slices_strided_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing::MaskExtractSomeSlicesStridedFactory; + dpctl::tensor::detail::DispatchVectorBuilder< + masked_extract_some_slices_strided_impl_fn_ptr_t, + MaskExtractSomeSlicesStridedFactory, dpctl::tensor::detail::num_types> + dvb2; + dvb2.populate_dispatch_vector( + masked_extract_some_slices_strided_impl_dispatch_vector); +} + +std::pair +py_extract(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + std::vector const &depends) +{ + int src_nd = src.get_ndim(); + if ((axis_start < 0 || axis_end > src_nd || axis_start >= axis_end)) { + throw py::value_error("Specified axes_start and axes_end are invalid."); + } + int mask_span_sz = axis_end - axis_start; + + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd + (mask_span_sz - 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) { + throw py::value_error("cumsum array must be a C-contiguous vector"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + py::ssize_t cumsum_sz = cumsum.get_size(); + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool same_ortho_dims(true); + size_t ortho_nelems(1); // number of orthogonal iterations + + for (auto i = 0; i < axis_start; ++i) { + auto src_sh_i = src_shape[i]; + ortho_nelems *= src_sh_i; + same_ortho_dims = same_ortho_dims && (src_sh_i == dst_shape[i]); + } + for (auto i = axis_end; i < src_nd; ++i) { + auto src_sh_i = src_shape[i]; + ortho_nelems *= src_sh_i; + same_ortho_dims = + same_ortho_dims && (src_sh_i == dst_shape[i - (mask_span_sz - 1)]); + } + + size_t masked_src_nelems(1); + size_t masked_dst_nelems(dst_shape[axis_start]); + for (auto i = axis_start; i < axis_end; ++i) { + masked_src_nelems *= src_shape[i]; + } + + // masked_dst_nelems is number of set elements in the mask, or last element + // in cumsum + if (!same_ortho_dims || + (masked_src_nelems != static_cast(cumsum_sz))) { + throw py::value_error("Inconsistent array dimensions"); + } + + // ensure that dst is sufficiently ample + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accomodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < static_cast(ortho_nelems * masked_dst_nelems)) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accomodate all the " + "array elements."); + } + } + + // check that dst does not intersect with src, not with cumsum. + if (overlap(dst, cumsum) || overlap(dst, src)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = dpctl::tensor::detail::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + constexpr int int64_typeid = + static_cast(dpctl::tensor::detail::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Unexact data type of cumsum array, expecting 'int64'"); + } + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data types"); + } + + char *src_data_p = src.get_data(); + char *dst_data_p = dst.get_data(); + char *cumsum_data_p = cumsum.get_data(); + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + sycl::event extract_ev; + std::vector host_task_events{}; + if (axis_start == 0 && axis_end == src_nd) { + // empty orthogonal directions + auto fn = + masked_extract_all_slices_strided_impl_dispatch_vector[src_typeid]; + + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + py::ssize_t *packed_src_shape_strides = + std::get<0>(ptr_size_event_tuple1); + sycl::event copy_src_shape_strides_ev = + std::get<2>(ptr_size_event_tuple1); + + assert(dst_shape_vec.size() == 1); + assert(dst_strides_vec.size() == 1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_src_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + extract_ev = fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p, + dst_data_p, src_nd, packed_src_shape_strides, + dst_shape_vec[0], dst_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(extract_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_src_shape_strides] { + sycl::free(packed_src_shape_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + else { + // non-empty othogonal directions + auto fn = + masked_extract_some_slices_strided_impl_dispatch_vector[src_typeid]; + + int masked_src_nd = mask_span_sz; + int ortho_nd = src_nd - masked_src_nd; + + using shT = std::vector; + + shT ortho_src_shape; + shT masked_src_shape; + shT ortho_src_strides; + shT masked_src_strides; + _split_iteration_space(src_shape_vec, src_strides_vec, axis_start, + axis_end, ortho_src_shape, + masked_src_shape, // 4 vectors modified + ortho_src_strides, masked_src_strides); + + shT ortho_dst_shape; + shT masked_dst_shape; + shT ortho_dst_strides; + shT masked_dst_strides; + _split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start, + axis_start + 1, ortho_dst_shape, + masked_dst_shape, // 4 vectors modified + ortho_dst_strides, masked_dst_strides); + + assert(ortho_src_shape.size() == static_cast(ortho_nd)); + assert(ortho_dst_shape.size() == static_cast(ortho_nd)); + assert(std::equal(ortho_src_shape.begin(), ortho_src_shape.end(), + ortho_dst_shape.begin())); + + std::vector simplified_ortho_shape; + std::vector simplified_ortho_src_strides; + std::vector simplified_ortho_dst_strides; + + const py::ssize_t *_shape = ortho_src_shape.data(); + const py::ssize_t *_src_strides = ortho_src_strides.data(); + const py::ssize_t *_dst_strides = ortho_dst_strides.data(); + constexpr py::ssize_t _itemsize = 1; // in elements + + constexpr bool is_c_contig = false; + constexpr bool is_f_contig = false; + + py::ssize_t ortho_src_offset(0); + py::ssize_t ortho_dst_offset(0); + + dpctl::tensor::py_internal::simplify_iteration_space( + ortho_nd, _shape, _src_strides, _itemsize, is_c_contig, is_f_contig, + _dst_strides, _itemsize, is_c_contig, is_f_contig, + simplified_ortho_shape, simplified_ortho_src_strides, + simplified_ortho_dst_strides, ortho_src_offset, ortho_dst_offset); + + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_ortho_shape, + simplified_ortho_src_strides, simplified_ortho_dst_strides); + py::ssize_t *packed_ortho_src_dst_shape_strides = + std::get<0>(ptr_size_event_tuple1); + sycl::event copy_shape_strides_ev1 = std::get<2>(ptr_size_event_tuple1); + + auto ptr_size_event_tuple2 = device_allocate_and_pack( + exec_q, host_task_events, masked_src_shape, masked_src_strides); + py::ssize_t *packed_masked_src_shape_strides = + std::get<0>(ptr_size_event_tuple2); + sycl::event copy_shape_strides_ev2 = std::get<2>(ptr_size_event_tuple2); + + assert(masked_dst_shape.size() == 1); + assert(masked_dst_strides.size() == 1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 2); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev1); + all_deps.push_back(copy_shape_strides_ev2); + + assert(all_deps.size() == depends.size() + 2); + + // OrthogIndexerT orthog_src_dst_indexer_, MaskedIndexerT + // masked_src_indexer_, MaskedIndexerT masked_dst_indexer_ + extract_ev = fn(exec_q, ortho_nelems, masked_src_nelems, src_data_p, + cumsum_data_p, dst_data_p, + // data to build orthog_src_dst_indexer + ortho_nd, packed_ortho_src_dst_shape_strides, + ortho_src_offset, ortho_dst_offset, + // data to build masked_src_indexer + masked_src_nd, packed_masked_src_shape_strides, + // data to build masked_dst_indexer, + masked_dst_shape[0], masked_dst_strides[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(extract_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_ortho_src_dst_shape_strides, + packed_masked_src_shape_strides] { + sycl::free(packed_ortho_src_dst_shape_strides, ctx); + sycl::free(packed_masked_src_shape_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + host_task_events.push_back(extract_ev); + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {src, cumsum, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, extract_ev); +} + +// Masked placement + +using dpctl::tensor::kernels::indexing:: + masked_place_all_slices_strided_impl_fn_ptr_t; + +static masked_place_all_slices_strided_impl_fn_ptr_t + masked_place_all_slices_strided_impl_dispatch_vector + [dpctl::tensor::detail::num_types]; + +using dpctl::tensor::kernels::indexing:: + masked_place_some_slices_strided_impl_fn_ptr_t; + +static masked_place_some_slices_strided_impl_fn_ptr_t + masked_place_some_slices_strided_impl_dispatch_vector + [dpctl::tensor::detail::num_types]; + +void populate_masked_place_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::indexing::MaskPlaceAllSlicesStridedFactory; + dpctl::tensor::detail::DispatchVectorBuilder< + masked_place_all_slices_strided_impl_fn_ptr_t, + MaskPlaceAllSlicesStridedFactory, dpctl::tensor::detail::num_types> + dvb1; + dvb1.populate_dispatch_vector( + masked_place_all_slices_strided_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing::MaskPlaceSomeSlicesStridedFactory; + dpctl::tensor::detail::DispatchVectorBuilder< + masked_place_some_slices_strided_impl_fn_ptr_t, + MaskPlaceSomeSlicesStridedFactory, dpctl::tensor::detail::num_types> + dvb2; + dvb2.populate_dispatch_vector( + masked_place_some_slices_strided_impl_dispatch_vector); +} + +/* + * @brief Copy dst[i, ortho_id] = rhs[cumsum[i] - 1, ortho_id] if cumsum[i] == + * ((i > 0) ? cumsum[i-1] + 1 : 1) + */ +std::pair +py_place(dpctl::tensor::usm_ndarray dst, + dpctl::tensor::usm_ndarray cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + dpctl::tensor::usm_ndarray rhs, + sycl::queue exec_q, + std::vector const &depends) +{ + int dst_nd = dst.get_ndim(); + if ((axis_start < 0 || axis_end > dst_nd || axis_start >= axis_end)) { + throw py::value_error("Specified axes_start and axes_end are invalid."); + } + int mask_span_sz = axis_end - axis_start; + + int rhs_nd = rhs.get_ndim(); + if (dst_nd != rhs_nd + (mask_span_sz - 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) { + throw py::value_error("cumsum array must be a C-contiguous vector"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst, cumsum, rhs})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + py::ssize_t cumsum_sz = cumsum.get_size(); + + const py::ssize_t *dst_shape = dst.get_shape_raw(); + const py::ssize_t *rhs_shape = rhs.get_shape_raw(); + bool same_ortho_dims(true); + size_t ortho_nelems(1); // number of orthogonal iterations + + for (auto i = 0; i < axis_start; ++i) { + auto dst_sh_i = dst_shape[i]; + ortho_nelems *= dst_sh_i; + same_ortho_dims = same_ortho_dims && (dst_sh_i == rhs_shape[i]); + } + for (auto i = axis_end; i < dst_nd; ++i) { + auto dst_sh_i = dst_shape[i]; + ortho_nelems *= dst_sh_i; + same_ortho_dims = + same_ortho_dims && (dst_sh_i == rhs_shape[i - (mask_span_sz - 1)]); + } + + size_t masked_dst_nelems(1); + for (auto i = axis_start; i < axis_end; ++i) { + masked_dst_nelems *= dst_shape[i]; + } + + if (!same_ortho_dims || + (masked_dst_nelems != static_cast(cumsum_sz))) { + throw py::value_error("Inconsistent array dimensions"); + } + + // ensure that dst is sufficiently ample + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accomodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < static_cast(ortho_nelems * masked_dst_nelems)) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accomodate all the " + "array elements."); + } + } + + // check that dst does not intersect with src, not with cumsum. + if (overlap(dst, rhs) || overlap(dst, cumsum)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int dst_typenum = dst.get_typenum(); + int rhs_typenum = rhs.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = dpctl::tensor::detail::usm_ndarray_types(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + constexpr int int64_typeid = + static_cast(dpctl::tensor::detail::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Unexact data type of cumsum array, expecting 'int64'"); + } + + // FIXME: should types be the same? + if (dst_typeid != rhs_typeid) { + throw py::value_error( + "Destination array must have the same elemental data types"); + } + + char *dst_data_p = dst.get_data(); + char *rhs_data_p = rhs.get_data(); + char *cumsum_data_p = cumsum.get_data(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + auto rhs_shape_vec = rhs.get_shape_vector(); + auto rhs_strides_vec = rhs.get_strides_vector(); + + sycl::event extract_ev; + std::vector host_task_events{}; + if (axis_start == 0 && axis_end == dst_nd) { + // empty orthogonal directions + auto fn = + masked_place_all_slices_strided_impl_dispatch_vector[dst_typeid]; + + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, dst_shape_vec, dst_strides_vec); + py::ssize_t *packed_dst_shape_strides = + std::get<0>(ptr_size_event_tuple1); + sycl::event copy_dst_shape_strides_ev = + std::get<2>(ptr_size_event_tuple1); + + assert(rhs_shape_vec.size() == 1); + assert(rhs_strides_vec.size() == 1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_dst_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + extract_ev = fn(exec_q, cumsum_sz, dst_data_p, cumsum_data_p, + rhs_data_p, dst_nd, packed_dst_shape_strides, + rhs_shape_vec[0], rhs_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(extract_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_dst_shape_strides] { + sycl::free(packed_dst_shape_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + else { + // non-empty othogonal directions + auto fn = + masked_place_some_slices_strided_impl_dispatch_vector[dst_typeid]; + + int masked_dst_nd = mask_span_sz; + int ortho_nd = dst_nd - masked_dst_nd; + + using shT = std::vector; + + shT ortho_dst_shape; + shT masked_dst_shape; + shT ortho_dst_strides; + shT masked_dst_strides; + _split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start, + axis_end, ortho_dst_shape, + masked_dst_shape, // 4 vectors modified + ortho_dst_strides, masked_dst_strides); + + shT ortho_rhs_shape; + shT masked_rhs_shape; + shT ortho_rhs_strides; + shT masked_rhs_strides; + _split_iteration_space(rhs_shape_vec, rhs_strides_vec, axis_start, + axis_start + 1, ortho_rhs_shape, + masked_rhs_shape, // 4 vectors modified + ortho_rhs_strides, masked_rhs_strides); + + assert(ortho_dst_shape.size() == static_cast(ortho_nd)); + assert(ortho_rhs_shape.size() == static_cast(ortho_nd)); + assert(std::equal(ortho_dst_shape.begin(), ortho_dst_shape.end(), + ortho_rhs_shape.begin())); + + std::vector simplified_ortho_shape; + std::vector simplified_ortho_dst_strides; + std::vector simplified_ortho_rhs_strides; + + const py::ssize_t *_shape = ortho_dst_shape.data(); + const py::ssize_t *_dst_strides = ortho_dst_strides.data(); + const py::ssize_t *_rhs_strides = ortho_rhs_strides.data(); + constexpr py::ssize_t _itemsize = 1; // in elements + + constexpr bool is_c_contig = false; + constexpr bool is_f_contig = false; + + py::ssize_t ortho_dst_offset(0); + py::ssize_t ortho_rhs_offset(0); + + dpctl::tensor::py_internal::simplify_iteration_space( + ortho_nd, _shape, _dst_strides, _itemsize, is_c_contig, is_f_contig, + _rhs_strides, _itemsize, is_c_contig, is_f_contig, + simplified_ortho_shape, simplified_ortho_dst_strides, + simplified_ortho_rhs_strides, ortho_dst_offset, ortho_rhs_offset); + + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_ortho_shape, + simplified_ortho_dst_strides, simplified_ortho_rhs_strides); + py::ssize_t *packed_ortho_dst_rhs_shape_strides = + std::get<0>(ptr_size_event_tuple1); + sycl::event copy_shape_strides_ev1 = std::get<2>(ptr_size_event_tuple1); + + auto ptr_size_event_tuple2 = device_allocate_and_pack( + exec_q, host_task_events, masked_dst_shape, masked_dst_strides); + py::ssize_t *packed_masked_dst_shape_strides = + std::get<0>(ptr_size_event_tuple2); + sycl::event copy_shape_strides_ev2 = std::get<2>(ptr_size_event_tuple2); + + assert(masked_rhs_shape.size() == 1); + assert(masked_rhs_strides.size() == 1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 2); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev1); + all_deps.push_back(copy_shape_strides_ev2); + + assert(all_deps.size() == depends.size() + 2); + + extract_ev = fn(exec_q, ortho_nelems, masked_dst_nelems, dst_data_p, + cumsum_data_p, rhs_data_p, + // data to build orthog_dst_rhs_indexer + ortho_nd, packed_ortho_dst_rhs_shape_strides, + ortho_dst_offset, ortho_rhs_offset, + // data to build masked_dst_indexer + masked_dst_nd, packed_masked_dst_shape_strides, + // data to build masked_dst_indexer, + masked_rhs_shape[0], masked_rhs_strides[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(extract_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_ortho_dst_rhs_shape_strides, + packed_masked_dst_shape_strides] { + sycl::free(packed_ortho_dst_rhs_shape_strides, ctx); + sycl::free(packed_masked_dst_shape_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + host_task_events.push_back(extract_ev); + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {dst, cumsum, rhs}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, extract_ev); +} + +// Non-zero + +std::pair py_nonzero( + dpctl::tensor::usm_ndarray cumsum, // int64 input array, 1D, C-contiguous + dpctl::tensor::usm_ndarray indexes, // int64 2D output array, C-contiguous + std::vector + mask_shape, // shape of array from which cumsum was computed + sycl::queue exec_q, + std::vector const &depends) +{ + if (!dpctl::utils::queues_are_compatible(exec_q, {cumsum, indexes})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + int cumsum_nd = cumsum.get_ndim(); + if (cumsum_nd != 1 || !cumsum.is_c_contiguous()) { + throw py::value_error("Cumsum array must be a C-contiguous vector"); + } + + int indexes_nd = indexes.get_ndim(); + if (indexes_nd != 2 || !indexes.is_c_contiguous()) { + throw py::value_error("Index array must be a C-contiguous matrix"); + } + + size_t _ndim = mask_shape.size(); + if (_ndim > std::numeric_limits::max()) { + throw py::value_error("Shape is too large"); + } + int ndim = static_cast(_ndim); + + const py::ssize_t *indexes_shape = indexes.get_shape_raw(); + + if (ndim != indexes_shape[0]) { + throw py::value_error( + "Length of shape must equal width of index matrix"); + } + + auto cumsum_sz = cumsum.get_size(); + py::ssize_t shape_nelems = + std::accumulate(mask_shape.begin(), mask_shape.end(), py::ssize_t(1), + std::multiplies()); + + if (cumsum_sz != shape_nelems) { + throw py::value_error("Shape and cumsum size are not constent"); + } + + py::ssize_t nz_elems = indexes_shape[1]; + + int indexes_typenum = indexes.get_typenum(); + auto const &array_types = dpctl::tensor::detail::usm_ndarray_types(); + int indexes_typeid = array_types.typenum_to_lookup_id(indexes_typenum); + + int cumsum_typenum = cumsum.get_typenum(); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + // cumsum must be int64_t only + constexpr int int64_typeid = + static_cast(dpctl::tensor::detail::typenum_t::INT64); + if (cumsum_typeid != int64_typeid || indexes_typeid != int64_typeid) { + throw py::value_error( + "Cumulative sum array and index array must have int64 data-type"); + } + + if (cumsum_sz == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + if (overlap(cumsum, indexes)) { + throw py::value_error("Arrays are expected to ave no memory overlap"); + } + + // ensure that dst is sufficiently ample + auto indexes_offsets = indexes.get_minmax_offsets(); + // destination must be ample enough to accomodate all elements + { + size_t range = + static_cast(indexes_offsets.second - indexes_offsets.first); + if (range + 1 < static_cast(nz_elems * _ndim)) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accomodate all the array elements."); + } + } + + std::vector host_task_events; + host_task_events.reserve(2); + + auto mask_shape_copying_tuple = device_allocate_and_pack( + exec_q, host_task_events, mask_shape); + py::ssize_t *src_shape_device_ptr = std::get<0>(mask_shape_copying_tuple); + sycl::event copy_ev = std::get<2>(mask_shape_copying_tuple); + + if (src_shape_device_ptr == nullptr) { + sycl::event::wait(host_task_events); + throw std::runtime_error("Device allocation failed"); + } + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_ev); + + using dpctl::tensor::kernels::indexing::non_zero_indexes_impl; + + sycl::event non_zero_indexes_ev = + non_zero_indexes_impl( + exec_q, cumsum_sz, nz_elems, ndim, cumsum.get_data(), + indexes.get_data(), src_shape_device_ptr, all_deps); + + sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(non_zero_indexes_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, src_shape_device_ptr] { + sycl::free(src_shape_device_ptr, ctx); + }); + }); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {cumsum, indexes}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, + temporaries_cleanup_ev); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp new file mode 100644 index 0000000000..f165fe5118 --- /dev/null +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp @@ -0,0 +1,84 @@ +//===-- boolean_advanced_indexing.hpp - --*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares Python API for implementation functions of +/// dpctl.tensor.place, dpctl.tensor.extract, and dpctl.tensor.nonzero +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void populate_mask_positions_dispatch_vectors(void); + +extern size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, + dpctl::tensor::usm_ndarray cumsum, + sycl::queue exec_q, + std::vector const &depends = {}); + +extern std::pair +py_extract(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + std::vector const &depends = {}); + +extern void populate_masked_extract_dispatch_vectors(void); + +extern std::pair +py_place(dpctl::tensor::usm_ndarray dst, + dpctl::tensor::usm_ndarray cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + dpctl::tensor::usm_ndarray rhs, + sycl::queue exec_q, + std::vector const &depends = {}); + +extern void populate_masked_place_dispatch_vectors(void); + +extern std::pair py_nonzero( + dpctl::tensor::usm_ndarray cumsum, // int64 input array, 1D, C-contiguous + dpctl::tensor::usm_ndarray indexes, // int64 2D output array, C-contiguous + std::vector + mask_shape, // shape of array from which cumsum was computed + sycl::queue exec_q, + std::vector const &depends = {}); + +/* @brief Check if memory regions underlying two arrays have an overlap */ +extern bool overlap(dpctl::tensor::usm_ndarray ar1, + dpctl::tensor::usm_ndarray ar2); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp index be4a35fb90..7eb7c8f8d6 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp @@ -39,6 +39,86 @@ namespace py = pybind11; using dpctl::tensor::c_contiguous_strides; using dpctl::tensor::f_contiguous_strides; +void simplify_iteration_space_1(int &nd, + const py::ssize_t *&shape, + const py::ssize_t *&strides, + py::ssize_t itemsize, + bool is_c_contig, + bool is_f_contig, + std::vector &simplified_shape, + std::vector &simplified_strides, + py::ssize_t &offset) +{ + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + for (int i = 0; i < nd; ++i) { + simplified_shape.push_back(shape[i]); + } + + simplified_strides.reserve(nd); + if (strides == nullptr) { + if (is_c_contig) { + simplified_strides = c_contiguous_strides(nd, shape, itemsize); + } + else if (is_f_contig) { + simplified_strides = f_contiguous_strides(nd, shape, itemsize); + } + else { + throw std::runtime_error( + "Array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + for (int i = 0; i < nd; ++i) { + simplified_strides.push_back(strides[i]); + } + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); + int contracted_nd = simplify_iteration_stride( + nd, simplified_shape.data(), simplified_strides.data(), + offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + + simplified_strides.reserve(nd); + + if (strides == nullptr) { + if (is_c_contig) { + simplified_strides.push_back(itemsize); + } + else if (is_f_contig) { + simplified_strides.push_back(itemsize); + } + else { + throw std::runtime_error( + "Array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + simplified_strides.push_back(strides[0]); + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); + } + shape = const_cast(simplified_shape.data()); + strides = const_cast(simplified_strides.data()); +} + void simplify_iteration_space(int &nd, const py::ssize_t *&shape, const py::ssize_t *&src_strides, @@ -173,6 +253,195 @@ void simplify_iteration_space(int &nd, const_cast(simplified_dst_strides.data()); } +void simplify_iteration_space_3( + int &nd, + const py::ssize_t *&shape, + // src1 + const py::ssize_t *&src1_strides, + py::ssize_t src1_itemsize, + bool is_src1_c_contig, + bool is_src1_f_contig, + // src2 + const py::ssize_t *&src2_strides, + py::ssize_t src2_itemsize, + bool is_src2_c_contig, + bool is_src2_f_contig, + // dst + const py::ssize_t *&dst_strides, + py::ssize_t dst_itemsize, + bool is_dst_c_contig, + bool is_dst_f_contig, + // output + std::vector &simplified_shape, + std::vector &simplified_src1_strides, + std::vector &simplified_src2_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src1_offset, + py::ssize_t &src2_offset, + py::ssize_t &dst_offset) +{ + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + for (int i = 0; i < nd; ++i) { + simplified_shape.push_back(shape[i]); + } + + simplified_src1_strides.reserve(nd); + simplified_src2_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + if (src1_strides == nullptr) { + if (is_src1_c_contig) { + simplified_src1_strides = + c_contiguous_strides(nd, shape, src1_itemsize); + } + else if (is_src1_f_contig) { + simplified_src1_strides = + f_contiguous_strides(nd, shape, src1_itemsize); + } + else { + throw std::runtime_error( + "Source array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + for (int i = 0; i < nd; ++i) { + simplified_src1_strides.push_back(src1_strides[i]); + } + } + if (src2_strides == nullptr) { + if (is_src2_c_contig) { + simplified_src2_strides = + c_contiguous_strides(nd, shape, src2_itemsize); + } + else if (is_src2_f_contig) { + simplified_src2_strides = + f_contiguous_strides(nd, shape, src2_itemsize); + } + else { + throw std::runtime_error( + "Source array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + for (int i = 0; i < nd; ++i) { + simplified_src2_strides.push_back(src2_strides[i]); + } + } + if (dst_strides == nullptr) { + if (is_dst_c_contig) { + simplified_dst_strides = + c_contiguous_strides(nd, shape, dst_itemsize); + } + else if (is_dst_f_contig) { + simplified_dst_strides = + f_contiguous_strides(nd, shape, dst_itemsize); + } + else { + throw std::runtime_error( + "Destination array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + for (int i = 0; i < nd; ++i) { + simplified_dst_strides.push_back(dst_strides[i]); + } + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + int contracted_nd = simplify_iteration_three_strides( + nd, simplified_shape.data(), simplified_src1_strides.data(), + simplified_src2_strides.data(), simplified_dst_strides.data(), + src1_offset, // modified by reference + src2_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src1_strides.resize(contracted_nd); + simplified_src2_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + + simplified_src1_strides.reserve(nd); + simplified_src2_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if (src1_strides == nullptr) { + if (is_src1_c_contig) { + simplified_src1_strides.push_back(src1_itemsize); + } + else if (is_src1_f_contig) { + simplified_src1_strides.push_back(src1_itemsize); + } + else { + throw std::runtime_error( + "Source array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + simplified_src1_strides.push_back(src1_strides[0]); + } + if (src2_strides == nullptr) { + if (is_src2_c_contig) { + simplified_src2_strides.push_back(src2_itemsize); + } + else if (is_src2_f_contig) { + simplified_src2_strides.push_back(src2_itemsize); + } + else { + throw std::runtime_error( + "Source array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + simplified_src2_strides.push_back(src2_strides[0]); + } + if (dst_strides == nullptr) { + if (is_dst_c_contig) { + simplified_dst_strides.push_back(dst_itemsize); + } + else if (is_dst_f_contig) { + simplified_dst_strides.push_back(dst_itemsize); + } + else { + throw std::runtime_error( + "Destination array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } + shape = const_cast(simplified_shape.data()); + src1_strides = + const_cast(simplified_src1_strides.data()); + src2_strides = + const_cast(simplified_src2_strides.data()); + dst_strides = + const_cast(simplified_dst_strides.data()); +} + } // namespace py_internal } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp index 515e795d20..ec0cc286d4 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp @@ -36,6 +36,16 @@ namespace py_internal namespace py = pybind11; +void simplify_iteration_space_1(int &, + const py::ssize_t *&, + const py::ssize_t *&, + py::ssize_t, + bool, + bool, + std::vector &, + std::vector &, + py::ssize_t &); + void simplify_iteration_space(int &, const py::ssize_t *&, const py::ssize_t *&, @@ -52,6 +62,32 @@ void simplify_iteration_space(int &, py::ssize_t &, py::ssize_t &); +void simplify_iteration_space_3(int &, + const py::ssize_t *&, + // src1 + const py::ssize_t *&, + py::ssize_t, + bool, + bool, + // src2 + const py::ssize_t *&, + py::ssize_t, + bool, + bool, + // dst + const py::ssize_t *&, + py::ssize_t, + bool, + bool, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &); + } // namespace py_internal } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index e164be2421..2e9e981a37 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -33,6 +33,7 @@ #include "dpctl4pybind11.hpp" +#include "boolean_advanced_indexing.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_for_reshape.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" @@ -75,6 +76,12 @@ using dpctl::tensor::py_internal::usm_ndarray_full; using dpctl::tensor::py_internal::usm_ndarray_put; using dpctl::tensor::py_internal::usm_ndarray_take; +using dpctl::tensor::py_internal::overlap; +using dpctl::tensor::py_internal::py_extract; +using dpctl::tensor::py_internal::py_mask_positions; +using dpctl::tensor::py_internal::py_nonzero; +using dpctl::tensor::py_internal::py_place; + /* ================ Eye ================== */ using dpctl::tensor::py_internal::usm_ndarray_eye; @@ -105,6 +112,10 @@ void init_dispatch_vectors(void) init_eye_ctor_dispatch_vectors(); init_triul_ctor_dispatch_vectors(); + populate_mask_positions_dispatch_vectors(); + populate_masked_extract_dispatch_vectors(); + populate_masked_place_dispatch_vectors(); + return; } @@ -252,4 +263,24 @@ PYBIND11_MODULE(_tensor_impl, m) m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"), py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), + py::arg("cumsum"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"), + py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_array_overlap", &overlap, + "Determines if the memory regions indexed by each array overlap", + py::arg("array1"), py::arg("array2")); + + m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"), + py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"), + py::arg("mask_shape"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); } From c1f00812b35becd29c8af54df2e049dd7033f136 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 10:31:52 -0600 Subject: [PATCH 47/57] Added missing include --- .../libtensor/include/kernels/boolean_advanced_indexing.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index b42b7869d2..71313e9a27 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -32,6 +32,7 @@ #include #include "utils/strided_iters.hpp" +#include "utils/type_dispatch.hpp" namespace dpctl { From 849a3ea2025277c7128b4e72145b4308be68153b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 11:18:44 -0600 Subject: [PATCH 48/57] Hooked up boolean indexing, first attempt --- dpctl/tensor/__init__.py | 5 +- dpctl/tensor/_copy_utils.py | 147 +++++++++++++++++++--------- dpctl/tensor/_indexing_functions.py | 118 ++++++++++++++++++++++ dpctl/tensor/_usmarray.pyx | 14 +-- 4 files changed, 229 insertions(+), 55 deletions(-) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index d21958b4fa..2a2afd60a4 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -58,7 +58,7 @@ ) from dpctl.tensor._device import Device from dpctl.tensor._dlpack import from_dlpack -from dpctl.tensor._indexing_functions import put, take +from dpctl.tensor._indexing_functions import extract, nonzero, place, put, take from dpctl.tensor._manipulation_functions import ( broadcast_arrays, broadcast_to, @@ -115,6 +115,9 @@ "squeeze", "take", "put", + "extract", + "place", + "nonzero", "from_numpy", "to_numpy", "asnumpy", diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index 597db87c49..72b9e0a021 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -389,45 +389,75 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True): return R -def _mock_extract(ary, ary_mask, p): - exec_q = dpctl.utils.get_execution_queue( - ( - ary.sycl_queue, - ary_mask.sycl_queue, +def _extract_impl(ary, ary_mask, axis=0): + """Extract elements of ary by applying mask starting from slot + dimension axis""" + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}" + ) + if not isinstance(ary_mask, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}" ) + exec_q = dpctl.utils.get_execution_queue( + (ary.sycl_queue, ary_mask.sycl_queue) ) if exec_q is None: raise dpctl.utils.ExecutionPlacementError( - "Can not automatically determine where to allocate the " - "result or performance execution. " - "Use `usm_ndarray.to_device` method to migrate data to " - "be associated with the same queue." + "arrays have different associated queues. " + "Use `Y.to_device(X.device)` to migrate." ) - - res_usm_type = dpctl.utils.get_coerced_usm_type( - ( - ary.usm_type, - ary_mask.usm_type, + ary_nd = ary.ndim + pp = normalize_axis_index(operator.index(axis), ary_nd) + mask_nd = ary_mask.ndim + if pp < 0 or pp + mask_nd > ary_nd: + raise ValueError( + "Parameter p is inconsistent with input array dimensions" ) + mask_nelems = ary_mask.size + cumsum = dpt.empty(mask_nelems, dtype=dpt.int64, device=ary_mask.device) + exec_q = cumsum.sycl_queue + mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q) + dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] + dst = dpt.empty( + dst_shape, dtype=ary.dtype, usm_type=ary.usm_type, device=ary.device ) - ary_np = dpt.asnumpy(ary) - mask_np = dpt.asnumpy(ary_mask) - res_np = ary_np[(slice(None),) * p + (mask_np,)] - res = dpt.empty( - res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q + hev, _ = ti._extract( + src=ary, + cumsum=cumsum, + axis_start=pp, + axis_end=pp + mask_nd, + dst=dst, + sycl_queue=exec_q, ) - res[...] = res_np - return res + hev.wait() + return dst -def _mock_nonzero(ary): +def _nonzero_impl(ary): if not isinstance(ary, dpt.usm_ndarray): - raise TypeError - q = ary.sycl_queue + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}" + ) + exec_q = ary.sycl_queue usm_type = ary.usm_type - ary_np = dpt.asnumpy(ary) - nz = ary_np.nonzero() - return tuple(dpt.asarray(i, usm_type=usm_type, sycl_queue=q) for i in nz) + mask_nelems = ary.size + cumsum = dpt.empty( + mask_nelems, dtype=dpt.int64, sycl_queue=exec_q, order="C" + ) + mask_count = ti.mask_positions(ary, cumsum, sycl_queue=exec_q) + indexes = dpt.empty( + (ary.ndim, mask_count), + dtype=cumsum.dtype, + usm_type=usm_type, + sycl_queue=exec_q, + order="C", + ) + hev, _ = ti._nonzero(cumsum, indexes, ary.shape, exec_q) + res = tuple(indexes[i, :] for i in range(ary.ndim)) + hev.wait() + return res def _take_multi_index(ary, inds, p): @@ -473,34 +503,57 @@ def _take_multi_index(ary, inds, p): return res -def _mock_place(ary, ary_mask, p, vals): +def _place_impl(ary, ary_mask, vals, axis=0): + """Extract elements of ary by applying mask starting from slot + dimension axis""" if not isinstance(ary, dpt.usm_ndarray): - raise TypeError + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}" + ) if not isinstance(ary_mask, dpt.usm_ndarray): - raise TypeError + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}" + ) + if not isinstance(vals, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}" + ) exec_q = dpctl.utils.get_execution_queue( - (ary.sycl_queue, ary_mask.sycl_queue) + (ary.sycl_queue, ary_mask.sycl_queue, vals.sycl_queue) ) - if exec_q is not None and isinstance(vals, dpt.usm_ndarray): - exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue)) if exec_q is None: raise dpctl.utils.ExecutionPlacementError( - "Can not automatically determine where to allocate the " - "result or performance execution. " - "Use `usm_ndarray.to_device` method to migrate data to " - "be associated with the same queue." + "arrays have different associated queues. " + "Use `Y.to_device(X.device)` to migrate." ) - - ary_np = dpt.asnumpy(ary) - mask_np = dpt.asnumpy(ary_mask) - if isinstance(vals, dpt.usm_ndarray) or hasattr( - vals, "__sycl_usm_array_interface__" - ): - vals_np = dpt.asnumpy(vals) + ary_nd = ary.ndim + pp = normalize_axis_index(operator.index(axis), ary_nd) + mask_nd = ary_mask.ndim + if pp < 0 or pp + mask_nd > ary_nd: + raise ValueError( + "Parameter p is inconsistent with input array dimensions" + ) + mask_nelems = ary_mask.size + cumsum = dpt.empty(mask_nelems, dtype=dpt.int64, device=ary_mask.device) + exec_q = cumsum.sycl_queue + mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q) + expected_vals_shape = ( + ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] + ) + if vals.dtype == ary.dtype: + rhs = vals else: - vals_np = vals - ary_np[(slice(None),) * p + (mask_np,)] = vals_np - ary[...] = ary_np + rhs = dpt.astype(vals, ary.dtype) + rhs = dpt.broadcast_to(rhs, expected_vals_shape) + hev, _ = ti._place( + dst=ary, + cumsum=cumsum, + axis_start=pp, + axis_end=pp + mask_nd, + rhs=rhs, + sycl_queue=exec_q, + ) + hev.wait() return diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 12f7b2d72e..01f1a2370a 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -23,6 +23,8 @@ import dpctl.tensor as dpt from dpctl.tensor._tensor_impl import _put, _take +from ._copy_utils import _extract_impl, _nonzero_impl, _place_impl + def take(x, indices, /, *, axis=None, mode="clip"): if not isinstance(x, dpt.usm_ndarray): @@ -175,3 +177,119 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"): hev, _ = _put(x, indices, vals, axis, mode, sycl_queue=exec_q) hev.wait() + + +def extract(condition, arr): + """extract(condition, arr) + + Returns the elements of an array that satisfies the condition. + + If `condition` is boolean :func:``dpctl.tensor.extract`` is + equivalent to ``arr[condition]``. + + Note that :func:``dpctl.tensor.place`` does the opposite of + :func:``dpctl.tensor.extract``. + + Args: + conditions: usm_ndarray + An array whose non-zero or True entries indicate the element + of `arr` to extract. + arr: usm_ndarray + Input array of the same size as `condition`. + + Returns: + extract: usm_ndarray + Rank 1 array of values from `arr` where `condition` is True. + """ + if not isinstance(condition, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}" + ) + if not isinstance(arr, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}" + ) + exec_q = dpctl.utils.get_execution_queue( + ( + condition.sycl_queue, + arr.sycl_queue, + ) + ) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError + if condition.shape != arr.shape: + raise ValueError("Arrays are not of the same size") + return _extract_impl(arr, condition) + + +def place(arr, mask, vals): + """place(arr, mask, vals) + + Change elements of an array based on conditional and input values. + + If `mask` is boolean :func:``dpctl.tensor.place`` is + equivalent to ``arr[condition] = vals``. + + Args: + arr: usm_ndarray + Array to put data into. + mask: usm_ndarray + Boolean mask array. Must have the same size as `arr`. + vals: usm_ndarray + Values to put into `arr`. Only the first N elements are + used, where N is the number of True values in `mask`. If + `vals` is smaller than N, it will be repeated, and if + elements of `arr` are to be masked, this sequence must be + non-empty. Array `vals` must be one dimensional. + """ + if not isinstance(arr, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}" + ) + if not isinstance(mask, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(mask)}" + ) + if not isinstance(vals, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(vals)}" + ) + exec_q = dpctl.utils.get_execution_queue( + ( + arr.sycl_queue, + mask.sycl_queue, + vals.sycl_queue, + ) + ) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError + if arr.shape != mask.shape or vals.ndim != 1: + raise ValueError("Array sizes are not as required") + # FIXME + _place_impl(arr, mask, vals, axis=0) + + +def nonzero(arr): + """nonzero(arr) + + Return the indices of non-zero elements. + + Returns the tuple of usm_narrays, one for each dimension + of `arr`, containing the indices of the non-zero elements + in that dimension. The values of `arr` are always tested in + row-major, C-style order. + + Args: + arr: usm_ndarray + Input array, which has non-zero array rank. + Returns: + tuple_of_usm_ndarrays: tuple + Indices of non-zero array elements. + """ + if not isinstance(arr, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}" + ) + if arr.ndim == 0: + raise ValueError("Array of positive rank is exepcted") + return _nonzero_impl(arr) diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 64a492065f..1abc1e88ac 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -670,15 +670,15 @@ cdef class usm_ndarray: if adv_ind_start_p < 0: return res - from ._copy_utils import _mock_extract, _mock_nonzero, _take_multi_index + from ._copy_utils import _extract_impl, _nonzero_impl, _take_multi_index if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool: - return _mock_extract(res, adv_ind[0], adv_ind_start_p) + return _extract_impl(res, adv_ind[0], axis=adv_ind_start_p) if any(ind.dtype == dpt_bool for ind in adv_ind): adv_ind_int = list() for ind in adv_ind: if ind.dtype == dpt_bool: - adv_ind_int.extend(_mock_nonzero(ind)) + adv_ind_int.extend(_nonzero_impl(ind)) else: adv_ind_int.append(ind) return _take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p) @@ -1015,8 +1015,8 @@ cdef class usm_ndarray: from ._copy_utils import ( _copy_from_numpy_into, _copy_from_usm_ndarray_to_usm_ndarray, - _mock_nonzero, - _mock_place, + _nonzero_impl, + _place_impl, _put_multi_index, ) @@ -1050,14 +1050,14 @@ cdef class usm_ndarray: return if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool: - _mock_place(Xv, adv_ind[0], adv_ind_start_p, rhs) + _place_impl(Xv, adv_ind[0], rhs, axis=adv_ind_start_p) return if any(ind.dtype == dpt_bool for ind in adv_ind): adv_ind_int = list() for ind in adv_ind: if ind.dtype == dpt_bool: - adv_ind_int.extend(_mock_nonzero(ind)) + adv_ind_int.extend(_nonzero_impl(ind)) else: adv_ind_int.append(ind) _put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs) From ed279d63b6f6b43ef5f3b4791696216536639ca1 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 11:43:28 -0600 Subject: [PATCH 49/57] Changes per clang-format 11 --- dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp index 1534b38391..9689612b8a 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -62,7 +62,7 @@ template sink_t inserter(V &lhs, U &&rhs) } template -std::vector concat(std::vector lhs, Vs &&... vs) +std::vector concat(std::vector lhs, Vs &&...vs) { std::size_t s = lhs.size(); { @@ -83,7 +83,7 @@ template std::tuple device_allocate_and_pack(sycl::queue q, std::vector &host_task_events, - Vs &&... vs) + Vs &&...vs) { // memory transfer optimization, use USM-host for temporary speeds up From 3ced89a095650fbc40688294b8e343d44857f495 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 12:42:07 -0600 Subject: [PATCH 50/57] Used Strided1DCyclingIndexer in place implementations This allows to implement behavior of place which cycles over values of val array if that is shorter than the number of non-zero elements in the mask. --- .../kernels/boolean_advanced_indexing.hpp | 53 +++++++++++-------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index 71313e9a27..aa0a90ce70 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -1,4 +1,4 @@ -//=== boolean_advance_indexing.hpp - ---*-C++-*--/===// +//=== boolean_advance_indexing.hpp - ------*-C++-*--/===// // // Data Parallel Control (dpctl) // @@ -16,11 +16,11 @@ // See the License for the specific language governing permissions and // limitations under the License. // -//===----------------------------------------------------------------------===// +//===---------------------------------------------------------------------===// /// /// \file /// This file defines kernels for advanced tensor index operations. -//===----------------------------------------------------------------------===// +//===---------------------------------------------------------------------===// #pragma once #include @@ -114,6 +114,26 @@ struct Strided1DIndexer py::ssize_t step = 1; }; +struct Strided1DCyclicIndexer +{ + Strided1DCyclicIndexer(py::ssize_t _offset, + py::ssize_t _size, + py::ssize_t _step) + : offset(_offset), size(static_cast(_size)), step(_step) + { + } + + size_t operator()(size_t gid) const + { + return static_cast(offset + (gid % size) * step); + } + +private: + py::ssize_t offset = 0; + size_t size = 1; + py::ssize_t step = 1; +}; + template struct ZeroChecker { @@ -762,27 +782,22 @@ sycl::event masked_place_all_slices_strided_impl( py::ssize_t rhs_stride, const std::vector &depends = {}) { - // using MaskedPlaceStridedFunctor; - // using Strided1DIndexer; - // using StridedIndexer; - // using TwoZeroOffsets_Indexer; - TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{}; /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const * *_packed_shape_strides) */ StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides); - Strided1DIndexer masked_rhs_indexer(0, rhs_size, rhs_stride); + Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride); sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); cgh.parallel_for>( + TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer, + dataT, indT>>( sycl::range<1>(static_cast(iteration_size)), MaskedPlaceStridedFunctor( + Strided1DCyclicIndexer, dataT, indT>( dst_p, cumsum_p, rhs_p, 1, iteration_size, orthog_dst_rhs_indexer, masked_dst_indexer, masked_rhs_indexer)); @@ -838,11 +853,6 @@ sycl::event masked_place_some_slices_strided_impl( py::ssize_t masked_rhs_stride, const std::vector &depends = {}) { - // using MaskedPlaceStridedFunctor; - // using Strided1DIndexer; - // using StridedIndexer; - // using TwoOffsets_StridedIndexer; - TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{ orthog_nd, ortho_dst_offset, ortho_rhs_offset, packed_ortho_dst_rhs_shape_strides}; @@ -851,17 +861,18 @@ sycl::event masked_place_some_slices_strided_impl( * *_packed_shape_strides) */ StridedIndexer masked_dst_indexer{masked_nd, 0, packed_masked_dst_shape_strides}; - Strided1DIndexer masked_rhs_indexer{0, masked_rhs_size, masked_rhs_stride}; + Strided1DCyclicIndexer masked_rhs_indexer{0, masked_rhs_size, + masked_rhs_stride}; sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); cgh.parallel_for>( + TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer, + dataT, indT>>( sycl::range<1>(static_cast(orthog_nelems * masked_nelems)), MaskedPlaceStridedFunctor( + Strided1DCyclicIndexer, dataT, indT>( dst_p, cumsum_p, rhs_p, orthog_nelems, masked_nelems, orthog_dst_rhs_indexer, masked_dst_indexer, masked_rhs_indexer)); From 19691ca89ff4ee3324c3402a19a4be669ee8e138 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 12:44:49 -0600 Subject: [PATCH 51/57] Implemented dpctl.tensor.place as per documented behavior. --- dpctl/tensor/_indexing_functions.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 01f1a2370a..20c4a22786 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -21,9 +21,9 @@ import dpctl import dpctl.tensor as dpt -from dpctl.tensor._tensor_impl import _put, _take +import dpctl.tensor._tensor_impl as ti -from ._copy_utils import _extract_impl, _nonzero_impl, _place_impl +from ._copy_utils import _extract_impl, _nonzero_impl def take(x, indices, /, *, axis=None, mode="clip"): @@ -95,7 +95,7 @@ def take(x, indices, /, *, axis=None, mode="clip"): res_shape, dtype=x.dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - hev, _ = _take(x, indices, res, axis, mode, sycl_queue=exec_q) + hev, _ = ti._take(x, indices, res, axis, mode, sycl_queue=exec_q) hev.wait() return res @@ -175,7 +175,7 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"): vals = dpt.broadcast_to(vals, val_shape) - hev, _ = _put(x, indices, vals, axis, mode, sycl_queue=exec_q) + hev, _ = ti._put(x, indices, vals, axis, mode, sycl_queue=exec_q) hev.wait() @@ -265,8 +265,23 @@ def place(arr, mask, vals): raise dpctl.utils.ExecutionPlacementError if arr.shape != mask.shape or vals.ndim != 1: raise ValueError("Array sizes are not as required") - # FIXME - _place_impl(arr, mask, vals, axis=0) + cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q) + nz_count = ti.mask_positions(mask, cumsum, sycl_queue=exec_q) + if nz_count == 0: + return + if vals.dtype == arr.dtype: + rhs = vals + else: + rhs = dpt.astype(vals, arr.dtype) + hev, _ = ti._place( + dst=arr, + cumsum=cumsum, + axis_start=0, + axis_end=mask.ndim, + rhs=rhs, + sycl_queue=exec_q, + ) + hev.wait() def nonzero(arr): From 03c48222e07201fa37567a4f1337f699f87cba1e Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 2 Mar 2023 17:24:22 -0800 Subject: [PATCH 52/57] _take and _put returned event changes - Host_tasks now collected and used as dependencies for dec_ref of py arguments - Return temporaries deallocation event to further prevent dangling host_tasks --- .../source/integer_advanced_indexing.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index dfc74c12f0..ed0f749add 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -548,10 +548,12 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src, }); }); - sycl::event host_task_ev = keep_args_alive( - exec_q, {src, py_ind, dst}, {take_generic_ev, temporaries_cleanup_ev}); + host_task_events.push_back(temporaries_cleanup_ev); - return std::make_pair(host_task_ev, take_generic_ev); + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events); + + return std::make_pair(arg_cleanup_ev, temporaries_cleanup_ev); } std::pair @@ -857,7 +859,6 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, dst_offset, val_offset, packed_ind_offsets, all_deps); // free packed temporaries - sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(put_generic_ev); auto ctx = exec_q.get_context(); @@ -872,10 +873,12 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst, }); }); - sycl::event py_obj_cleanup_ev = keep_args_alive( - exec_q, {dst, py_ind, val}, {put_generic_ev, temporaries_cleanup_ev}); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events); - return std::make_pair(temporaries_cleanup_ev, put_generic_ev); + return std::make_pair(arg_cleanup_ev, temporaries_cleanup_ev); } void init_advanced_indexing_dispatch_tables(void) From f75723b143c67f109bcf27bdb8837cd30a7d81cb Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 14:47:59 -0600 Subject: [PATCH 53/57] Added tests to test_usm_ndarray_indexing --- dpctl/tests/test_usm_ndarray_indexing.py | 220 +++++++++++++++++++++++ 1 file changed, 220 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index 98bb674b21..aec71def7d 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -970,3 +970,223 @@ def test_advanced_indexing_compute_follows_data(): dpt.put(x, ind0, val1, axis=0) with pytest.raises(ExecutionPlacementError): x[ind0] = val1 + + +####### + + +def test_extract_all_1d(): + x = dpt.arange(30, dtype="i4") + sel = dpt.ones(30, dtype="?") + sel[::2] = False + + res = x[sel] + expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)] + assert (dpt.asnumpy(res) == expected_res).all() + + res2 = dpt.extract(sel, x) + assert (dpt.asnumpy(res2) == expected_res).all() + + +def test_extract_all_2d(): + x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) + sel = dpt.ones(30, dtype="?") + sel[::2] = False + sel = dpt.reshape(sel, x.shape) + + res = x[sel] + expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)] + assert (dpt.asnumpy(res) == expected_res).all() + + res2 = dpt.extract(sel, x) + assert (dpt.asnumpy(res2) == expected_res).all() + + +def test_extract_2D_axis0(): + x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) + sel = dpt.ones(x.shape[0], dtype="?") + sel[::2] = False + + res = x[sel] + expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)] + assert (dpt.asnumpy(res) == expected_res).all() + + +def test_extract_2D_axis1(): + x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) + sel = dpt.ones(x.shape[1], dtype="?") + sel[::2] = False + + res = x[:, sel] + expected = dpt.asnumpy(x)[:, dpt.asnumpy(sel)] + assert (dpt.asnumpy(res) == expected).all() + + +def test_extract_begin(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 3), dtype="?") + sel[0, 0] = True + sel[1, 1] = True + z = y[sel] + expected = dpt.asnumpy(y)[[0, 1], [0, 1]] + assert (dpt.asnumpy(z) == expected).all() + + +def test_extract_end(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((4, 4), dtype="?") + sel[0, 0] = True + z = y[..., sel] + expected = dpt.asnumpy(y)[..., [0], [0]] + assert (dpt.asnumpy(z) == expected).all() + + +def test_extract_middle(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 4), dtype="?") + sel[0, 0] = True + z = y[:, sel] + expected = dpt.asnumpy(y)[:, [0], [0], :] + assert (dpt.asnumpy(z) == expected).all() + + +def test_extract_empty_result(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 4), dtype="?") + z = y[:, sel] + assert z.shape == ( + y.shape[0], + 0, + y.shape[3], + ) + + +def test_place_all_1d(): + x = dpt.arange(10, dtype="i2") + sel = dpt.zeros(10, dtype="?") + sel[0::2] = True + val = dpt.zeros(5, dtype=x.dtype) + x[sel] = val + assert (dpt.asnumpy(x) == np.array([0, 1, 0, 3, 0, 5, 0, 7, 0, 9])).all() + dpt.place(x, sel, dpt.asarray(2)) + assert (dpt.asnumpy(x) == np.array([2, 1, 2, 3, 2, 5, 2, 7, 2, 9])).all() + + +def test_place_2d_axis0(): + x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) + sel = dpt.asarray([True, False, True]) + val = dpt.zeros((2, 4), dtype=x.dtype) + x[sel] = val + expected_x = np.stack( + ( + np.zeros(4, dtype="i2"), + np.arange(4, 8, dtype="i2"), + np.zeros(4, dtype="i2"), + ) + ) + assert (dpt.asnumpy(x) == expected_x).all() + + +def test_place_2d_axis1(): + x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) + sel = dpt.asarray([True, False, True, False]) + val = dpt.zeros((3, 2), dtype=x.dtype) + x[:, sel] = val + expected_x = np.array( + [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2" + ) + assert (dpt.asnumpy(x) == expected_x).all() + + +def test_place_2d_axis1_scalar(): + x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) + sel = dpt.asarray([True, False, True, False]) + val = dpt.zeros(tuple(), dtype=x.dtype) + x[:, sel] = val + expected_x = np.array( + [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2" + ) + assert (dpt.asnumpy(x) == expected_x).all() + + +def test_place_all_slices(): + x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) + sel = dpt.asarray( + [ + [False, True, True, False], + [True, True, False, False], + [False, False, True, True], + ], + dtype="?", + ) + y = dpt.ones_like(x) + y[sel] = x[sel] + + +def test_place_some_slices_begin(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 3), dtype="?") + sel[0, 0] = True + sel[1, 1] = True + z = y[sel] + w = dpt.zeros_like(y) + w[sel] = z + + +def test_place_some_slices_mid(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 4), dtype="?") + sel[0, 0] = True + sel[1, 1] = True + z = y[:, sel] + w = dpt.zeros_like(y) + w[:, sel] = z + + +def test_place_some_slices_end(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((4, 4), dtype="?") + sel[0, 0] = True + sel[1, 1] = True + z = y[:, :, sel] + w = dpt.zeros_like(y) + w[:, :, sel] = z + + +def test_place_cycling(): + x = dpt.zeros(10, dtype="f4") + y = dpt.asarray([2, 3]) + sel = dpt.ones(x.size, dtype="?") + dpt.place(x, sel, y) + expected = np.array( + [ + 2, + 3, + ] + * 5, + dtype=x.dtype, + ) + assert (dpt.asnumpy(x) == expected).all() + + +def test_place_subset(): + x = dpt.zeros(10, dtype="f4") + y = dpt.ones_like(x) + sel = dpt.ones(x.size, dtype="?") + sel[::2] = False + dpt.place(x, sel, y) + expected = np.array([1, 3, 5, 7, 9], dtype=x.dtype) + assert (dpt.asnumpy(x) == expected).all() + + +def test_nonzero(): + x = dpt.concat((dpt.zeros(3), dpt.ones(4), dpt.zeros(3))) + (i,) = dpt.nonzero(x) + assert dpt.asnumpy(i) == np.array([3, 4, 5, 6]).all() From cab00351ba9ba9018528dd9cf3563c0aeb31f861 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 15:43:17 -0600 Subject: [PATCH 54/57] Fixed tests for boolean indexing --- dpctl/tests/test_usm_ndarray_indexing.py | 28 +++++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index aec71def7d..bcc1fdbb60 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -972,10 +972,8 @@ def test_advanced_indexing_compute_follows_data(): x[ind0] = val1 -####### - - def test_extract_all_1d(): + get_queue_or_skip() x = dpt.arange(30, dtype="i4") sel = dpt.ones(30, dtype="?") sel[::2] = False @@ -989,6 +987,7 @@ def test_extract_all_1d(): def test_extract_all_2d(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) sel = dpt.ones(30, dtype="?") sel[::2] = False @@ -1003,6 +1002,7 @@ def test_extract_all_2d(): def test_extract_2D_axis0(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) sel = dpt.ones(x.shape[0], dtype="?") sel[::2] = False @@ -1013,6 +1013,7 @@ def test_extract_2D_axis0(): def test_extract_2D_axis1(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) sel = dpt.ones(x.shape[1], dtype="?") sel[::2] = False @@ -1023,6 +1024,7 @@ def test_extract_2D_axis1(): def test_extract_begin(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((3, 3), dtype="?") @@ -1034,6 +1036,7 @@ def test_extract_begin(): def test_extract_end(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((4, 4), dtype="?") @@ -1044,6 +1047,7 @@ def test_extract_end(): def test_extract_middle(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((3, 4), dtype="?") @@ -1054,6 +1058,7 @@ def test_extract_middle(): def test_extract_empty_result(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((3, 4), dtype="?") @@ -1066,17 +1071,19 @@ def test_extract_empty_result(): def test_place_all_1d(): + get_queue_or_skip() x = dpt.arange(10, dtype="i2") sel = dpt.zeros(10, dtype="?") sel[0::2] = True val = dpt.zeros(5, dtype=x.dtype) x[sel] = val assert (dpt.asnumpy(x) == np.array([0, 1, 0, 3, 0, 5, 0, 7, 0, 9])).all() - dpt.place(x, sel, dpt.asarray(2)) + dpt.place(x, sel, dpt.asarray([2])) assert (dpt.asnumpy(x) == np.array([2, 1, 2, 3, 2, 5, 2, 7, 2, 9])).all() def test_place_2d_axis0(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) sel = dpt.asarray([True, False, True]) val = dpt.zeros((2, 4), dtype=x.dtype) @@ -1092,6 +1099,7 @@ def test_place_2d_axis0(): def test_place_2d_axis1(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) sel = dpt.asarray([True, False, True, False]) val = dpt.zeros((3, 2), dtype=x.dtype) @@ -1103,6 +1111,7 @@ def test_place_2d_axis1(): def test_place_2d_axis1_scalar(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) sel = dpt.asarray([True, False, True, False]) val = dpt.zeros(tuple(), dtype=x.dtype) @@ -1114,6 +1123,7 @@ def test_place_2d_axis1_scalar(): def test_place_all_slices(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) sel = dpt.asarray( [ @@ -1128,6 +1138,7 @@ def test_place_all_slices(): def test_place_some_slices_begin(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((3, 3), dtype="?") @@ -1139,6 +1150,7 @@ def test_place_some_slices_begin(): def test_place_some_slices_mid(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((3, 4), dtype="?") @@ -1150,6 +1162,7 @@ def test_place_some_slices_mid(): def test_place_some_slices_end(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((4, 4), dtype="?") @@ -1161,6 +1174,7 @@ def test_place_some_slices_end(): def test_place_cycling(): + get_queue_or_skip() x = dpt.zeros(10, dtype="f4") y = dpt.asarray([2, 3]) sel = dpt.ones(x.size, dtype="?") @@ -1177,16 +1191,18 @@ def test_place_cycling(): def test_place_subset(): + get_queue_or_skip() x = dpt.zeros(10, dtype="f4") y = dpt.ones_like(x) sel = dpt.ones(x.size, dtype="?") sel[::2] = False dpt.place(x, sel, y) - expected = np.array([1, 3, 5, 7, 9], dtype=x.dtype) + expected = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1], dtype=x.dtype) assert (dpt.asnumpy(x) == expected).all() def test_nonzero(): + get_queue_or_skip() x = dpt.concat((dpt.zeros(3), dpt.ones(4), dpt.zeros(3))) (i,) = dpt.nonzero(x) - assert dpt.asnumpy(i) == np.array([3, 4, 5, 6]).all() + assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all() From cb32c6fd2f5096040f6001e5c71ff6002013668c Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 18:05:54 -0600 Subject: [PATCH 55/57] Tweaks to docstrings of extract, place, nonzero --- dpctl/tensor/_indexing_functions.py | 34 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 20c4a22786..6f19dc3bd4 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -192,14 +192,14 @@ def extract(condition, arr): Args: conditions: usm_ndarray - An array whose non-zero or True entries indicate the element - of `arr` to extract. + An array whose non-zero or True entries indicate the element + of `arr` to extract. arr: usm_ndarray - Input array of the same size as `condition`. + Input array of the same size as `condition`. Returns: - extract: usm_ndarray - Rank 1 array of values from `arr` where `condition` is True. + usm_ndarray + Rank 1 array of values from `arr` where `condition` is True. """ if not isinstance(condition, dpt.usm_ndarray): raise TypeError( @@ -231,16 +231,16 @@ def place(arr, mask, vals): equivalent to ``arr[condition] = vals``. Args: - arr: usm_ndarray - Array to put data into. + arr: usm_ndarray + Array to put data into. mask: usm_ndarray - Boolean mask array. Must have the same size as `arr`. + Boolean mask array. Must have the same size as `arr`. vals: usm_ndarray - Values to put into `arr`. Only the first N elements are - used, where N is the number of True values in `mask`. If - `vals` is smaller than N, it will be repeated, and if - elements of `arr` are to be masked, this sequence must be - non-empty. Array `vals` must be one dimensional. + Values to put into `arr`. Only the first N elements are + used, where N is the number of True values in `mask`. If + `vals` is smaller than N, it will be repeated, and if + elements of `arr` are to be masked, this sequence must be + non-empty. Array `vals` must be one dimensional. """ if not isinstance(arr, dpt.usm_ndarray): raise TypeError( @@ -295,11 +295,11 @@ def nonzero(arr): row-major, C-style order. Args: - arr: usm_ndarray - Input array, which has non-zero array rank. + arr: usm_ndarray + Input array, which has non-zero array rank. Returns: - tuple_of_usm_ndarrays: tuple - Indices of non-zero array elements. + Tuple[usm_ndarray] + Indices of non-zero array elements. """ if not isinstance(arr, dpt.usm_ndarray): raise TypeError( From 0a7ea0c2674362b6be70b0094d1ef4a395d2cea9 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 3 Mar 2023 17:21:59 -0800 Subject: [PATCH 56/57] dpt.take and dpt.put changes - Improved conformity to array API standard - Added docstrings --- dpctl/tensor/_indexing_functions.py | 178 +++++++++++++---------- dpctl/tests/test_usm_ndarray_indexing.py | 28 ++-- 2 files changed, 117 insertions(+), 89 deletions(-) diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 6f19dc3bd4..c312d9e2b9 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -27,43 +27,56 @@ def take(x, indices, /, *, axis=None, mode="clip"): + """take(x, indices, axis=None, mode="clip") + + Takes elements from array along a given axis. + + Args: + x: usm_ndarray + The array that elements will be taken from. + indices: usm_ndarray + One-dimensional array of indices. + axis: + The axis over which the values will be selected. + If x is one-dimensional, this argument is optional. + mode: + How out-of-bounds indices will be handled. + "Clip" - clamps indices to (-n <= i < n), then wraps + negative indices. + "Wrap" - wraps both negative and positive indices. + + Returns: + out: usm_ndarray + Array with shape x.shape[:axis] + indices.shape + x.shape[axis + 1:] + filled with elements . + """ if not isinstance(x, dpt.usm_ndarray): raise TypeError( "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) ) - if not isinstance(indices, list) and not isinstance(indices, tuple): - indices = (indices,) - - queues_ = [ - x.sycl_queue, - ] - usm_types_ = [ - x.usm_type, - ] - - for i in indices: - if not isinstance(i, dpt.usm_ndarray): - raise TypeError( - "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( - type(i) - ) + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( + type(indices) ) - if not np.issubdtype(i.dtype, np.integer): - raise IndexError( - "`indices` expected integer data type, got `{}`".format(i.dtype) + ) + if not np.issubdtype(indices.dtype, np.integer): + raise IndexError( + "`indices` expected integer data type, got `{}`".format( + indices.dtype ) - queues_.append(i.sycl_queue) - usm_types_.append(i.usm_type) - exec_q = dpctl.utils.get_execution_queue(queues_) - if exec_q is None: - raise dpctl.utils.ExecutionPlacementError( - "Can not automatically determine where to allocate the " - "result or performance execution. " - "Use `usm_ndarray.to_device` method to migrate data to " - "be associated with the same queue." ) - res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) + if indices.ndim != 1: + raise ValueError( + "`indices` expected a 1D array, got `{}`".format(indices.ndim) + ) + exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, indices.sycl_queue]) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError + res_usm_type = dpctl.utils.get_coerced_usm_type( + [x.usm_type, indices.usm_type] + ) modes = {"clip": 0, "wrap": 1} try: @@ -81,27 +94,47 @@ def take(x, indices, /, *, axis=None, mode="clip"): ) axis = 0 - if len(indices) > 1: - indices = dpt.broadcast_arrays(*indices) if x_ndim > 0: axis = normalize_axis_index(operator.index(axis), x_ndim) - res_shape = ( - x.shape[:axis] + indices[0].shape + x.shape[axis + len(indices) :] - ) + res_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :] else: - res_shape = indices[0].shape + if axis != 0: + raise ValueError("`axis` must be 0 for an array of dimension 0.") + res_shape = indices.shape res = dpt.empty( res_shape, dtype=x.dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - hev, _ = ti._take(x, indices, res, axis, mode, sycl_queue=exec_q) + hev, _ = ti._take(x, (indices,), res, axis, mode, sycl_queue=exec_q) hev.wait() return res def put(x, indices, vals, /, *, axis=None, mode="clip"): + """put(x, indices, vals, axis=None, mode="clip") + + Puts values of an array into another array + along a given axis. + + Args: + x: usm_ndarray + The array the values will be put into. + indices: usm_ndarray + One-dimensional array of indices. + vals: + Array of values to be put into `x`. + Must be broadcastable to the shape of `indices`. + axis: + The axis over which the values will be placed. + If x is one-dimensional, this argument is optional. + mode: + How out-of-bounds indices will be handled. + "Clip" - clamps indices to (-axis_size <= i < axis_size), + then wraps negative indices. + "Wrap" - wraps both negative and positive indices. + """ if not isinstance(x, dpt.usm_ndarray): raise TypeError( "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) @@ -116,66 +149,61 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"): usm_types_ = [ x.usm_type, ] - - if not isinstance(indices, list) and not isinstance(indices, tuple): - indices = (indices,) - - for i in indices: - if not isinstance(i, dpt.usm_ndarray): - raise TypeError( - "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( - type(i) - ) + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( + type(indices) ) - if not np.issubdtype(i.dtype, np.integer): - raise IndexError( - "`indices` expected integer data type, got `{}`".format(i.dtype) + ) + if indices.ndim != 1: + raise ValueError( + "`indices` expected a 1D array, got `{}`".format(indices.ndim) + ) + if not np.issubdtype(indices.dtype, np.integer): + raise IndexError( + "`indices` expected integer data type, got `{}`".format( + indices.dtype ) - queues_.append(i.sycl_queue) - usm_types_.append(i.usm_type) + ) + queues_.append(indices.sycl_queue) + usm_types_.append(indices.usm_type) exec_q = dpctl.utils.get_execution_queue(queues_) if exec_q is None: - raise dpctl.utils.ExecutionPlacementError( - "Can not automatically determine where to allocate the " - "result or performance execution. " - "Use `usm_ndarray.to_device` method to migrate data to " - "be associated with the same queue." - ) - val_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) - + raise dpctl.utils.ExecutionPlacementError + vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) modes = {"clip": 0, "wrap": 1} try: mode = modes[mode] except KeyError: - raise ValueError("`mode` must be `wrap`, or `clip`.") + raise ValueError("`mode` must be `clip` or `wrap`.") - # when axis is none, array is treated as 1D - if axis is None: - try: - x = dpt.reshape(x, (x.size,), copy=False) - axis = 0 - except ValueError: - raise ValueError("Cannot create 1D view of input array") - if len(indices) > 1: - indices = dpt.broadcast_arrays(*indices) x_ndim = x.ndim + if axis is None: + if x_ndim > 1: + raise ValueError( + "`axis` cannot be `None` for array of dimension `{}`".format( + x_ndim + ) + ) + axis = 0 + if x_ndim > 0: axis = normalize_axis_index(operator.index(axis), x_ndim) - val_shape = ( - x.shape[:axis] + indices[0].shape + x.shape[axis + len(indices) :] - ) + val_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :] else: - val_shape = indices[0].shape + if axis != 0: + raise ValueError("`axis` must be 0 for an array of dimension 0.") + val_shape = indices.shape if not isinstance(vals, dpt.usm_ndarray): vals = dpt.asarray( - vals, dtype=x.dtype, usm_type=val_usm_type, sycl_queue=exec_q + vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q ) vals = dpt.broadcast_to(vals, val_shape) - hev, _ = ti._put(x, indices, vals, axis, mode, sycl_queue=exec_q) + hev, _ = ti._put(x, (indices,), vals, axis, mode, sycl_queue=exec_q) hev.wait() diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index bcc1fdbb60..7201357c7d 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -542,11 +542,11 @@ def test_put_0d_val(data_dt): x = dpt.arange(5, dtype=data_dt, sycl_queue=q) ind = dpt.asarray([0], dtype=np.intp, sycl_queue=q) - x[ind] = 2 + val = dpt.asarray(2, dtype=x.dtype, sycl_queue=q) + x[ind] = val assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x[0])) x = dpt.asarray(5, dtype=data_dt, sycl_queue=q) - val = 2 dpt.put(x, ind, val) assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x)) @@ -592,13 +592,13 @@ def test_put_0d_data(data_dt): "ind_dt", _all_int_dtypes, ) -def test_take_0d_ind(ind_dt): +def test_indexing_0d_ind(ind_dt): q = get_queue_or_skip() x = dpt.arange(5, dtype="i4", sycl_queue=q) ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q) - y = dpt.take(x, ind) + y = x[ind] assert dpt.asnumpy(x[3]) == dpt.asnumpy(y) @@ -613,7 +613,7 @@ def test_put_0d_ind(ind_dt): ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q) val = dpt.asarray(5, dtype=x.dtype, sycl_queue=q) - dpt.put(x, ind, val, axis=0) + x[ind] = val assert dpt.asnumpy(x[3]) == dpt.asnumpy(val) @@ -684,10 +684,6 @@ def test_take_strided(data_dt, order): np.take(xs_np, ind_np, axis=1), dpt.asnumpy(dpt.take(xs, ind, axis=1)), ) - assert_array_equal( - xs_np[ind_np, ind_np], - dpt.asnumpy(dpt.take(xs, [ind, ind], axis=0)), - ) @pytest.mark.parametrize( @@ -751,7 +747,7 @@ def test_take_strided_indices(ind_dt, order): inds_np = ind_np[s, ::sgn] assert_array_equal( np.take(x_np, inds_np, axis=0), - dpt.asnumpy(dpt.take(x, inds, axis=0)), + dpt.asnumpy(x[inds]), ) @@ -828,7 +824,7 @@ def test_put_strided_destination(data_dt, order): x_np1[ind_np, ind_np] = val_np x1 = dpt.copy(xs) - dpt.put(x1, [ind, ind], val, axis=0) + x1[ind, ind] = val assert_array_equal(x_np1, dpt.asnumpy(x1)) @@ -887,7 +883,7 @@ def test_put_strided_indices(ind_dt, order): inds_np = ind_np[s, ::sgn] x_copy = dpt.copy(x) - dpt.put(x_copy, inds, val, axis=0) + x_copy[inds] = val x_np_copy = x_np.copy() x_np_copy[inds_np] = val_np @@ -899,7 +895,7 @@ def test_take_arg_validation(): q = get_queue_or_skip() x = dpt.arange(4, dtype="i4", sycl_queue=q) - ind0 = dpt.arange(2, dtype=np.intp, sycl_queue=q) + ind0 = dpt.arange(4, dtype=np.intp, sycl_queue=q) ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q) with pytest.raises(TypeError): @@ -919,13 +915,15 @@ def test_take_arg_validation(): dpt.take(x, ind0, mode=0) with pytest.raises(ValueError): dpt.take(dpt.reshape(x, (2, 2)), ind0, axis=None) + with pytest.raises(ValueError): + dpt.take(x, dpt.reshape(ind0, (2, 2))) def test_put_arg_validation(): q = get_queue_or_skip() x = dpt.arange(4, dtype="i4", sycl_queue=q) - ind0 = dpt.arange(2, dtype=np.intp, sycl_queue=q) + ind0 = dpt.arange(4, dtype=np.intp, sycl_queue=q) ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q) val = dpt.asarray(2, x.dtype, sycl_queue=q) @@ -946,6 +944,8 @@ def test_put_arg_validation(): with pytest.raises(ValueError): dpt.put(x, ind0, val, mode=0) + with pytest.raises(ValueError): + dpt.put(x, dpt.reshape(ind0, (2, 2)), val) def test_advanced_indexing_compute_follows_data(): From 13c5db754e42fd446e9de46f908e15dc0a9c8c2d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 23:22:11 -0600 Subject: [PATCH 57/57] Fixed rst in docstrings of extract/place --- dpctl/tensor/_indexing_functions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index c312d9e2b9..e585d6bf69 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -212,11 +212,11 @@ def extract(condition, arr): Returns the elements of an array that satisfies the condition. - If `condition` is boolean :func:``dpctl.tensor.extract`` is + If `condition` is boolean ``dpctl.tensor.extract`` is equivalent to ``arr[condition]``. - Note that :func:``dpctl.tensor.place`` does the opposite of - :func:``dpctl.tensor.extract``. + Note that ``dpctl.tensor.place`` does the opposite of + ``dpctl.tensor.extract``. Args: conditions: usm_ndarray @@ -255,7 +255,7 @@ def place(arr, mask, vals): Change elements of an array based on conditional and input values. - If `mask` is boolean :func:``dpctl.tensor.place`` is + If `mask` is boolean ``dpctl.tensor.place`` is equivalent to ``arr[condition] = vals``. Args: