From 831a5cb36eba038092e451164610492892c80d35 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 10 Feb 2023 22:16:11 -0600
Subject: [PATCH 01/57] Fixed typo in docstring

---
 dpctl/tensor/_slicing.pxi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi
index 69ed454bd0..f8eb6aaa31 100755
--- a/dpctl/tensor/_slicing.pxi
+++ b/dpctl/tensor/_slicing.pxi
@@ -38,7 +38,7 @@ cdef object _basic_slice_meta(object ind, tuple shape,
                               tuple strides, Py_ssize_t offset):
     """
     Give basic slicing index `ind` and array layout information produce
-    a tuple (resulting_shape, resulting_strides, resultin_offset)
+    a tuple (resulting_shape, resulting_strides, resulting_offset)
     used to contruct a view into underlying array.
 
     Raises IndexError for invalid index `ind`, and NotImplementedError

From 402c1d60cfd83ec4041077716bf3c2dfeea38001 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 13 Feb 2023 18:40:05 -0600
Subject: [PATCH 02/57] Fixed TODO in utility _zero_like

---
 dpctl/tensor/_usmarray.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index a708418746..9a4fab8af3 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -1258,12 +1258,13 @@ cdef usm_ndarray _zero_like(usm_ndarray ary):
     Make C-contiguous array of zero elements with same shape
     and type as ary.
     """
+    cdef dt = _make_typestr(ary.typenum_)
     cdef usm_ndarray r = usm_ndarray(
         _make_int_tuple(ary.nd_, ary.shape_),
-        dtype=_make_typestr(ary.typenum_),
+        dtype=dt,
         buffer=ary.base_.get_usm_type()
     )
-    # TODO: call function to set array elements to zero
+    r.base_.memset()
     return r
 
 

From d7fc400f50e126277adfda24ace3e386425f5ea7 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 13 Feb 2023 18:40:47 -0600
Subject: [PATCH 03/57] Extended _basic_slice_meta to process advanced indexing
 specs

---
 dpctl/tensor/_slicing.pxi | 124 +++++++++++++++++++++++++++++---------
 1 file changed, 97 insertions(+), 27 deletions(-)

diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi
index f8eb6aaa31..6689502955 100755
--- a/dpctl/tensor/_slicing.pxi
+++ b/dpctl/tensor/_slicing.pxi
@@ -34,23 +34,41 @@ cdef Py_ssize_t _slice_len(
         return 1 + ((sl_stop - sl_start + 1) // sl_step)
 
 
-cdef object _basic_slice_meta(object ind, tuple shape,
-                              tuple strides, Py_ssize_t offset):
+cdef bint _is_integral(object x) except *:
+    """Gives True if x is an integral slice spec"""
+    if isinstance(x, (int, numbers.Integral)):
+        return True
+    if isinstance(x, usm_ndarray):
+        if x.ndim > 0:
+            return False
+        if x.dtype.kind not in "ui":
+            return False
+        return True
+    if callable(getattr(x, "__index__", None)):
+        try:
+            x.__index__()
+        except (TypeError, ValueError):
+            return False
+        return True
+    return False
+
+
+def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
     """
     Give basic slicing index `ind` and array layout information produce
-    a tuple (resulting_shape, resulting_strides, resulting_offset)
-    used to contruct a view into underlying array.
+    a 5-tuple (resulting_shape, resulting_strides, resulting_offset,
+       advanced_ind, resulting_advanced_ind_pos)
+    used to contruct a view into underlying array over which advanced
+    indexing, if any, is to be performed.
 
-    Raises IndexError for invalid index `ind`, and NotImplementedError
-    if `ind` is an array.
+    Raises IndexError for invalid index `ind`.
     """
-    is_integral = lambda x: (
-        isinstance(x, numbers.Integral) or callable(getattr(x, "__index__", None))
-    )
+    _no_advanced_ind = tuple()
+    _no_advanced_pos = -1
     if ind is Ellipsis:
-        return (shape, strides, offset)
+        return (shape, strides, offset, _no_advanced_ind, _no_advanced_pos)
     elif ind is None:
-        return ((1,) + shape, (0,) + strides, offset)
+        return ((1,) + shape, (0,) + strides, offset, _no_advanced_ind, _no_advanced_pos)
     elif isinstance(ind, slice):
         sl_start, sl_stop, sl_step = ind.indices(shape[0])
         sh0 = _slice_len(sl_start, sl_stop, sl_step)
@@ -60,38 +78,70 @@ cdef object _basic_slice_meta(object ind, tuple shape,
         return (
             (sh0, ) + shape[1:],
             new_strides,
-            new_offset
+            new_offset,
+            _no_advanced_ind,
+            _no_advanced_pos
         )
-    elif is_integral(ind):
+    elif _is_integral(ind):
         ind = ind.__index__()
         if 0 <= ind < shape[0]:
-            return (shape[1:], strides[1:], offset + ind * strides[0])
+            return (shape[1:], strides[1:], offset + ind * strides[0], _no_advanced_ind, _no_advanced_pos)
         elif -shape[0] <= ind < 0:
             return (shape[1:], strides[1:],
-                    offset + (shape[0] + ind) * strides[0])
+                    offset + (shape[0] + ind) * strides[0], _no_advanced_ind, _no_advanced_pos)
         else:
             raise IndexError(
                 "Index {0} is out of range for axes 0 with "
                 "size {1}".format(ind, shape[0]))
-    elif isinstance(ind, list):
-        raise NotImplemented
+    elif isinstance(ind, usm_ndarray):
+        return (shape, strides, 0, (ind,), 0)
     elif isinstance(ind, tuple):
         axes_referenced = 0
         ellipses_count = 0
         newaxis_count = 0
         explicit_index = 0
+        array_count = 0
+        seen_arrays_yet = False
+        array_streak_started = False
+        array_streak_interrupted = False
         for i in ind:
             if i is None:
-                newaxis_count = newaxis_count + 1
+                newaxis_count += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
             elif i is Ellipsis:
-                ellipses_count = ellipses_count + 1
+                ellipses_count += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
             elif isinstance(i, slice):
-                axes_referenced = axes_referenced + 1
-            elif is_integral(i):
-                explicit_index = explicit_index + 1
-                axes_referenced = axes_referenced + 1
-            elif isinstance(i, list):
-                raise NotImplemented
+                axes_referenced += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif _is_integral(i):
+                explicit_index += 1
+                axes_referenced += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif isinstance(i, usm_ndarray):
+                if not seen_arrays_yet:
+                    seen_arrays_yet = True
+                    array_streak_started = True
+                    array_streak_interrupted = False
+                if array_streak_interrupted:
+                    raise IndexError(
+                        "Advanced indexing array specs may not be "
+                        "separated by basic slicing specs."
+                    )
+                dt_k = i.dtype.kind
+                if dt_k == "b":
+                    axes_referenced += i.ndim
+                elif dt_k in "ui":
+                    axes_referenced += 1
+                else:
+                    raise IndexError(
+                        "arrays used as indices must be of integer (or boolean) type"
+                    )
+                array_count += 1
             else:
                 raise TypeError
         if ellipses_count > 1:
@@ -108,7 +158,10 @@ cdef object _basic_slice_meta(object ind, tuple shape,
                          + axes_referenced - explicit_index)
         new_shape = list()
         new_strides = list()
+        new_advanced_ind = list()
         k = 0
+        new_advanced_start_pos = -1
+        advanced_start_pos_set = False
         new_offset = offset
         is_empty = False
         for i in range(len(ind)):
@@ -133,7 +186,7 @@ cdef object _basic_slice_meta(object ind, tuple shape,
                 if sh_i == 0:
                     is_empty = True
                 k = k_new
-            elif is_integral(ind_i):
+            elif _is_integral(ind_i):
                 ind_i = ind_i.__index__()
                 if 0 <= ind_i < shape[k]:
                     k_new = k + 1
@@ -149,8 +202,25 @@ cdef object _basic_slice_meta(object ind, tuple shape,
                     raise IndexError(
                         ("Index {0} is out of range for "
                         "axes {1} with size {2}").format(ind_i, k, shape[k]))
+            elif isinstance(ind_i, usm_ndarray):
+                if not advanced_start_pos_set:
+                    new_advanced_start_pos = len(new_shape)
+                    advanced_start_pos_set = True
+                new_advanced_ind.append(ind_i)
+                dt_k = ind_i.dtype.kind
+                if dt_k == "b":
+                    k_new = k + ind_i.ndim
+                else:
+                    k_new = k + 1
+                new_shape.extend(shape[k:k_new])
+                new_strides.extend(strides[k:k_new])
+                k = k_new
         new_shape.extend(shape[k:])
         new_strides.extend(strides[k:])
-        return (tuple(new_shape), tuple(new_strides), new_offset)
+        new_shape_len += len(shape) - k
+#        assert len(new_shape) == new_shape_len, f"{len(new_shape)} vs {new_shape_len}"
+#        assert len(new_strides) == new_shape_len, f"{len(new_strides)} vs {new_shape_len}"
+#        assert len(new_advanced_ind) == array_count
+        return (tuple(new_shape), tuple(new_strides), new_offset, tuple(new_advanced_ind), new_advanced_start_pos)
     else:
         raise TypeError

From bcc305bdeec56c2dbb1b129f1657f7d12744d56e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 13 Feb 2023 18:45:39 -0600
Subject: [PATCH 04/57] Added prototype with mock implementation of
 extract/place/take/put

---
 proto/advanced.py      | 462 +++++++++++++++++++++++++++++++++++++++++
 proto/test_advanced.py | 405 ++++++++++++++++++++++++++++++++++++
 2 files changed, 867 insertions(+)
 create mode 100644 proto/advanced.py
 create mode 100644 proto/test_advanced.py

diff --git a/proto/advanced.py b/proto/advanced.py
new file mode 100644
index 0000000000..8721214d73
--- /dev/null
+++ b/proto/advanced.py
@@ -0,0 +1,462 @@
+import numbers
+
+import dpctl.tensor as dpt
+import dpctl.utils
+from dpctl.tensor import usm_ndarray
+
+"""
+Advanced slicing meta-infomation extraction
+"""
+
+
+class ExecutionPlacementError(Exception):
+    pass
+
+
+def _slice_len(sl_start: int, sl_stop: int, sl_step: int):
+    """
+    Compute len(range(sl_start, sl_stop, sl_step))
+    """
+    if sl_start == sl_stop:
+        return 0
+    if sl_step > 0:
+        # 1 + argmax k such htat sl_start + sl_step*k < sl_stop
+        return 1 + ((sl_stop - sl_start - 1) // sl_step)
+    else:
+        return 1 + ((sl_stop - sl_start + 1) // sl_step)
+
+
+def _is_integral(x):
+    """Gives True if x is an integral slice spec"""
+    if isinstance(x, (int, numbers.Integral)):
+        return True
+    if isinstance(x, usm_ndarray):
+        if x.ndim > 0:
+            return False
+        if x.dtype.kind not in "ui":
+            return False
+        return True
+    if callable(getattr(x, "__index__", None)):
+        try:
+            x.__index__()
+        except (TypeError, ValueError):
+            return False
+        return True
+    return False
+
+
+def _basic_slice_meta(ind, shape: tuple, strides: tuple, offset: int):
+    """
+    Give basic slicing index `ind` and array layout information produce
+    a 5-tuple (resulting_shape, resulting_strides, resulting_offset,
+       advanced_ind, resulting_advanced_ind_pos)
+    used to contruct a view into underlying array over which advanced
+    indexing, if any, is to be performed.
+
+    Raises IndexError for invalid index `ind`.
+    """
+    _no_advanced_ind = tuple()
+    _no_advanced_pos = -1
+    if ind is Ellipsis:
+        return (shape, strides, offset, _no_advanced_ind, _no_advanced_pos)
+    elif ind is None:
+        return (
+            (1,) + shape,
+            (0,) + strides,
+            offset,
+            _no_advanced_ind,
+            _no_advanced_pos,
+        )
+    elif isinstance(ind, slice):
+        sl_start, sl_stop, sl_step = ind.indices(shape[0])
+        sh0 = _slice_len(sl_start, sl_stop, sl_step)
+        str0 = sl_step * strides[0]
+        new_strides = (
+            strides if (sl_step == 1 or sh0 == 0) else (str0,) + strides[1:]
+        )
+        new_offset = offset if sh0 == 0 else offset + sl_start * strides[0]
+        return (
+            (sh0,) + shape[1:],
+            new_strides,
+            new_offset,
+            _no_advanced_ind,
+            _no_advanced_pos,
+        )
+    elif _is_integral(ind):
+        ind = ind.__index__()
+        if 0 <= ind < shape[0]:
+            return (
+                shape[1:],
+                strides[1:],
+                offset + ind * strides[0],
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+        elif -shape[0] <= ind < 0:
+            return (
+                shape[1:],
+                strides[1:],
+                offset + (shape[0] + ind) * strides[0],
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+        else:
+            raise IndexError(
+                "Index {0} is out of range for axes 0 with "
+                "size {1}".format(ind, shape[0])
+            )
+    elif isinstance(ind, usm_ndarray):
+        return (shape, strides, 0, (ind,), 0)
+    elif isinstance(ind, tuple):
+        axes_referenced = 0
+        ellipses_count = 0
+        newaxis_count = 0
+        explicit_index = 0
+        array_count = 0
+        seen_arrays_yet = False
+        array_streak_started = False
+        array_streak_interrupted = False
+        for i in ind:
+            if i is None:
+                newaxis_count += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif i is Ellipsis:
+                ellipses_count += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif isinstance(i, slice):
+                axes_referenced += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif _is_integral(i):
+                explicit_index += 1
+                axes_referenced += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif isinstance(i, usm_ndarray):
+                if not seen_arrays_yet:
+                    seen_arrays_yet = True
+                    array_streak_started = True
+                    array_streak_interrupted = False
+                if array_streak_interrupted:
+                    raise IndexError(
+                        "Advanced indexing array specs may not be "
+                        "separated by basic slicing specs."
+                    )
+                dt_k = i.dtype.kind
+                if dt_k == "b":
+                    axes_referenced += i.ndim
+                elif dt_k in "ui":
+                    axes_referenced += 1
+                else:
+                    raise IndexError(
+                        "arrays used as indices must be of integer "
+                        "(or boolean) type"
+                    )
+                array_count += 1
+            else:
+                raise TypeError
+        if ellipses_count > 1:
+            raise IndexError("an index can only have a sinlge ellipsis ('...')")
+        if axes_referenced > len(shape):
+            raise IndexError(
+                "too many indices for an array, array is "
+                "{0}-dimensional, but {1} were indexed".format(
+                    len(shape), axes_referenced
+                )
+            )
+        if ellipses_count:
+            ellipses_count = len(shape) - axes_referenced
+        new_shape_len = (
+            newaxis_count + ellipses_count + axes_referenced - explicit_index
+        )
+        new_shape = list()
+        new_strides = list()
+        new_advanced_ind = list()
+        k = 0
+        new_advanced_start_pos = -1
+        advanced_start_pos_set = False
+        new_offset = offset
+        is_empty = False
+        for i in range(len(ind)):
+            ind_i = ind[i]
+            if ind_i is Ellipsis:
+                k_new = k + ellipses_count
+                new_shape.extend(shape[k:k_new])
+                new_strides.extend(strides[k:k_new])
+                k = k_new
+            elif ind_i is None:
+                new_shape.append(1)
+                new_strides.append(0)
+            elif isinstance(ind_i, slice):
+                k_new = k + 1
+                sl_start, sl_stop, sl_step = ind_i.indices(shape[k])
+                sh_i = _slice_len(sl_start, sl_stop, sl_step)
+                str_i = (1 if sh_i == 0 else sl_step) * strides[k]
+                new_shape.append(sh_i)
+                new_strides.append(str_i)
+                if sh_i > 0 and not is_empty:
+                    new_offset = new_offset + sl_start * strides[k]
+                if sh_i == 0:
+                    is_empty = True
+                k = k_new
+            elif _is_integral(ind_i):
+                ind_i = ind_i.__index__()
+                if 0 <= ind_i < shape[k]:
+                    k_new = k + 1
+                    if not is_empty:
+                        new_offset = new_offset + ind_i * strides[k]
+                    k = k_new
+                elif -shape[k] <= ind_i < 0:
+                    k_new = k + 1
+                    if not is_empty:
+                        new_offset = (
+                            new_offset + (shape[k] + ind_i) * strides[k]
+                        )
+                    k = k_new
+                else:
+                    raise IndexError(
+                        (
+                            "Index {0} is out of range for "
+                            "axes {1} with size {2}"
+                        ).format(ind_i, k, shape[k])
+                    )
+            elif isinstance(ind_i, usm_ndarray):
+                if not advanced_start_pos_set:
+                    new_advanced_start_pos = len(new_shape)
+                    advanced_start_pos_set = True
+                new_advanced_ind.append(ind_i)
+                dt_k = ind_i.dtype.kind
+                if dt_k == "b":
+                    k_new = k + ind_i.ndim
+                else:
+                    k_new = k + 1
+                new_shape.extend(shape[k:k_new])
+                new_strides.extend(strides[k:k_new])
+                k = k_new
+        new_shape.extend(shape[k:])
+        new_strides.extend(strides[k:])
+        debug = True
+        if debug:
+            new_shape_len += len(shape) - k
+            assert (
+                len(new_shape) == new_shape_len
+            ), f"{len(new_shape)} vs {new_shape_len}"
+            assert (
+                len(new_strides) == new_shape_len
+            ), f"{len(new_strides)} vs {new_shape_len}"
+            assert len(new_advanced_ind) == array_count
+        return (
+            tuple(new_shape),
+            tuple(new_strides),
+            new_offset,
+            tuple(new_advanced_ind),
+            new_advanced_start_pos,
+        )
+    else:
+        raise TypeError
+
+
+def _mock_extract(ary, ary_mask, p):
+    exec_q = dpctl.utils.get_execution_queue(
+        (
+            ary.sycl_queue,
+            ary_mask.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise ExecutionPlacementError(
+            "Can not automatically determine where to allocate the "
+            "result or performance execution. "
+            "Use `usm_ndarray.to_device` method to migrate data to "
+            "be associated with the same queue."
+        )
+
+    res_usm_type = dpctl.utils.get_coerced_usm_type(
+        (
+            ary.usm_type,
+            ary_mask.usm_type,
+        )
+    )
+    ary_np = dpt.asnumpy(ary)
+    mask_np = dpt.asnumpy(ary_mask)
+    res_np = ary_np[(slice(None),) * p + (mask_np,)]
+    res = dpt.empty(
+        res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+    res[...] = res_np
+    return res
+
+
+def _mock_nonzero(ary):
+    if not isinstance(ary, usm_ndarray):
+        raise TypeError
+    q = ary.sycl_queue
+    usm_type = ary.usm_type
+    ary_np = dpt.asnumpy(ary)
+    nz = ary_np.nonzero()
+    return tuple(dpt.asarray(i, usm_type=usm_type, sycl_queue=q) for i in nz)
+
+
+def _mock_take_multi_index(ary, inds, p):
+    queues_ = [
+        ary.sycl_queue,
+    ]
+    usm_types_ = [
+        ary.usm_type,
+    ]
+    all_integers = True
+    for ind in inds:
+        queues_.append(ind.sycl_queue)
+        usm_types_.append(ind.usm_type)
+        if all_integers:
+            all_integers = ind.dtype.kind in "ui"
+    exec_q = dpctl.utils.get_execution_queue(queues_)
+    if exec_q is None:
+        raise ExecutionPlacementError("")
+    if not all_integers:
+        print(inds)
+        raise IndexError(
+            "arrays used as indices must be of integer (or boolean) type"
+        )
+    ary_np = dpt.asnumpy(ary)
+    ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds)
+    res_np = ary_np[ind_np]
+    res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
+    res = dpt.empty(
+        res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+    res[...] = res_np
+    return res
+
+
+def get_item(ary, ind):
+    suai = ary.__sycl_usm_array_interface__
+    _meta = _basic_slice_meta(
+        ind, ary.shape, ary.strides, suai.get("offset", 0)
+    )
+
+    if len(_meta) < 5:
+        raise RuntimeError
+
+    res = usm_ndarray.__new__(
+        usm_ndarray,
+        _meta[0],
+        dtype=ary.dtype,  # _make_typestr(ary.dtype.num),
+        strides=_meta[1],
+        buffer=ary.usm_data,  # self.base_,
+        offset=_meta[2],
+    )
+    # set flags and namespace
+    # res.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE)
+    # res.array_namespace_ = self.array_namespace_
+    adv_ind = _meta[3]
+    adv_ind_start_p = _meta[4]
+
+    if adv_ind_start_p < 0:
+        return res
+
+    if len(adv_ind) == 1 and adv_ind[0].dtype == dpt.bool:
+        return _mock_extract(res, adv_ind[0], adv_ind_start_p)
+
+    if any(ind.dtype == dpt.bool for ind in adv_ind):
+        adv_ind_int = list()
+        for ind in adv_ind:
+            if ind.dtype == dpt.bool:
+                adv_ind_int.extend(_mock_nonzero(ind))
+            else:
+                adv_ind_int.append(ind)
+        return _mock_take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p)
+
+    return _mock_take_multi_index(res, adv_ind, adv_ind_start_p)
+
+
+def _mock_place(ary, ary_mask, p, vals):
+    exec_q = dpctl.utils.get_execution_queue(
+        (ary.sycl_queue, ary_mask.sycl_queue, vals.sycl_queue)
+    )
+    if exec_q is None:
+        raise ExecutionPlacementError(
+            "Can not automatically determine where to allocate the "
+            "result or performance execution. "
+            "Use `usm_ndarray.to_device` method to migrate data to "
+            "be associated with the same queue."
+        )
+
+    ary_np = dpt.asnumpy(ary)
+    mask_np = dpt.asnumpy(ary_mask)
+    vals_np = dpt.asnumpy(vals)
+    ary_np[(slice(None),) * p + (mask_np,)] = vals_np
+    ary[...] = ary_np
+    return
+
+
+def _mock_put_multi_index(ary, inds, p, vals):
+    queues_ = [ary.sycl_queue, vals.sycl_queue]
+    usm_types_ = [ary.usm_type, vals.usm_type]
+    all_integers = True
+    for ind in inds:
+        queues_.append(ind.sycl_queue)
+        usm_types_.append(ind.usm_type)
+        if all_integers:
+            all_integers = ind.dtype.kind in "ui"
+    exec_q = dpctl.utils.get_execution_queue(queues_)
+    if exec_q is None:
+        raise ExecutionPlacementError("")
+    if not all_integers:
+        print(inds)
+        raise IndexError(
+            "arrays used as indices must be of integer (or boolean) type"
+        )
+    ary_np = dpt.asnumpy(ary)
+    vals_np = dpt.asnumpy(vals)
+    ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds)
+    ary_np[ind_np] = vals_np
+    ary[...] = ary_np
+    return
+
+
+def set_item(ary, ind, rhs):
+    suai = ary.__sycl_usm_array_interface__
+    _meta = _basic_slice_meta(
+        ind, ary.shape, ary.strides, suai.get("offset", 0)
+    )
+
+    if len(_meta) < 5:
+        raise RuntimeError
+
+    res = usm_ndarray.__new__(
+        usm_ndarray,
+        _meta[0],
+        dtype=ary.dtype,  # _make_typestr(ary.dtype.num),
+        strides=_meta[1],
+        buffer=ary.usm_data,  # self.base_,
+        offset=_meta[2],
+    )
+    # set flags and namespace
+    # res.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE)
+    # res.array_namespace_ = self.array_namespace_
+    adv_ind = _meta[3]
+    adv_ind_start_p = _meta[4]
+
+    if adv_ind_start_p < 0:
+        res[...] = rhs
+        return
+
+    if len(adv_ind) == 1 and adv_ind[0].dtype == dpt.bool:
+        _mock_place(res, adv_ind[0], adv_ind_start_p, rhs)
+        return
+
+    if any(ind.dtype == dpt.bool for ind in adv_ind):
+        adv_ind_int = list()
+        for ind in adv_ind:
+            if ind.dtype == dpt.bool:
+                adv_ind_int.extend(_mock_nonzero(ind))
+            else:
+                adv_ind_int.append(ind)
+        _mock_put_multi_index(res, tuple(adv_ind_int), adv_ind_start_p, rhs)
+        return
+
+    _mock_put_multi_index(res, adv_ind, adv_ind_start_p, rhs)
+    return
diff --git a/proto/test_advanced.py b/proto/test_advanced.py
new file mode 100644
index 0000000000..7cfb44c3d6
--- /dev/null
+++ b/proto/test_advanced.py
@@ -0,0 +1,405 @@
+import advanced
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+
+
+def test_basic_slice1():
+    res = advanced._basic_slice_meta((0,), (1,), (1,), 0)
+    assert res == (tuple(), tuple(), 0, tuple(), -1)
+
+
+def test_basic_slice1a():
+    res = advanced._basic_slice_meta(0, (1,), (1,), 0)
+    assert res == (tuple(), tuple(), 0, tuple(), -1)
+
+
+def test_basic_slice2():
+    res = advanced._basic_slice_meta((slice(None),), (1,), (1,), 0)
+    assert res == ((1,), (1,), 0, tuple(), -1)
+
+
+def test_basic_slice3():
+    res = advanced._basic_slice_meta((slice(None, None, -1),), (1,), (1,), 0)
+    assert res == ((1,), (-1,), 0, tuple(), -1)
+
+
+def test_basic_slice4():
+    res = advanced._basic_slice_meta(
+        (slice(None, None, -1),),
+        (
+            5,
+            3,
+        ),
+        (
+            3,
+            1,
+        ),
+        0,
+    )
+    assert res == ((5, 3), (-3, 1), (5 - 1) * 3, tuple(), -1)
+
+
+def test_basic_slice5():
+    res = advanced._basic_slice_meta(
+        (
+            slice(None),
+            slice(None, None, -1),
+        ),
+        (
+            4,
+            3,
+        ),
+        (
+            3,
+            1,
+        ),
+        0,
+    )
+    assert res == ((4, 3), (3, -1), 3 - 1, tuple(), -1)
+
+
+def test_basic_slice6():
+    res = advanced._basic_slice_meta(
+        (
+            2,
+            slice(None, None, -1),
+        ),
+        (
+            4,
+            3,
+        ),
+        (
+            3,
+            1,
+        ),
+        0,
+    )
+    assert res == ((3,), (-1,), 2 * 3 + 3 - 1, tuple(), -1)
+
+
+def test_basic_slice7():
+    res = advanced._basic_slice_meta(
+        (
+            Ellipsis,
+            slice(None, None, -1),
+        ),
+        (
+            4,
+            3,
+        ),
+        (
+            3,
+            1,
+        ),
+        0,
+    )
+    assert res == ((4, 3), (3, -1), 3 - 1, tuple(), -1)
+
+
+def test_basic_slice8():
+    res = advanced._basic_slice_meta(
+        (Ellipsis, None),
+        (
+            4,
+            3,
+        ),
+        (
+            3,
+            1,
+        ),
+        0,
+    )
+    assert res == ((4, 3, 1), (3, 1, 0), 0, tuple(), -1)
+
+
+def test_basic_slice9():
+    res = advanced._basic_slice_meta(
+        (
+            None,
+            Ellipsis,
+        ),
+        (
+            4,
+            3,
+        ),
+        (
+            3,
+            1,
+        ),
+        0,
+    )
+    assert res == (
+        (
+            1,
+            4,
+            3,
+        ),
+        (0, 3, 1),
+        0,
+        tuple(),
+        -1,
+    )
+
+
+def test_basic_slice10():
+    res = advanced._basic_slice_meta(
+        (None, Ellipsis, slice(None)), (4, 3, 5), (30, 5, 1), 0
+    )
+    assert res == ((1, 4, 3, 5), (0, 30, 5, 1), 0, tuple(), -1)
+
+
+def test_advanced_slice1():
+    ii = dpt.asarray([0, 1])
+    res = advanced._basic_slice_meta((ii,), (10,), (1,), 0)
+    assert res == ((10,), (1,), 0, (ii,), 0)
+
+    res = advanced._basic_slice_meta(ii, (10,), (1,), 0)
+    assert res == ((10,), (1,), 0, (ii,), 0)
+
+
+def test_advanced_slice2():
+    ii = dpt.asarray([0, 1])
+    res = advanced._basic_slice_meta((ii, None), (10,), (1,), 0)
+    assert res == ((10, 1), (1, 0), 0, (ii,), 0)
+
+
+def test_advanced_slice3():
+    ii = dpt.asarray([0, 1])
+    res = advanced._basic_slice_meta((None, ii), (10,), (1,), 0)
+    assert res == (
+        (
+            1,
+            10,
+        ),
+        (
+            0,
+            1,
+        ),
+        0,
+        (ii,),
+        1,
+    )
+
+
+def test_advanced_slice4():
+    ii = dpt.asarray([0, 1])
+    res = advanced._basic_slice_meta(
+        (ii, ii, ii),
+        (10, 10, 10),
+        (
+            100,
+            10,
+            1,
+        ),
+        0,
+    )
+    assert res == (
+        (10, 10, 10),
+        (
+            100,
+            10,
+            1,
+        ),
+        0,
+        (ii, ii, ii),
+        0,
+    )
+
+
+def test_advanced_slice5():
+    ii = dpt.asarray([0, 1])
+    with pytest.raises(IndexError):
+        advanced._basic_slice_meta(
+            (ii, slice(None), ii),
+            (10, 10, 10),
+            (
+                100,
+                10,
+                1,
+            ),
+            0,
+        )
+
+
+def test_advanced_slice6():
+    ii = dpt.asarray([0, 1])
+    res = advanced._basic_slice_meta(
+        (
+            slice(None),
+            ii,
+            ii,
+        ),
+        (10, 10, 10),
+        (
+            100,
+            10,
+            1,
+        ),
+        0,
+    )
+    assert res == (
+        (
+            10,
+            10,
+            10,
+        ),
+        (100, 10, 1),
+        0,
+        (
+            ii,
+            ii,
+        ),
+        1,
+    )
+
+
+def test_advanced_slice7():
+    x = dpt.reshape(
+        dpt.arange(3 * 3 * 3, dtype="i8"),
+        (
+            3,
+            3,
+            3,
+        ),
+    )
+    mask = dpt.asarray(
+        [
+            [[True, True, False], [False, True, True], [True, False, True]],
+            [[True, False, False], [False, False, True], [False, True, False]],
+            [[True, True, True], [False, False, False], [False, False, True]],
+        ]
+    )
+    res = advanced.get_item(x, mask)
+    res_expected = np.array([0, 1, 4, 5, 6, 8, 9, 14, 16, 18, 19, 20, 26])
+    assert np.array_equal(dpt.asnumpy(res), res_expected)
+    res2 = advanced.get_item(x, (mask,))
+    assert np.array_equal(dpt.asnumpy(res2), res_expected)
+
+
+def test_advanced_slice8():
+    x = dpt.reshape(
+        dpt.arange(3 * 3 * 3, dtype="i8"),
+        (
+            3,
+            3,
+            3,
+        ),
+    )
+    mask = dpt.asarray(
+        [[True, False, False], [False, True, False], [False, True, False]]
+    )
+    res = advanced.get_item(x, mask)
+    res_expected = np.array([[0, 1, 2], [12, 13, 14], [21, 22, 23]])
+    assert np.array_equal(dpt.asnumpy(res), res_expected)
+    res2 = advanced.get_item(x, (mask,))
+    assert np.array_equal(dpt.asnumpy(res2), res_expected)
+
+
+def test_advanced_slice9():
+    x = dpt.reshape(
+        dpt.arange(3 * 3 * 3, dtype="i8"),
+        (
+            3,
+            3,
+            3,
+        ),
+    )
+    mask = dpt.asarray(
+        [[True, False, False], [False, True, False], [False, True, False]]
+    )
+    res = advanced.get_item(
+        x,
+        (
+            slice(None, None, None),
+            mask,
+        ),
+    )
+    res_expected = np.array([[0, 4, 7], [9, 13, 16], [18, 22, 25]])
+    assert np.array_equal(dpt.asnumpy(res), res_expected)
+
+
+def lin_id(i, j, k):
+    return 9 * i + 3 * j + k
+
+
+def test_advanced_slice10():
+    x = dpt.reshape(
+        dpt.arange(3 * 3 * 3, dtype="i8"),
+        (
+            3,
+            3,
+            3,
+        ),
+    )
+    i0 = dpt.asarray([0, 1, 1])
+    i1 = dpt.asarray([1, 1, 2])
+    i2 = dpt.asarray([2, 0, 1])
+    res = advanced.get_item(x, (i0, i1, i2))
+    res_expected = np.array(
+        [
+            lin_id(0, 1, 2),
+            lin_id(1, 1, 0),
+            lin_id(1, 2, 1),
+        ]
+    )
+    assert np.array_equal(dpt.asnumpy(res), res_expected)
+
+
+def test_advanced_slice11():
+    x = dpt.reshape(
+        dpt.arange(3 * 3 * 3, dtype="i8"),
+        (
+            3,
+            3,
+            3,
+        ),
+    )
+    i0 = dpt.asarray([0, 1, 1])
+    i2 = dpt.asarray([2, 0, 1])
+    with pytest.raises(IndexError):
+        advanced.get_item(x, (i0, slice(None, None, None), i2))
+
+
+def test_advanced_slice12():
+    x = dpt.reshape(
+        dpt.arange(3 * 3 * 3, dtype="i8"),
+        (
+            3,
+            3,
+            3,
+        ),
+    )
+    i1 = dpt.asarray([1, 1, 2])
+    i2 = dpt.asarray([2, 0, 1])
+    res = advanced.get_item(x, (slice(None), None, i1, i2, None))
+    res_expected = np.array(
+        [
+            [[[lin_id(0, 1, 2)], [lin_id(0, 1, 0)], [lin_id(0, 2, 1)]]],
+            [[[lin_id(1, 1, 2)], [lin_id(1, 1, 0)], [lin_id(1, 2, 1)]]],
+            [[[lin_id(2, 1, 2)], [lin_id(2, 1, 0)], [lin_id(2, 2, 1)]]],
+        ]
+    )
+    assert np.array_equal(dpt.asnumpy(res), res_expected)
+
+
+def test_advanced_slice13():
+    x = dpt.reshape(
+        dpt.arange(3 * 3 * 3, dtype="i8"),
+        (
+            3,
+            3,
+            3,
+        ),
+    )
+    i1 = dpt.asarray([[1], [2]])
+    i2 = dpt.asarray([[0, 1]])
+    res = advanced.get_item(x, (i1, i2, 0))
+    res_expected = np.array(
+        [
+            [lin_id(1, 0, 0), lin_id(1, 1, 0)],
+            [lin_id(2, 0, 0), lin_id(2, 1, 0)],
+        ]
+    )
+    assert np.array_equal(dpt.asnumpy(res), res_expected)

From a1791e28ff45278df797ea73340391859452286a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 14 Feb 2023 08:53:28 -0600
Subject: [PATCH 05/57] Change mod of _slicing.pxi to be non-executable

---
 dpctl/tensor/_slicing.pxi | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 dpctl/tensor/_slicing.pxi

diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi
old mode 100755
new mode 100644

From ea339eb179c4c01f4e6f22ba18689ff93fb4a0e9 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 14 Feb 2023 08:54:00 -0600
Subject: [PATCH 06/57] Added ExecutionPlacementError

---
 dpctl/utils/__init__.py               |  2 ++
 dpctl/utils/_compute_follows_data.pyx | 13 ++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/dpctl/utils/__init__.py b/dpctl/utils/__init__.py
index 7589f9de9f..671564cda5 100644
--- a/dpctl/utils/__init__.py
+++ b/dpctl/utils/__init__.py
@@ -19,6 +19,7 @@
 """
 
 from ._compute_follows_data import (
+    ExecutionPlacementError,
     get_coerced_usm_type,
     get_execution_queue,
     validate_usm_type,
@@ -30,4 +31,5 @@
     "get_coerced_usm_type",
     "validate_usm_type",
     "onetrace_enabled",
+    "ExecutionPlacementError",
 ]
diff --git a/dpctl/utils/_compute_follows_data.pyx b/dpctl/utils/_compute_follows_data.pyx
index f61cebc90c..179fb6f875 100644
--- a/dpctl/utils/_compute_follows_data.pyx
+++ b/dpctl/utils/_compute_follows_data.pyx
@@ -28,7 +28,18 @@ import dpctl
 
 from .._sycl_queue cimport SyclQueue
 
-__all__ = ["get_execution_queue", "get_coerced_usm_type"]
+__all__ = ["get_execution_queue", "get_coerced_usm_type", "ExecutionPlacementError"]
+
+
+class ExecutionPlacementError(Exception):
+    """Exception raised when execution placement target can be determined
+    from input arrays.
+
+    Make sure that input arrays are associated with the same SyclQueue,
+    or migrate data to the same SyclQueue using usm_ndarray.to_device
+    method.
+    """
+    pass
 
 
 cdef bint queue_equiv(SyclQueue q1, SyclQueue q2):

From 483a423b01fef875111605fdac8cba478ac3009d Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 14 Feb 2023 09:20:41 -0600
Subject: [PATCH 07/57] Factored out dpt.dtype and dpt.bool, etc. definitions
 into dedicated file

---
 dpctl/tensor/__init__.py    | 34 ++++++++++++-------------
 dpctl/tensor/_data_types.py | 50 +++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 17 deletions(-)
 create mode 100644 dpctl/tensor/_data_types.py

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index bc6ae52564..cc4e31adf5 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -21,8 +21,6 @@
 
 """
 
-from numpy import dtype
-
 from dpctl.tensor._copy_utils import asnumpy, astype, copy, from_numpy, to_numpy
 from dpctl.tensor._ctors import (
     arange,
@@ -41,6 +39,23 @@
     zeros,
     zeros_like,
 )
+from dpctl.tensor._data_types import (
+    bool,
+    complex64,
+    complex128,
+    dtype,
+    float16,
+    float32,
+    float64,
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+)
 from dpctl.tensor._device import Device
 from dpctl.tensor._dlpack import from_dlpack
 from dpctl.tensor._manipulation_functions import (
@@ -68,21 +83,6 @@
 from dpctl.tensor._reshape import reshape
 from dpctl.tensor._usmarray import usm_ndarray
 
-bool = dtype("bool")
-int8 = dtype("int8")
-int16 = dtype("int16")
-int32 = dtype("int32")
-int64 = dtype("int64")
-uint8 = dtype("uint8")
-uint16 = dtype("uint16")
-uint32 = dtype("uint32")
-uint64 = dtype("uint64")
-float16 = dtype("float16")
-float32 = dtype("float32")
-float64 = dtype("float64")
-complex64 = dtype("complex64")
-complex128 = dtype("complex128")
-
 __all__ = [
     "Device",
     "usm_ndarray",
diff --git a/dpctl/tensor/_data_types.py b/dpctl/tensor/_data_types.py
new file mode 100644
index 0000000000..c97afe37be
--- /dev/null
+++ b/dpctl/tensor/_data_types.py
@@ -0,0 +1,50 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from numpy import dtype
+
+bool = dtype("bool")
+int8 = dtype("int8")
+int16 = dtype("int16")
+int32 = dtype("int32")
+int64 = dtype("int64")
+uint8 = dtype("uint8")
+uint16 = dtype("uint16")
+uint32 = dtype("uint32")
+uint64 = dtype("uint64")
+float16 = dtype("float16")
+float32 = dtype("float32")
+float64 = dtype("float64")
+complex64 = dtype("complex64")
+complex128 = dtype("complex128")
+
+__all__ = [
+    "dtype",
+    "bool",
+    "int8",
+    "uint8",
+    "int16",
+    "uint16",
+    "int32",
+    "uint32",
+    "int64",
+    "uint64",
+    "float16",
+    "float32",
+    "float64",
+    "complex64",
+    "complex128",
+]

From efcd9cb01c7621dfde131b3b4a58c25f0e2e75e5 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 14 Feb 2023 09:21:29 -0600
Subject: [PATCH 08/57] Deployed lazy implementation of advanced indexing to
 develop tests

---
 dpctl/tensor/_copy_utils.py | 123 ++++++++++++++++++++++++++++++++
 dpctl/tensor/_usmarray.pyx  | 136 ++++++++++++++++++++++++++++--------
 2 files changed, 230 insertions(+), 29 deletions(-)

diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index f006582cd3..41a2fd3203 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -15,9 +15,11 @@
 #  limitations under the License.
 import numpy as np
 
+import dpctl
 import dpctl.memory as dpm
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_impl as ti
+import dpctl.utils
 from dpctl.tensor._device import normalize_queue_device
 
 __doc__ = (
@@ -382,3 +384,124 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
         )
     _copy_from_usm_ndarray_to_usm_ndarray(R, usm_ary)
     return R
+
+
+def _mock_extract(ary, ary_mask, p):
+    exec_q = dpctl.utils.get_execution_queue(
+        (
+            ary.sycl_queue,
+            ary_mask.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "Can not automatically determine where to allocate the "
+            "result or performance execution. "
+            "Use `usm_ndarray.to_device` method to migrate data to "
+            "be associated with the same queue."
+        )
+
+    res_usm_type = dpctl.utils.get_coerced_usm_type(
+        (
+            ary.usm_type,
+            ary_mask.usm_type,
+        )
+    )
+    ary_np = dpt.asnumpy(ary)
+    mask_np = dpt.asnumpy(ary_mask)
+    res_np = ary_np[(slice(None),) * p + (mask_np,)]
+    res = dpt.empty(
+        res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+    res[...] = res_np
+    return res
+
+
+def _mock_nonzero(ary):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError
+    q = ary.sycl_queue
+    usm_type = ary.usm_type
+    ary_np = dpt.asnumpy(ary)
+    nz = ary_np.nonzero()
+    return tuple(dpt.asarray(i, usm_type=usm_type, sycl_queue=q) for i in nz)
+
+
+def _mock_take_multi_index(ary, inds, p):
+    queues_ = [
+        ary.sycl_queue,
+    ]
+    usm_types_ = [
+        ary.usm_type,
+    ]
+    all_integers = True
+    for ind in inds:
+        queues_.append(ind.sycl_queue)
+        usm_types_.append(ind.usm_type)
+        if all_integers:
+            all_integers = ind.dtype.kind in "ui"
+    exec_q = dpctl.utils.get_execution_queue(queues_)
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError("")
+    if not all_integers:
+        raise IndexError(
+            "arrays used as indices must be of integer (or boolean) type"
+        )
+    ary_np = dpt.asnumpy(ary)
+    ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds)
+    res_np = ary_np[ind_np]
+    res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
+    res = dpt.empty(
+        res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+    res[...] = res_np
+    return res
+
+
+def _mock_place(ary, ary_mask, p, vals):
+    exec_q = dpctl.utils.get_execution_queue(
+        (ary.sycl_queue, ary_mask.sycl_queue, vals.sycl_queue)
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "Can not automatically determine where to allocate the "
+            "result or performance execution. "
+            "Use `usm_ndarray.to_device` method to migrate data to "
+            "be associated with the same queue."
+        )
+
+    ary_np = dpt.asnumpy(ary)
+    mask_np = dpt.asnumpy(ary_mask)
+    vals_np = dpt.asnumpy(vals)
+    ary_np[(slice(None),) * p + (mask_np,)] = vals_np
+    ary[...] = ary_np
+    return
+
+
+def _mock_put_multi_index(ary, inds, p, vals):
+    queues_ = [ary.sycl_queue, vals.sycl_queue]
+    usm_types_ = [ary.usm_type, vals.usm_type]
+    all_integers = True
+    for ind in inds:
+        queues_.append(ind.sycl_queue)
+        usm_types_.append(ind.usm_type)
+        if all_integers:
+            all_integers = ind.dtype.kind in "ui"
+    exec_q = dpctl.utils.get_execution_queue(queues_)
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "Can not automatically determine where to allocate the "
+            "result or performance execution. "
+            "Use `usm_ndarray.to_device` method to migrate data to "
+            "be associated with the same queue."
+        )
+    if not all_integers:
+        raise IndexError(
+            "arrays used as indices must be of integer (or boolean) type"
+        )
+    ary_np = dpt.asnumpy(ary)
+    vals_np = dpt.asnumpy(vals)
+    ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds)
+    ary_np[ind_np] = vals_np
+    ary[...] = ary_np
+    return
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 9a4fab8af3..3c42c96dd5 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -25,6 +25,7 @@ import numpy as np
 import dpctl
 import dpctl.memory as dpmem
 
+from ._data_types import bool as dpt_bool
 from ._device import Device
 from ._print import usm_ndarray_repr, usm_ndarray_str
 
@@ -34,6 +35,7 @@ from cpython.tuple cimport PyTuple_New, PyTuple_SetItem
 cimport dpctl as c_dpctl
 cimport dpctl.memory as c_dpmem
 cimport dpctl.tensor._dlpack as c_dlpack
+
 import dpctl.tensor._flags as _flags
 
 include "_stride_utils.pxi"
@@ -648,6 +650,9 @@ cdef class usm_ndarray:
             self.get_offset())
         cdef usm_ndarray res
 
+        if len(_meta) < 5:
+            raise RuntimeError
+
         res = usm_ndarray.__new__(
             usm_ndarray,
             _meta[0],
@@ -658,7 +663,32 @@ cdef class usm_ndarray:
         )
         res.flags_ |= (self.flags_ & USM_ARRAY_WRITABLE)
         res.array_namespace_ = self.array_namespace_
-        return res
+
+        adv_ind = _meta[3]
+        adv_ind_start_p = _meta[4]
+
+        if adv_ind_start_p < 0:
+            return res
+
+        from ._copy_utils import (
+            _mock_extract,
+            _mock_nonzero,
+            _mock_take_multi_index,
+        )
+        if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
+            return _mock_extract(res, adv_ind[0], adv_ind_start_p)
+
+        if any(ind.dtype == dpt_bool for ind in adv_ind):
+            adv_ind_int = list()
+            for ind in adv_ind:
+                if ind.dtype == dpt_bool:
+                    adv_ind_int.extend(_mock_nonzero(ind))
+                else:
+                    adv_ind_int.append(ind)
+            return _mock_take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p)
+
+        return _mock_take_multi_index(res, adv_ind, adv_ind_start_p)
+
 
     def to_device(self, target):
         """
@@ -959,39 +989,87 @@ cdef class usm_ndarray:
             return _dispatch_binary_elementwise2(first, "right_shift", other)
         return NotImplemented
 
-    def __setitem__(self, key, val):
-        try:
-            Xv = self.__getitem__(key)
-        except (ValueError, IndexError) as e:
-            raise e
+    def __setitem__(self, key, rhs):
+        cdef tuple _meta
+        cdef usm_ndarray Xv
+
+        if (self.flags_ & USM_ARRAY_WRITABLE) == 0:
+            raise ValueError("Can not modify read-only array.")
+
+        _meta = _basic_slice_meta(
+            key, (<object>self).shape, (<object> self).strides,
+            self.get_offset()
+        )
+
+        if len(_meta) < 5:
+            raise RuntimeError
+
+        Xv = usm_ndarray.__new__(
+            usm_ndarray,
+            _meta[0],
+            dtype=_make_typestr(self.typenum_),
+            strides=_meta[1],
+            buffer=self.base_,
+            offset=_meta[2],
+        )
+        # set flags and namespace
+        Xv.flags_ |= (self.flags_ & USM_ARRAY_WRITABLE)
+        Xv.array_namespace_ = self.array_namespace_
+
         from ._copy_utils import (
             _copy_from_numpy_into,
             _copy_from_usm_ndarray_to_usm_ndarray,
+            _mock_nonzero,
+            _mock_place,
+            _mock_put_multi_index,
         )
-        if ((<usm_ndarray> Xv).flags_ & USM_ARRAY_WRITABLE) == 0:
-            raise ValueError("Can not modify read-only array.")
-        if isinstance(val, usm_ndarray):
-            _copy_from_usm_ndarray_to_usm_ndarray(Xv, val)
-        else:
-            if hasattr(val, "__sycl_usm_array_interface__"):
-                from dpctl.tensor import asarray
-                try:
-                    val_ar = asarray(val)
-                    _copy_from_usm_ndarray_to_usm_ndarray(Xv, val_ar)
-                except Exception:
-                    raise ValueError(
-                        f"Input of type {type(val)} could not be "
-                        "converted to usm_ndarray"
-                    )
+
+        adv_ind = _meta[3]
+        adv_ind_start_p = _meta[4]
+
+        if adv_ind_start_p < 0:
+            # basic slicing
+            if isinstance(rhs, usm_ndarray):
+                _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs)
             else:
-                try:
-                    val_np = np.asarray(val)
-                    _copy_from_numpy_into(Xv, val_np)
-                except Exception:
-                    raise ValueError(
-                        f"Input of type {type(val)} could not be "
-                        "converted to usm_ndarray"
-                    )
+                if hasattr(rhs, "__sycl_usm_array_interface__"):
+                    from dpctl.tensor import asarray
+                    try:
+                        rhs_ar = asarray(rhs)
+                        _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs_ar)
+                    except Exception:
+                        raise ValueError(
+                            f"Input of type {type(rhs)} could not be "
+                            "converted to usm_ndarray"
+                        )
+                else:
+                    try:
+                        rhs_np = np.asarray(rhs)
+                        _copy_from_numpy_into(Xv, rhs_np)
+                    except Exception:
+                        raise ValueError(
+                            f"Input of type {type(rhs)} could not be "
+                            "converted to usm_ndarray"
+                        )
+            return
+
+        if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
+            _mock_place(Xv, adv_ind[0], adv_ind_start_p, rhs)
+            return
+
+        if any(ind.dtype == dpt_bool for ind in adv_ind):
+            adv_ind_int = list()
+            for ind in adv_ind:
+                if ind.dtype == dpt_bool:
+                    adv_ind_int.extend(_mock_nonzero(ind))
+                else:
+                    adv_ind_int.append(ind)
+            _mock_put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs)
+            return
+
+        _mock_put_multi_index(Xv, adv_ind, adv_ind_start_p, rhs)
+        return
+
 
     def __sub__(first, other):
         "See comment in __add__"

From 04e4b5128ba650ebaa4578eab544ef69e1b72e35 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 14 Feb 2023 11:12:08 -0600
Subject: [PATCH 09/57] Ensure that rhs can be a scalar or numpy array

---
 dpctl/tensor/_copy_utils.py | 38 ++++++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index 41a2fd3203..f83ecdbd74 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -428,6 +428,8 @@ def _mock_nonzero(ary):
 
 
 def _mock_take_multi_index(ary, inds, p):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError
     queues_ = [
         ary.sycl_queue,
     ]
@@ -459,9 +461,15 @@ def _mock_take_multi_index(ary, inds, p):
 
 
 def _mock_place(ary, ary_mask, p, vals):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError
+    if not isinstance(ary_mask, dpt.usm_ndarray):
+        raise TypeError
     exec_q = dpctl.utils.get_execution_queue(
-        (ary.sycl_queue, ary_mask.sycl_queue, vals.sycl_queue)
+        (ary.sycl_queue, ary_mask.sycl_queue)
     )
+    if exec_q is not None and isinstance(vals, dpt.usm_ndarray):
+        exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue))
     if exec_q is None:
         raise dpctl.utils.ExecutionPlacementError(
             "Can not automatically determine where to allocate the "
@@ -472,17 +480,32 @@ def _mock_place(ary, ary_mask, p, vals):
 
     ary_np = dpt.asnumpy(ary)
     mask_np = dpt.asnumpy(ary_mask)
-    vals_np = dpt.asnumpy(vals)
+    if isinstance(vals, dpt.usm_ndarray) or hasattr(
+        vals, "__sycl_usm_array_interface__"
+    ):
+        vals_np = dpt.asnumpy(vals)
+    else:
+        vals_np = vals
     ary_np[(slice(None),) * p + (mask_np,)] = vals_np
     ary[...] = ary_np
     return
 
 
 def _mock_put_multi_index(ary, inds, p, vals):
-    queues_ = [ary.sycl_queue, vals.sycl_queue]
-    usm_types_ = [ary.usm_type, vals.usm_type]
+    if isinstance(vals, dpt.ums_ndarray):
+        queues_ = [ary.sycl_queue, vals.sycl_queue]
+        usm_types_ = [ary.usm_type, vals.usm_type]
+    else:
+        queues_ = [
+            ary.sycl_queue,
+        ]
+        usm_types_ = [
+            ary.usm_type,
+        ]
     all_integers = True
     for ind in inds:
+        if not isinstance(ind, dpt.usm_ndarray):
+            raise TypeError
         queues_.append(ind.sycl_queue)
         usm_types_.append(ind.usm_type)
         if all_integers:
@@ -500,7 +523,12 @@ def _mock_put_multi_index(ary, inds, p, vals):
             "arrays used as indices must be of integer (or boolean) type"
         )
     ary_np = dpt.asnumpy(ary)
-    vals_np = dpt.asnumpy(vals)
+    if isinstance(vals, dpt.usm_ndarray) or hasattr(
+        vals, "__sycl_usm_array_interface__"
+    ):
+        vals_np = dpt.asnumpy(vals)
+    else:
+        vals_np = vals
     ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds)
     ary_np[ind_np] = vals_np
     ary[...] = ary_np

From 6589dfa05b6355cf42919a4028d48f8787d59523 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 14 Feb 2023 11:22:29 -0600
Subject: [PATCH 10/57] Added new test file to test indexing

---
 dpctl/tests/test_usm_ndarray_indexing.py | 60 ++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 dpctl/tests/test_usm_ndarray_indexing.py

diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
new file mode 100644
index 0000000000..2441a663cf
--- /dev/null
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -0,0 +1,60 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# import numpy as np
+# import pytest
+from helper import get_queue_or_skip
+
+# import dpctl
+import dpctl.tensor as dpt
+
+# from helper import skip_if_dtype_not_supported
+
+
+def test_basic_slice1():
+    q = get_queue_or_skip()
+    x = dpt.empty(10, dtype="u2", sycl_queue=q)
+    y = x[0]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.ndim == 0
+    assert y.shape == tuple()
+    assert y.strides == tuple()
+
+
+def test_basic_slice2():
+    q = get_queue_or_skip()
+    x = dpt.empty(10, dtype="i2", sycl_queue=q)
+    y = x[(0,)]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.ndim == 0
+    assert y.shape == tuple()
+    assert y.strides == tuple()
+
+
+def test_basic_slice3():
+    q = get_queue_or_skip()
+    x = dpt.empty(10, dtype="i2", sycl_queue=q)
+    y = x[:]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.ndim == x.ndim
+    assert y.shape == x.shape
+    assert y.strides == x.strides
+    y = x[(slice(None, None, None),)]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.ndim == x.ndim
+    assert y.shape == x.shape
+    assert y.strides == x.strides

From d66c4940a9fecec47f77abac546756c6b10ca391 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 15 Feb 2023 09:53:25 -0600
Subject: [PATCH 11/57] Added _constants, and extended advanced indexing tests

---
 dpctl/tensor/__init__.py                 |  7 ++
 dpctl/tensor/_constants.py               | 24 ++++++
 dpctl/tests/test_usm_ndarray_indexing.py | 99 ++++++++++++++++++++++++
 3 files changed, 130 insertions(+)
 create mode 100644 dpctl/tensor/_constants.py

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index cc4e31adf5..7f2a6a9962 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -83,6 +83,8 @@
 from dpctl.tensor._reshape import reshape
 from dpctl.tensor._usmarray import usm_ndarray
 
+from ._constants import e, inf, nan, newaxis, pi
+
 __all__ = [
     "Device",
     "usm_ndarray",
@@ -141,4 +143,9 @@
     "print_options",
     "usm_ndarray_repr",
     "usm_ndarray_str",
+    "newaxis",
+    "e",
+    "pi",
+    "nan",
+    "inf",
 ]
diff --git a/dpctl/tensor/_constants.py b/dpctl/tensor/_constants.py
new file mode 100644
index 0000000000..88c516364d
--- /dev/null
+++ b/dpctl/tensor/_constants.py
@@ -0,0 +1,24 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import numpy as np
+
+newaxis = None
+
+pi = np.pi
+e = np.e
+nan = np.nan
+inf = np.inf
diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index 2441a663cf..152c1aefdd 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -58,3 +58,102 @@ def test_basic_slice3():
     assert y.ndim == x.ndim
     assert y.shape == x.shape
     assert y.strides == x.strides
+
+
+def test_basic_slice4():
+    q = get_queue_or_skip()
+    n0, n1 = 5, 3
+    x = dpt.empty((n0, n1), dtype="f4", sycl_queue=q)
+    y = x[::-1]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == x.shape
+    assert y.strides == (-x.strides[0], x.strides[1])
+    actual_offset = y.__sycl_usm_array_interface__["offset"]
+    assert actual_offset == (n0 - 1) * n1
+
+
+def test_basic_slice5():
+    q = get_queue_or_skip()
+    n0, n1 = 5, 3
+    x = dpt.empty((n0, n1), dtype="c8", sycl_queue=q)
+    y = x[:, ::-1]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == x.shape
+    assert y.strides == (x.strides[0], -x.strides[1])
+    actual_offset = y.__sycl_usm_array_interface__["offset"]
+    assert actual_offset == (n1 - 1)
+
+
+def test_basic_slice6():
+    q = get_queue_or_skip()
+    i0, n0, n1 = 2, 4, 3
+    x = dpt.empty((n0, n1), dtype="c8", sycl_queue=q)
+    y = x[i0, ::-1]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (x.shape[1],)
+    assert y.strides == (-x.strides[1],)
+    actual_offset = y.__sycl_usm_array_interface__["offset"]
+    expected_offset = i0 * x.strides[0] + (n1 - 1) * x.strides[1]
+    assert actual_offset == expected_offset
+
+
+def test_basic_slice7():
+    q = get_queue_or_skip()
+    n0, n1, n2 = 5, 3, 2
+    x = dpt.empty((n0, n1, n2), dtype="?", sycl_queue=q)
+    y = x[..., ::-1]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == x.shape
+    assert y.strides == (
+        x.strides[0],
+        x.strides[1],
+        -x.strides[2],
+    )
+    actual_offset = y.__sycl_usm_array_interface__["offset"]
+    expected_offset = (n2 - 1) * x.strides[2]
+    assert actual_offset == expected_offset
+
+
+def test_basic_slice8():
+    q = get_queue_or_skip()
+    n0, n1 = 3, 7
+    x = dpt.empty((n0, n1), dtype="u1", sycl_queue=q)
+    y = x[..., dpt.newaxis]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (n0, n1, 1)
+    assert y.strides == (n1, 1, 0)
+
+
+def test_basic_slice9():
+    q = get_queue_or_skip()
+    n0, n1 = 3, 7
+    x = dpt.empty((n0, n1), dtype="u8", sycl_queue=q)
+    y = x[dpt.newaxis, ...]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (1, n0, n1)
+    assert y.strides == (0, n1, 1)
+
+
+def test_basic_slice10():
+    q = get_queue_or_skip()
+    n0, n1, n2 = 3, 7, 5
+    x = dpt.empty((n0, n1, n2), dtype="u1", sycl_queue=q)
+    y = x[dpt.newaxis, ..., :]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (1, n0, n1, n2)
+    assert y.strides == (0, n1 * n2, n2, 1)
+
+
+def test_advanced_slice1():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+    y = x[ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert y.strides == (1,)
+    # FIXME, once usm_ndarray.__equal__ is implemented,
+    # use of asnumpy should be removed
+    assert all(
+        dpt.asnumpy(x[ii[k]]) == dpt.asnumpy(y[k]) for k in range(ii.shape[0])
+    )

From 61917a5efee44cdf3c5f08eadd35a06169cc2d81 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 15 Feb 2023 13:46:55 -0600
Subject: [PATCH 12/57] More tests for advanced indexing

---
 dpctl/tests/test_usm_ndarray_indexing.py | 212 ++++++++++++++++++++++-
 1 file changed, 209 insertions(+), 3 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index 152c1aefdd..e6c7271ab1 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -16,7 +16,7 @@
 
 
 # import numpy as np
-# import pytest
+import pytest
 from helper import get_queue_or_skip
 
 # import dpctl
@@ -144,6 +144,10 @@ def test_basic_slice10():
     assert y.strides == (0, n1 * n2, n2, 1)
 
 
+def _all_equal(it1, it2):
+    return all(dpt.asnumpy(x) == dpt.asnumpy(y) for x, y in zip(it1, it2))
+
+
 def test_advanced_slice1():
     q = get_queue_or_skip()
     ii = dpt.asarray([1, 2], sycl_queue=q)
@@ -154,6 +158,208 @@ def test_advanced_slice1():
     assert y.strides == (1,)
     # FIXME, once usm_ndarray.__equal__ is implemented,
     # use of asnumpy should be removed
-    assert all(
-        dpt.asnumpy(x[ii[k]]) == dpt.asnumpy(y[k]) for k in range(ii.shape[0])
+    assert _all_equal(
+        (x[ii[k]] for k in range(ii.shape[0])),
+        (y[k] for k in range(ii.shape[0])),
+    )
+    y = x[(ii,)]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert y.strides == (1,)
+    # FIXME, once usm_ndarray.__equal__ is implemented,
+    # use of asnumpy should be removed
+    assert _all_equal(
+        (x[ii[k]] for k in range(ii.shape[0])),
+        (y[k] for k in range(ii.shape[0])),
     )
+
+
+def test_advanced_slice2():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+    y = x[ii, dpt.newaxis]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape + (1,)
+    assert y.flags["C"]
+
+
+def test_advanced_slice3():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+    y = x[dpt.newaxis, ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (1,) + ii.shape
+    assert y.flags["C"]
+
+
+def _make_3d(dt, q):
+    return dpt.reshape(
+        dpt.arange(3 * 3 * 3, dtype=dt, sycl_queue=q),
+        (
+            3,
+            3,
+            3,
+        ),
+    )
+
+
+def test_advanced_slice4():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = _make_3d("i4", q)
+    y = x[ii, ii, ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert _all_equal(
+        (x[ii[k], ii[k], ii[k]] for k in range(ii.shape[0])),
+        (y[k] for k in range(ii.shape[0])),
+    )
+
+
+def test_advanced_slice5():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = _make_3d("i4", q)
+    with pytest.raises(IndexError):
+        x[ii, 0, ii]
+
+
+def test_advanced_slice6():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = _make_3d("i4", q)
+    y = x[:, ii, ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (
+        x.shape[0],
+        ii.shape[0],
+    )
+    assert _all_equal(
+        (
+            x[i, ii[k], ii[k]]
+            for i in range(x.shape[0])
+            for k in range(ii.shape[0])
+        ),
+        (y[i, k] for i in range(x.shape[0]) for k in range(ii.shape[0])),
+    )
+
+
+def test_advanced_slice7():
+    q = get_queue_or_skip()
+    mask = dpt.asarray(
+        [
+            [[True, True, False], [False, True, True], [True, False, True]],
+            [[True, False, False], [False, False, True], [False, True, False]],
+            [[True, True, True], [False, False, False], [False, False, True]],
+        ],
+        sycl_queue=q,
+    )
+    x = _make_3d("i2", q)
+    y = x[mask]
+    expected = [0, 1, 4, 5, 6, 8, 9, 14, 16, 18, 19, 20, 26]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (len(expected),)
+    assert all(dpt.asnumpy(y[k]) == expected[k] for k in range(len(expected)))
+
+
+def test_advanced_slice8():
+    q = get_queue_or_skip()
+    mask = dpt.asarray(
+        [[True, False, False], [False, True, False], [False, True, False]],
+        sycl_queue=q,
+    )
+    x = _make_3d("u2", q)
+    y = x[mask]
+    expected = dpt.asarray(
+        [[0, 1, 2], [12, 13, 14], [21, 22, 23]], sycl_queue=q
+    )
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()
+
+
+def test_advanced_slice9():
+    q = get_queue_or_skip()
+    mask = dpt.asarray(
+        [[True, False, False], [False, True, False], [False, True, False]],
+        sycl_queue=q,
+    )
+    x = _make_3d("u4", q)
+    y = x[:, mask]
+    expected = dpt.asarray([[0, 4, 7], [9, 13, 16], [18, 22, 25]], sycl_queue=q)
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()
+
+
+def lin_id(i, j, k):
+    """global_linear_id for (3,3,3) range traversed in C-contiguous order"""
+    return 9 * i + 3 * j + k
+
+
+def test_advanced_slice10():
+    q = get_queue_or_skip()
+    x = _make_3d("u8", q)
+    i0 = dpt.asarray([0, 1, 1], device=x.device)
+    i1 = dpt.asarray([1, 1, 2], device=x.device)
+    i2 = dpt.asarray([2, 0, 1], device=x.device)
+    y = x[i0, i1, i2]
+    res_expected = dpt.asarray(
+        [
+            lin_id(0, 1, 2),
+            lin_id(1, 1, 0),
+            lin_id(1, 2, 1),
+        ],
+        sycl_queue=q,
+    )
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == res_expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(res_expected)).all()
+
+
+def test_advanced_slice11():
+    q = get_queue_or_skip()
+    x = _make_3d("u8", q)
+    i0 = dpt.asarray([0, 1, 1], device=x.device)
+    i2 = dpt.asarray([2, 0, 1], device=x.device)
+    with pytest.raises(IndexError):
+        x[i0, :, i2]
+
+
+def test_advanced_slice12():
+    q = get_queue_or_skip()
+    x = _make_3d("u8", q)
+    i1 = dpt.asarray([1, 1, 2], device=x.device)
+    i2 = dpt.asarray([2, 0, 1], device=x.device)
+    y = x[:, dpt.newaxis, i1, i2, dpt.newaxis]
+    res_expected = dpt.asarray(
+        [
+            [[[lin_id(0, 1, 2)], [lin_id(0, 1, 0)], [lin_id(0, 2, 1)]]],
+            [[[lin_id(1, 1, 2)], [lin_id(1, 1, 0)], [lin_id(1, 2, 1)]]],
+            [[[lin_id(2, 1, 2)], [lin_id(2, 1, 0)], [lin_id(2, 2, 1)]]],
+        ],
+        sycl_queue=q,
+    )
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == res_expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(res_expected)).all()
+
+
+def test_advanced_slice13():
+    q = get_queue_or_skip()
+    x = _make_3d("u8", q)
+    i1 = dpt.asarray([[1], [2]], device=x.device)
+    i2 = dpt.asarray([[0, 1]], device=x.device)
+    y = x[i1, i2, 0]
+    expected = dpt.asarray(
+        [
+            [lin_id(1, 0, 0), lin_id(1, 1, 0)],
+            [lin_id(2, 0, 0), lin_id(2, 1, 0)],
+        ],
+        device=x.device,
+    )
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()

From c2d7928bb440675be1dac7ef2fe1a98d1bcd2593 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 15 Feb 2023 14:06:16 -0600
Subject: [PATCH 13/57] Removed proto/ folder

---
 proto/advanced.py      | 462 -----------------------------------------
 proto/test_advanced.py | 405 ------------------------------------
 2 files changed, 867 deletions(-)
 delete mode 100644 proto/advanced.py
 delete mode 100644 proto/test_advanced.py

diff --git a/proto/advanced.py b/proto/advanced.py
deleted file mode 100644
index 8721214d73..0000000000
--- a/proto/advanced.py
+++ /dev/null
@@ -1,462 +0,0 @@
-import numbers
-
-import dpctl.tensor as dpt
-import dpctl.utils
-from dpctl.tensor import usm_ndarray
-
-"""
-Advanced slicing meta-infomation extraction
-"""
-
-
-class ExecutionPlacementError(Exception):
-    pass
-
-
-def _slice_len(sl_start: int, sl_stop: int, sl_step: int):
-    """
-    Compute len(range(sl_start, sl_stop, sl_step))
-    """
-    if sl_start == sl_stop:
-        return 0
-    if sl_step > 0:
-        # 1 + argmax k such htat sl_start + sl_step*k < sl_stop
-        return 1 + ((sl_stop - sl_start - 1) // sl_step)
-    else:
-        return 1 + ((sl_stop - sl_start + 1) // sl_step)
-
-
-def _is_integral(x):
-    """Gives True if x is an integral slice spec"""
-    if isinstance(x, (int, numbers.Integral)):
-        return True
-    if isinstance(x, usm_ndarray):
-        if x.ndim > 0:
-            return False
-        if x.dtype.kind not in "ui":
-            return False
-        return True
-    if callable(getattr(x, "__index__", None)):
-        try:
-            x.__index__()
-        except (TypeError, ValueError):
-            return False
-        return True
-    return False
-
-
-def _basic_slice_meta(ind, shape: tuple, strides: tuple, offset: int):
-    """
-    Give basic slicing index `ind` and array layout information produce
-    a 5-tuple (resulting_shape, resulting_strides, resulting_offset,
-       advanced_ind, resulting_advanced_ind_pos)
-    used to contruct a view into underlying array over which advanced
-    indexing, if any, is to be performed.
-
-    Raises IndexError for invalid index `ind`.
-    """
-    _no_advanced_ind = tuple()
-    _no_advanced_pos = -1
-    if ind is Ellipsis:
-        return (shape, strides, offset, _no_advanced_ind, _no_advanced_pos)
-    elif ind is None:
-        return (
-            (1,) + shape,
-            (0,) + strides,
-            offset,
-            _no_advanced_ind,
-            _no_advanced_pos,
-        )
-    elif isinstance(ind, slice):
-        sl_start, sl_stop, sl_step = ind.indices(shape[0])
-        sh0 = _slice_len(sl_start, sl_stop, sl_step)
-        str0 = sl_step * strides[0]
-        new_strides = (
-            strides if (sl_step == 1 or sh0 == 0) else (str0,) + strides[1:]
-        )
-        new_offset = offset if sh0 == 0 else offset + sl_start * strides[0]
-        return (
-            (sh0,) + shape[1:],
-            new_strides,
-            new_offset,
-            _no_advanced_ind,
-            _no_advanced_pos,
-        )
-    elif _is_integral(ind):
-        ind = ind.__index__()
-        if 0 <= ind < shape[0]:
-            return (
-                shape[1:],
-                strides[1:],
-                offset + ind * strides[0],
-                _no_advanced_ind,
-                _no_advanced_pos,
-            )
-        elif -shape[0] <= ind < 0:
-            return (
-                shape[1:],
-                strides[1:],
-                offset + (shape[0] + ind) * strides[0],
-                _no_advanced_ind,
-                _no_advanced_pos,
-            )
-        else:
-            raise IndexError(
-                "Index {0} is out of range for axes 0 with "
-                "size {1}".format(ind, shape[0])
-            )
-    elif isinstance(ind, usm_ndarray):
-        return (shape, strides, 0, (ind,), 0)
-    elif isinstance(ind, tuple):
-        axes_referenced = 0
-        ellipses_count = 0
-        newaxis_count = 0
-        explicit_index = 0
-        array_count = 0
-        seen_arrays_yet = False
-        array_streak_started = False
-        array_streak_interrupted = False
-        for i in ind:
-            if i is None:
-                newaxis_count += 1
-                if array_streak_started:
-                    array_streak_interrupted = True
-            elif i is Ellipsis:
-                ellipses_count += 1
-                if array_streak_started:
-                    array_streak_interrupted = True
-            elif isinstance(i, slice):
-                axes_referenced += 1
-                if array_streak_started:
-                    array_streak_interrupted = True
-            elif _is_integral(i):
-                explicit_index += 1
-                axes_referenced += 1
-                if array_streak_started:
-                    array_streak_interrupted = True
-            elif isinstance(i, usm_ndarray):
-                if not seen_arrays_yet:
-                    seen_arrays_yet = True
-                    array_streak_started = True
-                    array_streak_interrupted = False
-                if array_streak_interrupted:
-                    raise IndexError(
-                        "Advanced indexing array specs may not be "
-                        "separated by basic slicing specs."
-                    )
-                dt_k = i.dtype.kind
-                if dt_k == "b":
-                    axes_referenced += i.ndim
-                elif dt_k in "ui":
-                    axes_referenced += 1
-                else:
-                    raise IndexError(
-                        "arrays used as indices must be of integer "
-                        "(or boolean) type"
-                    )
-                array_count += 1
-            else:
-                raise TypeError
-        if ellipses_count > 1:
-            raise IndexError("an index can only have a sinlge ellipsis ('...')")
-        if axes_referenced > len(shape):
-            raise IndexError(
-                "too many indices for an array, array is "
-                "{0}-dimensional, but {1} were indexed".format(
-                    len(shape), axes_referenced
-                )
-            )
-        if ellipses_count:
-            ellipses_count = len(shape) - axes_referenced
-        new_shape_len = (
-            newaxis_count + ellipses_count + axes_referenced - explicit_index
-        )
-        new_shape = list()
-        new_strides = list()
-        new_advanced_ind = list()
-        k = 0
-        new_advanced_start_pos = -1
-        advanced_start_pos_set = False
-        new_offset = offset
-        is_empty = False
-        for i in range(len(ind)):
-            ind_i = ind[i]
-            if ind_i is Ellipsis:
-                k_new = k + ellipses_count
-                new_shape.extend(shape[k:k_new])
-                new_strides.extend(strides[k:k_new])
-                k = k_new
-            elif ind_i is None:
-                new_shape.append(1)
-                new_strides.append(0)
-            elif isinstance(ind_i, slice):
-                k_new = k + 1
-                sl_start, sl_stop, sl_step = ind_i.indices(shape[k])
-                sh_i = _slice_len(sl_start, sl_stop, sl_step)
-                str_i = (1 if sh_i == 0 else sl_step) * strides[k]
-                new_shape.append(sh_i)
-                new_strides.append(str_i)
-                if sh_i > 0 and not is_empty:
-                    new_offset = new_offset + sl_start * strides[k]
-                if sh_i == 0:
-                    is_empty = True
-                k = k_new
-            elif _is_integral(ind_i):
-                ind_i = ind_i.__index__()
-                if 0 <= ind_i < shape[k]:
-                    k_new = k + 1
-                    if not is_empty:
-                        new_offset = new_offset + ind_i * strides[k]
-                    k = k_new
-                elif -shape[k] <= ind_i < 0:
-                    k_new = k + 1
-                    if not is_empty:
-                        new_offset = (
-                            new_offset + (shape[k] + ind_i) * strides[k]
-                        )
-                    k = k_new
-                else:
-                    raise IndexError(
-                        (
-                            "Index {0} is out of range for "
-                            "axes {1} with size {2}"
-                        ).format(ind_i, k, shape[k])
-                    )
-            elif isinstance(ind_i, usm_ndarray):
-                if not advanced_start_pos_set:
-                    new_advanced_start_pos = len(new_shape)
-                    advanced_start_pos_set = True
-                new_advanced_ind.append(ind_i)
-                dt_k = ind_i.dtype.kind
-                if dt_k == "b":
-                    k_new = k + ind_i.ndim
-                else:
-                    k_new = k + 1
-                new_shape.extend(shape[k:k_new])
-                new_strides.extend(strides[k:k_new])
-                k = k_new
-        new_shape.extend(shape[k:])
-        new_strides.extend(strides[k:])
-        debug = True
-        if debug:
-            new_shape_len += len(shape) - k
-            assert (
-                len(new_shape) == new_shape_len
-            ), f"{len(new_shape)} vs {new_shape_len}"
-            assert (
-                len(new_strides) == new_shape_len
-            ), f"{len(new_strides)} vs {new_shape_len}"
-            assert len(new_advanced_ind) == array_count
-        return (
-            tuple(new_shape),
-            tuple(new_strides),
-            new_offset,
-            tuple(new_advanced_ind),
-            new_advanced_start_pos,
-        )
-    else:
-        raise TypeError
-
-
-def _mock_extract(ary, ary_mask, p):
-    exec_q = dpctl.utils.get_execution_queue(
-        (
-            ary.sycl_queue,
-            ary_mask.sycl_queue,
-        )
-    )
-    if exec_q is None:
-        raise ExecutionPlacementError(
-            "Can not automatically determine where to allocate the "
-            "result or performance execution. "
-            "Use `usm_ndarray.to_device` method to migrate data to "
-            "be associated with the same queue."
-        )
-
-    res_usm_type = dpctl.utils.get_coerced_usm_type(
-        (
-            ary.usm_type,
-            ary_mask.usm_type,
-        )
-    )
-    ary_np = dpt.asnumpy(ary)
-    mask_np = dpt.asnumpy(ary_mask)
-    res_np = ary_np[(slice(None),) * p + (mask_np,)]
-    res = dpt.empty(
-        res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
-    )
-    res[...] = res_np
-    return res
-
-
-def _mock_nonzero(ary):
-    if not isinstance(ary, usm_ndarray):
-        raise TypeError
-    q = ary.sycl_queue
-    usm_type = ary.usm_type
-    ary_np = dpt.asnumpy(ary)
-    nz = ary_np.nonzero()
-    return tuple(dpt.asarray(i, usm_type=usm_type, sycl_queue=q) for i in nz)
-
-
-def _mock_take_multi_index(ary, inds, p):
-    queues_ = [
-        ary.sycl_queue,
-    ]
-    usm_types_ = [
-        ary.usm_type,
-    ]
-    all_integers = True
-    for ind in inds:
-        queues_.append(ind.sycl_queue)
-        usm_types_.append(ind.usm_type)
-        if all_integers:
-            all_integers = ind.dtype.kind in "ui"
-    exec_q = dpctl.utils.get_execution_queue(queues_)
-    if exec_q is None:
-        raise ExecutionPlacementError("")
-    if not all_integers:
-        print(inds)
-        raise IndexError(
-            "arrays used as indices must be of integer (or boolean) type"
-        )
-    ary_np = dpt.asnumpy(ary)
-    ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds)
-    res_np = ary_np[ind_np]
-    res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
-    res = dpt.empty(
-        res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
-    )
-    res[...] = res_np
-    return res
-
-
-def get_item(ary, ind):
-    suai = ary.__sycl_usm_array_interface__
-    _meta = _basic_slice_meta(
-        ind, ary.shape, ary.strides, suai.get("offset", 0)
-    )
-
-    if len(_meta) < 5:
-        raise RuntimeError
-
-    res = usm_ndarray.__new__(
-        usm_ndarray,
-        _meta[0],
-        dtype=ary.dtype,  # _make_typestr(ary.dtype.num),
-        strides=_meta[1],
-        buffer=ary.usm_data,  # self.base_,
-        offset=_meta[2],
-    )
-    # set flags and namespace
-    # res.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE)
-    # res.array_namespace_ = self.array_namespace_
-    adv_ind = _meta[3]
-    adv_ind_start_p = _meta[4]
-
-    if adv_ind_start_p < 0:
-        return res
-
-    if len(adv_ind) == 1 and adv_ind[0].dtype == dpt.bool:
-        return _mock_extract(res, adv_ind[0], adv_ind_start_p)
-
-    if any(ind.dtype == dpt.bool for ind in adv_ind):
-        adv_ind_int = list()
-        for ind in adv_ind:
-            if ind.dtype == dpt.bool:
-                adv_ind_int.extend(_mock_nonzero(ind))
-            else:
-                adv_ind_int.append(ind)
-        return _mock_take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p)
-
-    return _mock_take_multi_index(res, adv_ind, adv_ind_start_p)
-
-
-def _mock_place(ary, ary_mask, p, vals):
-    exec_q = dpctl.utils.get_execution_queue(
-        (ary.sycl_queue, ary_mask.sycl_queue, vals.sycl_queue)
-    )
-    if exec_q is None:
-        raise ExecutionPlacementError(
-            "Can not automatically determine where to allocate the "
-            "result or performance execution. "
-            "Use `usm_ndarray.to_device` method to migrate data to "
-            "be associated with the same queue."
-        )
-
-    ary_np = dpt.asnumpy(ary)
-    mask_np = dpt.asnumpy(ary_mask)
-    vals_np = dpt.asnumpy(vals)
-    ary_np[(slice(None),) * p + (mask_np,)] = vals_np
-    ary[...] = ary_np
-    return
-
-
-def _mock_put_multi_index(ary, inds, p, vals):
-    queues_ = [ary.sycl_queue, vals.sycl_queue]
-    usm_types_ = [ary.usm_type, vals.usm_type]
-    all_integers = True
-    for ind in inds:
-        queues_.append(ind.sycl_queue)
-        usm_types_.append(ind.usm_type)
-        if all_integers:
-            all_integers = ind.dtype.kind in "ui"
-    exec_q = dpctl.utils.get_execution_queue(queues_)
-    if exec_q is None:
-        raise ExecutionPlacementError("")
-    if not all_integers:
-        print(inds)
-        raise IndexError(
-            "arrays used as indices must be of integer (or boolean) type"
-        )
-    ary_np = dpt.asnumpy(ary)
-    vals_np = dpt.asnumpy(vals)
-    ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds)
-    ary_np[ind_np] = vals_np
-    ary[...] = ary_np
-    return
-
-
-def set_item(ary, ind, rhs):
-    suai = ary.__sycl_usm_array_interface__
-    _meta = _basic_slice_meta(
-        ind, ary.shape, ary.strides, suai.get("offset", 0)
-    )
-
-    if len(_meta) < 5:
-        raise RuntimeError
-
-    res = usm_ndarray.__new__(
-        usm_ndarray,
-        _meta[0],
-        dtype=ary.dtype,  # _make_typestr(ary.dtype.num),
-        strides=_meta[1],
-        buffer=ary.usm_data,  # self.base_,
-        offset=_meta[2],
-    )
-    # set flags and namespace
-    # res.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE)
-    # res.array_namespace_ = self.array_namespace_
-    adv_ind = _meta[3]
-    adv_ind_start_p = _meta[4]
-
-    if adv_ind_start_p < 0:
-        res[...] = rhs
-        return
-
-    if len(adv_ind) == 1 and adv_ind[0].dtype == dpt.bool:
-        _mock_place(res, adv_ind[0], adv_ind_start_p, rhs)
-        return
-
-    if any(ind.dtype == dpt.bool for ind in adv_ind):
-        adv_ind_int = list()
-        for ind in adv_ind:
-            if ind.dtype == dpt.bool:
-                adv_ind_int.extend(_mock_nonzero(ind))
-            else:
-                adv_ind_int.append(ind)
-        _mock_put_multi_index(res, tuple(adv_ind_int), adv_ind_start_p, rhs)
-        return
-
-    _mock_put_multi_index(res, adv_ind, adv_ind_start_p, rhs)
-    return
diff --git a/proto/test_advanced.py b/proto/test_advanced.py
deleted file mode 100644
index 7cfb44c3d6..0000000000
--- a/proto/test_advanced.py
+++ /dev/null
@@ -1,405 +0,0 @@
-import advanced
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-
-
-def test_basic_slice1():
-    res = advanced._basic_slice_meta((0,), (1,), (1,), 0)
-    assert res == (tuple(), tuple(), 0, tuple(), -1)
-
-
-def test_basic_slice1a():
-    res = advanced._basic_slice_meta(0, (1,), (1,), 0)
-    assert res == (tuple(), tuple(), 0, tuple(), -1)
-
-
-def test_basic_slice2():
-    res = advanced._basic_slice_meta((slice(None),), (1,), (1,), 0)
-    assert res == ((1,), (1,), 0, tuple(), -1)
-
-
-def test_basic_slice3():
-    res = advanced._basic_slice_meta((slice(None, None, -1),), (1,), (1,), 0)
-    assert res == ((1,), (-1,), 0, tuple(), -1)
-
-
-def test_basic_slice4():
-    res = advanced._basic_slice_meta(
-        (slice(None, None, -1),),
-        (
-            5,
-            3,
-        ),
-        (
-            3,
-            1,
-        ),
-        0,
-    )
-    assert res == ((5, 3), (-3, 1), (5 - 1) * 3, tuple(), -1)
-
-
-def test_basic_slice5():
-    res = advanced._basic_slice_meta(
-        (
-            slice(None),
-            slice(None, None, -1),
-        ),
-        (
-            4,
-            3,
-        ),
-        (
-            3,
-            1,
-        ),
-        0,
-    )
-    assert res == ((4, 3), (3, -1), 3 - 1, tuple(), -1)
-
-
-def test_basic_slice6():
-    res = advanced._basic_slice_meta(
-        (
-            2,
-            slice(None, None, -1),
-        ),
-        (
-            4,
-            3,
-        ),
-        (
-            3,
-            1,
-        ),
-        0,
-    )
-    assert res == ((3,), (-1,), 2 * 3 + 3 - 1, tuple(), -1)
-
-
-def test_basic_slice7():
-    res = advanced._basic_slice_meta(
-        (
-            Ellipsis,
-            slice(None, None, -1),
-        ),
-        (
-            4,
-            3,
-        ),
-        (
-            3,
-            1,
-        ),
-        0,
-    )
-    assert res == ((4, 3), (3, -1), 3 - 1, tuple(), -1)
-
-
-def test_basic_slice8():
-    res = advanced._basic_slice_meta(
-        (Ellipsis, None),
-        (
-            4,
-            3,
-        ),
-        (
-            3,
-            1,
-        ),
-        0,
-    )
-    assert res == ((4, 3, 1), (3, 1, 0), 0, tuple(), -1)
-
-
-def test_basic_slice9():
-    res = advanced._basic_slice_meta(
-        (
-            None,
-            Ellipsis,
-        ),
-        (
-            4,
-            3,
-        ),
-        (
-            3,
-            1,
-        ),
-        0,
-    )
-    assert res == (
-        (
-            1,
-            4,
-            3,
-        ),
-        (0, 3, 1),
-        0,
-        tuple(),
-        -1,
-    )
-
-
-def test_basic_slice10():
-    res = advanced._basic_slice_meta(
-        (None, Ellipsis, slice(None)), (4, 3, 5), (30, 5, 1), 0
-    )
-    assert res == ((1, 4, 3, 5), (0, 30, 5, 1), 0, tuple(), -1)
-
-
-def test_advanced_slice1():
-    ii = dpt.asarray([0, 1])
-    res = advanced._basic_slice_meta((ii,), (10,), (1,), 0)
-    assert res == ((10,), (1,), 0, (ii,), 0)
-
-    res = advanced._basic_slice_meta(ii, (10,), (1,), 0)
-    assert res == ((10,), (1,), 0, (ii,), 0)
-
-
-def test_advanced_slice2():
-    ii = dpt.asarray([0, 1])
-    res = advanced._basic_slice_meta((ii, None), (10,), (1,), 0)
-    assert res == ((10, 1), (1, 0), 0, (ii,), 0)
-
-
-def test_advanced_slice3():
-    ii = dpt.asarray([0, 1])
-    res = advanced._basic_slice_meta((None, ii), (10,), (1,), 0)
-    assert res == (
-        (
-            1,
-            10,
-        ),
-        (
-            0,
-            1,
-        ),
-        0,
-        (ii,),
-        1,
-    )
-
-
-def test_advanced_slice4():
-    ii = dpt.asarray([0, 1])
-    res = advanced._basic_slice_meta(
-        (ii, ii, ii),
-        (10, 10, 10),
-        (
-            100,
-            10,
-            1,
-        ),
-        0,
-    )
-    assert res == (
-        (10, 10, 10),
-        (
-            100,
-            10,
-            1,
-        ),
-        0,
-        (ii, ii, ii),
-        0,
-    )
-
-
-def test_advanced_slice5():
-    ii = dpt.asarray([0, 1])
-    with pytest.raises(IndexError):
-        advanced._basic_slice_meta(
-            (ii, slice(None), ii),
-            (10, 10, 10),
-            (
-                100,
-                10,
-                1,
-            ),
-            0,
-        )
-
-
-def test_advanced_slice6():
-    ii = dpt.asarray([0, 1])
-    res = advanced._basic_slice_meta(
-        (
-            slice(None),
-            ii,
-            ii,
-        ),
-        (10, 10, 10),
-        (
-            100,
-            10,
-            1,
-        ),
-        0,
-    )
-    assert res == (
-        (
-            10,
-            10,
-            10,
-        ),
-        (100, 10, 1),
-        0,
-        (
-            ii,
-            ii,
-        ),
-        1,
-    )
-
-
-def test_advanced_slice7():
-    x = dpt.reshape(
-        dpt.arange(3 * 3 * 3, dtype="i8"),
-        (
-            3,
-            3,
-            3,
-        ),
-    )
-    mask = dpt.asarray(
-        [
-            [[True, True, False], [False, True, True], [True, False, True]],
-            [[True, False, False], [False, False, True], [False, True, False]],
-            [[True, True, True], [False, False, False], [False, False, True]],
-        ]
-    )
-    res = advanced.get_item(x, mask)
-    res_expected = np.array([0, 1, 4, 5, 6, 8, 9, 14, 16, 18, 19, 20, 26])
-    assert np.array_equal(dpt.asnumpy(res), res_expected)
-    res2 = advanced.get_item(x, (mask,))
-    assert np.array_equal(dpt.asnumpy(res2), res_expected)
-
-
-def test_advanced_slice8():
-    x = dpt.reshape(
-        dpt.arange(3 * 3 * 3, dtype="i8"),
-        (
-            3,
-            3,
-            3,
-        ),
-    )
-    mask = dpt.asarray(
-        [[True, False, False], [False, True, False], [False, True, False]]
-    )
-    res = advanced.get_item(x, mask)
-    res_expected = np.array([[0, 1, 2], [12, 13, 14], [21, 22, 23]])
-    assert np.array_equal(dpt.asnumpy(res), res_expected)
-    res2 = advanced.get_item(x, (mask,))
-    assert np.array_equal(dpt.asnumpy(res2), res_expected)
-
-
-def test_advanced_slice9():
-    x = dpt.reshape(
-        dpt.arange(3 * 3 * 3, dtype="i8"),
-        (
-            3,
-            3,
-            3,
-        ),
-    )
-    mask = dpt.asarray(
-        [[True, False, False], [False, True, False], [False, True, False]]
-    )
-    res = advanced.get_item(
-        x,
-        (
-            slice(None, None, None),
-            mask,
-        ),
-    )
-    res_expected = np.array([[0, 4, 7], [9, 13, 16], [18, 22, 25]])
-    assert np.array_equal(dpt.asnumpy(res), res_expected)
-
-
-def lin_id(i, j, k):
-    return 9 * i + 3 * j + k
-
-
-def test_advanced_slice10():
-    x = dpt.reshape(
-        dpt.arange(3 * 3 * 3, dtype="i8"),
-        (
-            3,
-            3,
-            3,
-        ),
-    )
-    i0 = dpt.asarray([0, 1, 1])
-    i1 = dpt.asarray([1, 1, 2])
-    i2 = dpt.asarray([2, 0, 1])
-    res = advanced.get_item(x, (i0, i1, i2))
-    res_expected = np.array(
-        [
-            lin_id(0, 1, 2),
-            lin_id(1, 1, 0),
-            lin_id(1, 2, 1),
-        ]
-    )
-    assert np.array_equal(dpt.asnumpy(res), res_expected)
-
-
-def test_advanced_slice11():
-    x = dpt.reshape(
-        dpt.arange(3 * 3 * 3, dtype="i8"),
-        (
-            3,
-            3,
-            3,
-        ),
-    )
-    i0 = dpt.asarray([0, 1, 1])
-    i2 = dpt.asarray([2, 0, 1])
-    with pytest.raises(IndexError):
-        advanced.get_item(x, (i0, slice(None, None, None), i2))
-
-
-def test_advanced_slice12():
-    x = dpt.reshape(
-        dpt.arange(3 * 3 * 3, dtype="i8"),
-        (
-            3,
-            3,
-            3,
-        ),
-    )
-    i1 = dpt.asarray([1, 1, 2])
-    i2 = dpt.asarray([2, 0, 1])
-    res = advanced.get_item(x, (slice(None), None, i1, i2, None))
-    res_expected = np.array(
-        [
-            [[[lin_id(0, 1, 2)], [lin_id(0, 1, 0)], [lin_id(0, 2, 1)]]],
-            [[[lin_id(1, 1, 2)], [lin_id(1, 1, 0)], [lin_id(1, 2, 1)]]],
-            [[[lin_id(2, 1, 2)], [lin_id(2, 1, 0)], [lin_id(2, 2, 1)]]],
-        ]
-    )
-    assert np.array_equal(dpt.asnumpy(res), res_expected)
-
-
-def test_advanced_slice13():
-    x = dpt.reshape(
-        dpt.arange(3 * 3 * 3, dtype="i8"),
-        (
-            3,
-            3,
-            3,
-        ),
-    )
-    i1 = dpt.asarray([[1], [2]])
-    i2 = dpt.asarray([[0, 1]])
-    res = advanced.get_item(x, (i1, i2, 0))
-    res_expected = np.array(
-        [
-            [lin_id(1, 0, 0), lin_id(1, 1, 0)],
-            [lin_id(2, 0, 0), lin_id(2, 1, 0)],
-        ]
-    )
-    assert np.array_equal(dpt.asnumpy(res), res_expected)

From 832a981f1bbda43a872eced15322ce90ae6ca5d9 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 22 Feb 2023 15:40:44 -0800
Subject: [PATCH 14/57] Implemented advanced indexing kernels - Kernels for
 _take, _put - Python API functions for take, put

---
 dpctl/tensor/CMakeLists.txt                   |    1 +
 dpctl/tensor/__init__.py                      |    3 +
 dpctl/tensor/_copy_utils.py                   |   57 +-
 dpctl/tensor/_indexing_functions.py           |  171 +++
 .../include/kernels/advanced_indexing.hpp     |  417 ++++++
 .../libtensor/source/advanced_indexing.cpp    | 1142 +++++++++++++++++
 .../libtensor/source/advanced_indexing.hpp    |   62 +
 dpctl/tensor/libtensor/source/tensor_py.cpp   |   24 +
 8 files changed, 1862 insertions(+), 15 deletions(-)
 create mode 100644 dpctl/tensor/_indexing_functions.py
 create mode 100644 dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
 create mode 100644 dpctl/tensor/libtensor/source/advanced_indexing.cpp
 create mode 100644 dpctl/tensor/libtensor/source/advanced_indexing.hpp

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index b496ecfbd8..83db95805e 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -23,6 +23,7 @@ pybind11_add_module(${python_module_name} MODULE
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/advanced_indexing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 7f2a6a9962..d21958b4fa 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -58,6 +58,7 @@
 )
 from dpctl.tensor._device import Device
 from dpctl.tensor._dlpack import from_dlpack
+from dpctl.tensor._indexing_functions import put, take
 from dpctl.tensor._manipulation_functions import (
     broadcast_arrays,
     broadcast_to,
@@ -112,6 +113,8 @@
     "expand_dims",
     "permute_dims",
     "squeeze",
+    "take",
+    "put",
     "from_numpy",
     "to_numpy",
     "asnumpy",
diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index f83ecdbd74..382d92bb79 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -13,7 +13,10 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import operator
+
 import numpy as np
+from numpy.core.numeric import normalize_axis_index
 
 import dpctl
 import dpctl.memory as dpm
@@ -449,14 +452,25 @@ def _mock_take_multi_index(ary, inds, p):
         raise IndexError(
             "arrays used as indices must be of integer (or boolean) type"
         )
-    ary_np = dpt.asnumpy(ary)
-    ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds)
-    res_np = ary_np[ind_np]
+    inds = dpt.broadcast_arrays(*inds)
+    ary_ndim = ary.ndim
+    if ary_ndim > 0:
+        p = operator.index(p)
+        p = normalize_axis_index(p, ary_ndim)
+
+        res_shape = ary.shape[:p] + inds[0].shape + ary.shape[p + len(inds) :]
+    else:
+        res_shape = inds[0].shape
     res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
     res = dpt.empty(
-        res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
+        res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
-    res[...] = res_np
+
+    hev, _ = ti._take(
+        src=ary, ind=inds, dst=res, axis_start=p, mode=0, sycl_queue=exec_q
+    )
+    hev.wait()
+
     return res
 
 
@@ -492,7 +506,7 @@ def _mock_place(ary, ary_mask, p, vals):
 
 
 def _mock_put_multi_index(ary, inds, p, vals):
-    if isinstance(vals, dpt.ums_ndarray):
+    if isinstance(vals, dpt.usm_ndarray):
         queues_ = [ary.sycl_queue, vals.sycl_queue]
         usm_types_ = [ary.usm_type, vals.usm_type]
     else:
@@ -522,14 +536,27 @@ def _mock_put_multi_index(ary, inds, p, vals):
         raise IndexError(
             "arrays used as indices must be of integer (or boolean) type"
         )
-    ary_np = dpt.asnumpy(ary)
-    if isinstance(vals, dpt.usm_ndarray) or hasattr(
-        vals, "__sycl_usm_array_interface__"
-    ):
-        vals_np = dpt.asnumpy(vals)
+
+    inds = dpt.broadcast_arrays(*inds)
+    ary_ndim = ary.ndim
+    if ary_ndim > 0:
+        p = operator.index(p)
+        p = normalize_axis_index(p, ary_ndim)
+        vals_shape = ary.shape[:p] + inds[0].shape + ary.shape[p + len(inds) :]
     else:
-        vals_np = vals
-    ind_np = (slice(None),) * p + tuple(dpt.asnumpy(ind) for ind in inds)
-    ary_np[ind_np] = vals_np
-    ary[...] = ary_np
+        vals_shape = inds[0].shape
+
+    vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
+    if not isinstance(vals, dpt.usm_ndarray):
+        vals = dpt.asarray(
+            vals, ary.dtype, usm_type=vals_usm_type, sycl_queue=exec_q
+        )
+
+    vals = dpt.broadcast_to(vals, vals_shape)
+
+    hev, _ = ti._put(
+        dst=ary, ind=inds, val=vals, axis_start=p, mode=0, sycl_queue=exec_q
+    )
+    hev.wait()
+
     return
diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
new file mode 100644
index 0000000000..c3562de8f8
--- /dev/null
+++ b/dpctl/tensor/_indexing_functions.py
@@ -0,0 +1,171 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2022 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import operator
+
+import numpy as np
+from numpy.core.numeric import normalize_axis_index
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.tensor._tensor_impl import _put, _take
+
+
+def take(x, indices, /, *, axis=None, mode="clip"):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+
+    if not isinstance(indices, list) and not isinstance(indices, tuple):
+        indices = (indices,)
+
+    queues_ = [
+        x.sycl_queue,
+    ]
+    usm_types_ = [
+        x.usm_type,
+    ]
+
+    for i in indices:
+        if not isinstance(i, dpt.usm_ndarray):
+            raise TypeError(
+                "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
+                    type(i)
+                )
+            )
+        if not np.issubdtype(i.dtype, np.integer):
+            raise TypeError(
+                "`indices` expected integer data type, got `{}`".format(i.dtype)
+            )
+        queues_.append(i.sycl_queue)
+        usm_types_.append(i.usm_type)
+    exec_q = dpctl.utils.get_execution_queue(queues_)
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "Can not automatically determine where to allocate the "
+            "result or performance execution. "
+            "Use `usm_ndarray.to_device` method to migrate data to "
+            "be associated with the same queue."
+        )
+    res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
+
+    modes = {"clip": 0, "wrap": 1}
+    try:
+        mode = modes[mode]
+    except KeyError:
+        raise ValueError("`mode` must be `clip` or `wrap`.")
+
+    x_ndim = x.ndim
+    if axis is None:
+        if x_ndim > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(
+                    x_ndim
+                )
+            )
+        axis = 0
+
+    indices = dpt.broadcast_arrays(*indices)
+    if x_ndim > 0:
+        axis = operator.index(axis)
+        axis = normalize_axis_index(axis, x_ndim)
+        res_shape = (
+            x.shape[:axis] + indices[0].shape + x.shape[axis + len(indices) :]
+        )
+    else:
+        res_shape = indices[0].shape
+
+    res = dpt.empty(
+        res_shape, dtype=x.dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+
+    hev, _ = _take(x, indices, res, axis, mode, sycl_queue=exec_q)
+    hev.wait()
+
+    return res
+
+
+def put(x, indices, vals, /, *, axis=None, mode="clip"):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+    queues_ = [
+        x.sycl_queue,
+    ]
+    usm_types_ = [
+        x.usm_type,
+    ]
+
+    if not isinstance(indices, list) and not isinstance(indices, tuple):
+        indices = (indices,)
+
+    for i in indices:
+        if not isinstance(i, dpt.usm_ndarray):
+            raise TypeError(
+                "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
+                    type(i)
+                )
+            )
+        if not np.issubdtype(i.dtype, np.integer):
+            raise TypeError(
+                "`indices` expected integer data type, got `{}`".format(i.dtype)
+            )
+        queues_.append(i.sycl_queue)
+        usm_types_.append(i.usm_type)
+    exec_q = dpctl.utils.get_execution_queue(queues_)
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "Can not automatically determine where to allocate the "
+            "result or performance execution. "
+            "Use `usm_ndarray.to_device` method to migrate data to "
+            "be associated with the same queue."
+        )
+    val_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
+
+    modes = {"clip": 0, "wrap": 1}
+    try:
+        mode = modes[mode]
+    except KeyError:
+        raise ValueError("`mode` must be `wrap`, or `clip`.")
+
+    # when axis is none, array is treated as 1D
+    if axis is None:
+        x = dpt.reshape(x, (x.size,), copy=False)
+        axis = 0
+
+    indices = dpt.broadcast_arrays(*indices)
+    x_ndim = x.ndim
+    if x_ndim > 0:
+        axis = operator.index(axis)
+        axis = normalize_axis_index(axis, x_ndim)
+
+        val_shape = (
+            x.shape[:axis] + indices[0].shape + x.shape[axis + len(indices) :]
+        )
+    else:
+        val_shape = indices[0].shape
+
+    if not isinstance(vals, dpt.usm_ndarray):
+        vals = dpt.asarray(
+            vals, dtype=x.dtype, usm_type=val_usm_type, sycl_queue=exec_q
+        )
+
+    vals = dpt.broadcast_to(vals, val_shape)
+
+    hev, _ = _put(x, indices, vals, axis, mode, sycl_queue=exec_q)
+    hev.wait()
diff --git a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
new file mode 100644
index 0000000000..77234296ff
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
@@ -0,0 +1,417 @@
+//=== indexing.hpp -  Implementation of indexing kernels ---*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for advanced tensor index operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include "utils/strided_iters.hpp"
+#include "utils/type_utils.hpp"
+#include <CL/sycl.hpp>
+#include <complex>
+#include <cstdint>
+#include <pybind11/pybind11.h>
+#include <type_traits>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace indexing
+{
+
+namespace py = pybind11;
+
+template <typename ProjectorT, typename Ty, typename indT> class take_kernel;
+template <typename ProjectorT, typename Ty, typename indT> class put_kernel;
+
+template <typename indT> class ClipIndex
+{
+public:
+    ClipIndex() = default;
+
+    void operator()(py::ssize_t max_item, indT &ind) const
+    {
+        max_item = (max_item > 0) ? max_item : 1;
+        py::ssize_t clip_ind = static_cast<py::ssize_t>(ind);
+        ind = (ind < 0) ? 0 : (clip_ind >= max_item) ? (max_item - 1) : ind;
+        return;
+    }
+};
+
+template <typename indT> class WrapIndex
+{
+public:
+    WrapIndex() = default;
+
+    void operator()(py::ssize_t max_item, indT &ind) const
+    {
+        max_item = (max_item > 0) ? max_item : 1;
+        py::ssize_t wrap_ind = static_cast<py::ssize_t>(ind);
+        ind = (ind < 0)                ? max_item - (-wrap_ind % max_item)
+              : (wrap_ind >= max_item) ? wrap_ind % max_item
+                                       : ind;
+        return;
+    }
+};
+
+template <typename ProjectorT, typename T, typename indT> class TakeFunctor
+{
+private:
+    const char *src_ = nullptr;
+    char *dst_ = nullptr;
+    char **ind_ = nullptr;
+    int nd_ = 0;
+    int ind_nd_ = 0;
+    int k_ = 0;
+    size_t ind_nelems_ = 0;
+    const py::ssize_t *orthog_shape_and_strides_ = nullptr;
+    const py::ssize_t *axes_shape_and_strides_ = nullptr;
+    const py::ssize_t *ind_shape_and_strides_ = nullptr;
+    py::ssize_t src_offset_ = 0;
+    py::ssize_t dst_offset_ = 0;
+    const py::ssize_t *ind_offsets_ = nullptr;
+
+public:
+    TakeFunctor(const char *src_cp,
+                char *dst_cp,
+                char **ind_cp,
+                int nd,
+                int ind_nd,
+                int k,
+                size_t ind_nelems,
+                const py::ssize_t *orthog_shape_and_strides,
+                const py::ssize_t *axes_shape_and_strides,
+                const py::ssize_t *ind_shape_and_strides,
+                py::ssize_t src_offset,
+                py::ssize_t dst_offset,
+                const py::ssize_t *ind_offsets)
+        : src_(src_cp), dst_(dst_cp), ind_(ind_cp), nd_(nd), ind_nd_(ind_nd),
+          k_(k), ind_nelems_(ind_nelems),
+          orthog_shape_and_strides_(orthog_shape_and_strides),
+          axes_shape_and_strides_(axes_shape_and_strides),
+          ind_shape_and_strides_(ind_shape_and_strides),
+          src_offset_(src_offset), dst_offset_(dst_offset),
+          ind_offsets_(ind_offsets)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        const T *src = reinterpret_cast<const T *>(src_);
+        T *dst = reinterpret_cast<T *>(dst_);
+
+        py::ssize_t i_orthog = id / ind_nelems_;
+        py::ssize_t i_along = id - (i_orthog * ind_nelems_);
+
+        py::ssize_t src_orthog_idx(0);
+        py::ssize_t dst_orthog_idx(0);
+        CIndexer_vector<py::ssize_t> indxr(nd_);
+        indxr.get_displacement<const py::ssize_t *, const py::ssize_t *>(
+            static_cast<py::ssize_t>(i_orthog),
+            orthog_shape_and_strides_,           // common shape
+            orthog_shape_and_strides_ + nd_,     // src strides
+            orthog_shape_and_strides_ + 2 * nd_, // dst strides
+            src_orthog_idx,                      // modified by reference
+            dst_orthog_idx);
+
+        ProjectorT proj{};
+        py::ssize_t ind_arr_idx(0);
+        CIndexer_vector<py::ssize_t> ind_indxr(ind_nd_);
+        for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
+            ind_indxr.get_displacement<const py::ssize_t *>(
+                static_cast<py::ssize_t>(i_along), ind_shape_and_strides_,
+                ind_shape_and_strides_ + ((axis_idx + 1) * ind_nd_),
+                ind_arr_idx);
+            indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
+            indT i = ind_data[ind_arr_idx + ind_offsets_[axis_idx]];
+            proj(axes_shape_and_strides_[axis_idx], i);
+            src_orthog_idx += i * axes_shape_and_strides_[k_ + axis_idx];
+        }
+        py::ssize_t ind_dst_idx(0);
+        ind_indxr.get_displacement<const ::py::ssize_t *>(
+            static_cast<py::ssize_t>(i_along), ind_shape_and_strides_,
+            axes_shape_and_strides_ + (2 * k_), ind_dst_idx);
+
+        dst[dst_orthog_idx + ind_dst_idx + dst_offset_] =
+            src[src_orthog_idx + src_offset_];
+    }
+};
+
+typedef sycl::event (*take_fn_ptr_t)(sycl::queue,
+                                     size_t,
+                                     size_t,
+                                     int,
+                                     int,
+                                     int,
+                                     const py::ssize_t *,
+                                     const py::ssize_t *,
+                                     const py::ssize_t *,
+                                     const char *,
+                                     char *,
+                                     char **,
+                                     py::ssize_t,
+                                     py::ssize_t,
+                                     const py::ssize_t *,
+                                     const std::vector<sycl::event> &);
+
+template <typename ProjectorT, typename Ty, typename indT>
+sycl::event take_impl(sycl::queue q,
+                      size_t orthog_nelems,
+                      size_t ind_nelems,
+                      int nd,
+                      int ind_nd,
+                      int k,
+                      const py::ssize_t *orthog_shape_and_strides,
+                      const py::ssize_t *axes_shape_and_strides,
+                      const py::ssize_t *ind_shape_and_strides,
+                      const char *src_p,
+                      char *dst_p,
+                      char **ind_p,
+                      py::ssize_t src_offset,
+                      py::ssize_t dst_offset,
+                      const py::ssize_t *ind_offsets,
+                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event take_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const size_t gws = orthog_nelems * ind_nelems;
+
+        cgh.parallel_for<take_kernel<ProjectorT, Ty, indT>>(
+            sycl::range<1>(gws),
+            TakeFunctor<ProjectorT, Ty, indT>(
+                src_p, dst_p, ind_p, nd, ind_nd, k, ind_nelems,
+                orthog_shape_and_strides, axes_shape_and_strides,
+                ind_shape_and_strides, src_offset, dst_offset, ind_offsets));
+    });
+
+    return take_ev;
+}
+
+template <typename ProjectorT, typename T, typename indT> class PutFunctor
+{
+private:
+    char *dst_ = nullptr;
+    const char *val_ = nullptr;
+    char **ind_ = nullptr;
+    int nd_ = 0;
+    int ind_nd_ = 0;
+    int k_ = 0;
+    size_t ind_nelems_ = 0;
+    const py::ssize_t *orthog_shape_and_strides_ = nullptr;
+    const py::ssize_t *axes_shape_and_strides_ = nullptr;
+    const py::ssize_t *ind_shape_and_strides_ = nullptr;
+    py::ssize_t dst_offset_ = 0;
+    py::ssize_t val_offset_ = 0;
+    const py::ssize_t *ind_offsets_ = nullptr;
+
+public:
+    PutFunctor(char *dst_cp,
+               const char *val_cp,
+               char **ind_cp,
+               int nd,
+               int ind_nd,
+               int k,
+               size_t ind_nelems,
+               const py::ssize_t *orthog_shape_and_strides,
+               const py::ssize_t *axes_shape_and_strides,
+               const py::ssize_t *ind_shape_and_strides,
+               py::ssize_t dst_offset,
+               py::ssize_t val_offset,
+               const py::ssize_t *ind_offsets)
+        : dst_(dst_cp), val_(val_cp), ind_(ind_cp), nd_(nd), ind_nd_(ind_nd),
+          k_(k), ind_nelems_(ind_nelems),
+          orthog_shape_and_strides_(orthog_shape_and_strides),
+          axes_shape_and_strides_(axes_shape_and_strides),
+          ind_shape_and_strides_(ind_shape_and_strides),
+          dst_offset_(dst_offset), val_offset_(val_offset),
+          ind_offsets_(ind_offsets)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        T *dst = reinterpret_cast<T *>(dst_);
+        const T *val = reinterpret_cast<const T *>(val_);
+
+        py::ssize_t i_orthog = id / ind_nelems_;
+        py::ssize_t i_along = id - (i_orthog * ind_nelems_);
+
+        py::ssize_t dst_orthog_idx(0);
+        py::ssize_t val_orthog_idx(0);
+        CIndexer_vector<py::ssize_t> indxr(nd_);
+        indxr.get_displacement<const py::ssize_t *, const py::ssize_t *>(
+            static_cast<py::ssize_t>(i_orthog),
+            orthog_shape_and_strides_,           // common shape
+            orthog_shape_and_strides_ + nd_,     // dst strides
+            orthog_shape_and_strides_ + 2 * nd_, // val strides
+            dst_orthog_idx,                      // modified by reference
+            val_orthog_idx);
+
+        ProjectorT proj{};
+        py::ssize_t ind_arr_idx(0);
+        CIndexer_vector<py::ssize_t> ind_indxr(ind_nd_);
+        for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
+            ind_indxr.get_displacement<const py::ssize_t *>(
+                static_cast<py::ssize_t>(i_along), ind_shape_and_strides_,
+                ind_shape_and_strides_ + ((axis_idx + 1) * ind_nd_),
+                ind_arr_idx);
+            indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
+            indT i = ind_data[ind_arr_idx + ind_offsets_[axis_idx]];
+            proj(axes_shape_and_strides_[axis_idx], i);
+            dst_orthog_idx += i * axes_shape_and_strides_[k_ + axis_idx];
+        }
+        py::ssize_t ind_val_idx(0);
+        ind_indxr.get_displacement<const ::py::ssize_t *>(
+            static_cast<py::ssize_t>(i_along), ind_shape_and_strides_,
+            axes_shape_and_strides_ + (2 * k_), ind_val_idx);
+
+        dst[dst_orthog_idx + dst_offset_] =
+            val[val_orthog_idx + ind_val_idx + val_offset_];
+    }
+};
+
+typedef sycl::event (*put_fn_ptr_t)(sycl::queue,
+                                    size_t,
+                                    size_t,
+                                    int,
+                                    int,
+                                    int,
+                                    const py::ssize_t *,
+                                    const py::ssize_t *,
+                                    const py::ssize_t *,
+                                    char *,
+                                    const char *,
+                                    char **,
+                                    py::ssize_t,
+                                    py::ssize_t,
+                                    const py::ssize_t *,
+                                    const std::vector<sycl::event> &);
+
+template <typename ProjectorT, typename Ty, typename indT>
+sycl::event put_impl(sycl::queue q,
+                     size_t orthog_nelems,
+                     size_t ind_nelems,
+                     int nd,
+                     int ind_nd,
+                     int k,
+                     const py::ssize_t *orthog_shape_and_strides,
+                     const py::ssize_t *axes_shape_and_strides,
+                     const py::ssize_t *ind_shape_and_strides,
+                     char *dst_p,
+                     const char *val_p,
+                     char **ind_p,
+                     py::ssize_t dst_offset,
+                     py::ssize_t val_offset,
+                     const py::ssize_t *ind_offsets,
+                     const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event put_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const size_t gws = orthog_nelems * ind_nelems;
+
+        cgh.parallel_for<put_kernel<ProjectorT, Ty, indT>>(
+            sycl::range<1>(gws),
+            PutFunctor<ProjectorT, Ty, indT>(
+                dst_p, val_p, ind_p, nd, ind_nd, k, ind_nelems,
+                orthog_shape_and_strides, axes_shape_and_strides,
+                ind_shape_and_strides, dst_offset, val_offset, ind_offsets));
+    });
+
+    return put_ev;
+}
+
+template <typename fnT, typename T, typename indT> struct TakeWrapFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            fnT fn = take_impl<WrapIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT> struct TakeClipFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            fnT fn = take_impl<ClipIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT> struct PutWrapFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            fnT fn = put_impl<WrapIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT> struct PutClipFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            fnT fn = put_impl<ClipIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+} // namespace indexing
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.cpp b/dpctl/tensor/libtensor/source/advanced_indexing.cpp
new file mode 100644
index 0000000000..5f043db7bc
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/advanced_indexing.cpp
@@ -0,0 +1,1142 @@
+//===-- take_kernel_impl.cpp - Implementation of take  --*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines implementation functions of dpctl.tensor.take and
+/// dpctl.tensor.put
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <utility>
+
+#include "dpctl4pybind11.hpp"
+#include "kernels/advanced_indexing.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#define INDEXING_MODES 2
+#define CLIP_MODE 0
+#define WRAP_MODE 1
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace _ns = dpctl::tensor::detail;
+
+using dpctl::tensor::kernels::indexing::put_fn_ptr_t;
+using dpctl::tensor::kernels::indexing::take_fn_ptr_t;
+
+static take_fn_ptr_t take_dispatch_table[INDEXING_MODES][_ns::num_types]
+                                        [_ns::num_types];
+
+static put_fn_ptr_t put_dispatch_table[INDEXING_MODES][_ns::num_types]
+                                      [_ns::num_types];
+
+namespace py = pybind11;
+
+using dpctl::tensor::c_contiguous_strides;
+using dpctl::tensor::f_contiguous_strides;
+
+using dpctl::utils::keep_args_alive;
+
+std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
+    sycl::queue exec_q,
+    py::ssize_t *device_orthog_shapes_strides,
+    py::ssize_t *device_axes_shapes_strides,
+    const py::ssize_t *inp_shape,
+    const py::ssize_t *inp_strides,
+    bool is_inp_c_contig,
+    bool is_inp_f_contig,
+    const py::ssize_t *arr_shape,
+    const py::ssize_t *arr_strides,
+    bool is_arr_c_contig,
+    bool is_arr_f_contig,
+    int axis_start,
+    int k,
+    int ind_nd,
+    int inp_nd,
+    int arr_nd)
+{
+
+    int orthog_sh_elems = (inp_nd > 1) ? inp_nd - k : 1;
+    int along_sh_elems = (ind_nd > 1) ? ind_nd : 1;
+
+    using usm_host_allocatorT =
+        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
+    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT allocator(exec_q);
+    std::shared_ptr<shT> packed_host_shapes_strides_shp =
+        std::make_shared<shT>(3 * orthog_sh_elems, allocator);
+
+    std::shared_ptr<shT> packed_host_axes_shapes_strides_shp =
+        std::make_shared<shT>(2 * k + along_sh_elems, allocator);
+
+    // can be made more efficient by checking if inp_nd > 1, then performing
+    // same treatment of orthog_sh_elems as for 0D (orthog will not exist)
+    if (inp_nd > 0) {
+        std::copy(inp_shape, inp_shape + axis_start,
+                  packed_host_shapes_strides_shp->begin());
+        std::copy(inp_shape + axis_start + k, inp_shape + inp_nd,
+                  packed_host_shapes_strides_shp->begin() + axis_start);
+        std::copy(inp_shape + axis_start, inp_shape + axis_start + k,
+                  packed_host_axes_shapes_strides_shp->begin());
+
+        // contract axes by using two copies
+        if (inp_strides == nullptr) {
+            if (is_inp_c_contig) {
+                const auto &inp_contig_strides =
+                    c_contiguous_strides(inp_nd, inp_shape);
+                std::copy(inp_contig_strides.begin(),
+                          inp_contig_strides.begin() + axis_start,
+                          packed_host_shapes_strides_shp->begin() +
+                              orthog_sh_elems);
+                std::copy(inp_contig_strides.begin() + axis_start + k,
+                          inp_contig_strides.end(),
+                          packed_host_shapes_strides_shp->begin() +
+                              orthog_sh_elems + axis_start);
+                std::copy(inp_contig_strides.begin() + axis_start,
+                          inp_contig_strides.begin() + axis_start + k,
+                          packed_host_axes_shapes_strides_shp->begin() + k);
+            }
+            else if (is_inp_f_contig) {
+                const auto &inp_contig_strides =
+                    f_contiguous_strides(inp_nd, inp_shape);
+                std::copy(inp_contig_strides.begin(),
+                          inp_contig_strides.begin() + axis_start,
+                          packed_host_shapes_strides_shp->begin() +
+                              orthog_sh_elems);
+                std::copy(inp_contig_strides.begin() + axis_start + k,
+                          inp_contig_strides.end(),
+                          packed_host_shapes_strides_shp->begin() +
+                              orthog_sh_elems + axis_start);
+                std::copy(inp_contig_strides.begin() + axis_start,
+                          inp_contig_strides.begin() + axis_start + k,
+                          packed_host_axes_shapes_strides_shp->begin() + k);
+            }
+            else {
+                sycl::free(device_orthog_shapes_strides, exec_q);
+                throw std::runtime_error("Invalid array encountered");
+            }
+        }
+        else {
+            std::copy(inp_strides, inp_strides + axis_start,
+                      packed_host_shapes_strides_shp->begin() +
+                          orthog_sh_elems);
+            std::copy(inp_strides + axis_start + k, inp_strides + inp_nd,
+                      packed_host_shapes_strides_shp->begin() +
+                          orthog_sh_elems + axis_start);
+            std::copy(inp_strides + axis_start, inp_strides + axis_start + k,
+                      packed_host_axes_shapes_strides_shp->begin() + k);
+        }
+
+        if (arr_strides == nullptr) {
+            if (is_arr_c_contig) {
+                const auto &arr_contig_strides =
+                    c_contiguous_strides(arr_nd, arr_shape);
+                std::copy(arr_contig_strides.begin(),
+                          arr_contig_strides.begin() + axis_start,
+                          packed_host_shapes_strides_shp->begin() +
+                              2 * orthog_sh_elems);
+                std::copy(arr_contig_strides.begin() + axis_start + ind_nd,
+                          arr_contig_strides.end(),
+                          packed_host_shapes_strides_shp->begin() +
+                              2 * orthog_sh_elems + axis_start);
+                std::copy(arr_contig_strides.begin() + axis_start,
+                          arr_contig_strides.begin() + axis_start + ind_nd,
+                          packed_host_axes_shapes_strides_shp->begin() + 2 * k);
+            }
+            else if (is_arr_f_contig) {
+                const auto &arr_contig_strides =
+                    f_contiguous_strides(arr_nd, arr_shape);
+                std::copy(arr_contig_strides.begin(),
+                          arr_contig_strides.begin() + axis_start,
+                          packed_host_shapes_strides_shp->begin() +
+                              2 * orthog_sh_elems);
+                std::copy(arr_contig_strides.begin() + axis_start + ind_nd,
+                          arr_contig_strides.end(),
+                          packed_host_shapes_strides_shp->begin() +
+                              2 * orthog_sh_elems + axis_start);
+                std::copy(arr_contig_strides.begin() + axis_start,
+                          arr_contig_strides.begin() + axis_start + ind_nd,
+                          packed_host_axes_shapes_strides_shp->begin() + 2 * k);
+            }
+            else {
+                sycl::free(device_orthog_shapes_strides, exec_q);
+                throw std::runtime_error("Invalid array encountered");
+            }
+        }
+        else {
+            std::copy(arr_strides, arr_strides + axis_start,
+                      packed_host_shapes_strides_shp->begin() +
+                          2 * orthog_sh_elems);
+            std::copy(arr_strides + axis_start + ind_nd, arr_strides + inp_nd,
+                      packed_host_shapes_strides_shp->begin() +
+                          2 * orthog_sh_elems + axis_start);
+            std::copy(arr_strides + axis_start,
+                      arr_strides + axis_start + ind_nd,
+                      packed_host_axes_shapes_strides_shp->begin() + 2 * k);
+        }
+
+        // copy packed shapes and strides from host to devices
+        sycl::event device_orthog_shapes_strides_copy_ev =
+            exec_q.copy<py::ssize_t>(packed_host_shapes_strides_shp->data(),
+                                     device_orthog_shapes_strides,
+                                     packed_host_shapes_strides_shp->size());
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(device_orthog_shapes_strides_copy_ev);
+            cgh.host_task([packed_host_shapes_strides_shp] {});
+        });
+
+        sycl::event device_axes_shapes_strides_copy_ev =
+            exec_q.copy<py::ssize_t>(
+                packed_host_axes_shapes_strides_shp->data(),
+                device_axes_shapes_strides,
+                packed_host_axes_shapes_strides_shp->size());
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(device_axes_shapes_strides_copy_ev);
+            cgh.host_task([packed_host_axes_shapes_strides_shp]() {});
+        });
+
+        std::vector<sycl::event> v = {device_orthog_shapes_strides_copy_ev,
+                                      device_axes_shapes_strides_copy_ev};
+        return v;
+    }
+    else {
+        // no orthogonal dimensions
+        sycl::event device_orthog_shapes_strides_fill_ev =
+            exec_q.fill<py::ssize_t>(device_orthog_shapes_strides,
+                                     py::ssize_t(0), 3);
+
+        packed_host_axes_shapes_strides_shp->insert(
+            packed_host_axes_shapes_strides_shp->end(), py::ssize_t(0), 2);
+        if (arr_strides == nullptr) {
+            if (is_arr_c_contig) {
+                const auto &arr_contig_strides =
+                    c_contiguous_strides(arr_nd, arr_shape);
+                std::copy(arr_contig_strides.begin() + axis_start,
+                          arr_contig_strides.begin() + axis_start + ind_nd,
+                          packed_host_axes_shapes_strides_shp->begin() + 2);
+            }
+            else if (is_arr_f_contig) {
+                const auto &arr_contig_strides =
+                    f_contiguous_strides(arr_nd, arr_shape);
+                std::copy(arr_contig_strides.begin() + axis_start,
+                          arr_contig_strides.begin() + axis_start + ind_nd,
+                          packed_host_axes_shapes_strides_shp->begin() + 2);
+            }
+            else {
+                sycl::free(device_orthog_shapes_strides, exec_q);
+                throw std::runtime_error("Invalid array encountered");
+            }
+        }
+        else {
+            std::copy(arr_strides + axis_start,
+                      arr_strides + axis_start + ind_nd,
+                      packed_host_axes_shapes_strides_shp->begin() + 2);
+        }
+
+        sycl::event device_axes_shapes_strides_copy_ev =
+            exec_q.copy<py::ssize_t>(
+                packed_host_axes_shapes_strides_shp->data(),
+                device_axes_shapes_strides,
+                packed_host_axes_shapes_strides_shp->size());
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(device_axes_shapes_strides_copy_ev);
+            cgh.host_task([packed_host_axes_shapes_strides_shp]() {});
+        });
+
+        std::vector<sycl::event> v = {device_orthog_shapes_strides_fill_ev,
+                                      device_axes_shapes_strides_copy_ev};
+        return v;
+    }
+}
+
+std::pair<sycl::event, sycl::event>
+usm_ndarray_take(dpctl::tensor::usm_ndarray src,
+                 std::vector<dpctl::tensor::usm_ndarray> ind,
+                 dpctl::tensor::usm_ndarray dst,
+                 int axis_start,
+                 uint8_t mode,
+                 sycl::queue exec_q,
+                 const std::vector<sycl::event> &depends = {})
+{
+    int k = ind.size();
+
+    if (k == 0) {
+        // no indices to take from
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    if (axis_start < 0) {
+        throw py::value_error("Axis cannot be negative.");
+    }
+
+    const dpctl::tensor::usm_ndarray ind_rep = ind[0];
+
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    int ind_nd = ind_rep.get_ndim();
+
+    auto sh_elems = (src_nd > 0) ? src_nd : 1;
+
+    if (axis_start + k > sh_elems) {
+        throw py::value_error("Axes are out of range for array of dimension " +
+                              std::to_string(src_nd));
+    }
+    if (src_nd == 0) {
+        if (dst_nd != ind_nd) {
+            throw py::value_error(
+                "Destination is not of appropriate dimension for take kernel.");
+        }
+    }
+    else {
+        if (dst_nd != (src_nd - k + ind_nd)) {
+            throw py::value_error(
+                "Destination is not of appropriate dimension for take kernel.");
+        }
+    }
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool orthog_shapes_equal(true);
+    size_t orthog_nelems(1);
+    for (int i = 0; i < axis_start; ++i) {
+        orthog_nelems *= static_cast<size_t>(src_shape[i]);
+        orthog_shapes_equal =
+            orthog_shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+
+    for (int i = (axis_start + k), j = (axis_start + ind_nd);
+         (i < src_nd && j < dst_nd); ++i, ++j)
+    {
+        orthog_nelems *= static_cast<size_t>(src_shape[i]);
+        orthog_shapes_equal =
+            orthog_shapes_equal && (src_shape[i] == dst_shape[j]);
+    }
+
+    if (!orthog_shapes_equal) {
+        throw py::value_error(
+            "Axes of basic indices are not of matching shapes.");
+    }
+
+    if (orthog_nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    auto src_offsets = src.get_minmax_offsets();
+    auto dst_offsets = dst.get_minmax_offsets();
+    int src_elem_size = src.get_elemsize();
+    int dst_elem_size = dst.get_elemsize();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+    py::ssize_t src_offset = py::ssize_t(0);
+    py::ssize_t dst_offset = py::ssize_t(0);
+
+    if (!dst.is_writable()) {
+        throw py::value_error("Output array is read-only.");
+    }
+
+    bool memory_overlap =
+        ((dst_data - src_data > src_offsets.second * src_elem_size -
+                                    dst_offsets.first * dst_elem_size) &&
+         (src_data - dst_data > dst_offsets.second * dst_elem_size -
+                                    src_offsets.first * src_elem_size));
+    if (memory_overlap) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::type_error("Array data types are not the same.");
+    }
+
+    const py::ssize_t *ind_shape = ind_rep.get_shape_raw();
+
+    int ind_typenum = ind_rep.get_typenum();
+    int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum);
+
+    size_t ind_nelems(1);
+    for (int i = 0; i < ind_nd; ++i) {
+        ind_nelems *= static_cast<size_t>(ind_shape[i]);
+
+        if (!(ind_shape[i] == dst_shape[axis_start + i])) {
+            throw py::value_error(
+                "Indices shape does not match shape of axis in destination.");
+        }
+    }
+
+    auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1;
+
+    char **packed_ind_ptrs = sycl::malloc_device<char *>(k, exec_q);
+
+    if (packed_ind_ptrs == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_ind_ptrs device memory");
+    }
+
+    // packed_ind_shapes_strides = [ind_shape,
+    //                              ind[0] strides,
+    //                              ...,
+    //                              ind[k] strides]
+    py::ssize_t *packed_ind_shapes_strides =
+        sycl::malloc_device<py::ssize_t>((k + 1) * ind_sh_elems, exec_q);
+
+    if (packed_ind_shapes_strides == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_ind_shapes_strides device memory");
+    }
+
+    py::ssize_t *packed_ind_offsets =
+        sycl::malloc_device<py::ssize_t>(k, exec_q);
+
+    if (packed_ind_offsets == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_ind_offsets device memory");
+    }
+
+    using usm_host_allocator_T =
+        sycl::usm_allocator<char *, sycl::usm::alloc::host>;
+    using ptrT = std::vector<char *, usm_host_allocator_T>;
+
+    usm_host_allocator_T ptr_allocator(exec_q);
+    std::shared_ptr<ptrT> host_ind_ptrs_shp =
+        std::make_shared<ptrT>(k, ptr_allocator);
+
+    using usm_host_allocatorT =
+        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
+    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT ind_allocator(exec_q);
+    std::shared_ptr<shT> host_ind_shapes_strides_shp =
+        std::make_shared<shT>(ind_sh_elems * (k + 1), ind_allocator);
+
+    // shape can be copied now (must be the same for every array)
+    if (ind_nd > 0) {
+        std::copy(ind_shape, ind_shape + ind_nd,
+                  host_ind_shapes_strides_shp->begin());
+    }
+    else {
+        // all strides are 0 for 0D array
+        host_ind_shapes_strides_shp->insert(host_ind_shapes_strides_shp->end(),
+                                            (k + 1), 0);
+    }
+
+    std::shared_ptr<shT> host_ind_offsets_shp =
+        std::make_shared<shT>(k, ind_allocator);
+
+    std::vector<char *> ind_ptrs;
+    ind_ptrs.reserve(k);
+    std::vector<py::ssize_t> ind_offsets;
+    ind_offsets.reserve(k);
+    for (int i = 0; i < k; ++i) {
+        dpctl::tensor::usm_ndarray ind_ = ind[i];
+
+        if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) {
+            throw py::value_error(
+                "Execution queue is not compatible with allocation queues");
+        }
+
+        // ndim, type, and shape are checked against the first array
+        if (i > 0) {
+            if (!(ind_.get_ndim() == ind_nd)) {
+                throw py::value_error("Index dimensions are not the same");
+            }
+
+            if (!(ind_type_id ==
+                  array_types.typenum_to_lookup_id(ind_.get_typenum()))) {
+                throw py::type_error(
+                    "Indices array data types are not all the same.");
+            }
+
+            const py::ssize_t *ind_shape_ = ind_.get_shape_raw();
+            for (int dim = 0; dim < ind_nd; ++dim) {
+                if (!(ind_shape[dim] == ind_shape_[dim])) {
+                    throw py::value_error("Indices shapes are not all equal.");
+                }
+            }
+        }
+
+        // check for overlap with destination
+        int ind_elem_size = ind_.get_elemsize();
+        auto ind_mem_offsets = ind_.get_minmax_offsets();
+        char *ind_data = ind_.get_data();
+        bool ind_memory_overlap =
+            ((dst_data - ind_data > ind_mem_offsets.second * ind_elem_size -
+                                        dst_offsets.first * dst_elem_size) &&
+             (ind_data - dst_data > dst_offsets.second * dst_elem_size -
+                                        ind_mem_offsets.first * ind_elem_size));
+
+        if (ind_memory_overlap) {
+            throw py::value_error(
+                "Arrays index overlapping segments of memory");
+        }
+
+        // strides are initialized to 0 for 0D indices, so skip here
+        if (ind_nd > 0) {
+            const py::ssize_t *ind_strides = ind_.get_strides_raw();
+            if (ind_strides == nullptr) {
+                if (ind_.is_c_contiguous()) {
+                    const auto &ind_contig_strides_ =
+                        c_contiguous_strides(ind_nd, ind_shape);
+                    std::copy(ind_contig_strides_.begin(),
+                              ind_contig_strides_.end(),
+                              host_ind_shapes_strides_shp->begin() +
+                                  (i + 1) * ind_nd);
+                }
+                else if (ind_.is_f_contiguous()) {
+                    const auto &ind_contig_strides_ =
+                        f_contiguous_strides(ind_nd, ind_shape);
+                    std::copy(ind_contig_strides_.begin(),
+                              ind_contig_strides_.end(),
+                              host_ind_shapes_strides_shp->begin() +
+                                  (i + 1) * ind_nd);
+                }
+                else {
+                    throw std::runtime_error(
+                        "Invalid ind array encountered in: take function");
+                }
+            }
+            else {
+                std::copy(ind_strides, ind_strides + ind_nd,
+                          host_ind_shapes_strides_shp->begin() +
+                              (i + 1) * ind_nd);
+            }
+        }
+
+        ind_ptrs.push_back(ind_data);
+        ind_offsets.push_back(py::ssize_t(0));
+    }
+
+    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
+    std::copy(ind_offsets.begin(), ind_offsets.end(),
+              host_ind_offsets_shp->begin());
+
+    sycl::event device_ind_ptrs_copy_ev = exec_q.copy<char *>(
+        host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(device_ind_ptrs_copy_ev);
+        cgh.host_task([host_ind_ptrs_shp]() {});
+    });
+
+    sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
+        host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides,
+        host_ind_shapes_strides_shp->size());
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(device_ind_shapes_strides_copy_ev);
+        cgh.host_task([host_ind_shapes_strides_shp]() {});
+    });
+
+    sycl::event device_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
+        host_ind_offsets_shp->data(), packed_ind_offsets,
+        host_ind_offsets_shp->size());
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(device_ind_offsets_copy_ev);
+        cgh.host_task([host_ind_offsets_shp]() {});
+    });
+
+    std::vector<sycl::event> ind_pack_depends = {
+        device_ind_ptrs_copy_ev, device_ind_shapes_strides_copy_ev,
+        device_ind_offsets_copy_ev};
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    const py::ssize_t *src_strides = src.get_strides_raw();
+    const py::ssize_t *dst_strides = dst.get_strides_raw();
+
+    // destination must be ample enough to accomodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if ((range + 1) < (orthog_nelems * ind_nelems)) {
+            throw py::value_error(
+                "Destination array can not accomodate all the "
+                "elements of source array.");
+        }
+    }
+
+    // packed_shapes_strides = [src_shape[:axis] + src_shape[:axis+1],
+    //                         src_strides[:axis] + src_strides[:axis+1],
+    //                         dst_strides[:axis] + dst_strides[:axis+1]]
+    py::ssize_t *packed_shapes_strides =
+        sycl::malloc_device<py::ssize_t>(3 * sh_elems, exec_q);
+
+    if (packed_shapes_strides == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_shapes_strides device memory");
+    }
+
+    // packed_axes_shapes_strides = [src_shape[axis:k],
+    //                               src_strides[axis:k,
+    //                               dst_strides[axis:ind.ndim]]
+    py::ssize_t *packed_axes_shapes_strides =
+        sycl::malloc_device<py::ssize_t>((2 * k) + ind_sh_elems, exec_q);
+
+    if (packed_axes_shapes_strides == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_axes_shapes_strides device memory");
+    }
+
+    std::vector<sycl::event> src_dst_pack_deps =
+        _populate_packed_shapes_strides_for_indexing(
+            exec_q, packed_shapes_strides, packed_axes_shapes_strides,
+            src_shape, src_strides, is_src_c_contig, is_src_f_contig, dst_shape,
+            dst_strides, is_dst_c_contig, is_dst_f_contig, axis_start, k,
+            ind_nd, src_nd, dst_nd);
+
+    std::vector<sycl::event> all_deps(depends.size() + ind_pack_depends.size() +
+                                      src_dst_pack_deps.size());
+    all_deps.insert(std::end(all_deps), std::begin(ind_pack_depends),
+                    std::end(ind_pack_depends));
+    all_deps.insert(std::end(all_deps), std::begin(src_dst_pack_deps),
+                    std::end(src_dst_pack_deps));
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    auto fn = take_dispatch_table[mode][src_type_id][ind_type_id];
+
+    if (fn == nullptr) {
+        throw std::runtime_error("Indices must be integer type, got " +
+                                 std::to_string(ind_type_id));
+    }
+
+    int orthog_nd = ((src_nd - k) > 0) ? src_nd - k : 1;
+
+    sycl::event take_generic_ev =
+        fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k,
+           packed_shapes_strides, packed_axes_shapes_strides,
+           packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs,
+           src_offset, dst_offset, packed_ind_offsets, all_deps);
+
+    // free packed_shapes_strides temporary
+
+    auto ctx = exec_q.get_context();
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(take_generic_ev);
+        cgh.host_task([packed_shapes_strides, ctx]() {
+            sycl::free(packed_shapes_strides, ctx);
+        });
+    });
+
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(take_generic_ev);
+        cgh.host_task([packed_axes_shapes_strides, ctx]() {
+            sycl::free(packed_axes_shapes_strides, ctx);
+        });
+    });
+
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(take_generic_ev);
+        cgh.host_task([packed_ind_shapes_strides, ctx]() {
+            sycl::free(packed_ind_shapes_strides, ctx);
+        });
+    });
+
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(take_generic_ev);
+        cgh.host_task(
+            [packed_ind_ptrs, ctx]() { sycl::free(packed_ind_ptrs, ctx); });
+    });
+
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(take_generic_ev);
+        cgh.host_task([packed_ind_offsets, ctx]() {
+            sycl::free(packed_ind_offsets, ctx);
+        });
+    });
+
+    return std::make_pair(
+        keep_args_alive(exec_q, {src, dst}, {take_generic_ev}),
+        take_generic_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
+                std::vector<dpctl::tensor::usm_ndarray> ind,
+                dpctl::tensor::usm_ndarray val,
+                int axis_start,
+                uint8_t mode,
+                sycl::queue exec_q,
+                const std::vector<sycl::event> &depends = {})
+{
+    // check compatibility of execution queue and allocation queue
+    int k = ind.size();
+
+    if (k == 0) {
+        // no indices to write to
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    if (axis_start < 0) {
+        throw py::value_error("Axis cannot be negative.");
+    }
+
+    const dpctl::tensor::usm_ndarray ind_rep = ind[0];
+
+    int dst_nd = dst.get_ndim();
+    int val_nd = val.get_ndim();
+    int ind_nd = ind_rep.get_ndim();
+
+    auto sh_elems = (dst_nd > 0) ? dst_nd : 1;
+
+    if (axis_start + k > sh_elems) {
+        throw py::value_error("Axes are out of range for array of dimension " +
+                              std::to_string(dst_nd));
+    }
+    if (dst_nd == 0) {
+        if (val_nd != ind_nd) {
+            throw py::value_error("Destination is not of appropriate dimension "
+                                  "for put function.");
+        }
+    }
+    else {
+        if (val_nd != (dst_nd - k + ind_nd)) {
+            throw py::value_error("Destination is not of appropriate dimension "
+                                  "for put function.");
+        }
+    }
+
+    size_t dst_nelems = dst.get_size();
+
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    const py::ssize_t *val_shape = val.get_shape_raw();
+
+    bool orthog_shapes_equal(true);
+    size_t orthog_nelems(1);
+    for (int i = 0; i < axis_start; ++i) {
+        orthog_nelems *= static_cast<size_t>(dst_shape[i]);
+        orthog_shapes_equal =
+            orthog_shapes_equal && (dst_shape[i] == val_shape[i]);
+    }
+
+    for (int i = (axis_start + k), j = (axis_start + ind_nd);
+         (i < dst_nd && j < val_nd); ++i, ++j)
+    {
+        orthog_nelems *= static_cast<size_t>(dst_shape[i]);
+        orthog_shapes_equal =
+            orthog_shapes_equal && (dst_shape[i] == val_shape[j]);
+    }
+
+    if (!orthog_shapes_equal) {
+        throw py::value_error(
+            "Axes of basic indices are not of matching shapes.");
+    }
+
+    if (orthog_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    char *dst_data = dst.get_data();
+    char *val_data = val.get_data();
+
+    auto dst_offsets = dst.get_minmax_offsets();
+    auto val_offsets = val.get_minmax_offsets();
+    int dst_elem_size = dst.get_elemsize();
+    int val_elem_size = val.get_elemsize();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst, val})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+    py::ssize_t dst_offset = py::ssize_t(0);
+    py::ssize_t val_offset = py::ssize_t(0);
+
+    if (!dst.is_writable()) {
+        throw py::value_error("Output array is read-only.");
+    }
+
+    bool memory_overlap =
+        ((val_data - dst_data > dst_offsets.second * dst_elem_size -
+                                    val_offsets.first * val_elem_size) &&
+         (dst_data - val_data > val_offsets.second * val_elem_size -
+                                    dst_offsets.first * dst_elem_size));
+    if (memory_overlap) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    int dst_typenum = dst.get_typenum();
+    int val_typenum = val.get_typenum();
+
+    auto array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+    int val_type_id = array_types.typenum_to_lookup_id(val_typenum);
+
+    if (dst_type_id != val_type_id) {
+        throw py::type_error("Array data types are not the same.");
+    }
+
+    const py::ssize_t *ind_shape = ind_rep.get_shape_raw();
+
+    int ind_typenum = ind_rep.get_typenum();
+    int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum);
+
+    size_t ind_nelems(1);
+    for (int i = 0; i < ind_nd; ++i) {
+        ind_nelems *= static_cast<size_t>(ind_shape[i]);
+
+        if (!(ind_shape[i] == val_shape[axis_start + i])) {
+            throw py::value_error(
+                "Indices shapes does not match shape of axis in vals.");
+        }
+    }
+
+    auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1;
+
+    char **packed_ind_ptrs = sycl::malloc_device<char *>(k, exec_q);
+
+    if (packed_ind_ptrs == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_ind_ptrs device memory");
+    }
+
+    // packed_ind_shapes_strides = [ind_shape,
+    //                              ind[0] strides,
+    //                              ...,
+    //                              ind[k] strides]
+    py::ssize_t *packed_ind_shapes_strides =
+        sycl::malloc_device<py::ssize_t>((k + 1) * ind_sh_elems, exec_q);
+
+    if (packed_ind_shapes_strides == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_ind_shapes_strides device memory");
+    }
+
+    py::ssize_t *packed_ind_offsets =
+        sycl::malloc_device<py::ssize_t>(k, exec_q);
+
+    if (packed_ind_offsets == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_ind_offsets device memory");
+    }
+
+    using usm_host_allocator_T =
+        sycl::usm_allocator<char *, sycl::usm::alloc::host>;
+    using ptrT = std::vector<char *, usm_host_allocator_T>;
+
+    usm_host_allocator_T ptr_allocator(exec_q);
+    std::shared_ptr<ptrT> host_ind_ptrs_shp =
+        std::make_shared<ptrT>(k, ptr_allocator);
+
+    using usm_host_allocatorT =
+        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
+    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT ind_allocator(exec_q);
+    std::shared_ptr<shT> host_ind_shapes_strides_shp =
+        std::make_shared<shT>(ind_sh_elems * (k + 1), ind_allocator);
+
+    // shape can be copied now (must be the same for every array)
+    if (ind_nd > 0) {
+        std::copy(ind_shape, ind_shape + ind_nd,
+                  host_ind_shapes_strides_shp->begin());
+    }
+    else {
+        // all strides are 0 for 0D array
+        host_ind_shapes_strides_shp->insert(host_ind_shapes_strides_shp->end(),
+                                            (k + 1), 0);
+    }
+
+    std::shared_ptr<shT> host_ind_offsets_shp =
+        std::make_shared<shT>(k, ind_allocator);
+
+    std::vector<char *> ind_ptrs;
+    ind_ptrs.reserve(k);
+    std::vector<py::ssize_t> ind_offsets;
+    ind_offsets.reserve(k);
+    for (int i = 0; i < k; ++i) {
+        dpctl::tensor::usm_ndarray ind_ = ind[i];
+
+        if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) {
+            throw py::value_error(
+                "Execution queue is not compatible with allocation queues");
+        }
+
+        // ndim, type, and shape are checked against the first array
+        if (i > 0) {
+            if (!(ind_.get_ndim() == ind_nd)) {
+                throw py::value_error("Index dimensions are not the same");
+            }
+
+            if (!(ind_type_id ==
+                  array_types.typenum_to_lookup_id(ind_.get_typenum()))) {
+                throw py::type_error(
+                    "Indices array data types are not all the same.");
+            }
+
+            const py::ssize_t *ind_shape_ = ind_.get_shape_raw();
+            for (int dim = 0; dim < ind_nd; ++dim) {
+                if (!(ind_shape[dim] == ind_shape_[dim])) {
+                    throw py::value_error("Indices shapes are not all equal.");
+                }
+            }
+        }
+
+        // check for overlap with destination
+        int ind_elem_size = ind_.get_elemsize();
+        auto ind_mem_offsets = ind_.get_minmax_offsets();
+        char *ind_data = ind_.get_data();
+        bool ind_memory_overlap =
+            ((val_data - ind_data > ind_mem_offsets.second * ind_elem_size -
+                                        val_offsets.first * val_elem_size) &&
+             (ind_data - val_data > val_offsets.second * val_elem_size -
+                                        ind_mem_offsets.first * ind_elem_size));
+
+        if (ind_memory_overlap) {
+            throw py::value_error(
+                "Arrays index overlapping segments of memory");
+        }
+
+        // strides are initialized to 0 for 0D indices, so skip here
+        if (ind_nd > 0) {
+            const py::ssize_t *ind_strides = ind_.get_strides_raw();
+            if (ind_strides == nullptr) {
+                if (ind_.is_c_contiguous()) {
+                    const auto &ind_contig_strides_ =
+                        c_contiguous_strides(ind_nd, ind_shape);
+                    std::copy(ind_contig_strides_.begin(),
+                              ind_contig_strides_.end(),
+                              host_ind_shapes_strides_shp->begin() +
+                                  (i + 1) * ind_nd);
+                }
+                else if (ind_.is_f_contiguous()) {
+                    const auto &ind_contig_strides_ =
+                        f_contiguous_strides(ind_nd, ind_shape);
+                    std::copy(ind_contig_strides_.begin(),
+                              ind_contig_strides_.end(),
+                              host_ind_shapes_strides_shp->begin() +
+                                  (i + 1) * ind_nd);
+                }
+                else {
+                    throw std::runtime_error(
+                        "Invalid ind array encountered in: take function");
+                }
+            }
+            else {
+                std::copy(ind_strides, ind_strides + ind_nd,
+                          host_ind_shapes_strides_shp->begin() +
+                              (i + 1) * ind_nd);
+            }
+        }
+
+        ind_ptrs.push_back(ind_data);
+        ind_offsets.push_back(py::ssize_t(0));
+    }
+
+    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
+    std::copy(ind_offsets.begin(), ind_offsets.end(),
+              host_ind_offsets_shp->begin());
+
+    sycl::event device_ind_ptrs_copy_ev = exec_q.copy<char *>(
+        host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(device_ind_ptrs_copy_ev);
+        cgh.host_task([host_ind_ptrs_shp]() {
+            // Capturing shared pointer ensures that the underlying vector is
+            // not destroyed until after its data are copied into packed USM
+            // vector
+        });
+    });
+
+    sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
+        host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides,
+        host_ind_shapes_strides_shp->size());
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(device_ind_shapes_strides_copy_ev);
+        cgh.host_task([host_ind_shapes_strides_shp]() {});
+    });
+
+    sycl::event device_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
+        host_ind_offsets_shp->data(), packed_ind_offsets,
+        host_ind_offsets_shp->size());
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(device_ind_offsets_copy_ev);
+        cgh.host_task([host_ind_offsets_shp]() {});
+    });
+
+    std::vector<sycl::event> ind_pack_depends = {
+        device_ind_ptrs_copy_ev, device_ind_shapes_strides_copy_ev,
+        device_ind_offsets_copy_ev};
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    bool is_val_c_contig = val.is_c_contiguous();
+    bool is_val_f_contig = val.is_f_contiguous();
+
+    const py::ssize_t *dst_strides = dst.get_strides_raw();
+    const py::ssize_t *val_strides = val.get_strides_raw();
+
+    // destination must be ample enough to accomodate all possible elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if ((range + 1) < dst_nelems) {
+            throw py::value_error(
+                "Destination array can not accomodate all the "
+                "elements of source array.");
+        }
+    }
+
+    // packed_shapes_strides = [dst_shape[:axis] + dst_shape[:axis+1],
+    //                         dst_strides[:axis] + dst_strides[:axis+1],
+    //                         val_strides[:axis] + val_strides[:axis+1]]
+    py::ssize_t *packed_shapes_strides =
+        sycl::malloc_device<py::ssize_t>(3 * sh_elems, exec_q);
+
+    if (packed_shapes_strides == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_shapes_strides device memory");
+    }
+
+    // packed_axes_shapes_strides = [dst_shape[axis:k],
+    //                               dst_strides[axis:k,
+    //                               val_strides[axis:ind.ndim]]
+    py::ssize_t *packed_axes_shapes_strides =
+        sycl::malloc_device<py::ssize_t>((2 * k) + ind_sh_elems, exec_q);
+
+    if (packed_axes_shapes_strides == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_axes_shapes_strides device memory");
+    }
+
+    std::vector<sycl::event> copy_shapes_strides_deps =
+        _populate_packed_shapes_strides_for_indexing(
+            exec_q, packed_shapes_strides, packed_axes_shapes_strides,
+            dst_shape, dst_strides, is_dst_c_contig, is_dst_f_contig, val_shape,
+            val_strides, is_val_c_contig, is_val_f_contig, axis_start, k,
+            ind_nd, dst_nd, val_nd);
+
+    std::vector<sycl::event> all_deps(depends.size() +
+                                      copy_shapes_strides_deps.size() +
+                                      ind_pack_depends.size());
+    all_deps.insert(std::end(all_deps), std::begin(copy_shapes_strides_deps),
+                    std::end(copy_shapes_strides_deps));
+    all_deps.insert(std::end(all_deps), std::begin(ind_pack_depends),
+                    std::end(ind_pack_depends));
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id];
+
+    if (fn == nullptr) {
+        throw std::runtime_error("Indices must be integer type, got " +
+                                 std::to_string(ind_type_id));
+    }
+
+    int orthog_nd = ((dst_nd - k) > 0) ? dst_nd - k : 1;
+
+    sycl::event put_generic_ev =
+        fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k,
+           packed_shapes_strides, packed_axes_shapes_strides,
+           packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs,
+           dst_offset, val_offset, packed_ind_offsets, all_deps);
+
+    // free packed_shapes_strides temporary
+
+    auto ctx = exec_q.get_context();
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(put_generic_ev);
+        cgh.host_task([packed_shapes_strides, ctx]() {
+            sycl::free(packed_shapes_strides, ctx);
+        });
+    });
+
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(put_generic_ev);
+        cgh.host_task([packed_axes_shapes_strides, ctx]() {
+            sycl::free(packed_axes_shapes_strides, ctx);
+        });
+    });
+
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(put_generic_ev);
+        cgh.host_task([packed_ind_shapes_strides, ctx]() {
+            sycl::free(packed_ind_shapes_strides, ctx);
+        });
+    });
+
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(put_generic_ev);
+        cgh.host_task(
+            [packed_ind_ptrs, ctx]() { sycl::free(packed_ind_ptrs, ctx); });
+    });
+
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(put_generic_ev);
+        cgh.host_task([packed_ind_offsets, ctx]() {
+            sycl::free(packed_ind_offsets, ctx);
+        });
+    });
+
+    return std::make_pair(keep_args_alive(exec_q, {dst, val}, {put_generic_ev}),
+                          put_generic_ev);
+}
+
+void init_advanced_indexing_dispatch_tables(void)
+{
+    using namespace dpctl::tensor::detail;
+
+    using dpctl::tensor::kernels::indexing::TakeClipFactory;
+    DispatchTableBuilder<take_fn_ptr_t, TakeClipFactory, num_types>
+        dtb_takeclip;
+    dtb_takeclip.populate_dispatch_table(take_dispatch_table[CLIP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::TakeWrapFactory;
+    DispatchTableBuilder<take_fn_ptr_t, TakeWrapFactory, num_types>
+        dtb_takewrap;
+    dtb_takewrap.populate_dispatch_table(take_dispatch_table[WRAP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::PutClipFactory;
+    DispatchTableBuilder<put_fn_ptr_t, PutClipFactory, num_types> dtb_putclip;
+    dtb_putclip.populate_dispatch_table(put_dispatch_table[CLIP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::PutWrapFactory;
+    DispatchTableBuilder<put_fn_ptr_t, PutWrapFactory, num_types> dtb_putwrap;
+    dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.hpp b/dpctl/tensor/libtensor/source/advanced_indexing.hpp
new file mode 100644
index 0000000000..d99d4f1828
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/advanced_indexing.hpp
@@ -0,0 +1,62 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+usm_ndarray_take(dpctl::tensor::usm_ndarray src,
+                 std::vector<dpctl::tensor::usm_ndarray> ind,
+                 dpctl::tensor::usm_ndarray dst,
+                 int axis_start,
+                 uint8_t mode,
+                 sycl::queue exec_q,
+                 const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event>
+usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
+                std::vector<dpctl::tensor::usm_ndarray> ind,
+                dpctl::tensor::usm_ndarray val,
+                int axis_start,
+                uint8_t mode,
+                sycl::queue exec_q,
+                const std::vector<sycl::event> &depends = {});
+
+extern void init_advanced_indexing_dispatch_tables();
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index aa8634ecf4..94458bccf9 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -33,6 +33,7 @@
 
 #include "dpctl4pybind11.hpp"
 
+#include "advanced_indexing.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_for_reshape.hpp"
 #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
@@ -70,6 +71,10 @@ using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
 
 using dpctl::tensor::py_internal::usm_ndarray_full;
 
+/* ============== Advanced Indexing ============= */
+using dpctl::tensor::py_internal::usm_ndarray_put;
+using dpctl::tensor::py_internal::usm_ndarray_take;
+
 /* ================ Eye ================== */
 
 using dpctl::tensor::py_internal::usm_ndarray_eye;
@@ -85,6 +90,7 @@ void init_dispatch_tables(void)
 
     init_copy_and_cast_usm_to_usm_dispatch_tables();
     init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
+    init_advanced_indexing_dispatch_tables();
     return;
 }
 
@@ -179,6 +185,24 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
+    m.def("_take", &usm_ndarray_take,
+          "Takes elements at usm_ndarray indices `ind` and axes starting "
+          "at axis `axis_start` from array `src` and copies them "
+          "into usm_ndarray `dst` synchronously."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("src"), py::arg("ind"), py::arg("dst"), py::arg("axis_start"),
+          py::arg("mode"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_put", &usm_ndarray_put,
+          "Puts elements at usm_ndarray indices `ind` and axes starting "
+          "at axis `axis_start` into array `dst` from "
+          "usm_ndarray `val` synchronously."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("dst"), py::arg("ind"), py::arg("val"), py::arg("axis_start"),
+          py::arg("mode"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
     m.def("_eye", &usm_ndarray_eye,
           "Fills input 2D contiguous usm_ndarray `dst` with "
           "zeros outside of the diagonal "

From 6239eb7d631713051211c4870883e9ff219d965d Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Fri, 24 Feb 2023 09:52:36 -0800
Subject: [PATCH 15/57] Changes to advanced indexing - Clipping now clips
 indices to -n <= i < n for n = axis size - Fixed a segfault caused by a typo
 when copying strides

---
 dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp | 4 +++-
 dpctl/tensor/libtensor/source/advanced_indexing.cpp          | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
index 77234296ff..8ccad8db28 100644
--- a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
@@ -54,7 +54,9 @@ template <typename indT> class ClipIndex
     {
         max_item = (max_item > 0) ? max_item : 1;
         py::ssize_t clip_ind = static_cast<py::ssize_t>(ind);
-        ind = (ind < 0) ? 0 : (clip_ind >= max_item) ? (max_item - 1) : ind;
+        ind = (ind < 0) ? (clip_ind <= -max_item) ? (0) : (clip_ind + max_item)
+              : (clip_ind >= max_item) ? (max_item - 1)
+                                       : ind;
         return;
     }
 };
diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.cpp b/dpctl/tensor/libtensor/source/advanced_indexing.cpp
index 5f043db7bc..3dc6f47904 100644
--- a/dpctl/tensor/libtensor/source/advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/advanced_indexing.cpp
@@ -197,7 +197,7 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
             std::copy(arr_strides, arr_strides + axis_start,
                       packed_host_shapes_strides_shp->begin() +
                           2 * orthog_sh_elems);
-            std::copy(arr_strides + axis_start + ind_nd, arr_strides + inp_nd,
+            std::copy(arr_strides + axis_start + ind_nd, arr_strides + arr_nd,
                       packed_host_shapes_strides_shp->begin() +
                           2 * orthog_sh_elems + axis_start);
             std::copy(arr_strides + axis_start,

From a0895be431e0827263955ea3e24b724eec4145c0 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Fri, 24 Feb 2023 12:33:57 -0800
Subject: [PATCH 16/57] Changes to advanced_indexing.cpp - Moved indices
 validation to avoid memory leaks - Refactored for loop over orthogonal
 elements of shapes - Direct initialization of sycl::event vectors

---
 .../libtensor/source/advanced_indexing.cpp    | 434 ++++++++----------
 1 file changed, 183 insertions(+), 251 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.cpp b/dpctl/tensor/libtensor/source/advanced_indexing.cpp
index 3dc6f47904..fed5d543ed 100644
--- a/dpctl/tensor/libtensor/source/advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/advanced_indexing.cpp
@@ -85,7 +85,7 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
     int arr_nd)
 {
 
-    int orthog_sh_elems = (inp_nd > 1) ? inp_nd - k : 1;
+    int orthog_sh_elems = ((inp_nd - k) > 1) ? (inp_nd - k) : 1;
     int along_sh_elems = (ind_nd > 1) ? ind_nd : 1;
 
     using usm_host_allocatorT =
@@ -291,14 +291,17 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     int k = ind.size();
 
     if (k == 0) {
-        // no indices to take from
-        return std::make_pair(sycl::event{}, sycl::event{});
+        throw py::value_error("List of indices is empty.");
     }
 
     if (axis_start < 0) {
         throw py::value_error("Axis cannot be negative.");
     }
 
+    if (mode != 0 && mode != 1) {
+        throw py::value_error("Mode must be 0 or 1.");
+    }
+
     const dpctl::tensor::usm_ndarray ind_rep = ind[0];
 
     int src_nd = src.get_ndim();
@@ -327,20 +330,17 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     const py::ssize_t *src_shape = src.get_shape_raw();
     const py::ssize_t *dst_shape = dst.get_shape_raw();
 
+    int orthog_nd = ((src_nd - k) > 0) ? src_nd - k : 1;
+
     bool orthog_shapes_equal(true);
     size_t orthog_nelems(1);
-    for (int i = 0; i < axis_start; ++i) {
-        orthog_nelems *= static_cast<size_t>(src_shape[i]);
-        orthog_shapes_equal =
-            orthog_shapes_equal && (src_shape[i] == dst_shape[i]);
-    }
+    for (int i = 0; i < (src_nd - k); ++i) {
+        auto idx1 = (i < axis_start) ? i : i + k;
+        auto idx2 = (i < axis_start) ? i : i + ind_nd;
 
-    for (int i = (axis_start + k), j = (axis_start + ind_nd);
-         (i < src_nd && j < dst_nd); ++i, ++j)
-    {
-        orthog_nelems *= static_cast<size_t>(src_shape[i]);
+        orthog_nelems *= static_cast<size_t>(src_shape[idx1]);
         orthog_shapes_equal =
-            orthog_shapes_equal && (src_shape[i] == dst_shape[j]);
+            orthog_shapes_equal && (src_shape[idx1] == dst_shape[idx2]);
     }
 
     if (!orthog_shapes_equal) {
@@ -355,29 +355,26 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     char *src_data = src.get_data();
     char *dst_data = dst.get_data();
 
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
     auto src_offsets = src.get_minmax_offsets();
     auto dst_offsets = dst.get_minmax_offsets();
     int src_elem_size = src.get_elemsize();
     int dst_elem_size = dst.get_elemsize();
 
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
     py::ssize_t src_offset = py::ssize_t(0);
     py::ssize_t dst_offset = py::ssize_t(0);
 
-    if (!dst.is_writable()) {
-        throw py::value_error("Output array is read-only.");
-    }
-
     bool memory_overlap =
         ((dst_data - src_data > src_offsets.second * src_elem_size -
                                     dst_offsets.first * dst_elem_size) &&
          (src_data - dst_data > dst_offsets.second * dst_elem_size -
                                     src_offsets.first * src_elem_size));
     if (memory_overlap) {
-        throw py::value_error("Arrays index overlapping segments of memory");
+        throw py::value_error("Array memory overlap.");
     }
 
     int src_typenum = src.get_typenum();
@@ -408,67 +405,16 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
 
     auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1;
 
-    char **packed_ind_ptrs = sycl::malloc_device<char *>(k, exec_q);
-
-    if (packed_ind_ptrs == nullptr) {
-        throw std::runtime_error(
-            "Unable to allocate packed_ind_ptrs device memory");
-    }
-
-    // packed_ind_shapes_strides = [ind_shape,
-    //                              ind[0] strides,
-    //                              ...,
-    //                              ind[k] strides]
-    py::ssize_t *packed_ind_shapes_strides =
-        sycl::malloc_device<py::ssize_t>((k + 1) * ind_sh_elems, exec_q);
-
-    if (packed_ind_shapes_strides == nullptr) {
-        throw std::runtime_error(
-            "Unable to allocate packed_ind_shapes_strides device memory");
-    }
-
-    py::ssize_t *packed_ind_offsets =
-        sycl::malloc_device<py::ssize_t>(k, exec_q);
-
-    if (packed_ind_offsets == nullptr) {
-        throw std::runtime_error(
-            "Unable to allocate packed_ind_offsets device memory");
-    }
-
-    using usm_host_allocator_T =
-        sycl::usm_allocator<char *, sycl::usm::alloc::host>;
-    using ptrT = std::vector<char *, usm_host_allocator_T>;
-
-    usm_host_allocator_T ptr_allocator(exec_q);
-    std::shared_ptr<ptrT> host_ind_ptrs_shp =
-        std::make_shared<ptrT>(k, ptr_allocator);
-
-    using usm_host_allocatorT =
-        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
-    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
-
-    usm_host_allocatorT ind_allocator(exec_q);
-    std::shared_ptr<shT> host_ind_shapes_strides_shp =
-        std::make_shared<shT>(ind_sh_elems * (k + 1), ind_allocator);
-
-    // shape can be copied now (must be the same for every array)
-    if (ind_nd > 0) {
-        std::copy(ind_shape, ind_shape + ind_nd,
-                  host_ind_shapes_strides_shp->begin());
-    }
-    else {
-        // all strides are 0 for 0D array
-        host_ind_shapes_strides_shp->insert(host_ind_shapes_strides_shp->end(),
-                                            (k + 1), 0);
-    }
-
-    std::shared_ptr<shT> host_ind_offsets_shp =
-        std::make_shared<shT>(k, ind_allocator);
-
     std::vector<char *> ind_ptrs;
     ind_ptrs.reserve(k);
+
     std::vector<py::ssize_t> ind_offsets;
     ind_offsets.reserve(k);
+
+    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0));
+    if (ind_nd > 0) {
+        std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin());
+    }
     for (int i = 0; i < k; ++i) {
         dpctl::tensor::usm_ndarray ind_ = ind[i];
 
@@ -521,16 +467,14 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
                         c_contiguous_strides(ind_nd, ind_shape);
                     std::copy(ind_contig_strides_.begin(),
                               ind_contig_strides_.end(),
-                              host_ind_shapes_strides_shp->begin() +
-                                  (i + 1) * ind_nd);
+                              ind_sh_sts.begin() + (i + 1) * ind_nd);
                 }
                 else if (ind_.is_f_contiguous()) {
                     const auto &ind_contig_strides_ =
                         f_contiguous_strides(ind_nd, ind_shape);
                     std::copy(ind_contig_strides_.begin(),
                               ind_contig_strides_.end(),
-                              host_ind_shapes_strides_shp->begin() +
-                                  (i + 1) * ind_nd);
+                              ind_sh_sts.begin() + (i + 1) * ind_nd);
                 }
                 else {
                     throw std::runtime_error(
@@ -539,8 +483,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
             }
             else {
                 std::copy(ind_strides, ind_strides + ind_nd,
-                          host_ind_shapes_strides_shp->begin() +
-                              (i + 1) * ind_nd);
+                          ind_sh_sts.begin() + (i + 1) * ind_nd);
             }
         }
 
@@ -548,36 +491,85 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
         ind_offsets.push_back(py::ssize_t(0));
     }
 
+    char **packed_ind_ptrs = sycl::malloc_device<char *>(k, exec_q);
+
+    if (packed_ind_ptrs == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_ind_ptrs device memory");
+    }
+
+    // rearrange to past where indices shapes are checked
+    // packed_ind_shapes_strides = [ind_shape,
+    //                              ind[0] strides,
+    //                              ...,
+    //                              ind[k] strides]
+    py::ssize_t *packed_ind_shapes_strides =
+        sycl::malloc_device<py::ssize_t>((k + 1) * ind_sh_elems, exec_q);
+
+    if (packed_ind_shapes_strides == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_ind_shapes_strides device memory");
+    }
+
+    py::ssize_t *packed_ind_offsets =
+        sycl::malloc_device<py::ssize_t>(k, exec_q);
+
+    if (packed_ind_offsets == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_ind_offsets device memory");
+    }
+
+    using usm_host_allocator_T =
+        sycl::usm_allocator<char *, sycl::usm::alloc::host>;
+    using ptrT = std::vector<char *, usm_host_allocator_T>;
+
+    usm_host_allocator_T ptr_allocator(exec_q);
+    std::shared_ptr<ptrT> host_ind_ptrs_shp =
+        std::make_shared<ptrT>(k, ptr_allocator);
+
+    using usm_host_allocatorT =
+        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
+    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT ind_allocator(exec_q);
+    std::shared_ptr<shT> host_ind_shapes_strides_shp =
+        std::make_shared<shT>(ind_sh_elems * (k + 1), ind_allocator);
+
+    std::shared_ptr<shT> host_ind_offsets_shp =
+        std::make_shared<shT>(k, ind_allocator);
+
+    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
+              host_ind_shapes_strides_shp->begin());
     std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
     std::copy(ind_offsets.begin(), ind_offsets.end(),
               host_ind_offsets_shp->begin());
 
-    sycl::event device_ind_ptrs_copy_ev = exec_q.copy<char *>(
+    sycl::event packed_ind_ptrs_copy_ev = exec_q.copy<char *>(
         host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
     exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(device_ind_ptrs_copy_ev);
+        cgh.depends_on(packed_ind_ptrs_copy_ev);
         cgh.host_task([host_ind_ptrs_shp]() {});
     });
 
-    sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
+    sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides,
         host_ind_shapes_strides_shp->size());
     exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(device_ind_shapes_strides_copy_ev);
+        cgh.depends_on(packed_ind_shapes_strides_copy_ev);
         cgh.host_task([host_ind_shapes_strides_shp]() {});
     });
 
-    sycl::event device_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
+    sycl::event packed_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_offsets_shp->data(), packed_ind_offsets,
         host_ind_offsets_shp->size());
     exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(device_ind_offsets_copy_ev);
+        cgh.depends_on(packed_ind_offsets_copy_ev);
         cgh.host_task([host_ind_offsets_shp]() {});
     });
 
-    std::vector<sycl::event> ind_pack_depends = {
-        device_ind_ptrs_copy_ev, device_ind_shapes_strides_copy_ev,
-        device_ind_offsets_copy_ev};
+    std::vector<sycl::event> ind_pack_depends{packed_ind_ptrs_copy_ev,
+                                              packed_ind_shapes_strides_copy_ev,
+                                              packed_ind_offsets_copy_ev};
 
     bool is_src_c_contig = src.is_c_contiguous();
     bool is_src_f_contig = src.is_f_contiguous();
@@ -588,20 +580,20 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     const py::ssize_t *src_strides = src.get_strides_raw();
     const py::ssize_t *dst_strides = dst.get_strides_raw();
 
-    // destination must be ample enough to accomodate all elements
+    // destination must be ample enough to accommodate all elements
     {
         size_t range =
             static_cast<size_t>(dst_offsets.second - dst_offsets.first);
         if ((range + 1) < (orthog_nelems * ind_nelems)) {
             throw py::value_error(
-                "Destination array can not accomodate all the "
+                "Destination array can not accommodate all the "
                 "elements of source array.");
         }
     }
 
-    // packed_shapes_strides = [src_shape[:axis] + src_shape[:axis+1],
-    //                         src_strides[:axis] + src_strides[:axis+1],
-    //                         dst_strides[:axis] + dst_strides[:axis+1]]
+    // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:],
+    //                          src_strides[:axis] + src_strides[axis+k:],
+    //                          dst_strides[:axis] + dst_strides[axis+k:]]
     py::ssize_t *packed_shapes_strides =
         sycl::malloc_device<py::ssize_t>(3 * sh_elems, exec_q);
 
@@ -610,8 +602,8 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
             "Unable to allocate packed_shapes_strides device memory");
     }
 
-    // packed_axes_shapes_strides = [src_shape[axis:k],
-    //                               src_strides[axis:k,
+    // packed_axes_shapes_strides = [src_shape[axis:axis+k],
+    //                               src_strides[axis:axis+k,
     //                               dst_strides[axis:ind.ndim]]
     py::ssize_t *packed_axes_shapes_strides =
         sycl::malloc_device<py::ssize_t>((2 * k) + ind_sh_elems, exec_q);
@@ -643,47 +635,23 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
                                  std::to_string(ind_type_id));
     }
 
-    int orthog_nd = ((src_nd - k) > 0) ? src_nd - k : 1;
-
     sycl::event take_generic_ev =
         fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k,
            packed_shapes_strides, packed_axes_shapes_strides,
            packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs,
            src_offset, dst_offset, packed_ind_offsets, all_deps);
 
-    // free packed_shapes_strides temporary
-
+    // free packed temporaries
     auto ctx = exec_q.get_context();
     exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(take_generic_ev);
-        cgh.host_task([packed_shapes_strides, ctx]() {
+        cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides,
+                       packed_ind_shapes_strides, packed_ind_ptrs,
+                       packed_ind_offsets, ctx]() {
             sycl::free(packed_shapes_strides, ctx);
-        });
-    });
-
-    exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(take_generic_ev);
-        cgh.host_task([packed_axes_shapes_strides, ctx]() {
             sycl::free(packed_axes_shapes_strides, ctx);
-        });
-    });
-
-    exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(take_generic_ev);
-        cgh.host_task([packed_ind_shapes_strides, ctx]() {
             sycl::free(packed_ind_shapes_strides, ctx);
-        });
-    });
-
-    exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(take_generic_ev);
-        cgh.host_task(
-            [packed_ind_ptrs, ctx]() { sycl::free(packed_ind_ptrs, ctx); });
-    });
-
-    exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(take_generic_ev);
-        cgh.host_task([packed_ind_offsets, ctx]() {
+            sycl::free(packed_ind_ptrs, ctx);
             sycl::free(packed_ind_offsets, ctx);
         });
     });
@@ -702,18 +670,25 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
                 sycl::queue exec_q,
                 const std::vector<sycl::event> &depends = {})
 {
-    // check compatibility of execution queue and allocation queue
     int k = ind.size();
 
     if (k == 0) {
         // no indices to write to
-        return std::make_pair(sycl::event{}, sycl::event{});
+        throw py::value_error("List of indices is empty.");
     }
 
     if (axis_start < 0) {
         throw py::value_error("Axis cannot be negative.");
     }
 
+    if (mode != 0 && mode != 1) {
+        throw py::value_error("Mode must be 0 or 1.");
+    }
+
+    if (!dst.is_writable()) {
+        throw py::value_error("Output array is read-only.");
+    }
+
     const dpctl::tensor::usm_ndarray ind_rep = ind[0];
 
     int dst_nd = dst.get_ndim();
@@ -744,20 +719,17 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
     const py::ssize_t *dst_shape = dst.get_shape_raw();
     const py::ssize_t *val_shape = val.get_shape_raw();
 
+    int orthog_nd = ((dst_nd - k) > 0) ? dst_nd - k : 1;
+
     bool orthog_shapes_equal(true);
     size_t orthog_nelems(1);
-    for (int i = 0; i < axis_start; ++i) {
-        orthog_nelems *= static_cast<size_t>(dst_shape[i]);
-        orthog_shapes_equal =
-            orthog_shapes_equal && (dst_shape[i] == val_shape[i]);
-    }
+    for (int i = 0; i < (dst_nd - k); ++i) {
+        auto idx1 = (i < axis_start) ? i : i + k;
+        auto idx2 = (i < axis_start) ? i : i + ind_nd;
 
-    for (int i = (axis_start + k), j = (axis_start + ind_nd);
-         (i < dst_nd && j < val_nd); ++i, ++j)
-    {
-        orthog_nelems *= static_cast<size_t>(dst_shape[i]);
+        orthog_nelems *= static_cast<size_t>(dst_shape[idx1]);
         orthog_shapes_equal =
-            orthog_shapes_equal && (dst_shape[i] == val_shape[j]);
+            orthog_shapes_equal && (dst_shape[idx1] == val_shape[idx2]);
     }
 
     if (!orthog_shapes_equal) {
@@ -784,10 +756,6 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
     py::ssize_t dst_offset = py::ssize_t(0);
     py::ssize_t val_offset = py::ssize_t(0);
 
-    if (!dst.is_writable()) {
-        throw py::value_error("Output array is read-only.");
-    }
-
     bool memory_overlap =
         ((val_data - dst_data > dst_offsets.second * dst_elem_size -
                                     val_offsets.first * val_elem_size) &&
@@ -825,67 +793,14 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
 
     auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1;
 
-    char **packed_ind_ptrs = sycl::malloc_device<char *>(k, exec_q);
-
-    if (packed_ind_ptrs == nullptr) {
-        throw std::runtime_error(
-            "Unable to allocate packed_ind_ptrs device memory");
-    }
-
-    // packed_ind_shapes_strides = [ind_shape,
-    //                              ind[0] strides,
-    //                              ...,
-    //                              ind[k] strides]
-    py::ssize_t *packed_ind_shapes_strides =
-        sycl::malloc_device<py::ssize_t>((k + 1) * ind_sh_elems, exec_q);
-
-    if (packed_ind_shapes_strides == nullptr) {
-        throw std::runtime_error(
-            "Unable to allocate packed_ind_shapes_strides device memory");
-    }
-
-    py::ssize_t *packed_ind_offsets =
-        sycl::malloc_device<py::ssize_t>(k, exec_q);
-
-    if (packed_ind_offsets == nullptr) {
-        throw std::runtime_error(
-            "Unable to allocate packed_ind_offsets device memory");
-    }
-
-    using usm_host_allocator_T =
-        sycl::usm_allocator<char *, sycl::usm::alloc::host>;
-    using ptrT = std::vector<char *, usm_host_allocator_T>;
-
-    usm_host_allocator_T ptr_allocator(exec_q);
-    std::shared_ptr<ptrT> host_ind_ptrs_shp =
-        std::make_shared<ptrT>(k, ptr_allocator);
-
-    using usm_host_allocatorT =
-        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
-    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
-
-    usm_host_allocatorT ind_allocator(exec_q);
-    std::shared_ptr<shT> host_ind_shapes_strides_shp =
-        std::make_shared<shT>(ind_sh_elems * (k + 1), ind_allocator);
-
-    // shape can be copied now (must be the same for every array)
-    if (ind_nd > 0) {
-        std::copy(ind_shape, ind_shape + ind_nd,
-                  host_ind_shapes_strides_shp->begin());
-    }
-    else {
-        // all strides are 0 for 0D array
-        host_ind_shapes_strides_shp->insert(host_ind_shapes_strides_shp->end(),
-                                            (k + 1), 0);
-    }
-
-    std::shared_ptr<shT> host_ind_offsets_shp =
-        std::make_shared<shT>(k, ind_allocator);
-
     std::vector<char *> ind_ptrs;
     ind_ptrs.reserve(k);
     std::vector<py::ssize_t> ind_offsets;
     ind_offsets.reserve(k);
+    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0));
+    if (ind_nd > 0) {
+        std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin());
+    }
     for (int i = 0; i < k; ++i) {
         dpctl::tensor::usm_ndarray ind_ = ind[i];
 
@@ -938,16 +853,14 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
                         c_contiguous_strides(ind_nd, ind_shape);
                     std::copy(ind_contig_strides_.begin(),
                               ind_contig_strides_.end(),
-                              host_ind_shapes_strides_shp->begin() +
-                                  (i + 1) * ind_nd);
+                              ind_sh_sts.begin() + (i + 1) * ind_nd);
                 }
                 else if (ind_.is_f_contiguous()) {
                     const auto &ind_contig_strides_ =
                         f_contiguous_strides(ind_nd, ind_shape);
                     std::copy(ind_contig_strides_.begin(),
                               ind_contig_strides_.end(),
-                              host_ind_shapes_strides_shp->begin() +
-                                  (i + 1) * ind_nd);
+                              ind_sh_sts.begin() + (i + 1) * ind_nd);
                 }
                 else {
                     throw std::runtime_error(
@@ -956,8 +869,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
             }
             else {
                 std::copy(ind_strides, ind_strides + ind_nd,
-                          host_ind_shapes_strides_shp->begin() +
-                              (i + 1) * ind_nd);
+                          ind_sh_sts.begin() + (i + 1) * ind_nd);
             }
         }
 
@@ -965,6 +877,54 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
         ind_offsets.push_back(py::ssize_t(0));
     }
 
+    char **packed_ind_ptrs = sycl::malloc_device<char *>(k, exec_q);
+
+    if (packed_ind_ptrs == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_ind_ptrs device memory");
+    }
+
+    // packed_ind_shapes_strides = [ind_shape,
+    //                              ind[0] strides,
+    //                              ...,
+    //                              ind[k] strides]
+    py::ssize_t *packed_ind_shapes_strides =
+        sycl::malloc_device<py::ssize_t>((k + 1) * ind_sh_elems, exec_q);
+
+    if (packed_ind_shapes_strides == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_ind_shapes_strides device memory");
+    }
+
+    py::ssize_t *packed_ind_offsets =
+        sycl::malloc_device<py::ssize_t>(k, exec_q);
+
+    if (packed_ind_offsets == nullptr) {
+        throw std::runtime_error(
+            "Unable to allocate packed_ind_offsets device memory");
+    }
+
+    using usm_host_allocator_T =
+        sycl::usm_allocator<char *, sycl::usm::alloc::host>;
+    using ptrT = std::vector<char *, usm_host_allocator_T>;
+
+    usm_host_allocator_T ptr_allocator(exec_q);
+    std::shared_ptr<ptrT> host_ind_ptrs_shp =
+        std::make_shared<ptrT>(k, ptr_allocator);
+
+    using usm_host_allocatorT =
+        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
+    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT ind_allocator(exec_q);
+    std::shared_ptr<shT> host_ind_shapes_strides_shp =
+        std::make_shared<shT>(ind_sh_elems * (k + 1), ind_allocator);
+
+    std::shared_ptr<shT> host_ind_offsets_shp =
+        std::make_shared<shT>(k, ind_allocator);
+
+    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
+              host_ind_shapes_strides_shp->begin());
     std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
     std::copy(ind_offsets.begin(), ind_offsets.end(),
               host_ind_offsets_shp->begin());
@@ -973,11 +933,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
         host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
     exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(device_ind_ptrs_copy_ev);
-        cgh.host_task([host_ind_ptrs_shp]() {
-            // Capturing shared pointer ensures that the underlying vector is
-            // not destroyed until after its data are copied into packed USM
-            // vector
-        });
+        cgh.host_task([host_ind_ptrs_shp]() {});
     });
 
     sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
@@ -996,9 +952,9 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
         cgh.host_task([host_ind_offsets_shp]() {});
     });
 
-    std::vector<sycl::event> ind_pack_depends = {
-        device_ind_ptrs_copy_ev, device_ind_shapes_strides_copy_ev,
-        device_ind_offsets_copy_ev};
+    std::vector<sycl::event> ind_pack_depends{device_ind_ptrs_copy_ev,
+                                              device_ind_shapes_strides_copy_ev,
+                                              device_ind_offsets_copy_ev};
 
     bool is_dst_c_contig = dst.is_c_contiguous();
     bool is_dst_f_contig = dst.is_f_contiguous();
@@ -1009,20 +965,20 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
     const py::ssize_t *dst_strides = dst.get_strides_raw();
     const py::ssize_t *val_strides = val.get_strides_raw();
 
-    // destination must be ample enough to accomodate all possible elements
+    // destination must be ample enough to accommodate all possible elements
     {
         size_t range =
             static_cast<size_t>(dst_offsets.second - dst_offsets.first);
         if ((range + 1) < dst_nelems) {
             throw py::value_error(
-                "Destination array can not accomodate all the "
+                "Destination array can not accommodate all the "
                 "elements of source array.");
         }
     }
 
-    // packed_shapes_strides = [dst_shape[:axis] + dst_shape[:axis+1],
-    //                         dst_strides[:axis] + dst_strides[:axis+1],
-    //                         val_strides[:axis] + val_strides[:axis+1]]
+    // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:],
+    //                          dst_strides[:axis] + dst_strides[axis+k:],
+    //                          val_strides[:axis] + val_strides[axis+k:]]
     py::ssize_t *packed_shapes_strides =
         sycl::malloc_device<py::ssize_t>(3 * sh_elems, exec_q);
 
@@ -1065,47 +1021,23 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
                                  std::to_string(ind_type_id));
     }
 
-    int orthog_nd = ((dst_nd - k) > 0) ? dst_nd - k : 1;
-
     sycl::event put_generic_ev =
         fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k,
            packed_shapes_strides, packed_axes_shapes_strides,
            packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs,
            dst_offset, val_offset, packed_ind_offsets, all_deps);
 
-    // free packed_shapes_strides temporary
-
+    // free packed temporaries
     auto ctx = exec_q.get_context();
     exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(put_generic_ev);
-        cgh.host_task([packed_shapes_strides, ctx]() {
+        cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides,
+                       packed_ind_shapes_strides, packed_ind_ptrs,
+                       packed_ind_offsets, ctx]() {
             sycl::free(packed_shapes_strides, ctx);
-        });
-    });
-
-    exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(put_generic_ev);
-        cgh.host_task([packed_axes_shapes_strides, ctx]() {
             sycl::free(packed_axes_shapes_strides, ctx);
-        });
-    });
-
-    exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(put_generic_ev);
-        cgh.host_task([packed_ind_shapes_strides, ctx]() {
             sycl::free(packed_ind_shapes_strides, ctx);
-        });
-    });
-
-    exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(put_generic_ev);
-        cgh.host_task(
-            [packed_ind_ptrs, ctx]() { sycl::free(packed_ind_ptrs, ctx); });
-    });
-
-    exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(put_generic_ev);
-        cgh.host_task([packed_ind_offsets, ctx]() {
+            sycl::free(packed_ind_ptrs, ctx);
             sycl::free(packed_ind_offsets, ctx);
         });
     });

From 728b8e69f29d9fbd0ed339a7f31d59c250dd1f94 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Sat, 25 Feb 2023 13:24:30 -0800
Subject: [PATCH 17/57] Fixed missing cast for indices clip/wrap

---
 .../include/kernels/advanced_indexing.hpp     | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
index 8ccad8db28..1e205c658c 100644
--- a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
@@ -45,34 +45,32 @@ namespace py = pybind11;
 template <typename ProjectorT, typename Ty, typename indT> class take_kernel;
 template <typename ProjectorT, typename Ty, typename indT> class put_kernel;
 
-template <typename indT> class ClipIndex
+class ClipIndex
 {
 public:
     ClipIndex() = default;
 
-    void operator()(py::ssize_t max_item, indT &ind) const
+    void operator()(py::ssize_t max_item, py::ssize_t &ind) const
     {
         max_item = (max_item > 0) ? max_item : 1;
-        py::ssize_t clip_ind = static_cast<py::ssize_t>(ind);
-        ind = (ind < 0) ? (clip_ind <= -max_item) ? (0) : (clip_ind + max_item)
-              : (clip_ind >= max_item) ? (max_item - 1)
-                                       : ind;
+        ind = (ind < 0)           ? (ind <= -max_item) ? (0) : (ind + max_item)
+                        : (ind >= max_item) ? (max_item - 1)
+                                  : ind;
         return;
     }
 };
 
-template <typename indT> class WrapIndex
+class WrapIndex
 {
 public:
     WrapIndex() = default;
 
-    void operator()(py::ssize_t max_item, indT &ind) const
+    void operator()(py::ssize_t max_item, py::ssize_t &ind) const
     {
         max_item = (max_item > 0) ? max_item : 1;
-        py::ssize_t wrap_ind = static_cast<py::ssize_t>(ind);
-        ind = (ind < 0)                ? max_item - (-wrap_ind % max_item)
-              : (wrap_ind >= max_item) ? wrap_ind % max_item
-                                       : ind;
+        ind = (ind < 0)           ? max_item - (-ind % max_item)
+              : (ind >= max_item) ? ind % max_item
+                                  : ind;
         return;
     }
 };
@@ -146,7 +144,8 @@ template <typename ProjectorT, typename T, typename indT> class TakeFunctor
                 ind_shape_and_strides_ + ((axis_idx + 1) * ind_nd_),
                 ind_arr_idx);
             indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
-            indT i = ind_data[ind_arr_idx + ind_offsets_[axis_idx]];
+            py::ssize_t i = static_cast<py::ssize_t>(
+                ind_data[ind_arr_idx + ind_offsets_[axis_idx]]);
             proj(axes_shape_and_strides_[axis_idx], i);
             src_orthog_idx += i * axes_shape_and_strides_[k_ + axis_idx];
         }
@@ -282,7 +281,8 @@ template <typename ProjectorT, typename T, typename indT> class PutFunctor
                 ind_shape_and_strides_ + ((axis_idx + 1) * ind_nd_),
                 ind_arr_idx);
             indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
-            indT i = ind_data[ind_arr_idx + ind_offsets_[axis_idx]];
+            py::ssize_t i = static_cast<py::ssize_t>(
+                ind_data[ind_arr_idx + ind_offsets_[axis_idx]]);
             proj(axes_shape_and_strides_[axis_idx], i);
             dst_orthog_idx += i * axes_shape_and_strides_[k_ + axis_idx];
         }
@@ -355,7 +355,7 @@ template <typename fnT, typename T, typename indT> struct TakeWrapFactory
     {
         if constexpr (std::is_integral<indT>::value &&
                       !std::is_same<indT, bool>::value) {
-            fnT fn = take_impl<WrapIndex<indT>, T, indT>;
+            fnT fn = take_impl<WrapIndex, T, indT>;
             return fn;
         }
         else {
@@ -371,7 +371,7 @@ template <typename fnT, typename T, typename indT> struct TakeClipFactory
     {
         if constexpr (std::is_integral<indT>::value &&
                       !std::is_same<indT, bool>::value) {
-            fnT fn = take_impl<ClipIndex<indT>, T, indT>;
+            fnT fn = take_impl<ClipIndex, T, indT>;
             return fn;
         }
         else {
@@ -387,7 +387,7 @@ template <typename fnT, typename T, typename indT> struct PutWrapFactory
     {
         if constexpr (std::is_integral<indT>::value &&
                       !std::is_same<indT, bool>::value) {
-            fnT fn = put_impl<WrapIndex<indT>, T, indT>;
+            fnT fn = put_impl<WrapIndex, T, indT>;
             return fn;
         }
         else {
@@ -403,7 +403,7 @@ template <typename fnT, typename T, typename indT> struct PutClipFactory
     {
         if constexpr (std::is_integral<indT>::value &&
                       !std::is_same<indT, bool>::value) {
-            fnT fn = put_impl<ClipIndex<indT>, T, indT>;
+            fnT fn = put_impl<ClipIndex, T, indT>;
             return fn;
         }
         else {

From 333f9e64b0bc1f6b70f328a5db4740e45722b49e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 25 Feb 2023 21:31:44 -0600
Subject: [PATCH 18/57] Fixed error from dpt.flip(dpt.arange(5))[dpt.arange(2)]

---
 dpctl/tensor/_slicing.pxi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi
index 6689502955..10b5c58395 100644
--- a/dpctl/tensor/_slicing.pxi
+++ b/dpctl/tensor/_slicing.pxi
@@ -94,7 +94,7 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
                 "Index {0} is out of range for axes 0 with "
                 "size {1}".format(ind, shape[0]))
     elif isinstance(ind, usm_ndarray):
-        return (shape, strides, 0, (ind,), 0)
+        return (shape, strides, offset, (ind,), 0)
     elif isinstance(ind, tuple):
         axes_referenced = 0
         ellipses_count = 0

From ab79d843eab9baa166655a63beb39a32204a7384 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 25 Feb 2023 21:33:05 -0600
Subject: [PATCH 19/57] More tests for advanced indexing

---
 dpctl/tests/test_usm_ndarray_indexing.py | 74 +++++++++++++++++++++++-
 1 file changed, 73 insertions(+), 1 deletion(-)

diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index e6c7271ab1..4281938577 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 
-# import numpy as np
+import numpy as np
 import pytest
 from helper import get_queue_or_skip
 
@@ -174,6 +174,22 @@ def test_advanced_slice1():
     )
 
 
+def test_advanced_slice1_negative_strides():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([0, 1], sycl_queue=q)
+    x = dpt.flip(dpt.arange(5, dtype="i4", sycl_queue=q))
+    y = x[ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert y.strides == (1,)
+    # FIXME, once usm_ndarray.__equal__ is implemented,
+    # use of asnumpy should be removed
+    assert _all_equal(
+        (x[ii[k]] for k in range(ii.shape[0])),
+        (y[k] for k in range(ii.shape[0])),
+    )
+
+
 def test_advanced_slice2():
     q = get_queue_or_skip()
     ii = dpt.asarray([1, 2], sycl_queue=q)
@@ -363,3 +379,59 @@ def test_advanced_slice13():
     assert isinstance(y, dpt.usm_ndarray)
     assert y.shape == expected.shape
     assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()
+
+
+def test_integer_indexing_1d():
+    get_queue_or_skip()
+    x = dpt.arange(10, dtype="i4")
+    ind_1d = dpt.asarray([7, 3, 1], dtype="u2")
+    ind_2d = dpt.asarray([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4")
+
+    y1 = x[ind_1d]
+    assert y1.shape == ind_1d.shape
+    y2 = x[ind_2d]
+    assert y2.shape == ind_2d.shape
+    assert (dpt.asnumpy(y1) == np.array([7, 3, 1], dtype="i4")).all()
+    assert (
+        dpt.asnumpy(y2)
+        == np.array([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4")
+    ).all()
+
+
+def test_integer_indexing_2d():
+    get_queue_or_skip()
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(n0 * n1, dtype="i4"),
+        (
+            n0,
+            n1,
+        ),
+    )
+    ind0 = dpt.arange(n0)
+    ind1 = dpt.arange(n1)
+
+    y = x[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
+    assert y.dtype == x.dtype
+    assert (dpt.asnumpy(y) == np.array([[5, 6], [12, 13]])).all()
+
+
+def test_integer_strided_indexing():
+    get_queue_or_skip()
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(2 * n0 * n1, dtype="i4"),
+        (
+            2 * n0,
+            n1,
+        ),
+    )
+    ind0 = dpt.arange(n0)
+    ind1 = dpt.arange(n1)
+
+    z = x[::-2, :]
+    y = z[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
+    assert y.dtype == x.dtype
+    zc = dpt.copy(z, order="C")
+    yc = zc[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
+    assert (dpt.asnumpy(y) == dpt.asnumpy(yc)).all()

From a966830bc720c37715fc2bbd1d19cb383116c765 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 25 Feb 2023 21:31:44 -0600
Subject: [PATCH 20/57] Fixed error from dpt.flip(dpt.arange(5))[dpt.arange(2)]

---
 dpctl/tensor/_slicing.pxi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi
index 6689502955..10b5c58395 100644
--- a/dpctl/tensor/_slicing.pxi
+++ b/dpctl/tensor/_slicing.pxi
@@ -94,7 +94,7 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
                 "Index {0} is out of range for axes 0 with "
                 "size {1}".format(ind, shape[0]))
     elif isinstance(ind, usm_ndarray):
-        return (shape, strides, 0, (ind,), 0)
+        return (shape, strides, offset, (ind,), 0)
     elif isinstance(ind, tuple):
         axes_referenced = 0
         ellipses_count = 0

From 8523d8e3e0be5229045632bc739c32b838b6edaf Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 25 Feb 2023 21:33:05 -0600
Subject: [PATCH 21/57] More tests for advanced indexing

---
 dpctl/tests/test_usm_ndarray_indexing.py | 74 +++++++++++++++++++++++-
 1 file changed, 73 insertions(+), 1 deletion(-)

diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index e6c7271ab1..4281938577 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 
-# import numpy as np
+import numpy as np
 import pytest
 from helper import get_queue_or_skip
 
@@ -174,6 +174,22 @@ def test_advanced_slice1():
     )
 
 
+def test_advanced_slice1_negative_strides():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([0, 1], sycl_queue=q)
+    x = dpt.flip(dpt.arange(5, dtype="i4", sycl_queue=q))
+    y = x[ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert y.strides == (1,)
+    # FIXME, once usm_ndarray.__equal__ is implemented,
+    # use of asnumpy should be removed
+    assert _all_equal(
+        (x[ii[k]] for k in range(ii.shape[0])),
+        (y[k] for k in range(ii.shape[0])),
+    )
+
+
 def test_advanced_slice2():
     q = get_queue_or_skip()
     ii = dpt.asarray([1, 2], sycl_queue=q)
@@ -363,3 +379,59 @@ def test_advanced_slice13():
     assert isinstance(y, dpt.usm_ndarray)
     assert y.shape == expected.shape
     assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()
+
+
+def test_integer_indexing_1d():
+    get_queue_or_skip()
+    x = dpt.arange(10, dtype="i4")
+    ind_1d = dpt.asarray([7, 3, 1], dtype="u2")
+    ind_2d = dpt.asarray([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4")
+
+    y1 = x[ind_1d]
+    assert y1.shape == ind_1d.shape
+    y2 = x[ind_2d]
+    assert y2.shape == ind_2d.shape
+    assert (dpt.asnumpy(y1) == np.array([7, 3, 1], dtype="i4")).all()
+    assert (
+        dpt.asnumpy(y2)
+        == np.array([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4")
+    ).all()
+
+
+def test_integer_indexing_2d():
+    get_queue_or_skip()
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(n0 * n1, dtype="i4"),
+        (
+            n0,
+            n1,
+        ),
+    )
+    ind0 = dpt.arange(n0)
+    ind1 = dpt.arange(n1)
+
+    y = x[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
+    assert y.dtype == x.dtype
+    assert (dpt.asnumpy(y) == np.array([[5, 6], [12, 13]])).all()
+
+
+def test_integer_strided_indexing():
+    get_queue_or_skip()
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(2 * n0 * n1, dtype="i4"),
+        (
+            2 * n0,
+            n1,
+        ),
+    )
+    ind0 = dpt.arange(n0)
+    ind1 = dpt.arange(n1)
+
+    z = x[::-2, :]
+    y = z[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
+    assert y.dtype == x.dtype
+    zc = dpt.copy(z, order="C")
+    yc = zc[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
+    assert (dpt.asnumpy(y) == dpt.asnumpy(yc)).all()

From 81ba473a38accbcd3841d85b378144228b9eb5d8 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 26 Feb 2023 10:15:43 -0600
Subject: [PATCH 22/57] Adding basic take, and basic put tests

---
 dpctl/tests/test_usm_ndarray_indexing.py | 83 +++++++++++++++++++++++-
 1 file changed, 80 insertions(+), 3 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index 4281938577..e088d4d00b 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -17,13 +17,11 @@
 
 import numpy as np
 import pytest
-from helper import get_queue_or_skip
+from helper import get_queue_or_skip, skip_if_dtype_not_supported
 
 # import dpctl
 import dpctl.tensor as dpt
 
-# from helper import skip_if_dtype_not_supported
-
 
 def test_basic_slice1():
     q = get_queue_or_skip()
@@ -435,3 +433,82 @@ def test_integer_strided_indexing():
     zc = dpt.copy(z, order="C")
     yc = zc[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
     assert (dpt.asnumpy(y) == dpt.asnumpy(yc)).all()
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8", "e", "f", "d", "F", "D"],
+)
+@pytest.mark.parametrize(
+    "ind_dt", ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"]
+)
+def test_take_basic(data_dt, ind_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(10, dtype=data_dt)
+    ind = dpt.arange(2, 5, dtype=ind_dt)
+    y = dpt.take(x, ind)
+    assert y.dtype == x.dtype
+    assert (dpt.asnumpy(y) == np.arange(2, 5, dtype=data_dt)).all()
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8", "e", "f", "d", "F", "D"],
+)
+@pytest.mark.parametrize(
+    "ind_dt", ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"]
+)
+def test_put_basic(data_dt, ind_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(10, dtype=data_dt)
+    ind = dpt.arange(2, 5, dtype=ind_dt)
+    val = dpt.ones(3, dtype=data_dt)
+    dpt.put(x, ind, val)
+    assert (
+        dpt.asnumpy(x)
+        == np.array([0, 1, 1, 1, 1, 5, 6, 7, 8, 9], dtype=data_dt)
+    ).all()
+
+
+def test_take_basic_axis():
+    get_queue_or_skip()
+
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(n0 * n1, dtype="i4"),
+        (
+            n0,
+            n1,
+        ),
+    )
+    ind = dpt.arange(2, 4)
+    y0 = dpt.take(x, ind, axis=0)
+    y1 = dpt.take(x, ind, axis=1)
+    assert y0.shape == (2, n1)
+    assert y1.shape == (n0, 2)
+
+
+def test_put_basic_axis():
+    get_queue_or_skip()
+
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(n0 * n1, dtype="i4"),
+        (
+            n0,
+            n1,
+        ),
+    )
+    ind = dpt.arange(2, 4)
+    v0 = dpt.zeros((2, n1), dtype=x.dtype)
+    v1 = dpt.zeros((n0, 2), dtype=x.dtype)
+    dpt.put(x, ind, v0, axis=0)
+    dpt.put(x, ind, v1, axis=1)
+    expected = np.arange(n0 * n1, dtype="i4").reshape((n0, n1))
+    expected[[2, 3], :] = 0
+    expected[:, [2, 3]] = 0
+    assert (expected == dpt.asnumpy(x)).all()

From 7c0c6f025fd67b3285f370ec2db9859a0ca3ef8e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 26 Feb 2023 16:12:21 -0600
Subject: [PATCH 23/57] Turn debugging on for test_windows test run

---
 .github/workflows/conda-package.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 4cb3f3ad99..7bbc5fffd1 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -170,7 +170,7 @@ jobs:
           # echo "libintelocl.so" | tee /etc/OpenCL/vendors/intel-cpu.icd
           export OCL_ICD_FILENAMES=libintelocl.so
           # clinfo -l
-          python -m pytest -p no:faulthandler --pyargs $MODULE_NAME
+          python -m pytest --pyargs $MODULE_NAME
 
   test_windows:
     needs: build_windows
@@ -296,8 +296,10 @@ jobs:
           conda activate dpctl_test && python -m dpctl -f
       - name: Run tests
         shell: cmd /C CALL {0}
+        env:
+          DPCTL_VERBOSITY: error
         run: >-
-          conda activate dpctl_test && python -m pytest -p no:faulthandler --pyargs ${{ env.MODULE_NAME }}
+          conda activate dpctl_test && python -m pytest -v -s --pyargs ${{ env.MODULE_NAME }}
 
   upload_linux:
     needs: test_linux

From 156f7f0a74f8ffeea3c3f2f1d59dad46072c0327 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Sun, 26 Feb 2023 20:11:28 -0800
Subject: [PATCH 24/57] Added several array indexing tests - Tests include -
 strided data for take and put - strided indices for take and put - indexing
 compute follows data - indexing argument validation

---
 dpctl/tests/test_usm_ndarray_indexing.py | 440 ++++++++++++++++++++++-
 1 file changed, 435 insertions(+), 5 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index e088d4d00b..840dfb931f 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -18,9 +18,28 @@
 import numpy as np
 import pytest
 from helper import get_queue_or_skip, skip_if_dtype_not_supported
+from numpy.testing import assert_array_equal
 
-# import dpctl
 import dpctl.tensor as dpt
+from dpctl.utils import ExecutionPlacementError
+
+_all_dtypes = [
+    "u1",
+    "i1",
+    "u2",
+    "i2",
+    "u4",
+    "i4",
+    "u8",
+    "i8",
+    "e",
+    "f",
+    "d",
+    "F",
+    "D",
+]
+
+_all_int_dtypes = ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"]
 
 
 def test_basic_slice1():
@@ -437,10 +456,11 @@ def test_integer_strided_indexing():
 
 @pytest.mark.parametrize(
     "data_dt",
-    ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8", "e", "f", "d", "F", "D"],
+    _all_dtypes,
 )
 @pytest.mark.parametrize(
-    "ind_dt", ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"]
+    "ind_dt",
+    _all_int_dtypes,
 )
 def test_take_basic(data_dt, ind_dt):
     q = get_queue_or_skip()
@@ -455,10 +475,11 @@ def test_take_basic(data_dt, ind_dt):
 
 @pytest.mark.parametrize(
     "data_dt",
-    ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8", "e", "f", "d", "F", "D"],
+    _all_dtypes,
 )
 @pytest.mark.parametrize(
-    "ind_dt", ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"]
+    "ind_dt",
+    _all_int_dtypes,
 )
 def test_put_basic(data_dt, ind_dt):
     q = get_queue_or_skip()
@@ -512,3 +533,412 @@ def test_put_basic_axis():
     expected[[2, 3], :] = 0
     expected[:, [2, 3]] = 0
     assert (expected == dpt.asnumpy(x)).all()
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+def test_take_0d_data(data_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.asarray(0, dtype=data_dt)
+    ind = dpt.arange(5)
+
+    y = dpt.take(x, ind)
+    assert (
+        dpt.asnumpy(y)
+        == np.broadcast_to(np.asarray(0, dtype=data_dt), ind.shape)
+    ).all()
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+def test_put_0d_data(data_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.asarray(0, dtype=data_dt)
+    ind = dpt.arange(5)
+    val = dpt.asarray(2, dtype=data_dt)
+
+    dpt.put(x, ind, val)
+    assert (
+        dpt.asnumpy(x)
+        == np.broadcast_to(np.asarray(2, dtype=data_dt), ind.shape)
+    ).all()
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_take_0d_ind(ind_dt):
+    get_queue_or_skip()
+
+    x = dpt.arange(5, dtype=ind_dt)
+    ind = dpt.asarray(3)
+
+    y = dpt.take(x, ind)
+    assert dpt.asnumpy(x[3]) == dpt.asnumpy(y)
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_put_0d_ind(ind_dt):
+    get_queue_or_skip()
+
+    x = dpt.arange(5, dtype=ind_dt)
+    ind = dpt.asarray(3)
+    val = dpt.asarray(5, dtype=ind_dt)
+
+    dpt.put(x, ind, val)
+    assert dpt.asnumpy(x[3]) == dpt.asnumpy(val)
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+def test_take_strided_1d_source(data_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(27, dtype=data_dt, sycl_queue=q)
+    ind = dpt.arange(4, 9, dtype=np.intp, sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        assert_array_equal(
+            np.take(x_np[s], ind_np, axis=0),
+            dpt.asnumpy(dpt.take(x[s], ind, axis=0)),
+        )
+
+    # 0-strided
+    x = dpt.usm_ndarray(
+        (27,),
+        dtype=data_dt,
+        strides=(0,),
+        buffer_ctor_kwargs={"queue": q},
+    )
+    x[0] = x_np[0]
+    assert_array_equal(
+        np.broadcast_to(x_np[0], ind.shape),
+        dpt.asnumpy(dpt.take(x, ind, axis=0)),
+    )
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_take_strided(data_dt, order):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
+    ind = dpt.arange(2, dtype=np.intp, sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        for sgn in (-1, 1):
+            xs = x[s, ::sgn]
+            xs_np = x_np[s, ::sgn]
+            assert_array_equal(
+                np.take(xs_np, ind_np, axis=0),
+                dpt.asnumpy(dpt.take(xs, ind, axis=0)),
+            )
+            assert_array_equal(
+                np.take(xs_np, ind_np, axis=1),
+                dpt.asnumpy(dpt.take(xs, ind, axis=1)),
+            )
+            assert_array_equal(
+                xs_np[ind_np, ind_np],
+                dpt.asnumpy(dpt.take(xs, [ind, ind], axis=0)),
+            )
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_take_strided_1d_indices(ind_dt):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(27, dtype="i4", sycl_queue=q)
+    ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind).astype(np.intp)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        assert_array_equal(
+            np.take(x_np, ind_np[s], axis=0),
+            dpt.asnumpy(dpt.take(x, ind[s], axis=0)),
+        )
+
+    # 0-strided
+    ind = dpt.usm_ndarray(
+        (12,),
+        dtype=ind_dt,
+        strides=(0,),
+        buffer_ctor_kwargs={"queue": q},
+    )
+    ind[0] = ind_np[0]
+    assert_array_equal(
+        np.broadcast_to(x_np[ind_np[0]], ind.shape),
+        dpt.asnumpy(dpt.take(x, ind, axis=0)),
+    )
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_take_strided_indices(ind_dt, order):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(27, dtype="i4", sycl_queue=q)
+    ind = dpt.reshape(
+        dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q), (4, 3), order=order
+    )
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind).astype(np.intp)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        for sgn in [-1, 1]:
+            inds = ind[s, ::sgn]
+            inds_np = ind_np[s, ::sgn]
+            assert_array_equal(
+                np.take(x_np, inds_np, axis=0),
+                dpt.asnumpy(dpt.take(x, inds, axis=0)),
+            )
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_put_strided_1d_destination(data_dt, order):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(27, dtype=data_dt, sycl_queue=q)
+    ind = dpt.arange(4, 9, dtype=np.intp, sycl_queue=q)
+    val = dpt.asarray(9, dtype=data_dt, sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind)
+    val_np = dpt.asnumpy(val)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        x_np1 = x_np.copy()
+        x_np1[s][ind_np] = val_np
+
+        x1 = dpt.copy(x)
+        dpt.put(x1[s], ind, val, axis=0)
+
+        assert_array_equal(x_np1, dpt.asnumpy(x1))
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_put_strided_destination(data_dt, order):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
+    ind = dpt.arange(2, dtype=np.intp, sycl_queue=q)
+    val = dpt.asarray(9, dtype=data_dt, sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind)
+    val_np = dpt.asnumpy(val)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        for sgn in [-1, 1]:
+            xs = x[s, ::sgn]
+            xs_np = x_np[s, ::sgn]
+
+            x_np1 = xs_np.copy()
+            x_np1[ind_np] = val_np
+
+            x1 = dpt.copy(xs)
+            dpt.put(x1, ind, val, axis=0)
+            assert_array_equal(x_np1, dpt.asnumpy(x1))
+
+            x_np1 = xs_np.copy()
+            x_np1[:, ind_np] = val_np
+
+            x1 = dpt.copy(xs)
+            dpt.put(x1, ind, val, axis=1)
+            assert_array_equal(x_np1, dpt.asnumpy(x1))
+
+            x_np1 = xs_np.copy()
+            x_np1[ind_np, ind_np] = val_np
+
+            x1 = dpt.copy(xs)
+            dpt.put(x1, [ind, ind], val, axis=0)
+            assert_array_equal(x_np1, dpt.asnumpy(x1))
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_put_strided_1d_indices(ind_dt):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(27, dtype="i4", sycl_queue=q)
+    ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q)
+    val = dpt.asarray(-1, dtype="i4", sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind).astype(np.intp)
+    val_np = dpt.asnumpy(val)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        x_copy = dpt.copy(x)
+        dpt.put(x_copy, ind[s], val, axis=0)
+
+        x_np_copy = x_np.copy()
+        x_np_copy[ind_np[s]] = val_np
+
+        assert_array_equal(x_np_copy, dpt.asnumpy(x_copy))
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_put_strided_indices(ind_dt, order):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(27, dtype="i4", sycl_queue=q)
+    ind = dpt.reshape(
+        dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q), (4, 3), order=order
+    )
+    val = dpt.asarray(-1, sycl_queue=q, dtype=x.dtype)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind).astype(np.intp)
+    val_np = dpt.asnumpy(val)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        for sgn in [-1, 1]:
+            inds = ind[s, ::sgn]
+            inds_np = ind_np[s, ::sgn]
+
+            x_copy = dpt.copy(x)
+            dpt.put(x_copy, inds, val, axis=0)
+
+            x_np_copy = x_np.copy()
+            x_np_copy[inds_np] = val_np
+
+            assert_array_equal(x_np_copy, dpt.asnumpy(x_copy))
+
+
+def test_take_arg_validation():
+    get_queue_or_skip()
+
+    x = dpt.arange(4)
+    ind0 = dpt.arange(2)
+    ind1 = dpt.arange(2.0)
+
+    with pytest.raises(TypeError):
+        dpt.take(dict(), ind0, axis=0)
+    with pytest.raises(TypeError):
+        dpt.take(x, dict(), axis=0)
+    with pytest.raises(TypeError):
+        dpt.take(x, ind1, axis=0)
+
+    with pytest.raises(ValueError):
+        dpt.take(x, ind0, mode=0)
+    with pytest.raises(ValueError):
+        dpt.take(dpt.reshape(x, (2, 2)), ind0, axis=None)
+
+
+def test_put_arg_validation():
+    get_queue_or_skip()
+
+    x = dpt.arange(4)
+    ind0 = dpt.arange(2)
+    ind1 = dpt.arange(2.0)
+    val = dpt.asarray(2)
+
+    with pytest.raises(TypeError):
+        dpt.put(dict(), ind0, val, axis=0)
+    with pytest.raises(TypeError):
+        dpt.put(x, dict(), val, axis=0)
+    with pytest.raises(TypeError):
+        dpt.put(x, ind1, val, axis=0)
+    with pytest.raises(TypeError):
+        dpt.put(x, ind0, dict(), axis=0)
+
+    with pytest.raises(ValueError):
+        dpt.put(x, ind0, val, mode=0)
+
+
+def test_advanced_indexing_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.arange(4, sycl_queue=q1)
+    ind0 = dpt.asarray([0], sycl_queue=q1)
+    ind1 = dpt.asarray([0], sycl_queue=q2)
+    val0 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q1)
+    val1 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q2)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.take(x, ind1, axis=0)
+    with pytest.raises(ExecutionPlacementError):
+        x[ind1]
+    with pytest.raises(ExecutionPlacementError):
+        dpt.put(x, ind1, val0)
+    with pytest.raises(ExecutionPlacementError):
+        x[ind1] = val0
+    with pytest.raises(ExecutionPlacementError):
+        dpt.put(x, ind0, val1)
+    with pytest.raises(ExecutionPlacementError):
+        x[ind0] = val1

From d42b019c5f9875d40cb58c507eb99ea42bd3433c Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Sun, 26 Feb 2023 20:48:21 -0800
Subject: [PATCH 25/57] Put calls in tests corrected, organized put logic

---
 dpctl/tensor/_indexing_functions.py      | 16 ++++++++++------
 dpctl/tests/test_usm_ndarray_indexing.py | 10 ++++++----
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
index c3562de8f8..864eb6924b 100644
--- a/dpctl/tensor/_indexing_functions.py
+++ b/dpctl/tensor/_indexing_functions.py
@@ -104,12 +104,16 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"):
         raise TypeError(
             "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
         )
-    queues_ = [
-        x.sycl_queue,
-    ]
-    usm_types_ = [
-        x.usm_type,
-    ]
+    if isinstance(vals, dpt.usm_ndarray):
+        queues_ = [x.sycl_queue, vals.sycl_queue]
+        usm_types_ = [x.usm_type, vals.usm_type]
+    else:
+        queues_ = [
+            x.sycl_queue,
+        ]
+        usm_types_ = [
+            x.usm_type,
+        ]
 
     if not isinstance(indices, list) and not isinstance(indices, tuple):
         indices = (indices,)
diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index 840dfb931f..523810c811 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -565,7 +565,7 @@ def test_put_0d_data(data_dt):
     ind = dpt.arange(5)
     val = dpt.asarray(2, dtype=data_dt)
 
-    dpt.put(x, ind, val)
+    dpt.put(x, ind, val, axis=0)
     assert (
         dpt.asnumpy(x)
         == np.broadcast_to(np.asarray(2, dtype=data_dt), ind.shape)
@@ -597,7 +597,7 @@ def test_put_0d_ind(ind_dt):
     ind = dpt.asarray(3)
     val = dpt.asarray(5, dtype=ind_dt)
 
-    dpt.put(x, ind, val)
+    dpt.put(x, ind, val, axis=0)
     assert dpt.asnumpy(x[3]) == dpt.asnumpy(val)
 
 
@@ -886,6 +886,8 @@ def test_take_arg_validation():
     ind0 = dpt.arange(2)
     ind1 = dpt.arange(2.0)
 
+    with pytest.raises(ValueError):
+        dpt.take(dpt.reshape(x, (2, 2)), ind0)
     with pytest.raises(TypeError):
         dpt.take(dict(), ind0, axis=0)
     with pytest.raises(TypeError):
@@ -935,10 +937,10 @@ def test_advanced_indexing_compute_follows_data():
     with pytest.raises(ExecutionPlacementError):
         x[ind1]
     with pytest.raises(ExecutionPlacementError):
-        dpt.put(x, ind1, val0)
+        dpt.put(x, ind1, val0, axis=0)
     with pytest.raises(ExecutionPlacementError):
         x[ind1] = val0
     with pytest.raises(ExecutionPlacementError):
-        dpt.put(x, ind0, val1)
+        dpt.put(x, ind0, val1, axis=0)
     with pytest.raises(ExecutionPlacementError):
         x[ind0] = val1

From 877c3c75bfa7d4fd8ac02a436603127135254d4b Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 27 Feb 2023 00:12:01 -0800
Subject: [PATCH 26/57] Test fixes - Error for non-integer usm_ndarrays used as
 indices changed to IndexError

---
 dpctl/tensor/_indexing_functions.py      |  4 +-
 dpctl/tests/test_usm_ndarray_indexing.py | 77 ++++++++++++++++--------
 2 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
index 864eb6924b..90718f4559 100644
--- a/dpctl/tensor/_indexing_functions.py
+++ b/dpctl/tensor/_indexing_functions.py
@@ -48,7 +48,7 @@ def take(x, indices, /, *, axis=None, mode="clip"):
                 )
             )
         if not np.issubdtype(i.dtype, np.integer):
-            raise TypeError(
+            raise IndexError(
                 "`indices` expected integer data type, got `{}`".format(i.dtype)
             )
         queues_.append(i.sycl_queue)
@@ -126,7 +126,7 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"):
                 )
             )
         if not np.issubdtype(i.dtype, np.integer):
-            raise TypeError(
+            raise IndexError(
                 "`indices` expected integer data type, got `{}`".format(i.dtype)
             )
         queues_.append(i.sycl_queue)
diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index 523810c811..45501afbac 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -535,6 +535,21 @@ def test_put_basic_axis():
     assert (expected == dpt.asnumpy(x)).all()
 
 
+@pytest.mark.parametrize("data_dt", _all_dtypes)
+def test_put_0d_val(data_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(5, dtype=data_dt, sycl_queue=q)
+    ind = dpt.asarray([0], dtype=np.intp, sycl_queue=q)
+    x[ind] = 2
+    assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x[0]))
+
+    x = dpt.asarray(5, dtype=data_dt, sycl_queue=q)
+    x[ind] = 2
+    assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x))
+
+
 @pytest.mark.parametrize(
     "data_dt",
     _all_dtypes,
@@ -543,8 +558,8 @@ def test_take_0d_data(data_dt):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(data_dt, q)
 
-    x = dpt.asarray(0, dtype=data_dt)
-    ind = dpt.arange(5)
+    x = dpt.asarray(0, dtype=data_dt, sycl_queue=q)
+    ind = dpt.arange(5, dtype=np.intp, sycl_queue=q)
 
     y = dpt.take(x, ind)
     assert (
@@ -561,9 +576,9 @@ def test_put_0d_data(data_dt):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(data_dt, q)
 
-    x = dpt.asarray(0, dtype=data_dt)
-    ind = dpt.arange(5)
-    val = dpt.asarray(2, dtype=data_dt)
+    x = dpt.asarray(0, dtype=data_dt, sycl_queue=q)
+    ind = dpt.arange(5, dtype=np.intp, sycl_queue=q)
+    val = dpt.asarray(2, dtype=data_dt, sycl_queue=q)
 
     dpt.put(x, ind, val, axis=0)
     assert (
@@ -577,10 +592,10 @@ def test_put_0d_data(data_dt):
     _all_int_dtypes,
 )
 def test_take_0d_ind(ind_dt):
-    get_queue_or_skip()
+    q = get_queue_or_skip()
 
-    x = dpt.arange(5, dtype=ind_dt)
-    ind = dpt.asarray(3)
+    x = dpt.arange(5, dtype="i4", sycl_queue=q)
+    ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q)
 
     y = dpt.take(x, ind)
     assert dpt.asnumpy(x[3]) == dpt.asnumpy(y)
@@ -591,11 +606,11 @@ def test_take_0d_ind(ind_dt):
     _all_int_dtypes,
 )
 def test_put_0d_ind(ind_dt):
-    get_queue_or_skip()
+    q = get_queue_or_skip()
 
-    x = dpt.arange(5, dtype=ind_dt)
-    ind = dpt.asarray(3)
-    val = dpt.asarray(5, dtype=ind_dt)
+    x = dpt.arange(5, dtype="i4", sycl_queue=q)
+    ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q)
+    val = dpt.asarray(5, dtype=x.dtype, sycl_queue=q)
 
     dpt.put(x, ind, val, axis=0)
     assert dpt.asnumpy(x[3]) == dpt.asnumpy(val)
@@ -750,7 +765,7 @@ def test_put_strided_1d_destination(data_dt, order):
 
     x = dpt.arange(27, dtype=data_dt, sycl_queue=q)
     ind = dpt.arange(4, 9, dtype=np.intp, sycl_queue=q)
-    val = dpt.asarray(9, dtype=data_dt, sycl_queue=q)
+    val = dpt.asarray(9, dtype=x.dtype, sycl_queue=q)
 
     x_np = dpt.asnumpy(x)
     ind_np = dpt.asnumpy(ind)
@@ -780,7 +795,7 @@ def test_put_strided_destination(data_dt, order):
 
     x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
     ind = dpt.arange(2, dtype=np.intp, sycl_queue=q)
-    val = dpt.asarray(9, dtype=data_dt, sycl_queue=q)
+    val = dpt.asarray(9, dtype=x.dtype, sycl_queue=q)
 
     x_np = dpt.asnumpy(x)
     ind_np = dpt.asnumpy(ind)
@@ -825,7 +840,7 @@ def test_put_strided_1d_indices(ind_dt):
 
     x = dpt.arange(27, dtype="i4", sycl_queue=q)
     ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q)
-    val = dpt.asarray(-1, dtype="i4", sycl_queue=q)
+    val = dpt.asarray(-1, dtype=x.dtype, sycl_queue=q)
 
     x_np = dpt.asnumpy(x)
     ind_np = dpt.asnumpy(ind).astype(np.intp)
@@ -880,21 +895,25 @@ def test_put_strided_indices(ind_dt, order):
 
 
 def test_take_arg_validation():
-    get_queue_or_skip()
+    q = get_queue_or_skip()
 
-    x = dpt.arange(4)
-    ind0 = dpt.arange(2)
-    ind1 = dpt.arange(2.0)
+    x = dpt.arange(4, dtype="i4", sycl_queue=q)
+    ind0 = dpt.arange(2, dtype=np.intp, sycl_queue=q)
+    ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q)
 
-    with pytest.raises(ValueError):
-        dpt.take(dpt.reshape(x, (2, 2)), ind0)
     with pytest.raises(TypeError):
         dpt.take(dict(), ind0, axis=0)
     with pytest.raises(TypeError):
         dpt.take(x, dict(), axis=0)
     with pytest.raises(TypeError):
+        x[[]]
+    with pytest.raises(IndexError):
         dpt.take(x, ind1, axis=0)
+    with pytest.raises(IndexError):
+        x[ind1]
 
+    with pytest.raises(ValueError):
+        dpt.take(dpt.reshape(x, (2, 2)), ind0)
     with pytest.raises(ValueError):
         dpt.take(x, ind0, mode=0)
     with pytest.raises(ValueError):
@@ -902,21 +921,27 @@ def test_take_arg_validation():
 
 
 def test_put_arg_validation():
-    get_queue_or_skip()
+    q = get_queue_or_skip()
 
-    x = dpt.arange(4)
-    ind0 = dpt.arange(2)
-    ind1 = dpt.arange(2.0)
-    val = dpt.asarray(2)
+    x = dpt.arange(4, dtype="i4", sycl_queue=q)
+    ind0 = dpt.arange(2, dtype=np.intp, sycl_queue=q)
+    ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q)
+    val = dpt.asarray(2, x.dtype, sycl_queue=q)
 
     with pytest.raises(TypeError):
         dpt.put(dict(), ind0, val, axis=0)
     with pytest.raises(TypeError):
         dpt.put(x, dict(), val, axis=0)
     with pytest.raises(TypeError):
+        x[[]] = val
+    with pytest.raises(IndexError):
         dpt.put(x, ind1, val, axis=0)
+    with pytest.raises(IndexError):
+        x[ind1] = val
     with pytest.raises(TypeError):
         dpt.put(x, ind0, dict(), axis=0)
+    with pytest.raises(TypeError):
+        x[ind0] = dict()
 
     with pytest.raises(ValueError):
         dpt.put(x, ind0, val, mode=0)

From e296d873229ef8e2cf1af0b3f7b1f5301cc1e4b4 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 27 Feb 2023 01:51:59 -0800
Subject: [PATCH 27/57] Moved advanced_indexing pointer range validation

---
 .../libtensor/source/advanced_indexing.cpp    | 46 +++++++++----------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.cpp b/dpctl/tensor/libtensor/source/advanced_indexing.cpp
index fed5d543ed..39f62a501a 100644
--- a/dpctl/tensor/libtensor/source/advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/advanced_indexing.cpp
@@ -99,8 +99,6 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
     std::shared_ptr<shT> packed_host_axes_shapes_strides_shp =
         std::make_shared<shT>(2 * k + along_sh_elems, allocator);
 
-    // can be made more efficient by checking if inp_nd > 1, then performing
-    // same treatment of orthog_sh_elems as for 0D (orthog will not exist)
     if (inp_nd > 0) {
         std::copy(inp_shape, inp_shape + axis_start,
                   packed_host_shapes_strides_shp->begin());
@@ -403,6 +401,17 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
         }
     }
 
+    // destination must be ample enough to accommodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if ((range + 1) < (orthog_nelems * ind_nelems)) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
     auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1;
 
     std::vector<char *> ind_ptrs;
@@ -580,17 +589,6 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     const py::ssize_t *src_strides = src.get_strides_raw();
     const py::ssize_t *dst_strides = dst.get_strides_raw();
 
-    // destination must be ample enough to accommodate all elements
-    {
-        size_t range =
-            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
-        if ((range + 1) < (orthog_nelems * ind_nelems)) {
-            throw py::value_error(
-                "Destination array can not accommodate all the "
-                "elements of source array.");
-        }
-    }
-
     // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:],
     //                          src_strides[:axis] + src_strides[axis+k:],
     //                          dst_strides[:axis] + dst_strides[axis+k:]]
@@ -765,6 +763,17 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
         throw py::value_error("Arrays index overlapping segments of memory");
     }
 
+    // destination must be ample enough to accommodate all possible elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if ((range + 1) < dst_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
     int dst_typenum = dst.get_typenum();
     int val_typenum = val.get_typenum();
 
@@ -965,17 +974,6 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
     const py::ssize_t *dst_strides = dst.get_strides_raw();
     const py::ssize_t *val_strides = val.get_strides_raw();
 
-    // destination must be ample enough to accommodate all possible elements
-    {
-        size_t range =
-            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
-        if ((range + 1) < dst_nelems) {
-            throw py::value_error(
-                "Destination array can not accommodate all the "
-                "elements of source array.");
-        }
-    }
-
     // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:],
     //                          dst_strides[:axis] + dst_strides[axis+k:],
     //                          val_strides[:axis] + val_strides[axis+k:]]

From 0cf7ba4cabeca0bdfad95f6fa7455f70432061cd Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 27 Feb 2023 23:13:14 -0800
Subject: [PATCH 28/57] Fixed typo in advanced_indexing kernels

---
 dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
index 1e205c658c..093d88706f 100644
--- a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
@@ -150,7 +150,7 @@ template <typename ProjectorT, typename T, typename indT> class TakeFunctor
             src_orthog_idx += i * axes_shape_and_strides_[k_ + axis_idx];
         }
         py::ssize_t ind_dst_idx(0);
-        ind_indxr.get_displacement<const ::py::ssize_t *>(
+        ind_indxr.get_displacement<const py::ssize_t *>(
             static_cast<py::ssize_t>(i_along), ind_shape_and_strides_,
             axes_shape_and_strides_ + (2 * k_), ind_dst_idx);
 
@@ -287,7 +287,7 @@ template <typename ProjectorT, typename T, typename indT> class PutFunctor
             dst_orthog_idx += i * axes_shape_and_strides_[k_ + axis_idx];
         }
         py::ssize_t ind_val_idx(0);
-        ind_indxr.get_displacement<const ::py::ssize_t *>(
+        ind_indxr.get_displacement<const py::ssize_t *>(
             static_cast<py::ssize_t>(i_along), ind_shape_and_strides_,
             axes_shape_and_strides_ + (2 * k_), ind_val_idx);
 

From fc46303dd64b2700c4c3d26b2440e604878492b9 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 27 Feb 2023 09:28:59 -0600
Subject: [PATCH 29/57] Renamed advance_indexing.*pp into
 integer_advanced_indexing.*pp

Streamlined call operator implementation for projection classes.
Added missing includes.
---
 dpctl/tensor/CMakeLists.txt                   |  2 +-
 ...xing.hpp => integer_advanced_indexing.hpp} | 16 +++++++--------
 ...xing.cpp => integer_advanced_indexing.cpp} | 20 ++++++++++++-------
 ...xing.hpp => integer_advanced_indexing.hpp} |  0
 dpctl/tensor/libtensor/source/tensor_py.cpp   |  2 +-
 5 files changed, 22 insertions(+), 18 deletions(-)
 rename dpctl/tensor/libtensor/include/kernels/{advanced_indexing.hpp => integer_advanced_indexing.hpp} (96%)
 rename dpctl/tensor/libtensor/source/{advanced_indexing.cpp => integer_advanced_indexing.cpp} (98%)
 rename dpctl/tensor/libtensor/source/{advanced_indexing.hpp => integer_advanced_indexing.hpp} (100%)

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 622473a0d8..3f5780cd75 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -31,7 +31,7 @@ pybind11_add_module(${python_module_name} MODULE
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/advanced_indexing.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
diff --git a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
similarity index 96%
rename from dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
rename to dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
index 093d88706f..a239691c80 100644
--- a/dpctl/tensor/libtensor/include/kernels/advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -26,6 +26,7 @@
 #include "utils/strided_iters.hpp"
 #include "utils/type_utils.hpp"
 #include <CL/sycl.hpp>
+#include <algorithm>
 #include <complex>
 #include <cstdint>
 #include <pybind11/pybind11.h>
@@ -52,10 +53,9 @@ class ClipIndex
 
     void operator()(py::ssize_t max_item, py::ssize_t &ind) const
     {
-        max_item = (max_item > 0) ? max_item : 1;
-        ind = (ind < 0)           ? (ind <= -max_item) ? (0) : (ind + max_item)
-                        : (ind >= max_item) ? (max_item - 1)
-                                  : ind;
+        max_item = std::max<py::ssize_t>(max_item, 1);
+        ind = std::clamp<py::ssize_t>(ind, -max_item, max_item - 1);
+        ind = (ind < 0) ? ind + max_item : ind;
         return;
     }
 };
@@ -67,10 +67,8 @@ class WrapIndex
 
     void operator()(py::ssize_t max_item, py::ssize_t &ind) const
     {
-        max_item = (max_item > 0) ? max_item : 1;
-        ind = (ind < 0)           ? max_item - (-ind % max_item)
-              : (ind >= max_item) ? ind % max_item
-                                  : ind;
+        max_item = std::max<py::ssize_t>(max_item, 1);
+        ind = ind % max_item;
         return;
     }
 };
@@ -136,9 +134,9 @@ template <typename ProjectorT, typename T, typename indT> class TakeFunctor
             dst_orthog_idx);
 
         ProjectorT proj{};
-        py::ssize_t ind_arr_idx(0);
         CIndexer_vector<py::ssize_t> ind_indxr(ind_nd_);
         for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
+            py::ssize_t ind_arr_idx(0);
             ind_indxr.get_displacement<const py::ssize_t *>(
                 static_cast<py::ssize_t>(i_along), ind_shape_and_strides_,
                 ind_shape_and_strides_ + ((axis_idx + 1) * ind_nd_),
diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
similarity index 98%
rename from dpctl/tensor/libtensor/source/advanced_indexing.cpp
rename to dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
index 39f62a501a..9c9840e0de 100644
--- a/dpctl/tensor/libtensor/source/advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -27,16 +27,19 @@
 #include <algorithm>
 #include <complex>
 #include <cstdint>
+#include <iostream>
 #include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <utility>
 
 #include "dpctl4pybind11.hpp"
-#include "kernels/advanced_indexing.hpp"
+#include "kernels/integer_advanced_indexing.hpp"
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"
 
+#include "integer_advanced_indexing.hpp"
+
 #define INDEXING_MODES 2
 #define CLIP_MODE 0
 #define WRAP_MODE 1
@@ -85,8 +88,8 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
     int arr_nd)
 {
 
-    int orthog_sh_elems = ((inp_nd - k) > 1) ? (inp_nd - k) : 1;
-    int along_sh_elems = (ind_nd > 1) ? ind_nd : 1;
+    int orthog_sh_elems = std::max<int>(inp_nd - k, 1);
+    int along_sh_elems = std::max<int>(ind_nd, 1);
 
     using usm_host_allocatorT =
         sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
@@ -284,7 +287,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
                  int axis_start,
                  uint8_t mode,
                  sycl::queue exec_q,
-                 const std::vector<sycl::event> &depends = {})
+                 const std::vector<sycl::event> &depends)
 {
     int k = ind.size();
 
@@ -328,7 +331,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     const py::ssize_t *src_shape = src.get_shape_raw();
     const py::ssize_t *dst_shape = dst.get_shape_raw();
 
-    int orthog_nd = ((src_nd - k) > 0) ? src_nd - k : 1;
+    int orthog_nd = std::max<int>(src_nd - k, 1);
 
     bool orthog_shapes_equal(true);
     size_t orthog_nelems(1);
@@ -412,7 +415,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
         }
     }
 
-    auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1;
+    int ind_sh_elems = std::max<int>(ind_nd, 1);
 
     std::vector<char *> ind_ptrs;
     ind_ptrs.reserve(k);
@@ -633,12 +636,15 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
                                  std::to_string(ind_type_id));
     }
 
+    std::cout << "Submitting take" << std::endl;
     sycl::event take_generic_ev =
         fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k,
            packed_shapes_strides, packed_axes_shapes_strides,
            packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs,
            src_offset, dst_offset, packed_ind_offsets, all_deps);
 
+    std::cout << "Submitting take clean-up host task" << std::endl;
+
     // free packed temporaries
     auto ctx = exec_q.get_context();
     exec_q.submit([&](sycl::handler &cgh) {
@@ -666,7 +672,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
                 int axis_start,
                 uint8_t mode,
                 sycl::queue exec_q,
-                const std::vector<sycl::event> &depends = {})
+                const std::vector<sycl::event> &depends)
 {
     int k = ind.size();
 
diff --git a/dpctl/tensor/libtensor/source/advanced_indexing.hpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/source/advanced_indexing.hpp
rename to dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 94458bccf9..e164be2421 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -33,13 +33,13 @@
 
 #include "dpctl4pybind11.hpp"
 
-#include "advanced_indexing.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_for_reshape.hpp"
 #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
 #include "eye_ctor.hpp"
 #include "full_ctor.hpp"
+#include "integer_advanced_indexing.hpp"
 #include "linear_sequences.hpp"
 #include "triul_ctor.hpp"
 #include "utils/strided_iters.hpp"

From 51e0fbbac997421572cdacf909e892239484950a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 27 Feb 2023 11:48:04 -0600
Subject: [PATCH 30/57] Initialize packed shape+strides data with zeros

---
 dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
index 9c9840e0de..6b053d7f77 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -97,10 +97,10 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
 
     usm_host_allocatorT allocator(exec_q);
     std::shared_ptr<shT> packed_host_shapes_strides_shp =
-        std::make_shared<shT>(3 * orthog_sh_elems, allocator);
+        std::make_shared<shT>(3 * orthog_sh_elems, 0, allocator);
 
     std::shared_ptr<shT> packed_host_axes_shapes_strides_shp =
-        std::make_shared<shT>(2 * k + along_sh_elems, allocator);
+        std::make_shared<shT>(2 * k + along_sh_elems, 0, allocator);
 
     if (inp_nd > 0) {
         std::copy(inp_shape, inp_shape + axis_start,

From 84ba81a47c16d7d40eecb6fa6a7e779432c3a09d Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 27 Feb 2023 15:41:56 -0600
Subject: [PATCH 31/57] Ensure that indices are also kept alive

---
 .../source/integer_advanced_indexing.cpp      | 51 +++++++++++++++----
 .../source/integer_advanced_indexing.hpp      | 30 +++++------
 2 files changed, 57 insertions(+), 24 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
index 6b053d7f77..983eaa9b6f 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -27,7 +27,6 @@
 #include <algorithm>
 #include <complex>
 #include <cstdint>
-#include <iostream>
 #include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -280,15 +279,50 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
     }
 }
 
+/* Utility to parse python object py_ind into vector of `usm_ndarray`s */
+std::vector<dpctl::tensor::usm_ndarray> parse_py_ind(const sycl::queue &q,
+                                                     py::object py_ind)
+{
+    size_t ind_count = py::len(py_ind);
+    std::vector<dpctl::tensor::usm_ndarray> res;
+    res.reserve(ind_count);
+
+    bool acquired = false;
+    int nd = -1;
+    for (size_t i = 0; i < ind_count; ++i) {
+        auto el_i = py_ind[py::cast(i)];
+        auto arr_i = py::cast<dpctl::tensor::usm_ndarray>(el_i);
+        if (!dpctl::utils::queues_are_compatible(q, {arr_i})) {
+            throw py::value_error("Index allocation queue is not compatible "
+                                  "with execution queue");
+        }
+        if (acquired) {
+            if (nd != arr_i.get_ndim()) {
+                throw py::value_error(
+                    "Indices must have the same number of dimensions.");
+            }
+        }
+        else {
+            acquired = true;
+            nd = arr_i.get_ndim();
+        }
+        res.push_back(arr_i);
+    }
+
+    return res;
+}
+
 std::pair<sycl::event, sycl::event>
 usm_ndarray_take(dpctl::tensor::usm_ndarray src,
-                 std::vector<dpctl::tensor::usm_ndarray> ind,
+                 py::object py_ind,
                  dpctl::tensor::usm_ndarray dst,
                  int axis_start,
                  uint8_t mode,
                  sycl::queue exec_q,
                  const std::vector<sycl::event> &depends)
 {
+    std::vector<dpctl::tensor::usm_ndarray> ind = parse_py_ind(exec_q, py_ind);
+
     int k = ind.size();
 
     if (k == 0) {
@@ -636,15 +670,12 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
                                  std::to_string(ind_type_id));
     }
 
-    std::cout << "Submitting take" << std::endl;
     sycl::event take_generic_ev =
         fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k,
            packed_shapes_strides, packed_axes_shapes_strides,
            packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs,
            src_offset, dst_offset, packed_ind_offsets, all_deps);
 
-    std::cout << "Submitting take clean-up host task" << std::endl;
-
     // free packed temporaries
     auto ctx = exec_q.get_context();
     exec_q.submit([&](sycl::handler &cgh) {
@@ -661,19 +692,20 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     });
 
     return std::make_pair(
-        keep_args_alive(exec_q, {src, dst}, {take_generic_ev}),
+        keep_args_alive(exec_q, {src, py_ind, dst}, {take_generic_ev}),
         take_generic_ev);
 }
 
 std::pair<sycl::event, sycl::event>
 usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
-                std::vector<dpctl::tensor::usm_ndarray> ind,
+                py::object py_ind,
                 dpctl::tensor::usm_ndarray val,
                 int axis_start,
                 uint8_t mode,
                 sycl::queue exec_q,
                 const std::vector<sycl::event> &depends)
 {
+    std::vector<dpctl::tensor::usm_ndarray> ind = parse_py_ind(exec_q, py_ind);
     int k = ind.size();
 
     if (k == 0) {
@@ -1046,8 +1078,9 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
         });
     });
 
-    return std::make_pair(keep_args_alive(exec_q, {dst, val}, {put_generic_ev}),
-                          put_generic_ev);
+    return std::make_pair(
+        keep_args_alive(exec_q, {dst, py_ind, val}, {put_generic_ev}),
+        put_generic_ev);
 }
 
 void init_advanced_indexing_dispatch_tables(void)
diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp
index d99d4f1828..c6d5ed74b8 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp
@@ -38,24 +38,24 @@ namespace py_internal
 {
 
 extern std::pair<sycl::event, sycl::event>
-usm_ndarray_take(dpctl::tensor::usm_ndarray src,
-                 std::vector<dpctl::tensor::usm_ndarray> ind,
-                 dpctl::tensor::usm_ndarray dst,
-                 int axis_start,
-                 uint8_t mode,
-                 sycl::queue exec_q,
-                 const std::vector<sycl::event> &depends = {});
+usm_ndarray_take(dpctl::tensor::usm_ndarray,
+                 py::object,
+                 dpctl::tensor::usm_ndarray,
+                 int,
+                 uint8_t,
+                 sycl::queue,
+                 const std::vector<sycl::event> & = {});
 
 extern std::pair<sycl::event, sycl::event>
-usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
-                std::vector<dpctl::tensor::usm_ndarray> ind,
-                dpctl::tensor::usm_ndarray val,
-                int axis_start,
-                uint8_t mode,
-                sycl::queue exec_q,
-                const std::vector<sycl::event> &depends = {});
+usm_ndarray_put(dpctl::tensor::usm_ndarray,
+                py::object,
+                dpctl::tensor::usm_ndarray,
+                int,
+                uint8_t,
+                sycl::queue,
+                const std::vector<sycl::event> & = {});
 
-extern void init_advanced_indexing_dispatch_tables();
+extern void init_advanced_indexing_dispatch_tables(void);
 
 } // namespace py_internal
 } // namespace tensor

From 56bb65fc6c7a7ba026397d5d42ddccfa2e42a5a8 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 28 Feb 2023 12:32:50 -0600
Subject: [PATCH 32/57] Moved ctx creation into host-task-dispatching handler
 function.

---
 .../libtensor/source/integer_advanced_indexing.cpp       | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
index 983eaa9b6f..510ee2c554 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -677,9 +677,9 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
            src_offset, dst_offset, packed_ind_offsets, all_deps);
 
     // free packed temporaries
-    auto ctx = exec_q.get_context();
     exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(take_generic_ev);
+        auto ctx = exec_q.get_context();
         cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides,
                        packed_ind_shapes_strides, packed_ind_ptrs,
                        packed_ind_offsets, ctx]() {
@@ -691,9 +691,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
         });
     });
 
-    return std::make_pair(
-        keep_args_alive(exec_q, {src, py_ind, dst}, {take_generic_ev}),
-        take_generic_ev);
+    sycl::event host_task_ev =
+        keep_args_alive(exec_q, {src, py_ind, dst}, {take_generic_ev});
+
+    return std::make_pair(host_task_ev, take_generic_ev);
 }
 
 std::pair<sycl::event, sycl::event>

From 24d7839ce74e8893b1dc6f3566658a919f4ceef5 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 1 Mar 2023 09:46:55 -0800
Subject: [PATCH 33/57] Prevent dangling host tasks in indexing functions -
 Host tasks are now collected and kept alive

---
 .../source/integer_advanced_indexing.cpp      | 73 ++++++++++++-------
 1 file changed, 45 insertions(+), 28 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
index 510ee2c554..7649fcfba2 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -70,6 +70,7 @@ using dpctl::utils::keep_args_alive;
 
 std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
     sycl::queue exec_q,
+    std::vector<sycl::event> &host_task_events,
     py::ssize_t *device_orthog_shapes_strides,
     py::ssize_t *device_axes_shapes_strides,
     const py::ssize_t *inp_shape,
@@ -210,20 +211,21 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
             exec_q.copy<py::ssize_t>(packed_host_shapes_strides_shp->data(),
                                      device_orthog_shapes_strides,
                                      packed_host_shapes_strides_shp->size());
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(device_orthog_shapes_strides_copy_ev);
-            cgh.host_task([packed_host_shapes_strides_shp] {});
-        });
 
         sycl::event device_axes_shapes_strides_copy_ev =
             exec_q.copy<py::ssize_t>(
                 packed_host_axes_shapes_strides_shp->data(),
                 device_axes_shapes_strides,
                 packed_host_axes_shapes_strides_shp->size());
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(device_axes_shapes_strides_copy_ev);
-            cgh.host_task([packed_host_axes_shapes_strides_shp]() {});
-        });
+
+        sycl::event clean_up_host_task_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(device_axes_shapes_strides_copy_ev);
+                cgh.depends_on(device_orthog_shapes_strides_copy_ev);
+                cgh.host_task([packed_host_axes_shapes_strides_shp,
+                               packed_host_shapes_strides_shp]() {});
+            });
+        host_task_events.push_back(clean_up_host_task_ev);
 
         std::vector<sycl::event> v = {device_orthog_shapes_strides_copy_ev,
                                       device_axes_shapes_strides_copy_ev};
@@ -268,10 +270,13 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
                 packed_host_axes_shapes_strides_shp->data(),
                 device_axes_shapes_strides,
                 packed_host_axes_shapes_strides_shp->size());
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(device_axes_shapes_strides_copy_ev);
-            cgh.host_task([packed_host_axes_shapes_strides_shp]() {});
-        });
+
+        sycl::event clean_up_host_task_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(device_axes_shapes_strides_copy_ev);
+                cgh.host_task([packed_host_axes_shapes_strides_shp]() {});
+            });
+        host_task_events.push_back(clean_up_host_task_ev);
 
         std::vector<sycl::event> v = {device_orthog_shapes_strides_fill_ev,
                                       device_axes_shapes_strides_copy_ev};
@@ -590,28 +595,33 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     std::copy(ind_offsets.begin(), ind_offsets.end(),
               host_ind_offsets_shp->begin());
 
+    std::vector<sycl::event> host_task_events(5);
+
     sycl::event packed_ind_ptrs_copy_ev = exec_q.copy<char *>(
         host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
-    exec_q.submit([&](sycl::handler &cgh) {
+    sycl::event ind_ptrs_host_task = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(packed_ind_ptrs_copy_ev);
         cgh.host_task([host_ind_ptrs_shp]() {});
     });
+    host_task_events.push_back(ind_ptrs_host_task);
 
     sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides,
         host_ind_shapes_strides_shp->size());
-    exec_q.submit([&](sycl::handler &cgh) {
+    sycl::event ind_sh_st_host_task = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(packed_ind_shapes_strides_copy_ev);
         cgh.host_task([host_ind_shapes_strides_shp]() {});
     });
+    host_task_events.push_back(ind_sh_st_host_task);
 
     sycl::event packed_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_offsets_shp->data(), packed_ind_offsets,
         host_ind_offsets_shp->size());
-    exec_q.submit([&](sycl::handler &cgh) {
+    sycl::event ind_offsets_host_task = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(packed_ind_offsets_copy_ev);
         cgh.host_task([host_ind_offsets_shp]() {});
     });
+    host_task_events.push_back(ind_offsets_host_task);
 
     std::vector<sycl::event> ind_pack_depends{packed_ind_ptrs_copy_ev,
                                               packed_ind_shapes_strides_copy_ev,
@@ -650,10 +660,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
 
     std::vector<sycl::event> src_dst_pack_deps =
         _populate_packed_shapes_strides_for_indexing(
-            exec_q, packed_shapes_strides, packed_axes_shapes_strides,
-            src_shape, src_strides, is_src_c_contig, is_src_f_contig, dst_shape,
-            dst_strides, is_dst_c_contig, is_dst_f_contig, axis_start, k,
-            ind_nd, src_nd, dst_nd);
+            exec_q, host_task_events, packed_shapes_strides,
+            packed_axes_shapes_strides, src_shape, src_strides, is_src_c_contig,
+            is_src_f_contig, dst_shape, dst_strides, is_dst_c_contig,
+            is_dst_f_contig, axis_start, k, ind_nd, src_nd, dst_nd);
 
     std::vector<sycl::event> all_deps(depends.size() + ind_pack_depends.size() +
                                       src_dst_pack_deps.size());
@@ -690,9 +700,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
             sycl::free(packed_ind_offsets, ctx);
         });
     });
+    host_task_events.push_back(take_generic_ev);
 
     sycl::event host_task_ev =
-        keep_args_alive(exec_q, {src, py_ind, dst}, {take_generic_ev});
+        keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events);
 
     return std::make_pair(host_task_ev, take_generic_ev);
 }
@@ -977,28 +988,33 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
     std::copy(ind_offsets.begin(), ind_offsets.end(),
               host_ind_offsets_shp->begin());
 
+    std::vector<sycl::event> host_task_events(5);
+
     sycl::event device_ind_ptrs_copy_ev = exec_q.copy<char *>(
         host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
-    exec_q.submit([&](sycl::handler &cgh) {
+    sycl::event ind_ptrs_host_task = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(device_ind_ptrs_copy_ev);
         cgh.host_task([host_ind_ptrs_shp]() {});
     });
+    host_task_events.push_back(ind_ptrs_host_task);
 
     sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides,
         host_ind_shapes_strides_shp->size());
-    exec_q.submit([&](sycl::handler &cgh) {
+    sycl::event ind_sh_st_host_task = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(device_ind_shapes_strides_copy_ev);
         cgh.host_task([host_ind_shapes_strides_shp]() {});
     });
+    host_task_events.push_back(ind_sh_st_host_task);
 
     sycl::event device_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_offsets_shp->data(), packed_ind_offsets,
         host_ind_offsets_shp->size());
-    exec_q.submit([&](sycl::handler &cgh) {
+    sycl::event ind_offsets_host_task = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(device_ind_offsets_copy_ev);
         cgh.host_task([host_ind_offsets_shp]() {});
     });
+    host_task_events.push_back(ind_offsets_host_task);
 
     std::vector<sycl::event> ind_pack_depends{device_ind_ptrs_copy_ev,
                                               device_ind_shapes_strides_copy_ev,
@@ -1037,10 +1053,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
 
     std::vector<sycl::event> copy_shapes_strides_deps =
         _populate_packed_shapes_strides_for_indexing(
-            exec_q, packed_shapes_strides, packed_axes_shapes_strides,
-            dst_shape, dst_strides, is_dst_c_contig, is_dst_f_contig, val_shape,
-            val_strides, is_val_c_contig, is_val_f_contig, axis_start, k,
-            ind_nd, dst_nd, val_nd);
+            exec_q, host_task_events, packed_shapes_strides,
+            packed_axes_shapes_strides, dst_shape, dst_strides, is_dst_c_contig,
+            is_dst_f_contig, val_shape, val_strides, is_val_c_contig,
+            is_val_f_contig, axis_start, k, ind_nd, dst_nd, val_nd);
 
     std::vector<sycl::event> all_deps(depends.size() +
                                       copy_shapes_strides_deps.size() +
@@ -1078,9 +1094,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
             sycl::free(packed_ind_offsets, ctx);
         });
     });
+    host_task_events.push_back(put_generic_ev);
 
     return std::make_pair(
-        keep_args_alive(exec_q, {dst, py_ind, val}, {put_generic_ev}),
+        keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events),
         put_generic_ev);
 }
 

From f84239fce8636c0330f750d9750be960630adec5 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 1 Mar 2023 16:11:29 -0600
Subject: [PATCH 34/57] Use py::gil_scoped_acquire instead of
 PyGILState_Ensure.

---
 dpctl/apis/include/dpctl4pybind11.hpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp
index 921f231aa1..04b9f5d919 100644
--- a/dpctl/apis/include/dpctl4pybind11.hpp
+++ b/dpctl/apis/include/dpctl4pybind11.hpp
@@ -1000,14 +1000,10 @@ sycl::event keep_args_alive(sycl::queue q,
             shp_arr[i]->inc_ref();
         }
         cgh.host_task([=]() {
-            bool guard = (Py_IsInitialized() && !_Py_IsFinalizing());
-            if (guard) {
-                PyGILState_STATE gstate;
-                gstate = PyGILState_Ensure();
-                for (std::size_t i = 0; i < num; ++i) {
-                    shp_arr[i]->dec_ref();
-                }
-                PyGILState_Release(gstate);
+            py::gil_scoped_acquire acquire;
+
+            for (std::size_t i = 0; i < num; ++i) {
+                shp_arr[i]->dec_ref();
             }
         });
     });

From b69a415d4ccec75e61d0b1e957f4031020c5bb58 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 1 Mar 2023 16:14:18 -0600
Subject: [PATCH 35/57] Make both _take and _put effectively synchronous

They still return a pair of events, but those are always in a compelte state.
---
 .../source/integer_advanced_indexing.cpp      | 123 +++++++++++++-----
 1 file changed, 88 insertions(+), 35 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
index 7649fcfba2..77c34f0a0b 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -143,6 +143,8 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
                           packed_host_axes_shapes_strides_shp->begin() + k);
             }
             else {
+                // FIXME: this pointer was not allocated in this function
+                // the caller should be freeing it
                 sycl::free(device_orthog_shapes_strides, exec_q);
                 throw std::runtime_error("Invalid array encountered");
             }
@@ -190,6 +192,8 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
                           packed_host_axes_shapes_strides_shp->begin() + 2 * k);
             }
             else {
+                // FIXME: this pointer was not allocated in this function
+                // the caller should be freeing it
                 sycl::free(device_orthog_shapes_strides, exec_q);
                 throw std::runtime_error("Invalid array encountered");
             }
@@ -255,6 +259,8 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
                           packed_host_axes_shapes_strides_shp->begin() + 2);
             }
             else {
+                // FIXME: memory was not allocated in this function
+                // it should be freed by the caller
                 sycl::free(device_orthog_shapes_strides, exec_q);
                 throw std::runtime_error("Invalid array encountered");
             }
@@ -292,23 +298,24 @@ std::vector<dpctl::tensor::usm_ndarray> parse_py_ind(const sycl::queue &q,
     std::vector<dpctl::tensor::usm_ndarray> res;
     res.reserve(ind_count);
 
-    bool acquired = false;
+    bool nd_is_known = false;
     int nd = -1;
     for (size_t i = 0; i < ind_count; ++i) {
-        auto el_i = py_ind[py::cast(i)];
-        auto arr_i = py::cast<dpctl::tensor::usm_ndarray>(el_i);
+        py::object el_i = py_ind[py::cast(i)];
+        dpctl::tensor::usm_ndarray arr_i =
+            py::cast<dpctl::tensor::usm_ndarray>(el_i);
         if (!dpctl::utils::queues_are_compatible(q, {arr_i})) {
             throw py::value_error("Index allocation queue is not compatible "
                                   "with execution queue");
         }
-        if (acquired) {
+        if (nd_is_known) {
             if (nd != arr_i.get_ndim()) {
                 throw py::value_error(
                     "Indices must have the same number of dimensions.");
             }
         }
         else {
-            acquired = true;
+            nd_is_known = true;
             nd = arr_i.get_ndim();
         }
         res.push_back(arr_i);
@@ -558,6 +565,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
         sycl::malloc_device<py::ssize_t>((k + 1) * ind_sh_elems, exec_q);
 
     if (packed_ind_shapes_strides == nullptr) {
+        sycl::free(packed_ind_ptrs, exec_q);
         throw std::runtime_error(
             "Unable to allocate packed_ind_shapes_strides device memory");
     }
@@ -566,6 +574,8 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
         sycl::malloc_device<py::ssize_t>(k, exec_q);
 
     if (packed_ind_offsets == nullptr) {
+        sycl::free(packed_ind_ptrs, exec_q);
+        sycl::free(packed_ind_shapes_strides, exec_q);
         throw std::runtime_error(
             "Unable to allocate packed_ind_offsets device memory");
     }
@@ -595,33 +605,29 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     std::copy(ind_offsets.begin(), ind_offsets.end(),
               host_ind_offsets_shp->begin());
 
-    std::vector<sycl::event> host_task_events(5);
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(5);
 
     sycl::event packed_ind_ptrs_copy_ev = exec_q.copy<char *>(
         host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
-    sycl::event ind_ptrs_host_task = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(packed_ind_ptrs_copy_ev);
-        cgh.host_task([host_ind_ptrs_shp]() {});
-    });
-    host_task_events.push_back(ind_ptrs_host_task);
 
     sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides,
         host_ind_shapes_strides_shp->size());
-    sycl::event ind_sh_st_host_task = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(packed_ind_shapes_strides_copy_ev);
-        cgh.host_task([host_ind_shapes_strides_shp]() {});
-    });
-    host_task_events.push_back(ind_sh_st_host_task);
 
     sycl::event packed_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_offsets_shp->data(), packed_ind_offsets,
         host_ind_offsets_shp->size());
-    sycl::event ind_offsets_host_task = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(packed_ind_offsets_copy_ev);
-        cgh.host_task([host_ind_offsets_shp]() {});
-    });
-    host_task_events.push_back(ind_offsets_host_task);
+
+    sycl::event shared_ptr_cleanup_host_task =
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on({packed_ind_offsets_copy_ev,
+                            packed_ind_shapes_strides_copy_ev,
+                            packed_ind_ptrs_copy_ev});
+            cgh.host_task([host_ind_offsets_shp, host_ind_shapes_strides_shp,
+                           host_ind_ptrs_shp]() {});
+        });
+    host_task_events.push_back(shared_ptr_cleanup_host_task);
 
     std::vector<sycl::event> ind_pack_depends{packed_ind_ptrs_copy_ev,
                                               packed_ind_shapes_strides_copy_ev,
@@ -643,6 +649,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
         sycl::malloc_device<py::ssize_t>(3 * sh_elems, exec_q);
 
     if (packed_shapes_strides == nullptr) {
+        sycl::event::wait(host_task_events);
+        sycl::free(packed_ind_ptrs, exec_q);
+        sycl::free(packed_ind_shapes_strides, exec_q);
+        sycl::free(packed_ind_offsets, exec_q);
         throw std::runtime_error(
             "Unable to allocate packed_shapes_strides device memory");
     }
@@ -654,6 +664,11 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
         sycl::malloc_device<py::ssize_t>((2 * k) + ind_sh_elems, exec_q);
 
     if (packed_axes_shapes_strides == nullptr) {
+        sycl::event::wait(host_task_events);
+        sycl::free(packed_ind_ptrs, exec_q);
+        sycl::free(packed_ind_shapes_strides, exec_q);
+        sycl::free(packed_ind_offsets, exec_q);
+        sycl::free(packed_shapes_strides, exec_q);
         throw std::runtime_error(
             "Unable to allocate packed_axes_shapes_strides device memory");
     }
@@ -665,8 +680,9 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
             is_src_f_contig, dst_shape, dst_strides, is_dst_c_contig,
             is_dst_f_contig, axis_start, k, ind_nd, src_nd, dst_nd);
 
-    std::vector<sycl::event> all_deps(depends.size() + ind_pack_depends.size() +
-                                      src_dst_pack_deps.size());
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + ind_pack_depends.size() +
+                     src_dst_pack_deps.size());
     all_deps.insert(std::end(all_deps), std::begin(ind_pack_depends),
                     std::end(ind_pack_depends));
     all_deps.insert(std::end(all_deps), std::begin(src_dst_pack_deps),
@@ -676,6 +692,12 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     auto fn = take_dispatch_table[mode][src_type_id][ind_type_id];
 
     if (fn == nullptr) {
+        sycl::event::wait(host_task_events);
+        sycl::free(packed_ind_ptrs, exec_q);
+        sycl::free(packed_ind_shapes_strides, exec_q);
+        sycl::free(packed_ind_offsets, exec_q);
+        sycl::free(packed_shapes_strides, exec_q);
+        sycl::free(packed_axes_shapes_strides, exec_q);
         throw std::runtime_error("Indices must be integer type, got " +
                                  std::to_string(ind_type_id));
     }
@@ -687,7 +709,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
            src_offset, dst_offset, packed_ind_offsets, all_deps);
 
     // free packed temporaries
-    exec_q.submit([&](sycl::handler &cgh) {
+    sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(take_generic_ev);
         auto ctx = exec_q.get_context();
         cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides,
@@ -700,12 +722,16 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
             sycl::free(packed_ind_offsets, ctx);
         });
     });
-    host_task_events.push_back(take_generic_ev);
 
-    sycl::event host_task_ev =
-        keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events);
+    sycl::event::wait(host_task_events);
+    sycl::event::wait({take_generic_ev, temporaries_cleanup_ev});
+
+    /*
+    sycl::event host_task_ev = keep_args_alive(exec_q, {src, py_ind, dst},
+                                               {temporaries_cleanup_ev});
+    */
 
-    return std::make_pair(host_task_ev, take_generic_ev);
+    return std::make_pair(sycl::event(), temporaries_cleanup_ev);
 }
 
 std::pair<sycl::event, sycl::event>
@@ -951,6 +977,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
         sycl::malloc_device<py::ssize_t>((k + 1) * ind_sh_elems, exec_q);
 
     if (packed_ind_shapes_strides == nullptr) {
+        sycl::free(packed_ind_ptrs, exec_q);
         throw std::runtime_error(
             "Unable to allocate packed_ind_shapes_strides device memory");
     }
@@ -959,6 +986,8 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
         sycl::malloc_device<py::ssize_t>(k, exec_q);
 
     if (packed_ind_offsets == nullptr) {
+        sycl::free(packed_ind_ptrs, exec_q);
+        sycl::free(packed_ind_shapes_strides, exec_q);
         throw std::runtime_error(
             "Unable to allocate packed_ind_offsets device memory");
     }
@@ -988,7 +1017,8 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
     std::copy(ind_offsets.begin(), ind_offsets.end(),
               host_ind_offsets_shp->begin());
 
-    std::vector<sycl::event> host_task_events(5);
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(7);
 
     sycl::event device_ind_ptrs_copy_ev = exec_q.copy<char *>(
         host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
@@ -1036,6 +1066,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
         sycl::malloc_device<py::ssize_t>(3 * sh_elems, exec_q);
 
     if (packed_shapes_strides == nullptr) {
+        sycl::event::wait(host_task_events);
+        sycl::free(packed_ind_ptrs, exec_q);
+        sycl::free(packed_ind_shapes_strides, exec_q);
+        sycl::free(packed_ind_offsets, exec_q);
         throw std::runtime_error(
             "Unable to allocate packed_shapes_strides device memory");
     }
@@ -1047,6 +1081,11 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
         sycl::malloc_device<py::ssize_t>((2 * k) + ind_sh_elems, exec_q);
 
     if (packed_axes_shapes_strides == nullptr) {
+        sycl::event::wait(host_task_events);
+        sycl::free(packed_shapes_strides, exec_q);
+        sycl::free(packed_ind_ptrs, exec_q);
+        sycl::free(packed_ind_shapes_strides, exec_q);
+        sycl::free(packed_ind_offsets, exec_q);
         throw std::runtime_error(
             "Unable to allocate packed_axes_shapes_strides device memory");
     }
@@ -1070,6 +1109,13 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
     auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id];
 
     if (fn == nullptr) {
+        sycl::event::wait(host_task_events);
+        sycl::free(packed_shapes_strides, exec_q);
+        sycl::free(packed_axes_shapes_strides, exec_q);
+        sycl::free(packed_ind_shapes_strides, exec_q);
+        sycl::free(packed_ind_ptrs, exec_q);
+        sycl::free(packed_ind_offsets, exec_q);
+
         throw std::runtime_error("Indices must be integer type, got " +
                                  std::to_string(ind_type_id));
     }
@@ -1081,9 +1127,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
            dst_offset, val_offset, packed_ind_offsets, all_deps);
 
     // free packed temporaries
-    auto ctx = exec_q.get_context();
-    exec_q.submit([&](sycl::handler &cgh) {
+
+    sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(put_generic_ev);
+        auto ctx = exec_q.get_context();
         cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides,
                        packed_ind_shapes_strides, packed_ind_ptrs,
                        packed_ind_offsets, ctx]() {
@@ -1094,11 +1141,17 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
             sycl::free(packed_ind_offsets, ctx);
         });
     });
-    host_task_events.push_back(put_generic_ev);
 
-    return std::make_pair(
-        keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events),
-        put_generic_ev);
+    sycl::event::wait(host_task_events);
+    sycl::event::wait({put_generic_ev, temporaries_cleanup_ev});
+
+    /*
+    sycl::event py_obj_cleanup_ev =
+        keep_args_alive(exec_q, {dst, py_ind, val},
+                        {put_generic_ev, temporaries_cleanup_ev});
+    */
+
+    return std::make_pair(sycl::event(), temporaries_cleanup_ev);
 }
 
 void init_advanced_indexing_dispatch_tables(void)

From f35734be1bdb6f670fc4377f9e60418b2350983a Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 1 Mar 2023 15:35:17 -0800
Subject: [PATCH 36/57] Simplified host_tasks in _put

---
 .../source/integer_advanced_indexing.cpp      | 49 ++++++++-----------
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
index 77c34f0a0b..b6ce79f4d0 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -599,22 +599,21 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     std::shared_ptr<shT> host_ind_offsets_shp =
         std::make_shared<shT>(k, ind_allocator);
 
-    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
-              host_ind_shapes_strides_shp->begin());
-    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
-    std::copy(ind_offsets.begin(), ind_offsets.end(),
-              host_ind_offsets_shp->begin());
-
     std::vector<sycl::event> host_task_events;
     host_task_events.reserve(5);
 
+    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
+              host_ind_shapes_strides_shp->begin());
     sycl::event packed_ind_ptrs_copy_ev = exec_q.copy<char *>(
         host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
 
+    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
     sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides,
         host_ind_shapes_strides_shp->size());
 
+    std::copy(ind_offsets.begin(), ind_offsets.end(),
+              host_ind_offsets_shp->begin());
     sycl::event packed_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_offsets_shp->data(), packed_ind_offsets,
         host_ind_offsets_shp->size());
@@ -1011,40 +1010,34 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
     std::shared_ptr<shT> host_ind_offsets_shp =
         std::make_shared<shT>(k, ind_allocator);
 
-    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
-              host_ind_shapes_strides_shp->begin());
-    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
-    std::copy(ind_offsets.begin(), ind_offsets.end(),
-              host_ind_offsets_shp->begin());
-
     std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(7);
+    host_task_events.reserve(5);
 
+    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
     sycl::event device_ind_ptrs_copy_ev = exec_q.copy<char *>(
         host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
-    sycl::event ind_ptrs_host_task = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(device_ind_ptrs_copy_ev);
-        cgh.host_task([host_ind_ptrs_shp]() {});
-    });
-    host_task_events.push_back(ind_ptrs_host_task);
 
+    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
+              host_ind_shapes_strides_shp->begin());
     sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides,
         host_ind_shapes_strides_shp->size());
-    sycl::event ind_sh_st_host_task = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(device_ind_shapes_strides_copy_ev);
-        cgh.host_task([host_ind_shapes_strides_shp]() {});
-    });
-    host_task_events.push_back(ind_sh_st_host_task);
 
+    std::copy(ind_offsets.begin(), ind_offsets.end(),
+              host_ind_offsets_shp->begin());
     sycl::event device_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_offsets_shp->data(), packed_ind_offsets,
         host_ind_offsets_shp->size());
-    sycl::event ind_offsets_host_task = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(device_ind_offsets_copy_ev);
-        cgh.host_task([host_ind_offsets_shp]() {});
-    });
-    host_task_events.push_back(ind_offsets_host_task);
+
+    sycl::event shared_ptr_cleanup_host_task =
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(device_ind_ptrs_copy_ev);
+            cgh.depends_on(device_ind_shapes_strides_copy_ev);
+            cgh.depends_on(device_ind_offsets_copy_ev);
+            cgh.host_task([host_ind_ptrs_shp, host_ind_shapes_strides_shp,
+                           host_ind_offsets_shp]() {});
+        });
+    host_task_events.push_back(shared_ptr_cleanup_host_task);
 
     std::vector<sycl::event> ind_pack_depends{device_ind_ptrs_copy_ev,
                                               device_ind_shapes_strides_copy_ev,

From 7fee9e4867a22cf6770f9dc50740a32749bb7208 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 1 Mar 2023 17:05:27 -0800
Subject: [PATCH 37/57] Reordered copies in _take and _put - Segmentation fault
 occurred with other ordering

---
 .../source/integer_advanced_indexing.cpp      | 44 ++++++++++---------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
index b6ce79f4d0..4a1b4e8ef0 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -599,21 +599,22 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     std::shared_ptr<shT> host_ind_offsets_shp =
         std::make_shared<shT>(k, ind_allocator);
 
+    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
+              host_ind_shapes_strides_shp->begin());
+    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
+    std::copy(ind_offsets.begin(), ind_offsets.end(),
+              host_ind_offsets_shp->begin());
+
     std::vector<sycl::event> host_task_events;
     host_task_events.reserve(5);
 
-    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
-              host_ind_shapes_strides_shp->begin());
     sycl::event packed_ind_ptrs_copy_ev = exec_q.copy<char *>(
         host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
 
-    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
     sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides,
         host_ind_shapes_strides_shp->size());
 
-    std::copy(ind_offsets.begin(), ind_offsets.end(),
-              host_ind_offsets_shp->begin());
     sycl::event packed_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_offsets_shp->data(), packed_ind_offsets,
         host_ind_offsets_shp->size());
@@ -1010,38 +1011,39 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
     std::shared_ptr<shT> host_ind_offsets_shp =
         std::make_shared<shT>(k, ind_allocator);
 
+    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
+              host_ind_shapes_strides_shp->begin());
+    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
+    std::copy(ind_offsets.begin(), ind_offsets.end(),
+              host_ind_offsets_shp->begin());
+
     std::vector<sycl::event> host_task_events;
     host_task_events.reserve(5);
 
-    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
-    sycl::event device_ind_ptrs_copy_ev = exec_q.copy<char *>(
+    sycl::event packed_ind_ptrs_copy_ev = exec_q.copy<char *>(
         host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
 
-    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
-              host_ind_shapes_strides_shp->begin());
-    sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
+    sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides,
         host_ind_shapes_strides_shp->size());
 
-    std::copy(ind_offsets.begin(), ind_offsets.end(),
-              host_ind_offsets_shp->begin());
-    sycl::event device_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
+    sycl::event packed_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
         host_ind_offsets_shp->data(), packed_ind_offsets,
         host_ind_offsets_shp->size());
 
     sycl::event shared_ptr_cleanup_host_task =
         exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(device_ind_ptrs_copy_ev);
-            cgh.depends_on(device_ind_shapes_strides_copy_ev);
-            cgh.depends_on(device_ind_offsets_copy_ev);
-            cgh.host_task([host_ind_ptrs_shp, host_ind_shapes_strides_shp,
-                           host_ind_offsets_shp]() {});
+            cgh.depends_on({packed_ind_offsets_copy_ev,
+                            packed_ind_shapes_strides_copy_ev,
+                            packed_ind_ptrs_copy_ev});
+            cgh.host_task([host_ind_offsets_shp, host_ind_shapes_strides_shp,
+                           host_ind_ptrs_shp]() {});
         });
     host_task_events.push_back(shared_ptr_cleanup_host_task);
 
-    std::vector<sycl::event> ind_pack_depends{device_ind_ptrs_copy_ev,
-                                              device_ind_shapes_strides_copy_ev,
-                                              device_ind_offsets_copy_ev};
+    std::vector<sycl::event> ind_pack_depends{packed_ind_ptrs_copy_ev,
+                                              packed_ind_shapes_strides_copy_ev,
+                                              packed_ind_offsets_copy_ev};
 
     bool is_dst_c_contig = dst.is_c_contiguous();
     bool is_dst_f_contig = dst.is_f_contiguous();

From d5a49c286754e23b2cac4d146643bac8cf9b717a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 1 Mar 2023 20:58:44 -0600
Subject: [PATCH 38/57] Reordered waits

---
 dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
index 4a1b4e8ef0..440dd5a332 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -723,8 +723,8 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
         });
     });
 
-    sycl::event::wait(host_task_events);
     sycl::event::wait({take_generic_ev, temporaries_cleanup_ev});
+    sycl::event::wait(host_task_events);
 
     /*
     sycl::event host_task_ev = keep_args_alive(exec_q, {src, py_ind, dst},
@@ -1137,8 +1137,8 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
         });
     });
 
-    sycl::event::wait(host_task_events);
     sycl::event::wait({put_generic_ev, temporaries_cleanup_ev});
+    sycl::event::wait(host_task_events);
 
     /*
     sycl::event py_obj_cleanup_ev =

From f06dde5bfacf2c6fea4ce819f27c86ac17ba2d91 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 1 Mar 2023 22:32:19 -0600
Subject: [PATCH 39/57] Add wait for every host task submitted.

---
 dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
index 440dd5a332..53450d2a8a 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -229,6 +229,7 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
                 cgh.host_task([packed_host_axes_shapes_strides_shp,
                                packed_host_shapes_strides_shp]() {});
             });
+        clean_up_host_task_ev.wait();
         host_task_events.push_back(clean_up_host_task_ev);
 
         std::vector<sycl::event> v = {device_orthog_shapes_strides_copy_ev,
@@ -282,6 +283,7 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
                 cgh.depends_on(device_axes_shapes_strides_copy_ev);
                 cgh.host_task([packed_host_axes_shapes_strides_shp]() {});
             });
+        clean_up_host_task_ev.wait();
         host_task_events.push_back(clean_up_host_task_ev);
 
         std::vector<sycl::event> v = {device_orthog_shapes_strides_fill_ev,
@@ -627,6 +629,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
             cgh.host_task([host_ind_offsets_shp, host_ind_shapes_strides_shp,
                            host_ind_ptrs_shp]() {});
         });
+    shared_ptr_cleanup_host_task.wait();
     host_task_events.push_back(shared_ptr_cleanup_host_task);
 
     std::vector<sycl::event> ind_pack_depends{packed_ind_ptrs_copy_ev,
@@ -1039,6 +1042,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
             cgh.host_task([host_ind_offsets_shp, host_ind_shapes_strides_shp,
                            host_ind_ptrs_shp]() {});
         });
+    shared_ptr_cleanup_host_task.wait();
     host_task_events.push_back(shared_ptr_cleanup_host_task);
 
     std::vector<sycl::event> ind_pack_depends{packed_ind_ptrs_copy_ev,

From 1387634077e37062b3f7b572dea8d84c12b02ffd Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 28 Feb 2023 15:14:20 -0800
Subject: [PATCH 40/57] Advanced indices don't broadcast if 1 array passed -
 _mock removed from indexing methods

---
 dpctl/tensor/_copy_utils.py         | 15 ++++++++++-----
 dpctl/tensor/_indexing_functions.py | 14 +++++++++-----
 dpctl/tensor/_usmarray.pyx          | 12 ++++++------
 3 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index 382d92bb79..079f02fe52 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -430,7 +430,7 @@ def _mock_nonzero(ary):
     return tuple(dpt.asarray(i, usm_type=usm_type, sycl_queue=q) for i in nz)
 
 
-def _mock_take_multi_index(ary, inds, p):
+def _take_multi_index(ary, inds, p):
     if not isinstance(ary, dpt.usm_ndarray):
         raise TypeError
     queues_ = [
@@ -439,6 +439,8 @@ def _mock_take_multi_index(ary, inds, p):
     usm_types_ = [
         ary.usm_type,
     ]
+    if not isinstance(inds, list) and not isinstance(inds, tuple):
+        inds = (inds,)
     all_integers = True
     for ind in inds:
         queues_.append(ind.sycl_queue)
@@ -452,7 +454,8 @@ def _mock_take_multi_index(ary, inds, p):
         raise IndexError(
             "arrays used as indices must be of integer (or boolean) type"
         )
-    inds = dpt.broadcast_arrays(*inds)
+    if (len(inds) > 1):
+        inds = dpt.broadcast_arrays(*inds)
     ary_ndim = ary.ndim
     if ary_ndim > 0:
         p = operator.index(p)
@@ -505,7 +508,7 @@ def _mock_place(ary, ary_mask, p, vals):
     return
 
 
-def _mock_put_multi_index(ary, inds, p, vals):
+def _put_multi_index(ary, inds, p, vals):
     if isinstance(vals, dpt.usm_ndarray):
         queues_ = [ary.sycl_queue, vals.sycl_queue]
         usm_types_ = [ary.usm_type, vals.usm_type]
@@ -516,6 +519,8 @@ def _mock_put_multi_index(ary, inds, p, vals):
         usm_types_ = [
             ary.usm_type,
         ]
+    if not isinstance(inds, list) and not isinstance(inds, tuple):
+        inds = (inds,)
     all_integers = True
     for ind in inds:
         if not isinstance(ind, dpt.usm_ndarray):
@@ -536,8 +541,8 @@ def _mock_put_multi_index(ary, inds, p, vals):
         raise IndexError(
             "arrays used as indices must be of integer (or boolean) type"
         )
-
-    inds = dpt.broadcast_arrays(*inds)
+    if (len(inds) > 1):
+        inds = dpt.broadcast_arrays(*inds)
     ary_ndim = ary.ndim
     if ary_ndim > 0:
         p = operator.index(p)
diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
index 90718f4559..23d2c4d637 100644
--- a/dpctl/tensor/_indexing_functions.py
+++ b/dpctl/tensor/_indexing_functions.py
@@ -79,7 +79,8 @@ def take(x, indices, /, *, axis=None, mode="clip"):
             )
         axis = 0
 
-    indices = dpt.broadcast_arrays(*indices)
+    if len(indices) > 1:
+        indices = dpt.broadcast_arrays(*indices)
     if x_ndim > 0:
         axis = operator.index(axis)
         axis = normalize_axis_index(axis, x_ndim)
@@ -149,10 +150,13 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"):
 
     # when axis is none, array is treated as 1D
     if axis is None:
-        x = dpt.reshape(x, (x.size,), copy=False)
-        axis = 0
-
-    indices = dpt.broadcast_arrays(*indices)
+        try:
+            x = dpt.reshape(x, (x.size,), copy=False)
+            axis = 0
+        except ValueError:
+            raise ValueError("Cannot create 1D view of array")
+    if len(indices) > 1:
+        indices = dpt.broadcast_arrays(*indices)
     x_ndim = x.ndim
     if x_ndim > 0:
         axis = operator.index(axis)
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 3c42c96dd5..5d83a86c62 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -673,7 +673,7 @@ cdef class usm_ndarray:
         from ._copy_utils import (
             _mock_extract,
             _mock_nonzero,
-            _mock_take_multi_index,
+            _take_multi_index,
         )
         if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
             return _mock_extract(res, adv_ind[0], adv_ind_start_p)
@@ -685,9 +685,9 @@ cdef class usm_ndarray:
                     adv_ind_int.extend(_mock_nonzero(ind))
                 else:
                     adv_ind_int.append(ind)
-            return _mock_take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p)
+            return _take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p)
 
-        return _mock_take_multi_index(res, adv_ind, adv_ind_start_p)
+        return _take_multi_index(res, adv_ind, adv_ind_start_p)
 
 
     def to_device(self, target):
@@ -1021,7 +1021,7 @@ cdef class usm_ndarray:
             _copy_from_usm_ndarray_to_usm_ndarray,
             _mock_nonzero,
             _mock_place,
-            _mock_put_multi_index,
+            _put_multi_index,
         )
 
         adv_ind = _meta[3]
@@ -1064,10 +1064,10 @@ cdef class usm_ndarray:
                     adv_ind_int.extend(_mock_nonzero(ind))
                 else:
                     adv_ind_int.append(ind)
-            _mock_put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs)
+            _put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs)
             return
 
-        _mock_put_multi_index(Xv, adv_ind, adv_ind_start_p, rhs)
+        _put_multi_index(Xv, adv_ind, adv_ind_start_p, rhs)
         return
 
 

From d0eb7cff395e6c88e337eae40630e56e44073ba5 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 2 Mar 2023 09:17:10 -0800
Subject: [PATCH 41/57] Take and put tweaks - take_multi_index and
 put_multi_index logic for 0D arrays removed, adjusted a test accordingly -
 take, put, take_multi_index, and put_multi_index axis type check and
 normalization only reassigns axis once

---
 dpctl/tensor/_copy_utils.py              | 21 +++++++--------------
 dpctl/tensor/_indexing_functions.py      |  8 +++-----
 dpctl/tests/test_usm_ndarray_indexing.py |  3 ++-
 3 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index 079f02fe52..597db87c49 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -454,16 +454,12 @@ def _take_multi_index(ary, inds, p):
         raise IndexError(
             "arrays used as indices must be of integer (or boolean) type"
         )
-    if (len(inds) > 1):
+    if len(inds) > 1:
         inds = dpt.broadcast_arrays(*inds)
     ary_ndim = ary.ndim
-    if ary_ndim > 0:
-        p = operator.index(p)
-        p = normalize_axis_index(p, ary_ndim)
+    p = normalize_axis_index(operator.index(p), ary_ndim)
 
-        res_shape = ary.shape[:p] + inds[0].shape + ary.shape[p + len(inds) :]
-    else:
-        res_shape = inds[0].shape
+    res_shape = ary.shape[:p] + inds[0].shape + ary.shape[p + len(inds) :]
     res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
     res = dpt.empty(
         res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
@@ -541,15 +537,12 @@ def _put_multi_index(ary, inds, p, vals):
         raise IndexError(
             "arrays used as indices must be of integer (or boolean) type"
         )
-    if (len(inds) > 1):
+    if len(inds) > 1:
         inds = dpt.broadcast_arrays(*inds)
     ary_ndim = ary.ndim
-    if ary_ndim > 0:
-        p = operator.index(p)
-        p = normalize_axis_index(p, ary_ndim)
-        vals_shape = ary.shape[:p] + inds[0].shape + ary.shape[p + len(inds) :]
-    else:
-        vals_shape = inds[0].shape
+
+    p = normalize_axis_index(operator.index(p), ary_ndim)
+    vals_shape = ary.shape[:p] + inds[0].shape + ary.shape[p + len(inds) :]
 
     vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
     if not isinstance(vals, dpt.usm_ndarray):
diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
index 23d2c4d637..12f7b2d72e 100644
--- a/dpctl/tensor/_indexing_functions.py
+++ b/dpctl/tensor/_indexing_functions.py
@@ -82,8 +82,7 @@ def take(x, indices, /, *, axis=None, mode="clip"):
     if len(indices) > 1:
         indices = dpt.broadcast_arrays(*indices)
     if x_ndim > 0:
-        axis = operator.index(axis)
-        axis = normalize_axis_index(axis, x_ndim)
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
         res_shape = (
             x.shape[:axis] + indices[0].shape + x.shape[axis + len(indices) :]
         )
@@ -154,13 +153,12 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"):
             x = dpt.reshape(x, (x.size,), copy=False)
             axis = 0
         except ValueError:
-            raise ValueError("Cannot create 1D view of array")
+            raise ValueError("Cannot create 1D view of input array")
     if len(indices) > 1:
         indices = dpt.broadcast_arrays(*indices)
     x_ndim = x.ndim
     if x_ndim > 0:
-        axis = operator.index(axis)
-        axis = normalize_axis_index(axis, x_ndim)
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
 
         val_shape = (
             x.shape[:axis] + indices[0].shape + x.shape[axis + len(indices) :]
diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index 45501afbac..98bb674b21 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -546,7 +546,8 @@ def test_put_0d_val(data_dt):
     assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x[0]))
 
     x = dpt.asarray(5, dtype=data_dt, sycl_queue=q)
-    x[ind] = 2
+    val = 2
+    dpt.put(x, ind, val)
     assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x))
 
 

From 1e6794308b760e2282856076351b49ebbb349115 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 2 Mar 2023 09:21:47 -0800
Subject: [PATCH 42/57] Fixed WrapIndex class returning negative indices

---
 .../libtensor/include/kernels/integer_advanced_indexing.hpp     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
index a239691c80..5258b3e4f5 100644
--- a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -68,7 +68,7 @@ class WrapIndex
     void operator()(py::ssize_t max_item, py::ssize_t &ind) const
     {
         max_item = std::max<py::ssize_t>(max_item, 1);
-        ind = ind % max_item;
+        ind = (ind < 0) ? ind % max_item + max_item : ind % max_item;
         return;
     }
 };

From ac9072f63061f0a79c6e8cc77d62080c17e3d8bb Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 2 Mar 2023 09:35:33 -0800
Subject: [PATCH 43/57] Import formatting corrected in usm_ndarray getitem

---
 dpctl/tensor/_usmarray.pyx | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 5d83a86c62..70c8eadeda 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -670,11 +670,7 @@ cdef class usm_ndarray:
         if adv_ind_start_p < 0:
             return res
 
-        from ._copy_utils import (
-            _mock_extract,
-            _mock_nonzero,
-            _take_multi_index,
-        )
++       from ._copy_utils import _mock_extract, _mock_nonzero, _take_multi_index
         if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
             return _mock_extract(res, adv_ind[0], adv_ind_start_p)
 

From d47fbf03107a44b05fd531c92b4f98bf4d7df8e4 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 2 Mar 2023 09:45:14 -0800
Subject: [PATCH 44/57] Whitespace in usm_ndarray getitem imports

---
 dpctl/tensor/_usmarray.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 70c8eadeda..64a492065f 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -670,7 +670,7 @@ cdef class usm_ndarray:
         if adv_ind_start_p < 0:
             return res
 
-+       from ._copy_utils import _mock_extract, _mock_nonzero, _take_multi_index
+        from ._copy_utils import _mock_extract, _mock_nonzero, _take_multi_index
         if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
             return _mock_extract(res, adv_ind[0], adv_ind_start_p)
 

From db84c42b133effcb5eafd04dee3f72ecdce81c1a Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 2 Mar 2023 12:00:36 -0800
Subject: [PATCH 45/57] Refactored advanced_indexing to 1 host_task

---
 .../source/integer_advanced_indexing.cpp      | 619 +++++-------------
 1 file changed, 172 insertions(+), 447 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
index 53450d2a8a..dfc74c12f0 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -68,228 +68,135 @@ using dpctl::tensor::f_contiguous_strides;
 
 using dpctl::utils::keep_args_alive;
 
-std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
-    sycl::queue exec_q,
-    std::vector<sycl::event> &host_task_events,
-    py::ssize_t *device_orthog_shapes_strides,
-    py::ssize_t *device_axes_shapes_strides,
-    const py::ssize_t *inp_shape,
-    const py::ssize_t *inp_strides,
-    bool is_inp_c_contig,
-    bool is_inp_f_contig,
-    const py::ssize_t *arr_shape,
-    const py::ssize_t *arr_strides,
-    bool is_arr_c_contig,
-    bool is_arr_f_contig,
-    int axis_start,
-    int k,
-    int ind_nd,
-    int inp_nd,
-    int arr_nd)
+std::vector<sycl::event>
+_populate_kernel_params(sycl::queue exec_q,
+                        std::vector<sycl::event> &host_task_events,
+                        char **device_ind_ptrs,
+                        py::ssize_t *device_ind_sh_st,
+                        py::ssize_t *device_ind_offsets,
+                        py::ssize_t *device_orthog_sh_st,
+                        py::ssize_t *device_along_sh_st,
+                        const py::ssize_t *inp_shape,
+                        std::vector<py::ssize_t> &inp_strides,
+                        std::vector<py::ssize_t> &arr_strides,
+                        std::vector<py::ssize_t> &ind_sh_sts,
+                        std::vector<char *> &ind_ptrs,
+                        std::vector<py::ssize_t> &ind_offsets,
+                        int axis_start,
+                        int k,
+                        int ind_nd,
+                        int inp_nd,
+                        int orthog_sh_elems,
+                        int ind_sh_elems)
 {
 
-    int orthog_sh_elems = std::max<int>(inp_nd - k, 1);
-    int along_sh_elems = std::max<int>(ind_nd, 1);
+    using usm_host_allocator_T =
+        sycl::usm_allocator<char *, sycl::usm::alloc::host>;
+    using ptrT = std::vector<char *, usm_host_allocator_T>;
+
+    usm_host_allocator_T ptr_allocator(exec_q);
+    std::shared_ptr<ptrT> host_ind_ptrs_shp =
+        std::make_shared<ptrT>(k, ptr_allocator);
 
     using usm_host_allocatorT =
         sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
     using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
 
-    usm_host_allocatorT allocator(exec_q);
-    std::shared_ptr<shT> packed_host_shapes_strides_shp =
-        std::make_shared<shT>(3 * orthog_sh_elems, 0, allocator);
+    usm_host_allocatorT sz_allocator(exec_q);
+    std::shared_ptr<shT> host_ind_sh_st_shp =
+        std::make_shared<shT>(ind_sh_elems * (k + 1), sz_allocator);
 
-    std::shared_ptr<shT> packed_host_axes_shapes_strides_shp =
-        std::make_shared<shT>(2 * k + along_sh_elems, 0, allocator);
+    std::shared_ptr<shT> host_ind_offsets_shp =
+        std::make_shared<shT>(k, sz_allocator);
 
-    if (inp_nd > 0) {
-        std::copy(inp_shape, inp_shape + axis_start,
-                  packed_host_shapes_strides_shp->begin());
-        std::copy(inp_shape + axis_start + k, inp_shape + inp_nd,
-                  packed_host_shapes_strides_shp->begin() + axis_start);
-        std::copy(inp_shape + axis_start, inp_shape + axis_start + k,
-                  packed_host_axes_shapes_strides_shp->begin());
-
-        // contract axes by using two copies
-        if (inp_strides == nullptr) {
-            if (is_inp_c_contig) {
-                const auto &inp_contig_strides =
-                    c_contiguous_strides(inp_nd, inp_shape);
-                std::copy(inp_contig_strides.begin(),
-                          inp_contig_strides.begin() + axis_start,
-                          packed_host_shapes_strides_shp->begin() +
-                              orthog_sh_elems);
-                std::copy(inp_contig_strides.begin() + axis_start + k,
-                          inp_contig_strides.end(),
-                          packed_host_shapes_strides_shp->begin() +
-                              orthog_sh_elems + axis_start);
-                std::copy(inp_contig_strides.begin() + axis_start,
-                          inp_contig_strides.begin() + axis_start + k,
-                          packed_host_axes_shapes_strides_shp->begin() + k);
-            }
-            else if (is_inp_f_contig) {
-                const auto &inp_contig_strides =
-                    f_contiguous_strides(inp_nd, inp_shape);
-                std::copy(inp_contig_strides.begin(),
-                          inp_contig_strides.begin() + axis_start,
-                          packed_host_shapes_strides_shp->begin() +
-                              orthog_sh_elems);
-                std::copy(inp_contig_strides.begin() + axis_start + k,
-                          inp_contig_strides.end(),
-                          packed_host_shapes_strides_shp->begin() +
-                              orthog_sh_elems + axis_start);
-                std::copy(inp_contig_strides.begin() + axis_start,
-                          inp_contig_strides.begin() + axis_start + k,
-                          packed_host_axes_shapes_strides_shp->begin() + k);
-            }
-            else {
-                // FIXME: this pointer was not allocated in this function
-                // the caller should be freeing it
-                sycl::free(device_orthog_shapes_strides, exec_q);
-                throw std::runtime_error("Invalid array encountered");
-            }
-        }
-        else {
-            std::copy(inp_strides, inp_strides + axis_start,
-                      packed_host_shapes_strides_shp->begin() +
-                          orthog_sh_elems);
-            std::copy(inp_strides + axis_start + k, inp_strides + inp_nd,
-                      packed_host_shapes_strides_shp->begin() +
-                          orthog_sh_elems + axis_start);
-            std::copy(inp_strides + axis_start, inp_strides + axis_start + k,
-                      packed_host_axes_shapes_strides_shp->begin() + k);
-        }
+    std::shared_ptr<shT> host_orthog_sh_st_shp =
+        std::make_shared<shT>(3 * orthog_sh_elems, sz_allocator);
 
-        if (arr_strides == nullptr) {
-            if (is_arr_c_contig) {
-                const auto &arr_contig_strides =
-                    c_contiguous_strides(arr_nd, arr_shape);
-                std::copy(arr_contig_strides.begin(),
-                          arr_contig_strides.begin() + axis_start,
-                          packed_host_shapes_strides_shp->begin() +
-                              2 * orthog_sh_elems);
-                std::copy(arr_contig_strides.begin() + axis_start + ind_nd,
-                          arr_contig_strides.end(),
-                          packed_host_shapes_strides_shp->begin() +
-                              2 * orthog_sh_elems + axis_start);
-                std::copy(arr_contig_strides.begin() + axis_start,
-                          arr_contig_strides.begin() + axis_start + ind_nd,
-                          packed_host_axes_shapes_strides_shp->begin() + 2 * k);
-            }
-            else if (is_arr_f_contig) {
-                const auto &arr_contig_strides =
-                    f_contiguous_strides(arr_nd, arr_shape);
-                std::copy(arr_contig_strides.begin(),
-                          arr_contig_strides.begin() + axis_start,
-                          packed_host_shapes_strides_shp->begin() +
-                              2 * orthog_sh_elems);
-                std::copy(arr_contig_strides.begin() + axis_start + ind_nd,
-                          arr_contig_strides.end(),
-                          packed_host_shapes_strides_shp->begin() +
-                              2 * orthog_sh_elems + axis_start);
-                std::copy(arr_contig_strides.begin() + axis_start,
-                          arr_contig_strides.begin() + axis_start + ind_nd,
-                          packed_host_axes_shapes_strides_shp->begin() + 2 * k);
-            }
-            else {
-                // FIXME: this pointer was not allocated in this function
-                // the caller should be freeing it
-                sycl::free(device_orthog_shapes_strides, exec_q);
-                throw std::runtime_error("Invalid array encountered");
-            }
+    std::shared_ptr<shT> host_along_sh_st_shp =
+        std::make_shared<shT>(2 * k + ind_sh_elems, sz_allocator);
+
+    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
+              host_ind_sh_st_shp->begin());
+    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
+    std::copy(ind_offsets.begin(), ind_offsets.end(),
+              host_ind_offsets_shp->begin());
+
+    sycl::event device_ind_ptrs_copy_ev = exec_q.copy<char *>(
+        host_ind_ptrs_shp->data(), device_ind_ptrs, host_ind_ptrs_shp->size());
+
+    sycl::event device_ind_sh_st_copy_ev =
+        exec_q.copy<py::ssize_t>(host_ind_sh_st_shp->data(), device_ind_sh_st,
+                                 host_ind_sh_st_shp->size());
+
+    sycl::event device_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
+        host_ind_offsets_shp->data(), device_ind_offsets,
+        host_ind_offsets_shp->size());
+
+    int orthog_nd = inp_nd - k;
+
+    if (orthog_nd > 0) {
+        if (axis_start > 0) {
+            std::copy(inp_shape, inp_shape + axis_start,
+                      host_orthog_sh_st_shp->begin());
+            std::copy(inp_strides.begin(), inp_strides.begin() + axis_start,
+                      host_orthog_sh_st_shp->begin() + orthog_sh_elems);
+            std::copy(arr_strides.begin(), arr_strides.begin() + axis_start,
+                      host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems);
         }
-        else {
-            std::copy(arr_strides, arr_strides + axis_start,
-                      packed_host_shapes_strides_shp->begin() +
-                          2 * orthog_sh_elems);
-            std::copy(arr_strides + axis_start + ind_nd, arr_strides + arr_nd,
-                      packed_host_shapes_strides_shp->begin() +
-                          2 * orthog_sh_elems + axis_start);
-            std::copy(arr_strides + axis_start,
-                      arr_strides + axis_start + ind_nd,
-                      packed_host_axes_shapes_strides_shp->begin() + 2 * k);
+        if (inp_nd > (axis_start + k)) {
+            std::copy(inp_shape + axis_start + k, inp_shape + inp_nd,
+                      host_orthog_sh_st_shp->begin() + axis_start);
+            std::copy(inp_strides.begin() + axis_start + k, inp_strides.end(),
+                      host_orthog_sh_st_shp->begin() + orthog_sh_elems +
+                          axis_start);
+
+            std::copy(arr_strides.begin() + axis_start + ind_nd,
+                      arr_strides.end(),
+                      host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems +
+                          axis_start);
         }
+    }
+
+    if (inp_nd > 0) {
+        std::copy(inp_shape + axis_start, inp_shape + axis_start + k,
+                  host_along_sh_st_shp->begin());
 
-        // copy packed shapes and strides from host to devices
-        sycl::event device_orthog_shapes_strides_copy_ev =
-            exec_q.copy<py::ssize_t>(packed_host_shapes_strides_shp->data(),
-                                     device_orthog_shapes_strides,
-                                     packed_host_shapes_strides_shp->size());
-
-        sycl::event device_axes_shapes_strides_copy_ev =
-            exec_q.copy<py::ssize_t>(
-                packed_host_axes_shapes_strides_shp->data(),
-                device_axes_shapes_strides,
-                packed_host_axes_shapes_strides_shp->size());
-
-        sycl::event clean_up_host_task_ev =
-            exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(device_axes_shapes_strides_copy_ev);
-                cgh.depends_on(device_orthog_shapes_strides_copy_ev);
-                cgh.host_task([packed_host_axes_shapes_strides_shp,
-                               packed_host_shapes_strides_shp]() {});
-            });
-        clean_up_host_task_ev.wait();
-        host_task_events.push_back(clean_up_host_task_ev);
-
-        std::vector<sycl::event> v = {device_orthog_shapes_strides_copy_ev,
-                                      device_axes_shapes_strides_copy_ev};
-        return v;
+        std::copy(inp_strides.begin() + axis_start,
+                  inp_strides.begin() + axis_start + k,
+                  host_along_sh_st_shp->begin() + k);
     }
-    else {
-        // no orthogonal dimensions
-        sycl::event device_orthog_shapes_strides_fill_ev =
-            exec_q.fill<py::ssize_t>(device_orthog_shapes_strides,
-                                     py::ssize_t(0), 3);
-
-        packed_host_axes_shapes_strides_shp->insert(
-            packed_host_axes_shapes_strides_shp->end(), py::ssize_t(0), 2);
-        if (arr_strides == nullptr) {
-            if (is_arr_c_contig) {
-                const auto &arr_contig_strides =
-                    c_contiguous_strides(arr_nd, arr_shape);
-                std::copy(arr_contig_strides.begin() + axis_start,
-                          arr_contig_strides.begin() + axis_start + ind_nd,
-                          packed_host_axes_shapes_strides_shp->begin() + 2);
-            }
-            else if (is_arr_f_contig) {
-                const auto &arr_contig_strides =
-                    f_contiguous_strides(arr_nd, arr_shape);
-                std::copy(arr_contig_strides.begin() + axis_start,
-                          arr_contig_strides.begin() + axis_start + ind_nd,
-                          packed_host_axes_shapes_strides_shp->begin() + 2);
-            }
-            else {
-                // FIXME: memory was not allocated in this function
-                // it should be freed by the caller
-                sycl::free(device_orthog_shapes_strides, exec_q);
-                throw std::runtime_error("Invalid array encountered");
-            }
-        }
-        else {
-            std::copy(arr_strides + axis_start,
-                      arr_strides + axis_start + ind_nd,
-                      packed_host_axes_shapes_strides_shp->begin() + 2);
-        }
 
-        sycl::event device_axes_shapes_strides_copy_ev =
-            exec_q.copy<py::ssize_t>(
-                packed_host_axes_shapes_strides_shp->data(),
-                device_axes_shapes_strides,
-                packed_host_axes_shapes_strides_shp->size());
-
-        sycl::event clean_up_host_task_ev =
-            exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(device_axes_shapes_strides_copy_ev);
-                cgh.host_task([packed_host_axes_shapes_strides_shp]() {});
-            });
-        clean_up_host_task_ev.wait();
-        host_task_events.push_back(clean_up_host_task_ev);
-
-        std::vector<sycl::event> v = {device_orthog_shapes_strides_fill_ev,
-                                      device_axes_shapes_strides_copy_ev};
-        return v;
+    if (ind_nd > 0) {
+        std::copy(arr_strides.begin() + axis_start,
+                  arr_strides.begin() + axis_start + ind_nd,
+                  host_along_sh_st_shp->begin() + 2 * k);
     }
+
+    sycl::event device_orthog_sh_st_copy_ev = exec_q.copy<py::ssize_t>(
+        host_orthog_sh_st_shp->data(), device_orthog_sh_st,
+        host_orthog_sh_st_shp->size());
+
+    sycl::event device_along_sh_st_copy_ev = exec_q.copy<py::ssize_t>(
+        host_along_sh_st_shp->data(), device_along_sh_st,
+        host_along_sh_st_shp->size());
+
+    sycl::event shared_ptr_cleanup_host_task =
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on({device_along_sh_st_copy_ev,
+                            device_orthog_sh_st_copy_ev,
+                            device_ind_offsets_copy_ev,
+                            device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev});
+            cgh.host_task([host_ind_offsets_shp, host_ind_sh_st_shp,
+                           host_ind_ptrs_shp, host_orthog_sh_st_shp,
+                           host_along_sh_st_shp]() {});
+        });
+    host_task_events.push_back(shared_ptr_cleanup_host_task);
+
+    std::vector<sycl::event> sh_st_pack_deps{
+        device_ind_ptrs_copy_ev, device_ind_sh_st_copy_ev,
+        device_ind_offsets_copy_ev, device_orthog_sh_st_copy_ev,
+        device_along_sh_st_copy_ev};
+    return sh_st_pack_deps;
 }
 
 /* Utility to parse python object py_ind into vector of `usm_ndarray`s */
@@ -357,7 +264,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     int dst_nd = dst.get_ndim();
     int ind_nd = ind_rep.get_ndim();
 
-    auto sh_elems = (src_nd > 0) ? src_nd : 1;
+    auto sh_elems = std::max<int>(src_nd, 1);
 
     if (axis_start + k > sh_elems) {
         throw py::value_error("Axes are out of range for array of dimension " +
@@ -379,8 +286,6 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     const py::ssize_t *src_shape = src.get_shape_raw();
     const py::ssize_t *dst_shape = dst.get_shape_raw();
 
-    int orthog_nd = std::max<int>(src_nd - k, 1);
-
     bool orthog_shapes_equal(true);
     size_t orthog_nelems(1);
     for (int i = 0; i < (src_nd - k); ++i) {
@@ -471,9 +376,9 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     std::vector<py::ssize_t> ind_offsets;
     ind_offsets.reserve(k);
 
-    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0));
+    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, 0);
     if (ind_nd > 0) {
-        std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin());
+        std::copy(ind_shape, ind_shape + ind_nd, ind_sh_sts.begin());
     }
     for (int i = 0; i < k; ++i) {
         dpctl::tensor::usm_ndarray ind_ = ind[i];
@@ -520,31 +425,9 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
 
         // strides are initialized to 0 for 0D indices, so skip here
         if (ind_nd > 0) {
-            const py::ssize_t *ind_strides = ind_.get_strides_raw();
-            if (ind_strides == nullptr) {
-                if (ind_.is_c_contiguous()) {
-                    const auto &ind_contig_strides_ =
-                        c_contiguous_strides(ind_nd, ind_shape);
-                    std::copy(ind_contig_strides_.begin(),
-                              ind_contig_strides_.end(),
-                              ind_sh_sts.begin() + (i + 1) * ind_nd);
-                }
-                else if (ind_.is_f_contiguous()) {
-                    const auto &ind_contig_strides_ =
-                        f_contiguous_strides(ind_nd, ind_shape);
-                    std::copy(ind_contig_strides_.begin(),
-                              ind_contig_strides_.end(),
-                              ind_sh_sts.begin() + (i + 1) * ind_nd);
-                }
-                else {
-                    throw std::runtime_error(
-                        "Invalid ind array encountered in: take function");
-                }
-            }
-            else {
-                std::copy(ind_strides, ind_strides + ind_nd,
-                          ind_sh_sts.begin() + (i + 1) * ind_nd);
-            }
+            auto ind_strides = ind_.get_strides_vector();
+            std::copy(ind_strides.begin(), ind_strides.end(),
+                      ind_sh_sts.begin() + (i + 1) * ind_nd);
         }
 
         ind_ptrs.push_back(ind_data);
@@ -582,77 +465,15 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
             "Unable to allocate packed_ind_offsets device memory");
     }
 
-    using usm_host_allocator_T =
-        sycl::usm_allocator<char *, sycl::usm::alloc::host>;
-    using ptrT = std::vector<char *, usm_host_allocator_T>;
-
-    usm_host_allocator_T ptr_allocator(exec_q);
-    std::shared_ptr<ptrT> host_ind_ptrs_shp =
-        std::make_shared<ptrT>(k, ptr_allocator);
-
-    using usm_host_allocatorT =
-        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
-    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
-
-    usm_host_allocatorT ind_allocator(exec_q);
-    std::shared_ptr<shT> host_ind_shapes_strides_shp =
-        std::make_shared<shT>(ind_sh_elems * (k + 1), ind_allocator);
-
-    std::shared_ptr<shT> host_ind_offsets_shp =
-        std::make_shared<shT>(k, ind_allocator);
-
-    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
-              host_ind_shapes_strides_shp->begin());
-    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
-    std::copy(ind_offsets.begin(), ind_offsets.end(),
-              host_ind_offsets_shp->begin());
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(5);
-
-    sycl::event packed_ind_ptrs_copy_ev = exec_q.copy<char *>(
-        host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
-
-    sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
-        host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides,
-        host_ind_shapes_strides_shp->size());
-
-    sycl::event packed_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
-        host_ind_offsets_shp->data(), packed_ind_offsets,
-        host_ind_offsets_shp->size());
-
-    sycl::event shared_ptr_cleanup_host_task =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on({packed_ind_offsets_copy_ev,
-                            packed_ind_shapes_strides_copy_ev,
-                            packed_ind_ptrs_copy_ev});
-            cgh.host_task([host_ind_offsets_shp, host_ind_shapes_strides_shp,
-                           host_ind_ptrs_shp]() {});
-        });
-    shared_ptr_cleanup_host_task.wait();
-    host_task_events.push_back(shared_ptr_cleanup_host_task);
-
-    std::vector<sycl::event> ind_pack_depends{packed_ind_ptrs_copy_ev,
-                                              packed_ind_shapes_strides_copy_ev,
-                                              packed_ind_offsets_copy_ev};
-
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_src_f_contig = src.is_f_contiguous();
-
-    bool is_dst_c_contig = dst.is_c_contiguous();
-    bool is_dst_f_contig = dst.is_f_contiguous();
-
-    const py::ssize_t *src_strides = src.get_strides_raw();
-    const py::ssize_t *dst_strides = dst.get_strides_raw();
+    int orthog_sh_elems = std::max<int>(src_nd - k, 1);
 
     // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:],
     //                          src_strides[:axis] + src_strides[axis+k:],
     //                          dst_strides[:axis] + dst_strides[axis+k:]]
     py::ssize_t *packed_shapes_strides =
-        sycl::malloc_device<py::ssize_t>(3 * sh_elems, exec_q);
+        sycl::malloc_device<py::ssize_t>(3 * orthog_sh_elems, exec_q);
 
     if (packed_shapes_strides == nullptr) {
-        sycl::event::wait(host_task_events);
         sycl::free(packed_ind_ptrs, exec_q);
         sycl::free(packed_ind_shapes_strides, exec_q);
         sycl::free(packed_ind_offsets, exec_q);
@@ -667,7 +488,6 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
         sycl::malloc_device<py::ssize_t>((2 * k) + ind_sh_elems, exec_q);
 
     if (packed_axes_shapes_strides == nullptr) {
-        sycl::event::wait(host_task_events);
         sycl::free(packed_ind_ptrs, exec_q);
         sycl::free(packed_ind_shapes_strides, exec_q);
         sycl::free(packed_ind_offsets, exec_q);
@@ -676,20 +496,22 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
             "Unable to allocate packed_axes_shapes_strides device memory");
     }
 
-    std::vector<sycl::event> src_dst_pack_deps =
-        _populate_packed_shapes_strides_for_indexing(
-            exec_q, host_task_events, packed_shapes_strides,
-            packed_axes_shapes_strides, src_shape, src_strides, is_src_c_contig,
-            is_src_f_contig, dst_shape, dst_strides, is_dst_c_contig,
-            is_dst_f_contig, axis_start, k, ind_nd, src_nd, dst_nd);
+    auto src_strides = src.get_strides_vector();
+    auto dst_strides = dst.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    std::vector<sycl::event> pack_deps = _populate_kernel_params(
+        exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides,
+        packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides,
+        src_shape, src_strides, dst_strides, ind_sh_sts, ind_ptrs, ind_offsets,
+        axis_start, k, ind_nd, src_nd, orthog_sh_elems, ind_sh_elems);
 
     std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + ind_pack_depends.size() +
-                     src_dst_pack_deps.size());
-    all_deps.insert(std::end(all_deps), std::begin(ind_pack_depends),
-                    std::end(ind_pack_depends));
-    all_deps.insert(std::end(all_deps), std::begin(src_dst_pack_deps),
-                    std::end(src_dst_pack_deps));
+    all_deps.reserve(depends.size() + pack_deps.size());
+    all_deps.insert(std::end(all_deps), std::begin(pack_deps),
+                    std::end(pack_deps));
     all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
 
     auto fn = take_dispatch_table[mode][src_type_id][ind_type_id];
@@ -706,7 +528,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
     }
 
     sycl::event take_generic_ev =
-        fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k,
+        fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k,
            packed_shapes_strides, packed_axes_shapes_strides,
            packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs,
            src_offset, dst_offset, packed_ind_offsets, all_deps);
@@ -726,15 +548,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
         });
     });
 
-    sycl::event::wait({take_generic_ev, temporaries_cleanup_ev});
-    sycl::event::wait(host_task_events);
+    sycl::event host_task_ev = keep_args_alive(
+        exec_q, {src, py_ind, dst}, {take_generic_ev, temporaries_cleanup_ev});
 
-    /*
-    sycl::event host_task_ev = keep_args_alive(exec_q, {src, py_ind, dst},
-                                               {temporaries_cleanup_ev});
-    */
-
-    return std::make_pair(sycl::event(), temporaries_cleanup_ev);
+    return std::make_pair(host_task_ev, take_generic_ev);
 }
 
 std::pair<sycl::event, sycl::event>
@@ -772,7 +589,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
     int val_nd = val.get_ndim();
     int ind_nd = ind_rep.get_ndim();
 
-    auto sh_elems = (dst_nd > 0) ? dst_nd : 1;
+    auto sh_elems = std::max<int>(dst_nd, 1);
 
     if (axis_start + k > sh_elems) {
         throw py::value_error("Axes are out of range for array of dimension " +
@@ -796,8 +613,6 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
     const py::ssize_t *dst_shape = dst.get_shape_raw();
     const py::ssize_t *val_shape = val.get_shape_raw();
 
-    int orthog_nd = ((dst_nd - k) > 0) ? dst_nd - k : 1;
-
     bool orthog_shapes_equal(true);
     size_t orthog_nelems(1);
     for (int i = 0; i < (dst_nd - k); ++i) {
@@ -879,7 +694,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
         }
     }
 
-    auto ind_sh_elems = (ind_nd > 0) ? ind_nd : 1;
+    auto ind_sh_elems = std::max<int>(ind_nd, 1);
 
     std::vector<char *> ind_ptrs;
     ind_ptrs.reserve(k);
@@ -934,31 +749,9 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
 
         // strides are initialized to 0 for 0D indices, so skip here
         if (ind_nd > 0) {
-            const py::ssize_t *ind_strides = ind_.get_strides_raw();
-            if (ind_strides == nullptr) {
-                if (ind_.is_c_contiguous()) {
-                    const auto &ind_contig_strides_ =
-                        c_contiguous_strides(ind_nd, ind_shape);
-                    std::copy(ind_contig_strides_.begin(),
-                              ind_contig_strides_.end(),
-                              ind_sh_sts.begin() + (i + 1) * ind_nd);
-                }
-                else if (ind_.is_f_contiguous()) {
-                    const auto &ind_contig_strides_ =
-                        f_contiguous_strides(ind_nd, ind_shape);
-                    std::copy(ind_contig_strides_.begin(),
-                              ind_contig_strides_.end(),
-                              ind_sh_sts.begin() + (i + 1) * ind_nd);
-                }
-                else {
-                    throw std::runtime_error(
-                        "Invalid ind array encountered in: take function");
-                }
-            }
-            else {
-                std::copy(ind_strides, ind_strides + ind_nd,
-                          ind_sh_sts.begin() + (i + 1) * ind_nd);
-            }
+            auto ind_strides = ind_.get_strides_vector();
+            std::copy(ind_strides.begin(), ind_strides.end(),
+                      ind_sh_sts.begin() + (i + 1) * ind_nd);
         }
 
         ind_ptrs.push_back(ind_data);
@@ -995,77 +788,15 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
             "Unable to allocate packed_ind_offsets device memory");
     }
 
-    using usm_host_allocator_T =
-        sycl::usm_allocator<char *, sycl::usm::alloc::host>;
-    using ptrT = std::vector<char *, usm_host_allocator_T>;
-
-    usm_host_allocator_T ptr_allocator(exec_q);
-    std::shared_ptr<ptrT> host_ind_ptrs_shp =
-        std::make_shared<ptrT>(k, ptr_allocator);
-
-    using usm_host_allocatorT =
-        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
-    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
-
-    usm_host_allocatorT ind_allocator(exec_q);
-    std::shared_ptr<shT> host_ind_shapes_strides_shp =
-        std::make_shared<shT>(ind_sh_elems * (k + 1), ind_allocator);
-
-    std::shared_ptr<shT> host_ind_offsets_shp =
-        std::make_shared<shT>(k, ind_allocator);
-
-    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
-              host_ind_shapes_strides_shp->begin());
-    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
-    std::copy(ind_offsets.begin(), ind_offsets.end(),
-              host_ind_offsets_shp->begin());
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(5);
-
-    sycl::event packed_ind_ptrs_copy_ev = exec_q.copy<char *>(
-        host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
-
-    sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
-        host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides,
-        host_ind_shapes_strides_shp->size());
-
-    sycl::event packed_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
-        host_ind_offsets_shp->data(), packed_ind_offsets,
-        host_ind_offsets_shp->size());
-
-    sycl::event shared_ptr_cleanup_host_task =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on({packed_ind_offsets_copy_ev,
-                            packed_ind_shapes_strides_copy_ev,
-                            packed_ind_ptrs_copy_ev});
-            cgh.host_task([host_ind_offsets_shp, host_ind_shapes_strides_shp,
-                           host_ind_ptrs_shp]() {});
-        });
-    shared_ptr_cleanup_host_task.wait();
-    host_task_events.push_back(shared_ptr_cleanup_host_task);
-
-    std::vector<sycl::event> ind_pack_depends{packed_ind_ptrs_copy_ev,
-                                              packed_ind_shapes_strides_copy_ev,
-                                              packed_ind_offsets_copy_ev};
-
-    bool is_dst_c_contig = dst.is_c_contiguous();
-    bool is_dst_f_contig = dst.is_f_contiguous();
-
-    bool is_val_c_contig = val.is_c_contiguous();
-    bool is_val_f_contig = val.is_f_contiguous();
-
-    const py::ssize_t *dst_strides = dst.get_strides_raw();
-    const py::ssize_t *val_strides = val.get_strides_raw();
+    int orthog_sh_elems = std::max<int>(dst_nd - k, 1);
 
     // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:],
     //                          dst_strides[:axis] + dst_strides[axis+k:],
     //                          val_strides[:axis] + val_strides[axis+k:]]
     py::ssize_t *packed_shapes_strides =
-        sycl::malloc_device<py::ssize_t>(3 * sh_elems, exec_q);
+        sycl::malloc_device<py::ssize_t>(3 * orthog_sh_elems, exec_q);
 
     if (packed_shapes_strides == nullptr) {
-        sycl::event::wait(host_task_events);
         sycl::free(packed_ind_ptrs, exec_q);
         sycl::free(packed_ind_shapes_strides, exec_q);
         sycl::free(packed_ind_offsets, exec_q);
@@ -1073,54 +804,54 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
             "Unable to allocate packed_shapes_strides device memory");
     }
 
-    // packed_axes_shapes_strides = [dst_shape[axis:k],
-    //                               dst_strides[axis:k,
+    // packed_axes_shapes_strides = [dst_shape[axis:axis+k],
+    //                               dst_strides[axis:axis+k,
     //                               val_strides[axis:ind.ndim]]
     py::ssize_t *packed_axes_shapes_strides =
         sycl::malloc_device<py::ssize_t>((2 * k) + ind_sh_elems, exec_q);
 
     if (packed_axes_shapes_strides == nullptr) {
-        sycl::event::wait(host_task_events);
-        sycl::free(packed_shapes_strides, exec_q);
         sycl::free(packed_ind_ptrs, exec_q);
         sycl::free(packed_ind_shapes_strides, exec_q);
         sycl::free(packed_ind_offsets, exec_q);
+        sycl::free(packed_shapes_strides, exec_q);
         throw std::runtime_error(
             "Unable to allocate packed_axes_shapes_strides device memory");
     }
 
-    std::vector<sycl::event> copy_shapes_strides_deps =
-        _populate_packed_shapes_strides_for_indexing(
-            exec_q, host_task_events, packed_shapes_strides,
-            packed_axes_shapes_strides, dst_shape, dst_strides, is_dst_c_contig,
-            is_dst_f_contig, val_shape, val_strides, is_val_c_contig,
-            is_val_f_contig, axis_start, k, ind_nd, dst_nd, val_nd);
-
-    std::vector<sycl::event> all_deps(depends.size() +
-                                      copy_shapes_strides_deps.size() +
-                                      ind_pack_depends.size());
-    all_deps.insert(std::end(all_deps), std::begin(copy_shapes_strides_deps),
-                    std::end(copy_shapes_strides_deps));
-    all_deps.insert(std::end(all_deps), std::begin(ind_pack_depends),
-                    std::end(ind_pack_depends));
+    auto dst_strides = dst.get_strides_vector();
+    auto val_strides = val.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    std::vector<sycl::event> pack_deps = _populate_kernel_params(
+        exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides,
+        packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides,
+        dst_shape, dst_strides, val_strides, ind_sh_sts, ind_ptrs, ind_offsets,
+        axis_start, k, ind_nd, dst_nd, orthog_sh_elems, ind_sh_elems);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + pack_deps.size());
+    all_deps.insert(std::end(all_deps), std::begin(pack_deps),
+                    std::end(pack_deps));
     all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
 
     auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id];
 
     if (fn == nullptr) {
         sycl::event::wait(host_task_events);
-        sycl::free(packed_shapes_strides, exec_q);
-        sycl::free(packed_axes_shapes_strides, exec_q);
-        sycl::free(packed_ind_shapes_strides, exec_q);
         sycl::free(packed_ind_ptrs, exec_q);
+        sycl::free(packed_ind_shapes_strides, exec_q);
         sycl::free(packed_ind_offsets, exec_q);
-
+        sycl::free(packed_shapes_strides, exec_q);
+        sycl::free(packed_axes_shapes_strides, exec_q);
         throw std::runtime_error("Indices must be integer type, got " +
                                  std::to_string(ind_type_id));
     }
 
     sycl::event put_generic_ev =
-        fn(exec_q, orthog_nelems, ind_nelems, orthog_nd, ind_nd, k,
+        fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k,
            packed_shapes_strides, packed_axes_shapes_strides,
            packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs,
            dst_offset, val_offset, packed_ind_offsets, all_deps);
@@ -1141,16 +872,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
         });
     });
 
-    sycl::event::wait({put_generic_ev, temporaries_cleanup_ev});
-    sycl::event::wait(host_task_events);
-
-    /*
-    sycl::event py_obj_cleanup_ev =
-        keep_args_alive(exec_q, {dst, py_ind, val},
-                        {put_generic_ev, temporaries_cleanup_ev});
-    */
+    sycl::event py_obj_cleanup_ev = keep_args_alive(
+        exec_q, {dst, py_ind, val}, {put_generic_ev, temporaries_cleanup_ev});
 
-    return std::make_pair(sycl::event(), temporaries_cleanup_ev);
+    return std::make_pair(temporaries_cleanup_ev, put_generic_ev);
 }
 
 void init_advanced_indexing_dispatch_tables(void)

From 2446b00354533e69723e581d140da6ed542ea61d Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 2 Mar 2023 19:35:18 -0600
Subject: [PATCH 46/57] Implements place, extract, nonzero kernels, and Python
 API for them

Implemented mask_positions, _extract, _place, _nonzero and _array_overlap
APIs.
---
 dpctl/tensor/CMakeLists.txt                   |    1 +
 .../kernels/boolean_advanced_indexing.hpp     |  948 ++++++++++++++
 .../source/boolean_advanced_indexing.cpp      | 1085 +++++++++++++++++
 .../source/boolean_advanced_indexing.hpp      |   84 ++
 .../source/simplify_iteration_space.cpp       |  269 ++++
 .../source/simplify_iteration_space.hpp       |   36 +
 dpctl/tensor/libtensor/source/tensor_py.cpp   |   31 +
 7 files changed, 2454 insertions(+)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
 create mode 100644 dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
 create mode 100644 dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 3f5780cd75..300baa98c9 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -32,6 +32,7 @@ pybind11_add_module(${python_module_name} MODULE
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
new file mode 100644
index 0000000000..b42b7869d2
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -0,0 +1,948 @@
+//=== boolean_advance_indexing.hpp -                       ---*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for advanced tensor index operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <array>
+#include <cstdint>
+#include <limits>
+#include <pybind11/pybind11.h>
+#include <utility>
+#include <vector>
+
+#include "utils/strided_iters.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace indexing
+{
+
+namespace py = pybind11;
+
+template <typename T> T ceiling_quotient(T n, T m)
+{
+    return (n + m - 1) / m;
+}
+template <typename T1, typename T2> T1 ceiling_quotient(T1 n, T2 m)
+{
+    return ceiling_quotient<T1>(n, static_cast<T1>(m));
+}
+
+template <typename inputT, typename outputT, typename IndexerT, size_t n_wi>
+class inclusive_scan_rec_local_scan_krn;
+
+template <typename inputT, typename outputT, typename IndexerT>
+class inclusive_scan_rec_chunk_update_krn;
+
+struct NoOpIndexer
+{
+    size_t operator()(size_t gid) const
+    {
+        return gid;
+    }
+};
+
+struct StridedIndexer
+{
+    StridedIndexer(int _nd,
+                   py::ssize_t _offset,
+                   py::ssize_t const *_packed_shape_strides)
+        : nd(_nd), starting_offset(_offset),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    size_t operator()(size_t gid) const
+    {
+        CIndexer_vector _ind(nd);
+        py::ssize_t relative_offset(0);
+        _ind.get_displacement<const py::ssize_t *, const py::ssize_t *>(
+            static_cast<py::ssize_t>(gid),
+            shape_strides,      // shape ptr
+            shape_strides + nd, // strides ptr
+            relative_offset);
+        return starting_offset + relative_offset;
+    }
+
+private:
+    int nd;
+    py::ssize_t starting_offset;
+    py::ssize_t const *shape_strides;
+};
+
+struct Strided1DIndexer
+{
+    Strided1DIndexer(py::ssize_t _offset, py::ssize_t _size, py::ssize_t _step)
+        : offset(_offset), size(static_cast<size_t>(_size)), step(_step)
+    {
+    }
+
+    size_t operator()(size_t gid) const
+    {
+        return static_cast<size_t>(offset + std::min<size_t>(gid, size) * step);
+    }
+
+private:
+    py::ssize_t offset = 0;
+    size_t size = 1;
+    py::ssize_t step = 1;
+};
+
+template <typename _IndexerFn> struct ZeroChecker
+{
+
+    ZeroChecker(_IndexerFn _indexer) : indexer_fn(_indexer) {}
+
+    template <typename dataT>
+    bool operator()(dataT const *data, size_t gid) const
+    {
+        constexpr dataT _zero(0);
+
+        return data[indexer_fn(gid)] == _zero;
+    }
+
+private:
+    _IndexerFn indexer_fn;
+};
+
+/*
+ * for integer type maskT,
+ *       output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
+ * for 0 <= j < n_elems
+ */
+template <typename inputT, typename outputT, typename IndexerT, size_t n_wi>
+sycl::event inclusive_scan_rec(sycl::queue exec_q,
+                               size_t n_elems,
+                               size_t wg_size,
+                               const inputT *input,
+                               outputT *output,
+                               size_t s0,
+                               size_t s1,
+                               IndexerT indexer,
+                               std::vector<sycl::event> const &depends = {})
+{
+    size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size);
+
+    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using slmT = sycl::local_accessor<size_t, 1>;
+
+        auto lws = sycl::range<1>(wg_size);
+        auto gws = sycl::range<1>(n_groups * wg_size);
+
+        slmT slm_iscan_tmp(lws, cgh);
+
+        ZeroChecker<IndexerT> is_zero_fn(indexer);
+
+        cgh.parallel_for<class inclusive_scan_rec_local_scan_krn<inputT, outputT, ZeroChecker<IndexerT>, n_wi>>(
+            sycl::nd_range<1>(gws, lws),
+            [=](sycl::nd_item<1> it)
+        {
+            auto chunk_gid = it.get_global_id(0);
+            auto lid = it.get_local_id(0);
+
+            std::array<size_t, n_wi> local_isum;
+
+            size_t i = chunk_gid * n_wi;
+            for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
+                constexpr outputT out_zero(0);
+                constexpr outputT out_one(1);
+                local_isum[m_wi] =
+                    (i + m_wi < n_elems)
+                        ? (is_zero_fn(input, s0 + s1 * (i + m_wi)) ? out_zero
+                                                                   : out_one)
+                        : out_zero;
+            }
+
+// local_isum is now result of
+// inclusive scan of locally stored mask indicators
+#pragma unroll
+            for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) {
+                local_isum[m_wi] += local_isum[m_wi - 1];
+            }
+
+            size_t wg_iscan_val =
+                sycl::inclusive_scan_over_group(it.get_group(),
+                                                local_isum.back(),
+                                                sycl::plus<size_t>(),
+                                                size_t(0));
+
+            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
+            it.barrier(sycl::access::fence_space::local_space);
+            size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid];
+            it.barrier(sycl::access::fence_space::local_space);
+
+#pragma unroll
+            for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
+                local_isum[m_wi] += addand;
+            }
+
+            for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) {
+                output[i + m_wi] = local_isum[m_wi];
+            }
+            }
+        );
+    });
+
+    sycl::event out_event = inc_scan_phase1_ev;
+    if (n_groups > 1) {
+        outputT *temp = sycl::malloc_device<outputT>(n_groups - 1, exec_q);
+
+        auto chunk_size = wg_size * n_wi;
+
+        NoOpIndexer _no_op_indexer{};
+        auto e2 = inclusive_scan_rec<outputT, outputT, NoOpIndexer, n_wi>(
+            exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1,
+            chunk_size, _no_op_indexer, {inc_scan_phase1_ev});
+
+        // output[ chunk_size * (i + 1) + j] += temp[i]
+        auto e3 = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(e2);
+            cgh.parallel_for<class inclusive_scan_rec_chunk_update_krn<inputT, outputT, IndexerT>>(
+                {n_elems},
+                [=](auto wiid)
+            {
+                auto gid = wiid[0];
+                auto i = (gid / chunk_size);
+                output[gid] += (i > 0) ? temp[i - 1] : 0;
+                }
+            );
+        });
+
+        // dangling task to free the temporary
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(e3);
+            auto ctx = exec_q.get_context();
+            cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); });
+        });
+
+        out_event = e3;
+    }
+
+    return out_event;
+}
+
+template <typename displacementT> struct TwoOffsets
+{
+    TwoOffsets() : first_offset(0), second_offset(0) {}
+    TwoOffsets(displacementT first_offset_, displacementT second_offset_)
+        : first_offset(first_offset_), second_offset(second_offset_)
+    {
+    }
+
+    displacementT get_first_offset() const
+    {
+        return first_offset;
+    }
+    displacementT get_second_offset() const
+    {
+        return second_offset;
+    }
+
+private:
+    displacementT first_offset = 0;
+    displacementT second_offset = 0;
+};
+
+struct TwoOffsets_StridedIndexer
+{
+    TwoOffsets_StridedIndexer(int common_nd,
+                              py::ssize_t first_offset_,
+                              py::ssize_t second_offset_,
+                              py::ssize_t const *_packed_shape_strides)
+        : nd(common_nd), starting_first_offset(first_offset_),
+          starting_second_offset(second_offset_),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    TwoOffsets<py::ssize_t> operator()(py::ssize_t gid) const
+    {
+        CIndexer_vector _ind(nd);
+        py::ssize_t relative_first_offset(0);
+        py::ssize_t relative_second_offset(0);
+        _ind.get_displacement<const py::ssize_t *, const py::ssize_t *>(
+            gid,
+            shape_strides,          // shape ptr
+            shape_strides + nd,     // src strides ptr
+            shape_strides + 2 * nd, // src strides ptr
+            relative_first_offset, relative_second_offset);
+        return TwoOffsets<py::ssize_t>(
+            starting_first_offset + relative_first_offset,
+            starting_second_offset + relative_second_offset);
+    }
+
+private:
+    int nd;
+    py::ssize_t starting_first_offset;
+    py::ssize_t starting_second_offset;
+    py::ssize_t const *shape_strides;
+};
+
+struct TwoZeroOffsets_Indexer
+{
+    TwoZeroOffsets_Indexer() {}
+
+    TwoOffsets<py::ssize_t> operator()(py::ssize_t) const
+    {
+        return TwoOffsets<py::ssize_t>();
+    }
+};
+
+template <typename OrthogIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+struct MaskedExtractStridedFunctor
+{
+    MaskedExtractStridedFunctor(const char *src_data_p,
+                                const char *cumsum_data_p,
+                                char *dst_data_p,
+                                size_t orthog_iter_size,
+                                size_t masked_iter_size,
+                                OrthogIndexerT orthog_src_dst_indexer_,
+                                MaskedSrcIndexerT masked_src_indexer_,
+                                MaskedDstIndexerT masked_dst_indexer_)
+        : src_cp(src_data_p), cumsum_cp(cumsum_data_p), dst_cp(dst_data_p),
+          orthog_nelems(orthog_iter_size), masked_nelems(masked_iter_size),
+          orthog_src_dst_indexer(orthog_src_dst_indexer_),
+          masked_src_indexer(masked_src_indexer_),
+          masked_dst_indexer(masked_dst_indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> idx) const
+    {
+        const dataT *src_data = reinterpret_cast<const dataT *>(src_cp);
+        dataT *dst_data = reinterpret_cast<dataT *>(dst_cp);
+        const indT *cumsum_data = reinterpret_cast<const indT *>(cumsum_cp);
+
+        size_t global_i = idx[0];
+        size_t orthog_i = global_i / masked_nelems;
+        size_t masked_i = global_i - masked_nelems * orthog_i;
+
+        indT current_running_count = cumsum_data[masked_i];
+        bool mask_set =
+            (masked_i == 0)
+                ? (current_running_count == 1)
+                : (current_running_count == cumsum_data[masked_i - 1] + 1);
+
+        // dst[cumsum[i], j] - 1 = src[i, j] if cumsum[i] == ((i > 0) ?
+        // cumsum[i-1]
+        // + 1 : 1)
+        if (mask_set) {
+            auto orthog_offsets =
+                orthog_src_dst_indexer(static_cast<py::ssize_t>(orthog_i));
+
+            size_t total_src_offset = masked_src_indexer(masked_i) +
+                                      orthog_offsets.get_first_offset();
+            size_t total_dst_offset =
+                masked_dst_indexer(current_running_count - 1) +
+                orthog_offsets.get_second_offset();
+
+            dst_data[total_dst_offset] = src_data[total_src_offset];
+        }
+    }
+
+private:
+    const char *src_cp = nullptr;
+    const char *cumsum_cp = nullptr;
+    char *dst_cp = nullptr;
+    size_t orthog_nelems = 0;
+    size_t masked_nelems = 0;
+    OrthogIndexerT
+        orthog_src_dst_indexer; // has nd, shape, src_strides, dst_strides for
+                                // dimensions that ARE NOT masked
+    MaskedSrcIndexerT masked_src_indexer; // has nd, shape, src_strides for
+                                          // dimensions that ARE     masked
+    MaskedDstIndexerT
+        masked_dst_indexer; // has 1, dst_strides for dimensions that ARE masked
+};
+
+template <typename OrthogIndexerT,
+          typename MaskedDstIndexerT,
+          typename MaskedRhsIndexerT,
+          typename dataT,
+          typename indT>
+struct MaskedPlaceStridedFunctor
+{
+    MaskedPlaceStridedFunctor(char *dst_data_p,
+                              const char *cumsum_data_p,
+                              const char *rhs_data_p,
+                              size_t orthog_iter_size,
+                              size_t masked_iter_size,
+                              OrthogIndexerT orthog_dst_rhs_indexer_,
+                              MaskedDstIndexerT masked_dst_indexer_,
+                              MaskedRhsIndexerT masked_rhs_indexer_)
+        : dst_cp(dst_data_p), cumsum_cp(cumsum_data_p), rhs_cp(rhs_data_p),
+          orthog_nelems(orthog_iter_size), masked_nelems(masked_iter_size),
+          orthog_dst_rhs_indexer(orthog_dst_rhs_indexer_),
+          masked_dst_indexer(masked_dst_indexer_),
+          masked_rhs_indexer(masked_rhs_indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> idx) const
+    {
+        dataT *dst_data = reinterpret_cast<dataT *>(dst_cp);
+        const indT *cumsum_data = reinterpret_cast<const indT *>(cumsum_cp);
+        const dataT *rhs_data = reinterpret_cast<const dataT *>(rhs_cp);
+
+        size_t global_i = idx[0];
+        size_t orthog_i = global_i / masked_nelems;
+        size_t masked_i = global_i - masked_nelems * orthog_i;
+
+        indT current_running_count = cumsum_data[masked_i];
+        bool mask_set =
+            (masked_i == 0)
+                ? (current_running_count == 1)
+                : (current_running_count == cumsum_data[masked_i - 1] + 1);
+
+        // src[i, j] = rhs[cumsum[i] - 1, j] if cumsum[i] == ((i > 0) ?
+        // cumsum[i-1]
+        // + 1 : 1)
+        if (mask_set) {
+            auto orthog_offsets =
+                orthog_dst_rhs_indexer(static_cast<py::ssize_t>(orthog_i));
+
+            size_t total_dst_offset = masked_dst_indexer(masked_i) +
+                                      orthog_offsets.get_first_offset();
+            size_t total_rhs_offset =
+                masked_rhs_indexer(current_running_count - 1) +
+                orthog_offsets.get_second_offset();
+
+            dst_data[total_dst_offset] = rhs_data[total_rhs_offset];
+        }
+    }
+
+private:
+    char *dst_cp = nullptr;
+    const char *cumsum_cp = nullptr;
+    const char *rhs_cp = nullptr;
+    size_t orthog_nelems = 0;
+    size_t masked_nelems = 0;
+    OrthogIndexerT
+        orthog_dst_rhs_indexer; // has nd, shape, dst_strides, rhs_strides for
+                                // dimensions that ARE NOT masked
+    MaskedDstIndexerT masked_dst_indexer; // has nd, shape, dst_strides for
+                                          // dimensions that ARE     masked
+    MaskedRhsIndexerT
+        masked_rhs_indexer; // has 1, rhs_strides for dimensions that ARE masked
+};
+
+// mask positions
+
+typedef size_t (*mask_positions_contig_impl_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    const char *,
+    char *,
+    std::vector<sycl::event> const &);
+
+template <typename maskT, typename cumsumT>
+size_t mask_positions_contig_impl(sycl::queue q,
+                                  size_t n_elems,
+                                  const char *mask,
+                                  char *cumsum,
+                                  std::vector<sycl::event> const &depends = {})
+{
+    constexpr int n_wi = 8;
+    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
+    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
+    size_t wg_size = 128;
+
+    NoOpIndexer flat_indexer{};
+
+    sycl::event comp_ev = inclusive_scan_rec<maskT, cumsumT, NoOpIndexer, n_wi>(
+        q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, flat_indexer,
+        depends);
+
+    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
+
+    cumsumT *last_elem_host_usm = sycl::malloc_host<cumsumT>(1, q);
+
+    if (last_elem_host_usm == nullptr) {
+        throw std::bad_alloc();
+    }
+    sycl::event copy_e =
+        q.copy<std::int64_t>(last_elem, last_elem_host_usm, 1, {comp_ev});
+    copy_e.wait();
+    size_t return_val = static_cast<size_t>(*last_elem_host_usm);
+    sycl::free(last_elem_host_usm, q);
+
+    return return_val;
+}
+
+template <typename fnT, typename T> struct MaskPositionsContigFactory
+{
+    fnT get()
+    {
+        fnT fn = mask_positions_contig_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+typedef size_t (*mask_positions_strided_impl_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    const char *,
+    int,
+    py::ssize_t,
+    const py::ssize_t *,
+    char *,
+    std::vector<sycl::event> const &);
+
+template <typename maskT, typename cumsumT>
+size_t mask_positions_strided_impl(sycl::queue q,
+                                   size_t n_elems,
+                                   const char *mask,
+                                   int nd,
+                                   py::ssize_t input_offset,
+                                   const py::ssize_t *shape_strides,
+                                   char *cumsum,
+                                   std::vector<sycl::event> const &depends = {})
+{
+    constexpr int n_wi = 8;
+    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
+    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
+    size_t wg_size = 128;
+
+    StridedIndexer strided_indexer{nd, input_offset, shape_strides};
+
+    sycl::event comp_ev =
+        inclusive_scan_rec<maskT, cumsumT, StridedIndexer, n_wi>(
+            q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
+            strided_indexer, depends);
+
+    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
+
+    cumsumT *last_elem_host_usm = sycl::malloc_host<cumsumT>(1, q);
+
+    if (last_elem_host_usm == nullptr) {
+        throw std::bad_alloc();
+    }
+    sycl::event copy_e =
+        q.copy<std::int64_t>(last_elem, last_elem_host_usm, 1, {comp_ev});
+    copy_e.wait();
+    size_t return_val = static_cast<size_t>(*last_elem_host_usm);
+    sycl::free(last_elem_host_usm, q);
+
+    return return_val;
+}
+
+template <typename fnT, typename T> struct MaskPositionsStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = mask_positions_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+// ======= Masked extraction ================================
+
+template <typename OrthoIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_extract_all_slices_strided_impl_krn;
+
+typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)(
+    sycl::queue,
+    py::ssize_t,
+    const char *,
+    const char *,
+    char *,
+    int,
+    py::ssize_t const *,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename dataT, typename indT>
+sycl::event masked_extract_all_slices_strided_impl(
+    sycl::queue exec_q,
+    py::ssize_t iteration_size,
+    const char *src_p,
+    const char *cumsum_p,
+    char *dst_p,
+    int nd,
+    const py::ssize_t
+        *packed_src_shape_strides, // [src_shape, src_strides], length 2*nd
+    py::ssize_t dst_size,          // dst is 1D
+    py::ssize_t dst_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    //  using MaskedExtractStridedFunctor;
+    //  using Strided1DIndexer;
+    //  using StridedIndexer;
+    //  using TwoZeroOffsets_Indexer;
+
+    TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
+
+    /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
+     * *_packed_shape_strides) */
+    StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides);
+    Strided1DIndexer masked_dst_indexer(0, dst_size, dst_stride);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<class masked_extract_all_slices_strided_impl_krn<
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer, dataT,
+            indT>>(
+            sycl::range<1>(static_cast<size_t>(iteration_size)),
+            MaskedExtractStridedFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                        Strided1DIndexer, dataT, indT>(
+                src_p, cumsum_p, dst_p, 1, iteration_size,
+                orthog_src_dst_indexer, masked_src_indexer,
+                masked_dst_indexer));
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)(
+    sycl::queue,
+    py::ssize_t,
+    py::ssize_t,
+    const char *,
+    const char *,
+    char *,
+    int,
+    py::ssize_t const *,
+    py::ssize_t,
+    py::ssize_t,
+    int,
+    py::ssize_t const *,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename OrthoIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_extract_some_slices_strided_impl_krn;
+
+template <typename dataT, typename indT>
+sycl::event masked_extract_some_slices_strided_impl(
+    sycl::queue exec_q,
+    py::ssize_t orthog_nelems,
+    py::ssize_t masked_nelems,
+    const char *src_p,
+    const char *cumsum_p,
+    char *dst_p,
+    int orthog_nd,
+    const py::ssize_t
+        *packed_ortho_src_dst_shape_strides, // [ortho_shape, ortho_src_strides,
+                                             // ortho_dst_strides], length
+                                             // 3*ortho_nd
+    py::ssize_t ortho_src_offset,
+    py::ssize_t ortho_dst_offset,
+    int masked_nd,
+    const py::ssize_t *packed_masked_src_shape_strides, // [masked_src_shape,
+                                                        // masked_src_strides],
+                                                        // length 2*masked_nd
+    py::ssize_t masked_dst_size,                        // mask_dst is 1D
+    py::ssize_t masked_dst_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    //  using MaskedExtractStridedFunctor;
+    //  using Strided1DIndexer;
+    //  using StridedIndexer;
+    //  using TwoOffsets_StridedIndexer;
+
+    TwoOffsets_StridedIndexer orthog_src_dst_indexer{
+        orthog_nd, ortho_src_offset, ortho_dst_offset,
+        packed_ortho_src_dst_shape_strides};
+
+    StridedIndexer masked_src_indexer{masked_nd, 0,
+                                      packed_masked_src_shape_strides};
+    Strided1DIndexer masked_dst_indexer{0, masked_dst_size, masked_dst_stride};
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<class masked_extract_some_slices_strided_impl_krn<
+            TwoOffsets_StridedIndexer, StridedIndexer, Strided1DIndexer, dataT,
+            indT>>(
+            sycl::range<1>(static_cast<size_t>(orthog_nelems * masked_nelems)),
+            MaskedExtractStridedFunctor<TwoOffsets_StridedIndexer,
+                                        StridedIndexer, Strided1DIndexer, dataT,
+                                        indT>(
+                src_p, cumsum_p, dst_p, orthog_nelems, masked_nelems,
+                orthog_src_dst_indexer, masked_src_indexer,
+                masked_dst_indexer));
+    });
+
+    return comp_ev;
+}
+
+template <typename fnT, typename T> struct MaskExtractAllSlicesStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T> struct MaskExtractSomeSlicesStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_some_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+// Masked placement
+
+template <typename OrthoIndexerT,
+          typename MaskedDstIndexerT,
+          typename MaskedRhsIndexerT,
+          typename dataT,
+          typename indT>
+class masked_place_all_slices_strided_impl_krn;
+
+typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)(
+    sycl::queue,
+    py::ssize_t,
+    char *,
+    const char *,
+    const char *,
+    int,
+    py::ssize_t const *,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename dataT, typename indT>
+sycl::event masked_place_all_slices_strided_impl(
+    sycl::queue exec_q,
+    py::ssize_t iteration_size,
+    char *dst_p,
+    const char *cumsum_p,
+    const char *rhs_p,
+    int nd,
+    const py::ssize_t
+        *packed_dst_shape_strides, // [dst_shape, dst_strides], length 2*nd
+    py::ssize_t rhs_size,          // rhs is 1D
+    py::ssize_t rhs_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    //  using MaskedPlaceStridedFunctor;
+    //  using Strided1DIndexer;
+    //  using StridedIndexer;
+    //  using TwoZeroOffsets_Indexer;
+
+    TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};
+
+    /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
+     * *_packed_shape_strides) */
+    StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides);
+    Strided1DIndexer masked_rhs_indexer(0, rhs_size, rhs_stride);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<class masked_place_all_slices_strided_impl_krn<
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer, dataT,
+            indT>>(
+            sycl::range<1>(static_cast<size_t>(iteration_size)),
+            MaskedPlaceStridedFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                      Strided1DIndexer, dataT, indT>(
+                dst_p, cumsum_p, rhs_p, 1, iteration_size,
+                orthog_dst_rhs_indexer, masked_dst_indexer,
+                masked_rhs_indexer));
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)(
+    sycl::queue,
+    py::ssize_t,
+    py::ssize_t,
+    char *,
+    const char *,
+    const char *,
+    int,
+    py::ssize_t const *,
+    py::ssize_t,
+    py::ssize_t,
+    int,
+    py::ssize_t const *,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename OrthoIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_place_some_slices_strided_impl_krn;
+
+template <typename dataT, typename indT>
+sycl::event masked_place_some_slices_strided_impl(
+    sycl::queue exec_q,
+    py::ssize_t orthog_nelems,
+    py::ssize_t masked_nelems,
+    char *dst_p,
+    const char *cumsum_p,
+    const char *rhs_p,
+    int orthog_nd,
+    const py::ssize_t
+        *packed_ortho_dst_rhs_shape_strides, // [ortho_shape, ortho_dst_strides,
+                                             // ortho_rhs_strides], length
+                                             // 3*ortho_nd
+    py::ssize_t ortho_dst_offset,
+    py::ssize_t ortho_rhs_offset,
+    int masked_nd,
+    const py::ssize_t *packed_masked_dst_shape_strides, // [masked_dst_shape,
+                                                        // masked_dst_strides],
+                                                        // length 2*masked_nd
+    py::ssize_t masked_rhs_size,                        // mask_dst is 1D
+    py::ssize_t masked_rhs_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    //  using MaskedPlaceStridedFunctor;
+    //  using Strided1DIndexer;
+    //  using StridedIndexer;
+    //  using TwoOffsets_StridedIndexer;
+
+    TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{
+        orthog_nd, ortho_dst_offset, ortho_rhs_offset,
+        packed_ortho_dst_rhs_shape_strides};
+
+    /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
+     * *_packed_shape_strides) */
+    StridedIndexer masked_dst_indexer{masked_nd, 0,
+                                      packed_masked_dst_shape_strides};
+    Strided1DIndexer masked_rhs_indexer{0, masked_rhs_size, masked_rhs_stride};
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<class masked_place_some_slices_strided_impl_krn<
+            TwoOffsets_StridedIndexer, StridedIndexer, Strided1DIndexer, dataT,
+            indT>>(
+            sycl::range<1>(static_cast<size_t>(orthog_nelems * masked_nelems)),
+            MaskedPlaceStridedFunctor<TwoOffsets_StridedIndexer, StridedIndexer,
+                                      Strided1DIndexer, dataT, indT>(
+                dst_p, cumsum_p, rhs_p, orthog_nelems, masked_nelems,
+                orthog_dst_rhs_indexer, masked_dst_indexer,
+                masked_rhs_indexer));
+    });
+
+    return comp_ev;
+}
+
+static masked_place_all_slices_strided_impl_fn_ptr_t
+    masked_place_all_slices_strided_impl_dispatch_vector
+        [dpctl::tensor::detail::num_types];
+
+template <typename fnT, typename T> struct MaskPlaceAllSlicesStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = masked_place_all_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+static masked_place_some_slices_strided_impl_fn_ptr_t
+    masked_place_some_slices_strided_impl_dispatch_vector
+        [dpctl::tensor::detail::num_types];
+
+template <typename fnT, typename T> struct MaskPlaceSomeSlicesStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = masked_place_some_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+// Non-zero
+
+class non_zero_indexes_krn;
+
+template <typename indT1, typename indT2>
+sycl::event non_zero_indexes_impl(sycl::queue exec_q,
+                                  py::ssize_t iter_size,
+                                  py::ssize_t nz_elems,
+                                  int nd,
+                                  const char *cumsum_cp,
+                                  char *indexes_cp,
+                                  const py::ssize_t *mask_shape,
+                                  std::vector<sycl::event> const &depends)
+{
+    const indT1 *cumsum_data = reinterpret_cast<const indT1 *>(cumsum_cp);
+    indT2 *indexes_data = reinterpret_cast<indT2 *>(indexes_cp);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<class non_zero_indexes_krn>(
+            sycl::range<1>(iter_size), [=](sycl::id<1> idx) {
+                auto i = idx[0];
+
+                auto cs_curr_val = cumsum_data[i] - 1;
+                auto cs_prev_val = (i > 0) ? cumsum_data[i - 1] : indT1(0);
+                bool cond = (cs_curr_val == cs_prev_val);
+
+                py::ssize_t i_ = static_cast<py::ssize_t>(i);
+                for (int dim = nd; --dim > 0;) {
+                    auto sd = mask_shape[dim];
+                    py::ssize_t q = i_ / sd;
+                    py::ssize_t r = (i_ - q * sd);
+                    if (cond) {
+                        indexes_data[cs_curr_val + dim * nz_elems] =
+                            static_cast<indT2>(r);
+                    }
+                    i_ = q;
+                }
+                if (cond) {
+                    indexes_data[cs_curr_val] = static_cast<indT2>(i_);
+                }
+            });
+    });
+
+    return comp_ev;
+}
+
+} // namespace indexing
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
new file mode 100644
index 0000000000..1534b38391
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -0,0 +1,1085 @@
+//===-- boolean_advanced_indexing.cpp -                       --*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines implementation functions of dpctl.tensor.place and
+/// dpctl.tensor.extract, dpctl.tensor.nonzero
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <cstdint>
+#include <limits>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <utility>
+#include <vector>
+
+#include "boolean_advanced_indexing.hpp"
+#include "kernels/boolean_advanced_indexing.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+struct sink_t
+{
+    sink_t(){};
+    template <class T> sink_t(T &&){};
+};
+
+template <class V> std::size_t accumulate_size(std::size_t &s, V &&v)
+{
+    return s += v.size();
+}
+
+template <class V, class U> sink_t inserter(V &lhs, U &&rhs)
+{
+    lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+    return {};
+}
+
+template <typename T, typename A, typename... Vs>
+std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&... vs)
+{
+    std::size_t s = lhs.size();
+    {
+        // limited scope ensures array is freed
+        [[maybe_unused]] sink_t tmp[] = {accumulate_size(s, vs)..., 0};
+    }
+    lhs.reserve(s);
+    {
+        // array of no-data objects ensures ordering of calls to inserter
+        [[maybe_unused]] sink_t tmp[] = {inserter(lhs, std::forward<Vs>(vs))...,
+                                         0};
+    }
+
+    return std::move(lhs); // prevent return-value optimization
+}
+
+template <typename indT, typename... Vs>
+std::tuple<indT *, size_t, sycl::event>
+device_allocate_and_pack(sycl::queue q,
+                         std::vector<sycl::event> &host_task_events,
+                         Vs &&... vs)
+{
+
+    // memory transfer optimization, use USM-host for temporary speeds up
+    // tranfer to device, especially on dGPUs
+    using usm_host_allocatorT =
+        sycl::usm_allocator<indT, sycl::usm::alloc::host>;
+    using shT = std::vector<indT, usm_host_allocatorT>;
+
+    usm_host_allocatorT usm_host_allocator(q);
+    shT empty{0, usm_host_allocator};
+    shT packed_shape_strides = concat(empty, vs...);
+
+    auto packed_shape_strides_owner =
+        std::make_shared<shT>(std::move(packed_shape_strides));
+
+    auto sz = packed_shape_strides_owner->size();
+    indT *shape_strides = sycl::malloc_device<indT>(sz, q);
+
+    sycl::event copy_ev =
+        q.copy<indT>(packed_shape_strides_owner->data(), shape_strides, sz);
+
+    sycl::event cleanup_host_task_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(copy_ev);
+        cgh.host_task([packed_shape_strides_owner] {
+            // increment shared pointer ref-count to keep it alive
+            // till copy operation completes;
+        });
+    });
+    host_task_events.push_back(cleanup_host_task_ev);
+
+    return std::make_tuple(shape_strides, sz, copy_ev);
+}
+
+/* @brief check for overlap of memory regions behind arrays.
+
+Presenty assume that array occupies all bytes between smallest and largest
+displaced elements.
+
+TODO: Write proper Frobenius solver to account for holes, e.g.
+   overlap( x_contig[::2], x_contig[1::2]) should give False,
+   while this implementation gives True.
+*/
+bool overlap(dpctl::tensor::usm_ndarray ar1, dpctl::tensor::usm_ndarray ar2)
+{
+    const char *ar1_data = ar1.get_data();
+
+    const auto &ar1_offsets = ar1.get_minmax_offsets();
+    py::ssize_t ar1_elem_size = static_cast<py::ssize_t>(ar1.get_elemsize());
+
+    const char *ar2_data = ar2.get_data();
+    const auto &ar2_offsets = ar2.get_minmax_offsets();
+    py::ssize_t ar2_elem_size = static_cast<py::ssize_t>(ar2.get_elemsize());
+
+    /* Memory of array1 extends from  */
+    /*    [ar1_data + ar1_offsets.first * ar1_elem_size, ar1_data +
+     * ar1_offsets.second * ar1_elem_size + ar1_elem_size] */
+    /* Memory of array2 extends from */
+    /*    [ar2_data + ar2_offsets.first * ar2_elem_size, ar2_data +
+     * ar2_offsets.second * ar2_elem_size + ar2_elem_size] */
+
+    /* Intervals [x0, x1] and [y0, y1] do not overlap if (x0 <= x1) && (y0 <=
+     * y1)
+     * && (x1 <=y0 || y1 <= x0 ) */
+    /* Given that x0 <= x1 and y0 <= y1 are true by construction, the condition
+     * for overlap us (x1 > y0) && (y1 > x0) */
+
+    /*  Applying:
+         (ar1_data + ar1_offsets.second * ar1_elem_size + ar1_elem_size >
+       ar2_data
+       + ar2_offsets.first * ar2_elem_size) && (ar2_data + ar2_offsets.second *
+       ar2_elem_size + ar2_elem_size > ar1_data + ar1_offsets.first *
+       ar1_elem_size)
+    */
+
+    auto byte_distance = static_cast<py::ssize_t>(ar2_data - ar1_data);
+
+    py::ssize_t x1_minus_y0 =
+        (-byte_distance +
+         (ar1_elem_size + (ar1_offsets.second * ar1_elem_size) -
+          (ar2_offsets.first * ar2_elem_size)));
+
+    py::ssize_t y1_minus_x0 =
+        (byte_distance + (ar2_elem_size + (ar2_offsets.second * ar2_elem_size) -
+                          (ar1_offsets.first * ar1_elem_size)));
+
+    bool memory_overlap = (x1_minus_y0 > 0) && (y1_minus_x0 > 0);
+
+    return memory_overlap;
+}
+
+/* @brief Split shape/strides into dir1 (complementary to axis_start <= i <
+ * axis_end) and dir2 (along given set of axes)
+ */
+template <typename shT>
+void _split_iteration_space(const shT &shape_vec,
+                            const shT &strides_vec,
+                            int axis_start,
+                            int axis_end,
+                            shT &dir1_shape_vec,
+                            shT &dir2_shape_vec,
+                            shT &dir1_strides_vec,
+                            shT &dir2_strides_vec)
+{
+    int nd = static_cast<int>(shape_vec.size());
+    int dir2_sz = axis_end - axis_start;
+    int dir1_sz = nd - dir2_sz;
+
+    assert(dir1_sz > 0);
+    assert(dir2_sz > 0);
+
+    dir1_shape_vec.resize(dir1_sz);
+    dir2_shape_vec.resize(dir2_sz);
+
+    std::copy(shape_vec.begin(), shape_vec.begin() + axis_start,
+              dir1_shape_vec.begin());
+    std::copy(shape_vec.begin() + axis_end, shape_vec.end(),
+              dir1_shape_vec.begin() + axis_start);
+
+    std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end,
+              dir2_shape_vec.begin());
+
+    dir1_strides_vec.resize(dir1_sz);
+    dir2_strides_vec.resize(dir2_sz);
+
+    std::copy(strides_vec.begin(), strides_vec.begin() + axis_start,
+              dir1_strides_vec.begin());
+    std::copy(strides_vec.begin() + axis_end, strides_vec.end(),
+              dir1_strides_vec.begin() + axis_start);
+
+    std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end,
+              dir2_strides_vec.begin());
+
+    return;
+}
+
+// Computation of positions of masked elements
+
+using dpctl::tensor::kernels::indexing::mask_positions_contig_impl_fn_ptr_t;
+static mask_positions_contig_impl_fn_ptr_t
+    mask_positions_contig_dispatch_vector[dpctl::tensor::detail::num_types];
+
+using dpctl::tensor::kernels::indexing::mask_positions_strided_impl_fn_ptr_t;
+static mask_positions_strided_impl_fn_ptr_t
+    mask_positions_strided_dispatch_vector[dpctl::tensor::detail::num_types];
+
+void populate_mask_positions_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::indexing::MaskPositionsContigFactory;
+    dpctl::tensor::detail::DispatchVectorBuilder<
+        mask_positions_contig_impl_fn_ptr_t, MaskPositionsContigFactory,
+        dpctl::tensor::detail::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(mask_positions_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::MaskPositionsStridedFactory;
+    dpctl::tensor::detail::DispatchVectorBuilder<
+        mask_positions_strided_impl_fn_ptr_t, MaskPositionsStridedFactory,
+        dpctl::tensor::detail::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(mask_positions_strided_dispatch_vector);
+
+    return;
+}
+
+size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
+                         dpctl::tensor::usm_ndarray cumsum,
+                         sycl::queue exec_q,
+                         std::vector<sycl::event> const &depends)
+{
+    // cumsum is 1D
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("Result array must be one-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array must be C-contiguous.");
+    }
+
+    // cumsum.shape == (mask.size,)
+    auto mask_size = mask.get_size();
+    auto cumsum_size = cumsum.get_shape(0);
+    if (cumsum_size != mask_size) {
+        throw py::value_error("Inconsistent dimensions");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) {
+        // FIXME: use ExecutionPlacementError
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (mask_size == 0) {
+        return 0;
+    }
+
+    int mask_typenum = mask.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    // mask can be any type
+    const char *mask_data = mask.get_data();
+    char *cumsum_data = cumsum.get_data();
+
+    auto const &array_types = dpctl::tensor::detail::usm_ndarray_types();
+
+    int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    // cumsum must be int64_t only
+    constexpr int int64_typeid =
+        static_cast<int>(dpctl::tensor::detail::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Cumulative sum array must have int64 data-type.");
+    }
+
+    if (mask.is_c_contiguous()) {
+        auto fn = mask_positions_contig_dispatch_vector[mask_typeid];
+
+        return fn(exec_q, mask_size, mask_data, cumsum_data, depends);
+    }
+
+    const py::ssize_t *shape = mask.get_shape_raw();
+    const py::ssize_t *strides = mask.get_strides_raw();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_strides;
+    py::ssize_t offset(0);
+
+    int mask_nd = mask.get_ndim();
+    int nd = mask_nd;
+
+    constexpr py::ssize_t itemsize = 1; // in elements
+    bool is_c_contig = mask.is_c_contiguous();
+    bool is_f_contig = mask.is_f_contiguous();
+
+    dpctl::tensor::py_internal::simplify_iteration_space_1(
+        nd, shape, strides, itemsize, is_c_contig, is_f_contig,
+        simplified_shape, simplified_strides, offset);
+
+    if (nd == 1 && simplified_strides[0] == 1) {
+        auto fn = mask_positions_contig_dispatch_vector[mask_typeid];
+
+        return fn(exec_q, mask_size, mask_data, cumsum_data, depends);
+    }
+
+    // Strided implementation
+    auto strided_fn = mask_positions_strided_dispatch_vector[mask_typeid];
+    std::vector<sycl::event> host_task_events;
+
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_strides);
+    py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple);
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+
+    if (2 * static_cast<size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
+        copy_shape_ev.wait();
+        sycl::event::wait(host_task_events);
+        sycl::free(shape_strides, exec_q);
+        throw std::runtime_error("Unexacted error");
+    }
+
+    std::vector<sycl::event> dependent_events;
+    dependent_events.reserve(depends.size() + 1);
+    dependent_events.insert(dependent_events.end(), copy_shape_ev);
+    dependent_events.insert(dependent_events.end(), depends.begin(),
+                            depends.end());
+
+    size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, offset,
+                                  shape_strides, cumsum_data, dependent_events);
+
+    sycl::event::wait(host_task_events);
+    sycl::free(shape_strides, exec_q);
+
+    return total_set;
+}
+
+// Masked extraction
+
+using dpctl::tensor::kernels::indexing::
+    masked_extract_all_slices_strided_impl_fn_ptr_t;
+
+static masked_extract_all_slices_strided_impl_fn_ptr_t
+    masked_extract_all_slices_strided_impl_dispatch_vector
+        [dpctl::tensor::detail::num_types];
+
+using dpctl::tensor::kernels::indexing::
+    masked_extract_some_slices_strided_impl_fn_ptr_t;
+
+static masked_extract_some_slices_strided_impl_fn_ptr_t
+    masked_extract_some_slices_strided_impl_dispatch_vector
+        [dpctl::tensor::detail::num_types];
+
+void populate_masked_extract_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::indexing::MaskExtractAllSlicesStridedFactory;
+    dpctl::tensor::detail::DispatchVectorBuilder<
+        masked_extract_all_slices_strided_impl_fn_ptr_t,
+        MaskExtractAllSlicesStridedFactory, dpctl::tensor::detail::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(
+        masked_extract_all_slices_strided_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::MaskExtractSomeSlicesStridedFactory;
+    dpctl::tensor::detail::DispatchVectorBuilder<
+        masked_extract_some_slices_strided_impl_fn_ptr_t,
+        MaskExtractSomeSlicesStridedFactory, dpctl::tensor::detail::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(
+        masked_extract_some_slices_strided_impl_dispatch_vector);
+}
+
+std::pair<sycl::event, sycl::event>
+py_extract(dpctl::tensor::usm_ndarray src,
+           dpctl::tensor::usm_ndarray cumsum,
+           int axis_start, // axis_start <= mask_i < axis_end
+           int axis_end,
+           dpctl::tensor::usm_ndarray dst,
+           sycl::queue exec_q,
+           std::vector<sycl::event> const &depends)
+{
+    int src_nd = src.get_ndim();
+    if ((axis_start < 0 || axis_end > src_nd || axis_start >= axis_end)) {
+        throw py::value_error("Specified axes_start and axes_end are invalid.");
+    }
+    int mask_span_sz = axis_end - axis_start;
+
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd + (mask_span_sz - 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be a C-contiguous vector");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    py::ssize_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool same_ortho_dims(true);
+    size_t ortho_nelems(1); // number of orthogonal iterations
+
+    for (auto i = 0; i < axis_start; ++i) {
+        auto src_sh_i = src_shape[i];
+        ortho_nelems *= src_sh_i;
+        same_ortho_dims = same_ortho_dims && (src_sh_i == dst_shape[i]);
+    }
+    for (auto i = axis_end; i < src_nd; ++i) {
+        auto src_sh_i = src_shape[i];
+        ortho_nelems *= src_sh_i;
+        same_ortho_dims =
+            same_ortho_dims && (src_sh_i == dst_shape[i - (mask_span_sz - 1)]);
+    }
+
+    size_t masked_src_nelems(1);
+    size_t masked_dst_nelems(dst_shape[axis_start]);
+    for (auto i = axis_start; i < axis_end; ++i) {
+        masked_src_nelems *= src_shape[i];
+    }
+
+    // masked_dst_nelems is number of set elements in the mask, or last element
+    // in cumsum
+    if (!same_ortho_dims ||
+        (masked_src_nelems != static_cast<size_t>(cumsum_sz))) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    // ensure that dst is sufficiently ample
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accomodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < static_cast<size_t>(ortho_nelems * masked_dst_nelems)) {
+            throw py::value_error(
+                "Memory addressed by the destination array can not "
+                "accomodate all the "
+                "array elements.");
+        }
+    }
+
+    // check that dst does not intersect with src, not with cumsum.
+    if (overlap(dst, cumsum) || overlap(dst, src)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    constexpr int int64_typeid =
+        static_cast<int>(dpctl::tensor::detail::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Unexact data type of cumsum array, expecting 'int64'");
+    }
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data types");
+    }
+
+    char *src_data_p = src.get_data();
+    char *dst_data_p = dst.get_data();
+    char *cumsum_data_p = cumsum.get_data();
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    sycl::event extract_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis_start == 0 && axis_end == src_nd) {
+        // empty orthogonal directions
+        auto fn =
+            masked_extract_all_slices_strided_impl_dispatch_vector[src_typeid];
+
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, src_shape_vec, src_strides_vec);
+        py::ssize_t *packed_src_shape_strides =
+            std::get<0>(ptr_size_event_tuple1);
+        sycl::event copy_src_shape_strides_ev =
+            std::get<2>(ptr_size_event_tuple1);
+
+        assert(dst_shape_vec.size() == 1);
+        assert(dst_strides_vec.size() == 1);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_src_shape_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        extract_ev = fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p,
+                        dst_data_p, src_nd, packed_src_shape_strides,
+                        dst_shape_vec[0], dst_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(extract_ev);
+                auto ctx = exec_q.get_context();
+                cgh.host_task([ctx, packed_src_shape_strides] {
+                    sycl::free(packed_src_shape_strides, ctx);
+                });
+            });
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+    else {
+        // non-empty othogonal directions
+        auto fn =
+            masked_extract_some_slices_strided_impl_dispatch_vector[src_typeid];
+
+        int masked_src_nd = mask_span_sz;
+        int ortho_nd = src_nd - masked_src_nd;
+
+        using shT = std::vector<py::ssize_t>;
+
+        shT ortho_src_shape;
+        shT masked_src_shape;
+        shT ortho_src_strides;
+        shT masked_src_strides;
+        _split_iteration_space(src_shape_vec, src_strides_vec, axis_start,
+                               axis_end, ortho_src_shape,
+                               masked_src_shape, // 4 vectors modified
+                               ortho_src_strides, masked_src_strides);
+
+        shT ortho_dst_shape;
+        shT masked_dst_shape;
+        shT ortho_dst_strides;
+        shT masked_dst_strides;
+        _split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start,
+                               axis_start + 1, ortho_dst_shape,
+                               masked_dst_shape, // 4 vectors modified
+                               ortho_dst_strides, masked_dst_strides);
+
+        assert(ortho_src_shape.size() == static_cast<size_t>(ortho_nd));
+        assert(ortho_dst_shape.size() == static_cast<size_t>(ortho_nd));
+        assert(std::equal(ortho_src_shape.begin(), ortho_src_shape.end(),
+                          ortho_dst_shape.begin()));
+
+        std::vector<py::ssize_t> simplified_ortho_shape;
+        std::vector<py::ssize_t> simplified_ortho_src_strides;
+        std::vector<py::ssize_t> simplified_ortho_dst_strides;
+
+        const py::ssize_t *_shape = ortho_src_shape.data();
+        const py::ssize_t *_src_strides = ortho_src_strides.data();
+        const py::ssize_t *_dst_strides = ortho_dst_strides.data();
+        constexpr py::ssize_t _itemsize = 1; // in elements
+
+        constexpr bool is_c_contig = false;
+        constexpr bool is_f_contig = false;
+
+        py::ssize_t ortho_src_offset(0);
+        py::ssize_t ortho_dst_offset(0);
+
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            ortho_nd, _shape, _src_strides, _itemsize, is_c_contig, is_f_contig,
+            _dst_strides, _itemsize, is_c_contig, is_f_contig,
+            simplified_ortho_shape, simplified_ortho_src_strides,
+            simplified_ortho_dst_strides, ortho_src_offset, ortho_dst_offset);
+
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_ortho_shape,
+            simplified_ortho_src_strides, simplified_ortho_dst_strides);
+        py::ssize_t *packed_ortho_src_dst_shape_strides =
+            std::get<0>(ptr_size_event_tuple1);
+        sycl::event copy_shape_strides_ev1 = std::get<2>(ptr_size_event_tuple1);
+
+        auto ptr_size_event_tuple2 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, masked_src_shape, masked_src_strides);
+        py::ssize_t *packed_masked_src_shape_strides =
+            std::get<0>(ptr_size_event_tuple2);
+        sycl::event copy_shape_strides_ev2 = std::get<2>(ptr_size_event_tuple2);
+
+        assert(masked_dst_shape.size() == 1);
+        assert(masked_dst_strides.size() == 1);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 2);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shape_strides_ev1);
+        all_deps.push_back(copy_shape_strides_ev2);
+
+        assert(all_deps.size() == depends.size() + 2);
+
+        // OrthogIndexerT orthog_src_dst_indexer_, MaskedIndexerT
+        // masked_src_indexer_, MaskedIndexerT masked_dst_indexer_
+        extract_ev = fn(exec_q, ortho_nelems, masked_src_nelems, src_data_p,
+                        cumsum_data_p, dst_data_p,
+                        // data to build orthog_src_dst_indexer
+                        ortho_nd, packed_ortho_src_dst_shape_strides,
+                        ortho_src_offset, ortho_dst_offset,
+                        // data to build masked_src_indexer
+                        masked_src_nd, packed_masked_src_shape_strides,
+                        // data to build masked_dst_indexer,
+                        masked_dst_shape[0], masked_dst_strides[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(extract_ev);
+                auto ctx = exec_q.get_context();
+                cgh.host_task([ctx, packed_ortho_src_dst_shape_strides,
+                               packed_masked_src_shape_strides] {
+                    sycl::free(packed_ortho_src_dst_shape_strides, ctx);
+                    sycl::free(packed_masked_src_shape_strides, ctx);
+                });
+            });
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    host_task_events.push_back(extract_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {src, cumsum, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, extract_ev);
+}
+
+// Masked placement
+
+using dpctl::tensor::kernels::indexing::
+    masked_place_all_slices_strided_impl_fn_ptr_t;
+
+static masked_place_all_slices_strided_impl_fn_ptr_t
+    masked_place_all_slices_strided_impl_dispatch_vector
+        [dpctl::tensor::detail::num_types];
+
+using dpctl::tensor::kernels::indexing::
+    masked_place_some_slices_strided_impl_fn_ptr_t;
+
+static masked_place_some_slices_strided_impl_fn_ptr_t
+    masked_place_some_slices_strided_impl_dispatch_vector
+        [dpctl::tensor::detail::num_types];
+
+void populate_masked_place_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::indexing::MaskPlaceAllSlicesStridedFactory;
+    dpctl::tensor::detail::DispatchVectorBuilder<
+        masked_place_all_slices_strided_impl_fn_ptr_t,
+        MaskPlaceAllSlicesStridedFactory, dpctl::tensor::detail::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(
+        masked_place_all_slices_strided_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::MaskPlaceSomeSlicesStridedFactory;
+    dpctl::tensor::detail::DispatchVectorBuilder<
+        masked_place_some_slices_strided_impl_fn_ptr_t,
+        MaskPlaceSomeSlicesStridedFactory, dpctl::tensor::detail::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(
+        masked_place_some_slices_strided_impl_dispatch_vector);
+}
+
+/*
+ * @brief Copy dst[i, ortho_id] = rhs[cumsum[i] - 1, ortho_id]  if cumsum[i] ==
+ * ((i > 0) ? cumsum[i-1] + 1 : 1)
+ */
+std::pair<sycl::event, sycl::event>
+py_place(dpctl::tensor::usm_ndarray dst,
+         dpctl::tensor::usm_ndarray cumsum,
+         int axis_start, // axis_start <= mask_i < axis_end
+         int axis_end,
+         dpctl::tensor::usm_ndarray rhs,
+         sycl::queue exec_q,
+         std::vector<sycl::event> const &depends)
+{
+    int dst_nd = dst.get_ndim();
+    if ((axis_start < 0 || axis_end > dst_nd || axis_start >= axis_end)) {
+        throw py::value_error("Specified axes_start and axes_end are invalid.");
+    }
+    int mask_span_sz = axis_end - axis_start;
+
+    int rhs_nd = rhs.get_ndim();
+    if (dst_nd != rhs_nd + (mask_span_sz - 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be a C-contiguous vector");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst, cumsum, rhs})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    py::ssize_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    const py::ssize_t *rhs_shape = rhs.get_shape_raw();
+    bool same_ortho_dims(true);
+    size_t ortho_nelems(1); // number of orthogonal iterations
+
+    for (auto i = 0; i < axis_start; ++i) {
+        auto dst_sh_i = dst_shape[i];
+        ortho_nelems *= dst_sh_i;
+        same_ortho_dims = same_ortho_dims && (dst_sh_i == rhs_shape[i]);
+    }
+    for (auto i = axis_end; i < dst_nd; ++i) {
+        auto dst_sh_i = dst_shape[i];
+        ortho_nelems *= dst_sh_i;
+        same_ortho_dims =
+            same_ortho_dims && (dst_sh_i == rhs_shape[i - (mask_span_sz - 1)]);
+    }
+
+    size_t masked_dst_nelems(1);
+    for (auto i = axis_start; i < axis_end; ++i) {
+        masked_dst_nelems *= dst_shape[i];
+    }
+
+    if (!same_ortho_dims ||
+        (masked_dst_nelems != static_cast<size_t>(cumsum_sz))) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    // ensure that dst is sufficiently ample
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accomodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < static_cast<size_t>(ortho_nelems * masked_dst_nelems)) {
+            throw py::value_error(
+                "Memory addressed by the destination array can not "
+                "accomodate all the "
+                "array elements.");
+        }
+    }
+
+    // check that dst does not intersect with src, not with cumsum.
+    if (overlap(dst, rhs) || overlap(dst, cumsum)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int dst_typenum = dst.get_typenum();
+    int rhs_typenum = rhs.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    constexpr int int64_typeid =
+        static_cast<int>(dpctl::tensor::detail::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Unexact data type of cumsum array, expecting 'int64'");
+    }
+
+    // FIXME: should types be the same?
+    if (dst_typeid != rhs_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data types");
+    }
+
+    char *dst_data_p = dst.get_data();
+    char *rhs_data_p = rhs.get_data();
+    char *cumsum_data_p = cumsum.get_data();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    auto rhs_shape_vec = rhs.get_shape_vector();
+    auto rhs_strides_vec = rhs.get_strides_vector();
+
+    sycl::event extract_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis_start == 0 && axis_end == dst_nd) {
+        // empty orthogonal directions
+        auto fn =
+            masked_place_all_slices_strided_impl_dispatch_vector[dst_typeid];
+
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, dst_shape_vec, dst_strides_vec);
+        py::ssize_t *packed_dst_shape_strides =
+            std::get<0>(ptr_size_event_tuple1);
+        sycl::event copy_dst_shape_strides_ev =
+            std::get<2>(ptr_size_event_tuple1);
+
+        assert(rhs_shape_vec.size() == 1);
+        assert(rhs_strides_vec.size() == 1);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_dst_shape_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        extract_ev = fn(exec_q, cumsum_sz, dst_data_p, cumsum_data_p,
+                        rhs_data_p, dst_nd, packed_dst_shape_strides,
+                        rhs_shape_vec[0], rhs_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(extract_ev);
+                auto ctx = exec_q.get_context();
+                cgh.host_task([ctx, packed_dst_shape_strides] {
+                    sycl::free(packed_dst_shape_strides, ctx);
+                });
+            });
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+    else {
+        // non-empty othogonal directions
+        auto fn =
+            masked_place_some_slices_strided_impl_dispatch_vector[dst_typeid];
+
+        int masked_dst_nd = mask_span_sz;
+        int ortho_nd = dst_nd - masked_dst_nd;
+
+        using shT = std::vector<py::ssize_t>;
+
+        shT ortho_dst_shape;
+        shT masked_dst_shape;
+        shT ortho_dst_strides;
+        shT masked_dst_strides;
+        _split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start,
+                               axis_end, ortho_dst_shape,
+                               masked_dst_shape, // 4 vectors modified
+                               ortho_dst_strides, masked_dst_strides);
+
+        shT ortho_rhs_shape;
+        shT masked_rhs_shape;
+        shT ortho_rhs_strides;
+        shT masked_rhs_strides;
+        _split_iteration_space(rhs_shape_vec, rhs_strides_vec, axis_start,
+                               axis_start + 1, ortho_rhs_shape,
+                               masked_rhs_shape, // 4 vectors modified
+                               ortho_rhs_strides, masked_rhs_strides);
+
+        assert(ortho_dst_shape.size() == static_cast<size_t>(ortho_nd));
+        assert(ortho_rhs_shape.size() == static_cast<size_t>(ortho_nd));
+        assert(std::equal(ortho_dst_shape.begin(), ortho_dst_shape.end(),
+                          ortho_rhs_shape.begin()));
+
+        std::vector<py::ssize_t> simplified_ortho_shape;
+        std::vector<py::ssize_t> simplified_ortho_dst_strides;
+        std::vector<py::ssize_t> simplified_ortho_rhs_strides;
+
+        const py::ssize_t *_shape = ortho_dst_shape.data();
+        const py::ssize_t *_dst_strides = ortho_dst_strides.data();
+        const py::ssize_t *_rhs_strides = ortho_rhs_strides.data();
+        constexpr py::ssize_t _itemsize = 1; // in elements
+
+        constexpr bool is_c_contig = false;
+        constexpr bool is_f_contig = false;
+
+        py::ssize_t ortho_dst_offset(0);
+        py::ssize_t ortho_rhs_offset(0);
+
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            ortho_nd, _shape, _dst_strides, _itemsize, is_c_contig, is_f_contig,
+            _rhs_strides, _itemsize, is_c_contig, is_f_contig,
+            simplified_ortho_shape, simplified_ortho_dst_strides,
+            simplified_ortho_rhs_strides, ortho_dst_offset, ortho_rhs_offset);
+
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_ortho_shape,
+            simplified_ortho_dst_strides, simplified_ortho_rhs_strides);
+        py::ssize_t *packed_ortho_dst_rhs_shape_strides =
+            std::get<0>(ptr_size_event_tuple1);
+        sycl::event copy_shape_strides_ev1 = std::get<2>(ptr_size_event_tuple1);
+
+        auto ptr_size_event_tuple2 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, masked_dst_shape, masked_dst_strides);
+        py::ssize_t *packed_masked_dst_shape_strides =
+            std::get<0>(ptr_size_event_tuple2);
+        sycl::event copy_shape_strides_ev2 = std::get<2>(ptr_size_event_tuple2);
+
+        assert(masked_rhs_shape.size() == 1);
+        assert(masked_rhs_strides.size() == 1);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 2);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shape_strides_ev1);
+        all_deps.push_back(copy_shape_strides_ev2);
+
+        assert(all_deps.size() == depends.size() + 2);
+
+        extract_ev = fn(exec_q, ortho_nelems, masked_dst_nelems, dst_data_p,
+                        cumsum_data_p, rhs_data_p,
+                        // data to build orthog_dst_rhs_indexer
+                        ortho_nd, packed_ortho_dst_rhs_shape_strides,
+                        ortho_dst_offset, ortho_rhs_offset,
+                        // data to build masked_dst_indexer
+                        masked_dst_nd, packed_masked_dst_shape_strides,
+                        // data to build masked_dst_indexer,
+                        masked_rhs_shape[0], masked_rhs_strides[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(extract_ev);
+                auto ctx = exec_q.get_context();
+                cgh.host_task([ctx, packed_ortho_dst_rhs_shape_strides,
+                               packed_masked_dst_shape_strides] {
+                    sycl::free(packed_ortho_dst_rhs_shape_strides, ctx);
+                    sycl::free(packed_masked_dst_shape_strides, ctx);
+                });
+            });
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    host_task_events.push_back(extract_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {dst, cumsum, rhs}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, extract_ev);
+}
+
+// Non-zero
+
+std::pair<sycl::event, sycl::event> py_nonzero(
+    dpctl::tensor::usm_ndarray cumsum,  // int64 input array, 1D, C-contiguous
+    dpctl::tensor::usm_ndarray indexes, // int64 2D output array, C-contiguous
+    std::vector<py::ssize_t>
+        mask_shape, // shape of array from which cumsum was computed
+    sycl::queue exec_q,
+    std::vector<sycl::event> const &depends)
+{
+    if (!dpctl::utils::queues_are_compatible(exec_q, {cumsum, indexes})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    int cumsum_nd = cumsum.get_ndim();
+    if (cumsum_nd != 1 || !cumsum.is_c_contiguous()) {
+        throw py::value_error("Cumsum array must be a C-contiguous vector");
+    }
+
+    int indexes_nd = indexes.get_ndim();
+    if (indexes_nd != 2 || !indexes.is_c_contiguous()) {
+        throw py::value_error("Index array must be a C-contiguous matrix");
+    }
+
+    size_t _ndim = mask_shape.size();
+    if (_ndim > std::numeric_limits<int>::max()) {
+        throw py::value_error("Shape is too large");
+    }
+    int ndim = static_cast<int>(_ndim);
+
+    const py::ssize_t *indexes_shape = indexes.get_shape_raw();
+
+    if (ndim != indexes_shape[0]) {
+        throw py::value_error(
+            "Length of shape must equal width of index matrix");
+    }
+
+    auto cumsum_sz = cumsum.get_size();
+    py::ssize_t shape_nelems =
+        std::accumulate(mask_shape.begin(), mask_shape.end(), py::ssize_t(1),
+                        std::multiplies<py::ssize_t>());
+
+    if (cumsum_sz != shape_nelems) {
+        throw py::value_error("Shape and cumsum size are not constent");
+    }
+
+    py::ssize_t nz_elems = indexes_shape[1];
+
+    int indexes_typenum = indexes.get_typenum();
+    auto const &array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int indexes_typeid = array_types.typenum_to_lookup_id(indexes_typenum);
+
+    int cumsum_typenum = cumsum.get_typenum();
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    // cumsum must be int64_t only
+    constexpr int int64_typeid =
+        static_cast<int>(dpctl::tensor::detail::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid || indexes_typeid != int64_typeid) {
+        throw py::value_error(
+            "Cumulative sum array and index array must have int64 data-type");
+    }
+
+    if (cumsum_sz == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    if (overlap(cumsum, indexes)) {
+        throw py::value_error("Arrays are expected to ave no memory overlap");
+    }
+
+    // ensure that dst is sufficiently ample
+    auto indexes_offsets = indexes.get_minmax_offsets();
+    // destination must be ample enough to accomodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(indexes_offsets.second - indexes_offsets.first);
+        if (range + 1 < static_cast<size_t>(nz_elems * _ndim)) {
+            throw py::value_error(
+                "Memory addressed by the destination array can not "
+                "accomodate all the array elements.");
+        }
+    }
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    auto mask_shape_copying_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, mask_shape);
+    py::ssize_t *src_shape_device_ptr = std::get<0>(mask_shape_copying_tuple);
+    sycl::event copy_ev = std::get<2>(mask_shape_copying_tuple);
+
+    if (src_shape_device_ptr == nullptr) {
+        sycl::event::wait(host_task_events);
+        throw std::runtime_error("Device allocation failed");
+    }
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_ev);
+
+    using dpctl::tensor::kernels::indexing::non_zero_indexes_impl;
+
+    sycl::event non_zero_indexes_ev =
+        non_zero_indexes_impl<std::int64_t, std::int64_t>(
+            exec_q, cumsum_sz, nz_elems, ndim, cumsum.get_data(),
+            indexes.get_data(), src_shape_device_ptr, all_deps);
+
+    sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(non_zero_indexes_ev);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, src_shape_device_ptr] {
+            sycl::free(src_shape_device_ptr, ctx);
+        });
+    });
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {cumsum, indexes}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev,
+                          temporaries_cleanup_ev);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
new file mode 100644
index 0000000000..f165fe5118
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
@@ -0,0 +1,84 @@
+//===-- boolean_advanced_indexing.hpp -                       --*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares Python API for implementation functions of
+/// dpctl.tensor.place, dpctl.tensor.extract, and dpctl.tensor.nonzero
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void populate_mask_positions_dispatch_vectors(void);
+
+extern size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
+                                dpctl::tensor::usm_ndarray cumsum,
+                                sycl::queue exec_q,
+                                std::vector<sycl::event> const &depends = {});
+
+extern std::pair<sycl::event, sycl::event>
+py_extract(dpctl::tensor::usm_ndarray src,
+           dpctl::tensor::usm_ndarray cumsum,
+           int axis_start, // axis_start <= mask_i < axis_end
+           int axis_end,
+           dpctl::tensor::usm_ndarray dst,
+           sycl::queue exec_q,
+           std::vector<sycl::event> const &depends = {});
+
+extern void populate_masked_extract_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event>
+py_place(dpctl::tensor::usm_ndarray dst,
+         dpctl::tensor::usm_ndarray cumsum,
+         int axis_start, // axis_start <= mask_i < axis_end
+         int axis_end,
+         dpctl::tensor::usm_ndarray rhs,
+         sycl::queue exec_q,
+         std::vector<sycl::event> const &depends = {});
+
+extern void populate_masked_place_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event> py_nonzero(
+    dpctl::tensor::usm_ndarray cumsum,  // int64 input array, 1D, C-contiguous
+    dpctl::tensor::usm_ndarray indexes, // int64 2D output array, C-contiguous
+    std::vector<py::ssize_t>
+        mask_shape, // shape of array from which cumsum was computed
+    sycl::queue exec_q,
+    std::vector<sycl::event> const &depends = {});
+
+/* @brief Check if memory regions underlying two arrays have an overlap */
+extern bool overlap(dpctl::tensor::usm_ndarray ar1,
+                    dpctl::tensor::usm_ndarray ar2);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
index be4a35fb90..7eb7c8f8d6 100644
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -39,6 +39,86 @@ namespace py = pybind11;
 using dpctl::tensor::c_contiguous_strides;
 using dpctl::tensor::f_contiguous_strides;
 
+void simplify_iteration_space_1(int &nd,
+                                const py::ssize_t *&shape,
+                                const py::ssize_t *&strides,
+                                py::ssize_t itemsize,
+                                bool is_c_contig,
+                                bool is_f_contig,
+                                std::vector<py::ssize_t> &simplified_shape,
+                                std::vector<py::ssize_t> &simplified_strides,
+                                py::ssize_t &offset)
+{
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        for (int i = 0; i < nd; ++i) {
+            simplified_shape.push_back(shape[i]);
+        }
+
+        simplified_strides.reserve(nd);
+        if (strides == nullptr) {
+            if (is_c_contig) {
+                simplified_strides = c_contiguous_strides(nd, shape, itemsize);
+            }
+            else if (is_f_contig) {
+                simplified_strides = f_contiguous_strides(nd, shape, itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            for (int i = 0; i < nd; ++i) {
+                simplified_strides.push_back(strides[i]);
+            }
+        }
+
+        assert(simplified_shape.size() == static_cast<size_t>(nd));
+        assert(simplified_strides.size() == static_cast<size_t>(nd));
+        int contracted_nd = simplify_iteration_stride(
+            nd, simplified_shape.data(), simplified_strides.data(),
+            offset // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+
+        simplified_strides.reserve(nd);
+
+        if (strides == nullptr) {
+            if (is_c_contig) {
+                simplified_strides.push_back(itemsize);
+            }
+            else if (is_f_contig) {
+                simplified_strides.push_back(itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            simplified_strides.push_back(strides[0]);
+        }
+
+        assert(simplified_shape.size() == static_cast<size_t>(nd));
+        assert(simplified_strides.size() == static_cast<size_t>(nd));
+    }
+    shape = const_cast<const py::ssize_t *>(simplified_shape.data());
+    strides = const_cast<const py::ssize_t *>(simplified_strides.data());
+}
+
 void simplify_iteration_space(int &nd,
                               const py::ssize_t *&shape,
                               const py::ssize_t *&src_strides,
@@ -173,6 +253,195 @@ void simplify_iteration_space(int &nd,
         const_cast<const py::ssize_t *>(simplified_dst_strides.data());
 }
 
+void simplify_iteration_space_3(
+    int &nd,
+    const py::ssize_t *&shape,
+    // src1
+    const py::ssize_t *&src1_strides,
+    py::ssize_t src1_itemsize,
+    bool is_src1_c_contig,
+    bool is_src1_f_contig,
+    // src2
+    const py::ssize_t *&src2_strides,
+    py::ssize_t src2_itemsize,
+    bool is_src2_c_contig,
+    bool is_src2_f_contig,
+    // dst
+    const py::ssize_t *&dst_strides,
+    py::ssize_t dst_itemsize,
+    bool is_dst_c_contig,
+    bool is_dst_f_contig,
+    // output
+    std::vector<py::ssize_t> &simplified_shape,
+    std::vector<py::ssize_t> &simplified_src1_strides,
+    std::vector<py::ssize_t> &simplified_src2_strides,
+    std::vector<py::ssize_t> &simplified_dst_strides,
+    py::ssize_t &src1_offset,
+    py::ssize_t &src2_offset,
+    py::ssize_t &dst_offset)
+{
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        for (int i = 0; i < nd; ++i) {
+            simplified_shape.push_back(shape[i]);
+        }
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src2_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+        if (src1_strides == nullptr) {
+            if (is_src1_c_contig) {
+                simplified_src1_strides =
+                    c_contiguous_strides(nd, shape, src1_itemsize);
+            }
+            else if (is_src1_f_contig) {
+                simplified_src1_strides =
+                    f_contiguous_strides(nd, shape, src1_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Source array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            for (int i = 0; i < nd; ++i) {
+                simplified_src1_strides.push_back(src1_strides[i]);
+            }
+        }
+        if (src2_strides == nullptr) {
+            if (is_src2_c_contig) {
+                simplified_src2_strides =
+                    c_contiguous_strides(nd, shape, src2_itemsize);
+            }
+            else if (is_src2_f_contig) {
+                simplified_src2_strides =
+                    f_contiguous_strides(nd, shape, src2_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Source array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            for (int i = 0; i < nd; ++i) {
+                simplified_src2_strides.push_back(src2_strides[i]);
+            }
+        }
+        if (dst_strides == nullptr) {
+            if (is_dst_c_contig) {
+                simplified_dst_strides =
+                    c_contiguous_strides(nd, shape, dst_itemsize);
+            }
+            else if (is_dst_f_contig) {
+                simplified_dst_strides =
+                    f_contiguous_strides(nd, shape, dst_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Destination array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            for (int i = 0; i < nd; ++i) {
+                simplified_dst_strides.push_back(dst_strides[i]);
+            }
+        }
+
+        assert(simplified_shape.size() == static_cast<size_t>(nd));
+        assert(simplified_src1_strides.size() == static_cast<size_t>(nd));
+        assert(simplified_src2_strides.size() == static_cast<size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<size_t>(nd));
+        int contracted_nd = simplify_iteration_three_strides(
+            nd, simplified_shape.data(), simplified_src1_strides.data(),
+            simplified_src2_strides.data(), simplified_dst_strides.data(),
+            src1_offset, // modified by reference
+            src2_offset, // modified by reference
+            dst_offset   // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src1_strides.resize(contracted_nd);
+        simplified_src2_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src2_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if (src1_strides == nullptr) {
+            if (is_src1_c_contig) {
+                simplified_src1_strides.push_back(src1_itemsize);
+            }
+            else if (is_src1_f_contig) {
+                simplified_src1_strides.push_back(src1_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Source array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            simplified_src1_strides.push_back(src1_strides[0]);
+        }
+        if (src2_strides == nullptr) {
+            if (is_src2_c_contig) {
+                simplified_src2_strides.push_back(src2_itemsize);
+            }
+            else if (is_src2_f_contig) {
+                simplified_src2_strides.push_back(src2_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Source array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            simplified_src2_strides.push_back(src2_strides[0]);
+        }
+        if (dst_strides == nullptr) {
+            if (is_dst_c_contig) {
+                simplified_dst_strides.push_back(dst_itemsize);
+            }
+            else if (is_dst_f_contig) {
+                simplified_dst_strides.push_back(dst_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Destination array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_shape.size() == static_cast<size_t>(nd));
+        assert(simplified_src1_strides.size() == static_cast<size_t>(nd));
+        assert(simplified_src2_strides.size() == static_cast<size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<size_t>(nd));
+    }
+    shape = const_cast<const py::ssize_t *>(simplified_shape.data());
+    src1_strides =
+        const_cast<const py::ssize_t *>(simplified_src1_strides.data());
+    src2_strides =
+        const_cast<const py::ssize_t *>(simplified_src2_strides.data());
+    dst_strides =
+        const_cast<const py::ssize_t *>(simplified_dst_strides.data());
+}
+
 } // namespace py_internal
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
index 515e795d20..ec0cc286d4 100644
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
@@ -36,6 +36,16 @@ namespace py_internal
 
 namespace py = pybind11;
 
+void simplify_iteration_space_1(int &,
+                                const py::ssize_t *&,
+                                const py::ssize_t *&,
+                                py::ssize_t,
+                                bool,
+                                bool,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &);
+
 void simplify_iteration_space(int &,
                               const py::ssize_t *&,
                               const py::ssize_t *&,
@@ -52,6 +62,32 @@ void simplify_iteration_space(int &,
                               py::ssize_t &,
                               py::ssize_t &);
 
+void simplify_iteration_space_3(int &,
+                                const py::ssize_t *&,
+                                // src1
+                                const py::ssize_t *&,
+                                py::ssize_t,
+                                bool,
+                                bool,
+                                // src2
+                                const py::ssize_t *&,
+                                py::ssize_t,
+                                bool,
+                                bool,
+                                // dst
+                                const py::ssize_t *&,
+                                py::ssize_t,
+                                bool,
+                                bool,
+                                // output
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &);
+
 } // namespace py_internal
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index e164be2421..2e9e981a37 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -33,6 +33,7 @@
 
 #include "dpctl4pybind11.hpp"
 
+#include "boolean_advanced_indexing.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_for_reshape.hpp"
 #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
@@ -75,6 +76,12 @@ using dpctl::tensor::py_internal::usm_ndarray_full;
 using dpctl::tensor::py_internal::usm_ndarray_put;
 using dpctl::tensor::py_internal::usm_ndarray_take;
 
+using dpctl::tensor::py_internal::overlap;
+using dpctl::tensor::py_internal::py_extract;
+using dpctl::tensor::py_internal::py_mask_positions;
+using dpctl::tensor::py_internal::py_nonzero;
+using dpctl::tensor::py_internal::py_place;
+
 /* ================ Eye ================== */
 
 using dpctl::tensor::py_internal::usm_ndarray_eye;
@@ -105,6 +112,10 @@ void init_dispatch_vectors(void)
     init_eye_ctor_dispatch_vectors();
     init_triul_ctor_dispatch_vectors();
 
+    populate_mask_positions_dispatch_vectors();
+    populate_masked_extract_dispatch_vectors();
+    populate_masked_place_dispatch_vectors();
+
     return;
 }
 
@@ -252,4 +263,24 @@ PYBIND11_MODULE(_tensor_impl, m)
     m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
           py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
           py::arg("depends") = py::list());
+
+    m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
+          py::arg("cumsum"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
+          py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_array_overlap", &overlap,
+          "Determines if the memory regions indexed by each array overlap",
+          py::arg("array1"), py::arg("array2"));
+
+    m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
+          py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"),
+          py::arg("mask_shape"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 }

From c1f00812b35becd29c8af54df2e049dd7033f136 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 10:31:52 -0600
Subject: [PATCH 47/57] Added missing include

---
 .../libtensor/include/kernels/boolean_advanced_indexing.hpp      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
index b42b7869d2..71313e9a27 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -32,6 +32,7 @@
 #include <vector>
 
 #include "utils/strided_iters.hpp"
+#include "utils/type_dispatch.hpp"
 
 namespace dpctl
 {

From 849a3ea2025277c7128b4e72145b4308be68153b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 11:18:44 -0600
Subject: [PATCH 48/57] Hooked up boolean indexing, first attempt

---
 dpctl/tensor/__init__.py            |   5 +-
 dpctl/tensor/_copy_utils.py         | 147 +++++++++++++++++++---------
 dpctl/tensor/_indexing_functions.py | 118 ++++++++++++++++++++++
 dpctl/tensor/_usmarray.pyx          |  14 +--
 4 files changed, 229 insertions(+), 55 deletions(-)

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index d21958b4fa..2a2afd60a4 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -58,7 +58,7 @@
 )
 from dpctl.tensor._device import Device
 from dpctl.tensor._dlpack import from_dlpack
-from dpctl.tensor._indexing_functions import put, take
+from dpctl.tensor._indexing_functions import extract, nonzero, place, put, take
 from dpctl.tensor._manipulation_functions import (
     broadcast_arrays,
     broadcast_to,
@@ -115,6 +115,9 @@
     "squeeze",
     "take",
     "put",
+    "extract",
+    "place",
+    "nonzero",
     "from_numpy",
     "to_numpy",
     "asnumpy",
diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index 597db87c49..72b9e0a021 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -389,45 +389,75 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
     return R
 
 
-def _mock_extract(ary, ary_mask, p):
-    exec_q = dpctl.utils.get_execution_queue(
-        (
-            ary.sycl_queue,
-            ary_mask.sycl_queue,
+def _extract_impl(ary, ary_mask, axis=0):
+    """Extract elements of ary by applying mask starting from slot
+    dimension axis"""
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    if not isinstance(ary_mask, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}"
         )
+    exec_q = dpctl.utils.get_execution_queue(
+        (ary.sycl_queue, ary_mask.sycl_queue)
     )
     if exec_q is None:
         raise dpctl.utils.ExecutionPlacementError(
-            "Can not automatically determine where to allocate the "
-            "result or performance execution. "
-            "Use `usm_ndarray.to_device` method to migrate data to "
-            "be associated with the same queue."
+            "arrays have different associated queues. "
+            "Use `Y.to_device(X.device)` to migrate."
         )
-
-    res_usm_type = dpctl.utils.get_coerced_usm_type(
-        (
-            ary.usm_type,
-            ary_mask.usm_type,
+    ary_nd = ary.ndim
+    pp = normalize_axis_index(operator.index(axis), ary_nd)
+    mask_nd = ary_mask.ndim
+    if pp < 0 or pp + mask_nd > ary_nd:
+        raise ValueError(
+            "Parameter p is inconsistent with input array dimensions"
         )
+    mask_nelems = ary_mask.size
+    cumsum = dpt.empty(mask_nelems, dtype=dpt.int64, device=ary_mask.device)
+    exec_q = cumsum.sycl_queue
+    mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q)
+    dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
+    dst = dpt.empty(
+        dst_shape, dtype=ary.dtype, usm_type=ary.usm_type, device=ary.device
     )
-    ary_np = dpt.asnumpy(ary)
-    mask_np = dpt.asnumpy(ary_mask)
-    res_np = ary_np[(slice(None),) * p + (mask_np,)]
-    res = dpt.empty(
-        res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    hev, _ = ti._extract(
+        src=ary,
+        cumsum=cumsum,
+        axis_start=pp,
+        axis_end=pp + mask_nd,
+        dst=dst,
+        sycl_queue=exec_q,
     )
-    res[...] = res_np
-    return res
+    hev.wait()
+    return dst
 
 
-def _mock_nonzero(ary):
+def _nonzero_impl(ary):
     if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError
-    q = ary.sycl_queue
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    exec_q = ary.sycl_queue
     usm_type = ary.usm_type
-    ary_np = dpt.asnumpy(ary)
-    nz = ary_np.nonzero()
-    return tuple(dpt.asarray(i, usm_type=usm_type, sycl_queue=q) for i in nz)
+    mask_nelems = ary.size
+    cumsum = dpt.empty(
+        mask_nelems, dtype=dpt.int64, sycl_queue=exec_q, order="C"
+    )
+    mask_count = ti.mask_positions(ary, cumsum, sycl_queue=exec_q)
+    indexes = dpt.empty(
+        (ary.ndim, mask_count),
+        dtype=cumsum.dtype,
+        usm_type=usm_type,
+        sycl_queue=exec_q,
+        order="C",
+    )
+    hev, _ = ti._nonzero(cumsum, indexes, ary.shape, exec_q)
+    res = tuple(indexes[i, :] for i in range(ary.ndim))
+    hev.wait()
+    return res
 
 
 def _take_multi_index(ary, inds, p):
@@ -473,34 +503,57 @@ def _take_multi_index(ary, inds, p):
     return res
 
 
-def _mock_place(ary, ary_mask, p, vals):
+def _place_impl(ary, ary_mask, vals, axis=0):
+    """Extract elements of ary by applying mask starting from slot
+    dimension axis"""
     if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
     if not isinstance(ary_mask, dpt.usm_ndarray):
-        raise TypeError
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}"
+        )
+    if not isinstance(vals, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}"
+        )
     exec_q = dpctl.utils.get_execution_queue(
-        (ary.sycl_queue, ary_mask.sycl_queue)
+        (ary.sycl_queue, ary_mask.sycl_queue, vals.sycl_queue)
     )
-    if exec_q is not None and isinstance(vals, dpt.usm_ndarray):
-        exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue))
     if exec_q is None:
         raise dpctl.utils.ExecutionPlacementError(
-            "Can not automatically determine where to allocate the "
-            "result or performance execution. "
-            "Use `usm_ndarray.to_device` method to migrate data to "
-            "be associated with the same queue."
+            "arrays have different associated queues. "
+            "Use `Y.to_device(X.device)` to migrate."
         )
-
-    ary_np = dpt.asnumpy(ary)
-    mask_np = dpt.asnumpy(ary_mask)
-    if isinstance(vals, dpt.usm_ndarray) or hasattr(
-        vals, "__sycl_usm_array_interface__"
-    ):
-        vals_np = dpt.asnumpy(vals)
+    ary_nd = ary.ndim
+    pp = normalize_axis_index(operator.index(axis), ary_nd)
+    mask_nd = ary_mask.ndim
+    if pp < 0 or pp + mask_nd > ary_nd:
+        raise ValueError(
+            "Parameter p is inconsistent with input array dimensions"
+        )
+    mask_nelems = ary_mask.size
+    cumsum = dpt.empty(mask_nelems, dtype=dpt.int64, device=ary_mask.device)
+    exec_q = cumsum.sycl_queue
+    mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q)
+    expected_vals_shape = (
+        ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
+    )
+    if vals.dtype == ary.dtype:
+        rhs = vals
     else:
-        vals_np = vals
-    ary_np[(slice(None),) * p + (mask_np,)] = vals_np
-    ary[...] = ary_np
+        rhs = dpt.astype(vals, ary.dtype)
+    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
+    hev, _ = ti._place(
+        dst=ary,
+        cumsum=cumsum,
+        axis_start=pp,
+        axis_end=pp + mask_nd,
+        rhs=rhs,
+        sycl_queue=exec_q,
+    )
+    hev.wait()
     return
 
 
diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
index 12f7b2d72e..01f1a2370a 100644
--- a/dpctl/tensor/_indexing_functions.py
+++ b/dpctl/tensor/_indexing_functions.py
@@ -23,6 +23,8 @@
 import dpctl.tensor as dpt
 from dpctl.tensor._tensor_impl import _put, _take
 
+from ._copy_utils import _extract_impl, _nonzero_impl, _place_impl
+
 
 def take(x, indices, /, *, axis=None, mode="clip"):
     if not isinstance(x, dpt.usm_ndarray):
@@ -175,3 +177,119 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"):
 
     hev, _ = _put(x, indices, vals, axis, mode, sycl_queue=exec_q)
     hev.wait()
+
+
+def extract(condition, arr):
+    """extract(condition, arr)
+
+    Returns the elements of an array that satisfies the condition.
+
+    If `condition` is boolean :func:``dpctl.tensor.extract`` is
+    equivalent to ``arr[condition]``.
+
+    Note that :func:``dpctl.tensor.place`` does the opposite of
+    :func:``dpctl.tensor.extract``.
+
+    Args:
+       conditions: usm_ndarray
+          An array whose non-zero or True entries indicate the element
+          of `arr` to extract.
+       arr: usm_ndarray
+          Input array of the same size as `condition`.
+
+    Returns:
+       extract: usm_ndarray
+          Rank 1 array of values from `arr` where `condition` is True.
+    """
+    if not isinstance(condition, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}"
+        )
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    exec_q = dpctl.utils.get_execution_queue(
+        (
+            condition.sycl_queue,
+            arr.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    if condition.shape != arr.shape:
+        raise ValueError("Arrays are not of the same size")
+    return _extract_impl(arr, condition)
+
+
+def place(arr, mask, vals):
+    """place(arr, mask, vals)
+
+    Change elements of an array based on conditional and input values.
+
+    If `mask` is boolean :func:``dpctl.tensor.place`` is
+    equivalent to ``arr[condition] = vals``.
+
+    Args:
+       arr: usm_ndarray
+          Array to put data into.
+       mask: usm_ndarray
+          Boolean mask array. Must have the same size as `arr`.
+       vals: usm_ndarray
+          Values to put into `arr`. Only the first N elements are
+          used, where N is the number of True values in `mask`. If
+          `vals` is smaller than N, it will be repeated, and if
+          elements of `arr` are to be masked, this sequence must be
+          non-empty. Array `vals` must be one dimensional.
+    """
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    if not isinstance(mask, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(mask)}"
+        )
+    if not isinstance(vals, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(vals)}"
+        )
+    exec_q = dpctl.utils.get_execution_queue(
+        (
+            arr.sycl_queue,
+            mask.sycl_queue,
+            vals.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    if arr.shape != mask.shape or vals.ndim != 1:
+        raise ValueError("Array sizes are not as required")
+    # FIXME
+    _place_impl(arr, mask, vals, axis=0)
+
+
+def nonzero(arr):
+    """nonzero(arr)
+
+    Return the indices of non-zero elements.
+
+    Returns the tuple of usm_narrays, one for each dimension
+    of `arr`, containing the indices of the non-zero elements
+    in that dimension. The values of `arr` are always tested in
+    row-major, C-style order.
+
+    Args:
+       arr: usm_ndarray
+          Input array, which has non-zero array rank.
+    Returns:
+       tuple_of_usm_ndarrays: tuple
+          Indices of non-zero array elements.
+    """
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    if arr.ndim == 0:
+        raise ValueError("Array of positive rank is exepcted")
+    return _nonzero_impl(arr)
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 64a492065f..1abc1e88ac 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -670,15 +670,15 @@ cdef class usm_ndarray:
         if adv_ind_start_p < 0:
             return res
 
-        from ._copy_utils import _mock_extract, _mock_nonzero, _take_multi_index
+        from ._copy_utils import _extract_impl, _nonzero_impl, _take_multi_index
         if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
-            return _mock_extract(res, adv_ind[0], adv_ind_start_p)
+            return _extract_impl(res, adv_ind[0], axis=adv_ind_start_p)
 
         if any(ind.dtype == dpt_bool for ind in adv_ind):
             adv_ind_int = list()
             for ind in adv_ind:
                 if ind.dtype == dpt_bool:
-                    adv_ind_int.extend(_mock_nonzero(ind))
+                    adv_ind_int.extend(_nonzero_impl(ind))
                 else:
                     adv_ind_int.append(ind)
             return _take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p)
@@ -1015,8 +1015,8 @@ cdef class usm_ndarray:
         from ._copy_utils import (
             _copy_from_numpy_into,
             _copy_from_usm_ndarray_to_usm_ndarray,
-            _mock_nonzero,
-            _mock_place,
+            _nonzero_impl,
+            _place_impl,
             _put_multi_index,
         )
 
@@ -1050,14 +1050,14 @@ cdef class usm_ndarray:
             return
 
         if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
-            _mock_place(Xv, adv_ind[0], adv_ind_start_p, rhs)
+            _place_impl(Xv, adv_ind[0], rhs, axis=adv_ind_start_p)
             return
 
         if any(ind.dtype == dpt_bool for ind in adv_ind):
             adv_ind_int = list()
             for ind in adv_ind:
                 if ind.dtype == dpt_bool:
-                    adv_ind_int.extend(_mock_nonzero(ind))
+                    adv_ind_int.extend(_nonzero_impl(ind))
                 else:
                     adv_ind_int.append(ind)
             _put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs)

From ed279d63b6f6b43ef5f3b4791696216536639ca1 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 11:43:28 -0600
Subject: [PATCH 49/57] Changes per clang-format 11

---
 dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
index 1534b38391..9689612b8a 100644
--- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -62,7 +62,7 @@ template <class V, class U> sink_t inserter(V &lhs, U &&rhs)
 }
 
 template <typename T, typename A, typename... Vs>
-std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&... vs)
+std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&...vs)
 {
     std::size_t s = lhs.size();
     {
@@ -83,7 +83,7 @@ template <typename indT, typename... Vs>
 std::tuple<indT *, size_t, sycl::event>
 device_allocate_and_pack(sycl::queue q,
                          std::vector<sycl::event> &host_task_events,
-                         Vs &&... vs)
+                         Vs &&...vs)
 {
 
     // memory transfer optimization, use USM-host for temporary speeds up

From 3ced89a095650fbc40688294b8e343d44857f495 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 12:42:07 -0600
Subject: [PATCH 50/57] Used Strided1DCyclingIndexer in place implementations

This allows to implement behavior of place which cycles over
values of val array if that is shorter than the number of non-zero
elements in the mask.
---
 .../kernels/boolean_advanced_indexing.hpp     | 53 +++++++++++--------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
index 71313e9a27..aa0a90ce70 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -1,4 +1,4 @@
-//=== boolean_advance_indexing.hpp -                       ---*-C++-*--/===//
+//=== boolean_advance_indexing.hpp -                      ------*-C++-*--/===//
 //
 //                      Data Parallel Control (dpctl)
 //
@@ -16,11 +16,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 ///
 /// \file
 /// This file defines kernels for advanced tensor index operations.
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 
 #pragma once
 #include <CL/sycl.hpp>
@@ -114,6 +114,26 @@ struct Strided1DIndexer
     py::ssize_t step = 1;
 };
 
+struct Strided1DCyclicIndexer
+{
+    Strided1DCyclicIndexer(py::ssize_t _offset,
+                           py::ssize_t _size,
+                           py::ssize_t _step)
+        : offset(_offset), size(static_cast<size_t>(_size)), step(_step)
+    {
+    }
+
+    size_t operator()(size_t gid) const
+    {
+        return static_cast<size_t>(offset + (gid % size) * step);
+    }
+
+private:
+    py::ssize_t offset = 0;
+    size_t size = 1;
+    py::ssize_t step = 1;
+};
+
 template <typename _IndexerFn> struct ZeroChecker
 {
 
@@ -762,27 +782,22 @@ sycl::event masked_place_all_slices_strided_impl(
     py::ssize_t rhs_stride,
     const std::vector<sycl::event> &depends = {})
 {
-    //  using MaskedPlaceStridedFunctor;
-    //  using Strided1DIndexer;
-    //  using StridedIndexer;
-    //  using TwoZeroOffsets_Indexer;
-
     TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};
 
     /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
      * *_packed_shape_strides) */
     StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides);
-    Strided1DIndexer masked_rhs_indexer(0, rhs_size, rhs_stride);
+    Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride);
 
     sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
         cgh.parallel_for<class masked_place_all_slices_strided_impl_krn<
-            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer, dataT,
-            indT>>(
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer,
+            dataT, indT>>(
             sycl::range<1>(static_cast<size_t>(iteration_size)),
             MaskedPlaceStridedFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
-                                      Strided1DIndexer, dataT, indT>(
+                                      Strided1DCyclicIndexer, dataT, indT>(
                 dst_p, cumsum_p, rhs_p, 1, iteration_size,
                 orthog_dst_rhs_indexer, masked_dst_indexer,
                 masked_rhs_indexer));
@@ -838,11 +853,6 @@ sycl::event masked_place_some_slices_strided_impl(
     py::ssize_t masked_rhs_stride,
     const std::vector<sycl::event> &depends = {})
 {
-    //  using MaskedPlaceStridedFunctor;
-    //  using Strided1DIndexer;
-    //  using StridedIndexer;
-    //  using TwoOffsets_StridedIndexer;
-
     TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{
         orthog_nd, ortho_dst_offset, ortho_rhs_offset,
         packed_ortho_dst_rhs_shape_strides};
@@ -851,17 +861,18 @@ sycl::event masked_place_some_slices_strided_impl(
      * *_packed_shape_strides) */
     StridedIndexer masked_dst_indexer{masked_nd, 0,
                                       packed_masked_dst_shape_strides};
-    Strided1DIndexer masked_rhs_indexer{0, masked_rhs_size, masked_rhs_stride};
+    Strided1DCyclicIndexer masked_rhs_indexer{0, masked_rhs_size,
+                                              masked_rhs_stride};
 
     sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
         cgh.parallel_for<class masked_place_some_slices_strided_impl_krn<
-            TwoOffsets_StridedIndexer, StridedIndexer, Strided1DIndexer, dataT,
-            indT>>(
+            TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer,
+            dataT, indT>>(
             sycl::range<1>(static_cast<size_t>(orthog_nelems * masked_nelems)),
             MaskedPlaceStridedFunctor<TwoOffsets_StridedIndexer, StridedIndexer,
-                                      Strided1DIndexer, dataT, indT>(
+                                      Strided1DCyclicIndexer, dataT, indT>(
                 dst_p, cumsum_p, rhs_p, orthog_nelems, masked_nelems,
                 orthog_dst_rhs_indexer, masked_dst_indexer,
                 masked_rhs_indexer));

From 19691ca89ff4ee3324c3402a19a4be669ee8e138 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 12:44:49 -0600
Subject: [PATCH 51/57] Implemented dpctl.tensor.place as per documented
 behavior.

---
 dpctl/tensor/_indexing_functions.py | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
index 01f1a2370a..20c4a22786 100644
--- a/dpctl/tensor/_indexing_functions.py
+++ b/dpctl/tensor/_indexing_functions.py
@@ -21,9 +21,9 @@
 
 import dpctl
 import dpctl.tensor as dpt
-from dpctl.tensor._tensor_impl import _put, _take
+import dpctl.tensor._tensor_impl as ti
 
-from ._copy_utils import _extract_impl, _nonzero_impl, _place_impl
+from ._copy_utils import _extract_impl, _nonzero_impl
 
 
 def take(x, indices, /, *, axis=None, mode="clip"):
@@ -95,7 +95,7 @@ def take(x, indices, /, *, axis=None, mode="clip"):
         res_shape, dtype=x.dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
 
-    hev, _ = _take(x, indices, res, axis, mode, sycl_queue=exec_q)
+    hev, _ = ti._take(x, indices, res, axis, mode, sycl_queue=exec_q)
     hev.wait()
 
     return res
@@ -175,7 +175,7 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"):
 
     vals = dpt.broadcast_to(vals, val_shape)
 
-    hev, _ = _put(x, indices, vals, axis, mode, sycl_queue=exec_q)
+    hev, _ = ti._put(x, indices, vals, axis, mode, sycl_queue=exec_q)
     hev.wait()
 
 
@@ -265,8 +265,23 @@ def place(arr, mask, vals):
         raise dpctl.utils.ExecutionPlacementError
     if arr.shape != mask.shape or vals.ndim != 1:
         raise ValueError("Array sizes are not as required")
-    # FIXME
-    _place_impl(arr, mask, vals, axis=0)
+    cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q)
+    nz_count = ti.mask_positions(mask, cumsum, sycl_queue=exec_q)
+    if nz_count == 0:
+        return
+    if vals.dtype == arr.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, arr.dtype)
+    hev, _ = ti._place(
+        dst=arr,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=mask.ndim,
+        rhs=rhs,
+        sycl_queue=exec_q,
+    )
+    hev.wait()
 
 
 def nonzero(arr):

From 03c48222e07201fa37567a4f1337f699f87cba1e Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 2 Mar 2023 17:24:22 -0800
Subject: [PATCH 52/57] _take and _put returned event changes - Host_tasks now
 collected and used as dependencies for dec_ref of py arguments - Return
 temporaries deallocation event to further prevent dangling host_tasks

---
 .../source/integer_advanced_indexing.cpp        | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
index dfc74c12f0..ed0f749add 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -548,10 +548,12 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
         });
     });
 
-    sycl::event host_task_ev = keep_args_alive(
-        exec_q, {src, py_ind, dst}, {take_generic_ev, temporaries_cleanup_ev});
+    host_task_events.push_back(temporaries_cleanup_ev);
 
-    return std::make_pair(host_task_ev, take_generic_ev);
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, temporaries_cleanup_ev);
 }
 
 std::pair<sycl::event, sycl::event>
@@ -857,7 +859,6 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
            dst_offset, val_offset, packed_ind_offsets, all_deps);
 
     // free packed temporaries
-
     sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(put_generic_ev);
         auto ctx = exec_q.get_context();
@@ -872,10 +873,12 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
         });
     });
 
-    sycl::event py_obj_cleanup_ev = keep_args_alive(
-        exec_q, {dst, py_ind, val}, {put_generic_ev, temporaries_cleanup_ev});
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events);
 
-    return std::make_pair(temporaries_cleanup_ev, put_generic_ev);
+    return std::make_pair(arg_cleanup_ev, temporaries_cleanup_ev);
 }
 
 void init_advanced_indexing_dispatch_tables(void)

From f75723b143c67f109bcf27bdb8837cd30a7d81cb Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 14:47:59 -0600
Subject: [PATCH 53/57] Added tests to test_usm_ndarray_indexing

---
 dpctl/tests/test_usm_ndarray_indexing.py | 220 +++++++++++++++++++++++
 1 file changed, 220 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index 98bb674b21..aec71def7d 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -970,3 +970,223 @@ def test_advanced_indexing_compute_follows_data():
         dpt.put(x, ind0, val1, axis=0)
     with pytest.raises(ExecutionPlacementError):
         x[ind0] = val1
+
+
+#######
+
+
+def test_extract_all_1d():
+    x = dpt.arange(30, dtype="i4")
+    sel = dpt.ones(30, dtype="?")
+    sel[::2] = False
+
+    res = x[sel]
+    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+    res2 = dpt.extract(sel, x)
+    assert (dpt.asnumpy(res2) == expected_res).all()
+
+
+def test_extract_all_2d():
+    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
+    sel = dpt.ones(30, dtype="?")
+    sel[::2] = False
+    sel = dpt.reshape(sel, x.shape)
+
+    res = x[sel]
+    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+    res2 = dpt.extract(sel, x)
+    assert (dpt.asnumpy(res2) == expected_res).all()
+
+
+def test_extract_2D_axis0():
+    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
+    sel = dpt.ones(x.shape[0], dtype="?")
+    sel[::2] = False
+
+    res = x[sel]
+    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+
+def test_extract_2D_axis1():
+    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
+    sel = dpt.ones(x.shape[1], dtype="?")
+    sel[::2] = False
+
+    res = x[:, sel]
+    expected = dpt.asnumpy(x)[:, dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected).all()
+
+
+def test_extract_begin():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 3), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[sel]
+    expected = dpt.asnumpy(y)[[0, 1], [0, 1]]
+    assert (dpt.asnumpy(z) == expected).all()
+
+
+def test_extract_end():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((4, 4), dtype="?")
+    sel[0, 0] = True
+    z = y[..., sel]
+    expected = dpt.asnumpy(y)[..., [0], [0]]
+    assert (dpt.asnumpy(z) == expected).all()
+
+
+def test_extract_middle():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 4), dtype="?")
+    sel[0, 0] = True
+    z = y[:, sel]
+    expected = dpt.asnumpy(y)[:, [0], [0], :]
+    assert (dpt.asnumpy(z) == expected).all()
+
+
+def test_extract_empty_result():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 4), dtype="?")
+    z = y[:, sel]
+    assert z.shape == (
+        y.shape[0],
+        0,
+        y.shape[3],
+    )
+
+
+def test_place_all_1d():
+    x = dpt.arange(10, dtype="i2")
+    sel = dpt.zeros(10, dtype="?")
+    sel[0::2] = True
+    val = dpt.zeros(5, dtype=x.dtype)
+    x[sel] = val
+    assert (dpt.asnumpy(x) == np.array([0, 1, 0, 3, 0, 5, 0, 7, 0, 9])).all()
+    dpt.place(x, sel, dpt.asarray(2))
+    assert (dpt.asnumpy(x) == np.array([2, 1, 2, 3, 2, 5, 2, 7, 2, 9])).all()
+
+
+def test_place_2d_axis0():
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray([True, False, True])
+    val = dpt.zeros((2, 4), dtype=x.dtype)
+    x[sel] = val
+    expected_x = np.stack(
+        (
+            np.zeros(4, dtype="i2"),
+            np.arange(4, 8, dtype="i2"),
+            np.zeros(4, dtype="i2"),
+        )
+    )
+    assert (dpt.asnumpy(x) == expected_x).all()
+
+
+def test_place_2d_axis1():
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray([True, False, True, False])
+    val = dpt.zeros((3, 2), dtype=x.dtype)
+    x[:, sel] = val
+    expected_x = np.array(
+        [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2"
+    )
+    assert (dpt.asnumpy(x) == expected_x).all()
+
+
+def test_place_2d_axis1_scalar():
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray([True, False, True, False])
+    val = dpt.zeros(tuple(), dtype=x.dtype)
+    x[:, sel] = val
+    expected_x = np.array(
+        [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2"
+    )
+    assert (dpt.asnumpy(x) == expected_x).all()
+
+
+def test_place_all_slices():
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray(
+        [
+            [False, True, True, False],
+            [True, True, False, False],
+            [False, False, True, True],
+        ],
+        dtype="?",
+    )
+    y = dpt.ones_like(x)
+    y[sel] = x[sel]
+
+
+def test_place_some_slices_begin():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 3), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[sel]
+    w = dpt.zeros_like(y)
+    w[sel] = z
+
+
+def test_place_some_slices_mid():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 4), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[:, sel]
+    w = dpt.zeros_like(y)
+    w[:, sel] = z
+
+
+def test_place_some_slices_end():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((4, 4), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[:, :, sel]
+    w = dpt.zeros_like(y)
+    w[:, :, sel] = z
+
+
+def test_place_cycling():
+    x = dpt.zeros(10, dtype="f4")
+    y = dpt.asarray([2, 3])
+    sel = dpt.ones(x.size, dtype="?")
+    dpt.place(x, sel, y)
+    expected = np.array(
+        [
+            2,
+            3,
+        ]
+        * 5,
+        dtype=x.dtype,
+    )
+    assert (dpt.asnumpy(x) == expected).all()
+
+
+def test_place_subset():
+    x = dpt.zeros(10, dtype="f4")
+    y = dpt.ones_like(x)
+    sel = dpt.ones(x.size, dtype="?")
+    sel[::2] = False
+    dpt.place(x, sel, y)
+    expected = np.array([1, 3, 5, 7, 9], dtype=x.dtype)
+    assert (dpt.asnumpy(x) == expected).all()
+
+
+def test_nonzero():
+    x = dpt.concat((dpt.zeros(3), dpt.ones(4), dpt.zeros(3)))
+    (i,) = dpt.nonzero(x)
+    assert dpt.asnumpy(i) == np.array([3, 4, 5, 6]).all()

From cab00351ba9ba9018528dd9cf3563c0aeb31f861 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 15:43:17 -0600
Subject: [PATCH 54/57] Fixed tests for boolean indexing

---
 dpctl/tests/test_usm_ndarray_indexing.py | 28 +++++++++++++++++++-----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index aec71def7d..bcc1fdbb60 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -972,10 +972,8 @@ def test_advanced_indexing_compute_follows_data():
         x[ind0] = val1
 
 
-#######
-
-
 def test_extract_all_1d():
+    get_queue_or_skip()
     x = dpt.arange(30, dtype="i4")
     sel = dpt.ones(30, dtype="?")
     sel[::2] = False
@@ -989,6 +987,7 @@ def test_extract_all_1d():
 
 
 def test_extract_all_2d():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
     sel = dpt.ones(30, dtype="?")
     sel[::2] = False
@@ -1003,6 +1002,7 @@ def test_extract_all_2d():
 
 
 def test_extract_2D_axis0():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
     sel = dpt.ones(x.shape[0], dtype="?")
     sel[::2] = False
@@ -1013,6 +1013,7 @@ def test_extract_2D_axis0():
 
 
 def test_extract_2D_axis1():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
     sel = dpt.ones(x.shape[1], dtype="?")
     sel[::2] = False
@@ -1023,6 +1024,7 @@ def test_extract_2D_axis1():
 
 
 def test_extract_begin():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((3, 3), dtype="?")
@@ -1034,6 +1036,7 @@ def test_extract_begin():
 
 
 def test_extract_end():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((4, 4), dtype="?")
@@ -1044,6 +1047,7 @@ def test_extract_end():
 
 
 def test_extract_middle():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((3, 4), dtype="?")
@@ -1054,6 +1058,7 @@ def test_extract_middle():
 
 
 def test_extract_empty_result():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((3, 4), dtype="?")
@@ -1066,17 +1071,19 @@ def test_extract_empty_result():
 
 
 def test_place_all_1d():
+    get_queue_or_skip()
     x = dpt.arange(10, dtype="i2")
     sel = dpt.zeros(10, dtype="?")
     sel[0::2] = True
     val = dpt.zeros(5, dtype=x.dtype)
     x[sel] = val
     assert (dpt.asnumpy(x) == np.array([0, 1, 0, 3, 0, 5, 0, 7, 0, 9])).all()
-    dpt.place(x, sel, dpt.asarray(2))
+    dpt.place(x, sel, dpt.asarray([2]))
     assert (dpt.asnumpy(x) == np.array([2, 1, 2, 3, 2, 5, 2, 7, 2, 9])).all()
 
 
 def test_place_2d_axis0():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
     sel = dpt.asarray([True, False, True])
     val = dpt.zeros((2, 4), dtype=x.dtype)
@@ -1092,6 +1099,7 @@ def test_place_2d_axis0():
 
 
 def test_place_2d_axis1():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
     sel = dpt.asarray([True, False, True, False])
     val = dpt.zeros((3, 2), dtype=x.dtype)
@@ -1103,6 +1111,7 @@ def test_place_2d_axis1():
 
 
 def test_place_2d_axis1_scalar():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
     sel = dpt.asarray([True, False, True, False])
     val = dpt.zeros(tuple(), dtype=x.dtype)
@@ -1114,6 +1123,7 @@ def test_place_2d_axis1_scalar():
 
 
 def test_place_all_slices():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
     sel = dpt.asarray(
         [
@@ -1128,6 +1138,7 @@ def test_place_all_slices():
 
 
 def test_place_some_slices_begin():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((3, 3), dtype="?")
@@ -1139,6 +1150,7 @@ def test_place_some_slices_begin():
 
 
 def test_place_some_slices_mid():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((3, 4), dtype="?")
@@ -1150,6 +1162,7 @@ def test_place_some_slices_mid():
 
 
 def test_place_some_slices_end():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((4, 4), dtype="?")
@@ -1161,6 +1174,7 @@ def test_place_some_slices_end():
 
 
 def test_place_cycling():
+    get_queue_or_skip()
     x = dpt.zeros(10, dtype="f4")
     y = dpt.asarray([2, 3])
     sel = dpt.ones(x.size, dtype="?")
@@ -1177,16 +1191,18 @@ def test_place_cycling():
 
 
 def test_place_subset():
+    get_queue_or_skip()
     x = dpt.zeros(10, dtype="f4")
     y = dpt.ones_like(x)
     sel = dpt.ones(x.size, dtype="?")
     sel[::2] = False
     dpt.place(x, sel, y)
-    expected = np.array([1, 3, 5, 7, 9], dtype=x.dtype)
+    expected = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1], dtype=x.dtype)
     assert (dpt.asnumpy(x) == expected).all()
 
 
 def test_nonzero():
+    get_queue_or_skip()
     x = dpt.concat((dpt.zeros(3), dpt.ones(4), dpt.zeros(3)))
     (i,) = dpt.nonzero(x)
-    assert dpt.asnumpy(i) == np.array([3, 4, 5, 6]).all()
+    assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all()

From cb32c6fd2f5096040f6001e5c71ff6002013668c Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 18:05:54 -0600
Subject: [PATCH 55/57] Tweaks to docstrings of extract, place, nonzero

---
 dpctl/tensor/_indexing_functions.py | 34 ++++++++++++++---------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
index 20c4a22786..6f19dc3bd4 100644
--- a/dpctl/tensor/_indexing_functions.py
+++ b/dpctl/tensor/_indexing_functions.py
@@ -192,14 +192,14 @@ def extract(condition, arr):
 
     Args:
        conditions: usm_ndarray
-          An array whose non-zero or True entries indicate the element
-          of `arr` to extract.
+            An array whose non-zero or True entries indicate the element
+            of `arr` to extract.
        arr: usm_ndarray
-          Input array of the same size as `condition`.
+            Input array of the same size as `condition`.
 
     Returns:
-       extract: usm_ndarray
-          Rank 1 array of values from `arr` where `condition` is True.
+        usm_ndarray
+            Rank 1 array of values from `arr` where `condition` is True.
     """
     if not isinstance(condition, dpt.usm_ndarray):
         raise TypeError(
@@ -231,16 +231,16 @@ def place(arr, mask, vals):
     equivalent to ``arr[condition] = vals``.
 
     Args:
-       arr: usm_ndarray
-          Array to put data into.
+        arr: usm_ndarray
+            Array to put data into.
        mask: usm_ndarray
-          Boolean mask array. Must have the same size as `arr`.
+            Boolean mask array. Must have the same size as `arr`.
        vals: usm_ndarray
-          Values to put into `arr`. Only the first N elements are
-          used, where N is the number of True values in `mask`. If
-          `vals` is smaller than N, it will be repeated, and if
-          elements of `arr` are to be masked, this sequence must be
-          non-empty. Array `vals` must be one dimensional.
+            Values to put into `arr`. Only the first N elements are
+            used, where N is the number of True values in `mask`. If
+            `vals` is smaller than N, it will be repeated, and if
+            elements of `arr` are to be masked, this sequence must be
+            non-empty. Array `vals` must be one dimensional.
     """
     if not isinstance(arr, dpt.usm_ndarray):
         raise TypeError(
@@ -295,11 +295,11 @@ def nonzero(arr):
     row-major, C-style order.
 
     Args:
-       arr: usm_ndarray
-          Input array, which has non-zero array rank.
+        arr: usm_ndarray
+            Input array, which has non-zero array rank.
     Returns:
-       tuple_of_usm_ndarrays: tuple
-          Indices of non-zero array elements.
+        Tuple[usm_ndarray]
+            Indices of non-zero array elements.
     """
     if not isinstance(arr, dpt.usm_ndarray):
         raise TypeError(

From 0a7ea0c2674362b6be70b0094d1ef4a395d2cea9 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Fri, 3 Mar 2023 17:21:59 -0800
Subject: [PATCH 56/57] dpt.take and dpt.put changes - Improved conformity to
 array API standard - Added docstrings

---
 dpctl/tensor/_indexing_functions.py      | 178 +++++++++++++----------
 dpctl/tests/test_usm_ndarray_indexing.py |  28 ++--
 2 files changed, 117 insertions(+), 89 deletions(-)

diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
index 6f19dc3bd4..c312d9e2b9 100644
--- a/dpctl/tensor/_indexing_functions.py
+++ b/dpctl/tensor/_indexing_functions.py
@@ -27,43 +27,56 @@
 
 
 def take(x, indices, /, *, axis=None, mode="clip"):
+    """take(x, indices, axis=None, mode="clip")
+
+    Takes elements from array along a given axis.
+
+    Args:
+       x: usm_ndarray
+          The array that elements will be taken from.
+       indices: usm_ndarray
+          One-dimensional array of indices.
+       axis:
+          The axis over which the values will be selected.
+          If x is one-dimensional, this argument is optional.
+       mode:
+          How out-of-bounds indices will be handled.
+          "Clip" - clamps indices to (-n <= i < n), then wraps
+          negative indices.
+          "Wrap" - wraps both negative and positive indices.
+
+    Returns:
+       out: usm_ndarray
+          Array with shape x.shape[:axis] + indices.shape + x.shape[axis + 1:]
+          filled with elements .
+    """
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(
             "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
         )
 
-    if not isinstance(indices, list) and not isinstance(indices, tuple):
-        indices = (indices,)
-
-    queues_ = [
-        x.sycl_queue,
-    ]
-    usm_types_ = [
-        x.usm_type,
-    ]
-
-    for i in indices:
-        if not isinstance(i, dpt.usm_ndarray):
-            raise TypeError(
-                "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
-                    type(i)
-                )
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
+                type(indices)
             )
-        if not np.issubdtype(i.dtype, np.integer):
-            raise IndexError(
-                "`indices` expected integer data type, got `{}`".format(i.dtype)
+        )
+    if not np.issubdtype(indices.dtype, np.integer):
+        raise IndexError(
+            "`indices` expected integer data type, got `{}`".format(
+                indices.dtype
             )
-        queues_.append(i.sycl_queue)
-        usm_types_.append(i.usm_type)
-    exec_q = dpctl.utils.get_execution_queue(queues_)
-    if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError(
-            "Can not automatically determine where to allocate the "
-            "result or performance execution. "
-            "Use `usm_ndarray.to_device` method to migrate data to "
-            "be associated with the same queue."
         )
-    res_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
+    if indices.ndim != 1:
+        raise ValueError(
+            "`indices` expected a 1D array, got `{}`".format(indices.ndim)
+        )
+    exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, indices.sycl_queue])
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    res_usm_type = dpctl.utils.get_coerced_usm_type(
+        [x.usm_type, indices.usm_type]
+    )
 
     modes = {"clip": 0, "wrap": 1}
     try:
@@ -81,27 +94,47 @@ def take(x, indices, /, *, axis=None, mode="clip"):
             )
         axis = 0
 
-    if len(indices) > 1:
-        indices = dpt.broadcast_arrays(*indices)
     if x_ndim > 0:
         axis = normalize_axis_index(operator.index(axis), x_ndim)
-        res_shape = (
-            x.shape[:axis] + indices[0].shape + x.shape[axis + len(indices) :]
-        )
+        res_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :]
     else:
-        res_shape = indices[0].shape
+        if axis != 0:
+            raise ValueError("`axis` must be 0 for an array of dimension 0.")
+        res_shape = indices.shape
 
     res = dpt.empty(
         res_shape, dtype=x.dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
 
-    hev, _ = ti._take(x, indices, res, axis, mode, sycl_queue=exec_q)
+    hev, _ = ti._take(x, (indices,), res, axis, mode, sycl_queue=exec_q)
     hev.wait()
 
     return res
 
 
 def put(x, indices, vals, /, *, axis=None, mode="clip"):
+    """put(x, indices, vals, axis=None, mode="clip")
+
+    Puts values of an array into another array
+    along a given axis.
+
+    Args:
+       x: usm_ndarray
+          The array the values will be put into.
+       indices: usm_ndarray
+          One-dimensional array of indices.
+       vals:
+          Array of values to be put into `x`.
+          Must be broadcastable to the shape of `indices`.
+       axis:
+          The axis over which the values will be placed.
+          If x is one-dimensional, this argument is optional.
+       mode:
+          How out-of-bounds indices will be handled.
+          "Clip" - clamps indices to (-axis_size <= i < axis_size),
+          then wraps negative indices.
+          "Wrap" - wraps both negative and positive indices.
+    """
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(
             "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
@@ -116,66 +149,61 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"):
         usm_types_ = [
             x.usm_type,
         ]
-
-    if not isinstance(indices, list) and not isinstance(indices, tuple):
-        indices = (indices,)
-
-    for i in indices:
-        if not isinstance(i, dpt.usm_ndarray):
-            raise TypeError(
-                "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
-                    type(i)
-                )
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
+                type(indices)
             )
-        if not np.issubdtype(i.dtype, np.integer):
-            raise IndexError(
-                "`indices` expected integer data type, got `{}`".format(i.dtype)
+        )
+    if indices.ndim != 1:
+        raise ValueError(
+            "`indices` expected a 1D array, got `{}`".format(indices.ndim)
+        )
+    if not np.issubdtype(indices.dtype, np.integer):
+        raise IndexError(
+            "`indices` expected integer data type, got `{}`".format(
+                indices.dtype
             )
-        queues_.append(i.sycl_queue)
-        usm_types_.append(i.usm_type)
+        )
+    queues_.append(indices.sycl_queue)
+    usm_types_.append(indices.usm_type)
     exec_q = dpctl.utils.get_execution_queue(queues_)
     if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError(
-            "Can not automatically determine where to allocate the "
-            "result or performance execution. "
-            "Use `usm_ndarray.to_device` method to migrate data to "
-            "be associated with the same queue."
-        )
-    val_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
-
+        raise dpctl.utils.ExecutionPlacementError
+    vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
     modes = {"clip": 0, "wrap": 1}
     try:
         mode = modes[mode]
     except KeyError:
-        raise ValueError("`mode` must be `wrap`, or `clip`.")
+        raise ValueError("`mode` must be `clip` or `wrap`.")
 
-    # when axis is none, array is treated as 1D
-    if axis is None:
-        try:
-            x = dpt.reshape(x, (x.size,), copy=False)
-            axis = 0
-        except ValueError:
-            raise ValueError("Cannot create 1D view of input array")
-    if len(indices) > 1:
-        indices = dpt.broadcast_arrays(*indices)
     x_ndim = x.ndim
+    if axis is None:
+        if x_ndim > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(
+                    x_ndim
+                )
+            )
+        axis = 0
+
     if x_ndim > 0:
         axis = normalize_axis_index(operator.index(axis), x_ndim)
 
-        val_shape = (
-            x.shape[:axis] + indices[0].shape + x.shape[axis + len(indices) :]
-        )
+        val_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :]
     else:
-        val_shape = indices[0].shape
+        if axis != 0:
+            raise ValueError("`axis` must be 0 for an array of dimension 0.")
+        val_shape = indices.shape
 
     if not isinstance(vals, dpt.usm_ndarray):
         vals = dpt.asarray(
-            vals, dtype=x.dtype, usm_type=val_usm_type, sycl_queue=exec_q
+            vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q
         )
 
     vals = dpt.broadcast_to(vals, val_shape)
 
-    hev, _ = ti._put(x, indices, vals, axis, mode, sycl_queue=exec_q)
+    hev, _ = ti._put(x, (indices,), vals, axis, mode, sycl_queue=exec_q)
     hev.wait()
 
 
diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index bcc1fdbb60..7201357c7d 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -542,11 +542,11 @@ def test_put_0d_val(data_dt):
 
     x = dpt.arange(5, dtype=data_dt, sycl_queue=q)
     ind = dpt.asarray([0], dtype=np.intp, sycl_queue=q)
-    x[ind] = 2
+    val = dpt.asarray(2, dtype=x.dtype, sycl_queue=q)
+    x[ind] = val
     assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x[0]))
 
     x = dpt.asarray(5, dtype=data_dt, sycl_queue=q)
-    val = 2
     dpt.put(x, ind, val)
     assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x))
 
@@ -592,13 +592,13 @@ def test_put_0d_data(data_dt):
     "ind_dt",
     _all_int_dtypes,
 )
-def test_take_0d_ind(ind_dt):
+def test_indexing_0d_ind(ind_dt):
     q = get_queue_or_skip()
 
     x = dpt.arange(5, dtype="i4", sycl_queue=q)
     ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q)
 
-    y = dpt.take(x, ind)
+    y = x[ind]
     assert dpt.asnumpy(x[3]) == dpt.asnumpy(y)
 
 
@@ -613,7 +613,7 @@ def test_put_0d_ind(ind_dt):
     ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q)
     val = dpt.asarray(5, dtype=x.dtype, sycl_queue=q)
 
-    dpt.put(x, ind, val, axis=0)
+    x[ind] = val
     assert dpt.asnumpy(x[3]) == dpt.asnumpy(val)
 
 
@@ -684,10 +684,6 @@ def test_take_strided(data_dt, order):
                 np.take(xs_np, ind_np, axis=1),
                 dpt.asnumpy(dpt.take(xs, ind, axis=1)),
             )
-            assert_array_equal(
-                xs_np[ind_np, ind_np],
-                dpt.asnumpy(dpt.take(xs, [ind, ind], axis=0)),
-            )
 
 
 @pytest.mark.parametrize(
@@ -751,7 +747,7 @@ def test_take_strided_indices(ind_dt, order):
             inds_np = ind_np[s, ::sgn]
             assert_array_equal(
                 np.take(x_np, inds_np, axis=0),
-                dpt.asnumpy(dpt.take(x, inds, axis=0)),
+                dpt.asnumpy(x[inds]),
             )
 
 
@@ -828,7 +824,7 @@ def test_put_strided_destination(data_dt, order):
             x_np1[ind_np, ind_np] = val_np
 
             x1 = dpt.copy(xs)
-            dpt.put(x1, [ind, ind], val, axis=0)
+            x1[ind, ind] = val
             assert_array_equal(x_np1, dpt.asnumpy(x1))
 
 
@@ -887,7 +883,7 @@ def test_put_strided_indices(ind_dt, order):
             inds_np = ind_np[s, ::sgn]
 
             x_copy = dpt.copy(x)
-            dpt.put(x_copy, inds, val, axis=0)
+            x_copy[inds] = val
 
             x_np_copy = x_np.copy()
             x_np_copy[inds_np] = val_np
@@ -899,7 +895,7 @@ def test_take_arg_validation():
     q = get_queue_or_skip()
 
     x = dpt.arange(4, dtype="i4", sycl_queue=q)
-    ind0 = dpt.arange(2, dtype=np.intp, sycl_queue=q)
+    ind0 = dpt.arange(4, dtype=np.intp, sycl_queue=q)
     ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q)
 
     with pytest.raises(TypeError):
@@ -919,13 +915,15 @@ def test_take_arg_validation():
         dpt.take(x, ind0, mode=0)
     with pytest.raises(ValueError):
         dpt.take(dpt.reshape(x, (2, 2)), ind0, axis=None)
+    with pytest.raises(ValueError):
+        dpt.take(x, dpt.reshape(ind0, (2, 2)))
 
 
 def test_put_arg_validation():
     q = get_queue_or_skip()
 
     x = dpt.arange(4, dtype="i4", sycl_queue=q)
-    ind0 = dpt.arange(2, dtype=np.intp, sycl_queue=q)
+    ind0 = dpt.arange(4, dtype=np.intp, sycl_queue=q)
     ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q)
     val = dpt.asarray(2, x.dtype, sycl_queue=q)
 
@@ -946,6 +944,8 @@ def test_put_arg_validation():
 
     with pytest.raises(ValueError):
         dpt.put(x, ind0, val, mode=0)
+    with pytest.raises(ValueError):
+        dpt.put(x, dpt.reshape(ind0, (2, 2)), val)
 
 
 def test_advanced_indexing_compute_follows_data():

From 13c5db754e42fd446e9de46f908e15dc0a9c8c2d Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 23:22:11 -0600
Subject: [PATCH 57/57] Fixed rst in docstrings of extract/place

---
 dpctl/tensor/_indexing_functions.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
index c312d9e2b9..e585d6bf69 100644
--- a/dpctl/tensor/_indexing_functions.py
+++ b/dpctl/tensor/_indexing_functions.py
@@ -212,11 +212,11 @@ def extract(condition, arr):
 
     Returns the elements of an array that satisfies the condition.
 
-    If `condition` is boolean :func:``dpctl.tensor.extract`` is
+    If `condition` is boolean ``dpctl.tensor.extract`` is
     equivalent to ``arr[condition]``.
 
-    Note that :func:``dpctl.tensor.place`` does the opposite of
-    :func:``dpctl.tensor.extract``.
+    Note that ``dpctl.tensor.place`` does the opposite of
+    ``dpctl.tensor.extract``.
 
     Args:
        conditions: usm_ndarray
@@ -255,7 +255,7 @@ def place(arr, mask, vals):
 
     Change elements of an array based on conditional and input values.
 
-    If `mask` is boolean :func:``dpctl.tensor.place`` is
+    If `mask` is boolean ``dpctl.tensor.place`` is
     equivalent to ``arr[condition] = vals``.
 
     Args: