From 2446b00354533e69723e581d140da6ed542ea61d Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 2 Mar 2023 19:35:18 -0600
Subject: [PATCH 1/8] Implements place, extract, nonzero kernels, and Python
 API for them

Implemented mask_positions, _extract, _place, _nonzero and _array_overlap
APIs.
---
 dpctl/tensor/CMakeLists.txt                   |    1 +
 .../kernels/boolean_advanced_indexing.hpp     |  948 ++++++++++++++
 .../source/boolean_advanced_indexing.cpp      | 1085 +++++++++++++++++
 .../source/boolean_advanced_indexing.hpp      |   84 ++
 .../source/simplify_iteration_space.cpp       |  269 ++++
 .../source/simplify_iteration_space.hpp       |   36 +
 dpctl/tensor/libtensor/source/tensor_py.cpp   |   31 +
 7 files changed, 2454 insertions(+)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
 create mode 100644 dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
 create mode 100644 dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 3f5780cd75..300baa98c9 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -32,6 +32,7 @@ pybind11_add_module(${python_module_name} MODULE
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
new file mode 100644
index 0000000000..b42b7869d2
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -0,0 +1,948 @@
+//=== boolean_advance_indexing.hpp -                       ---*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for advanced tensor index operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <array>
+#include <cstdint>
+#include <limits>
+#include <pybind11/pybind11.h>
+#include <utility>
+#include <vector>
+
+#include "utils/strided_iters.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace indexing
+{
+
+namespace py = pybind11;
+
+template <typename T> T ceiling_quotient(T n, T m)
+{
+    return (n + m - 1) / m;
+}
+template <typename T1, typename T2> T1 ceiling_quotient(T1 n, T2 m)
+{
+    return ceiling_quotient<T1>(n, static_cast<T1>(m));
+}
+
+template <typename inputT, typename outputT, typename IndexerT, size_t n_wi>
+class inclusive_scan_rec_local_scan_krn;
+
+template <typename inputT, typename outputT, typename IndexerT>
+class inclusive_scan_rec_chunk_update_krn;
+
+struct NoOpIndexer
+{
+    size_t operator()(size_t gid) const
+    {
+        return gid;
+    }
+};
+
+struct StridedIndexer
+{
+    StridedIndexer(int _nd,
+                   py::ssize_t _offset,
+                   py::ssize_t const *_packed_shape_strides)
+        : nd(_nd), starting_offset(_offset),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    size_t operator()(size_t gid) const
+    {
+        CIndexer_vector _ind(nd);
+        py::ssize_t relative_offset(0);
+        _ind.get_displacement<const py::ssize_t *, const py::ssize_t *>(
+            static_cast<py::ssize_t>(gid),
+            shape_strides,      // shape ptr
+            shape_strides + nd, // strides ptr
+            relative_offset);
+        return starting_offset + relative_offset;
+    }
+
+private:
+    int nd;
+    py::ssize_t starting_offset;
+    py::ssize_t const *shape_strides;
+};
+
+struct Strided1DIndexer
+{
+    Strided1DIndexer(py::ssize_t _offset, py::ssize_t _size, py::ssize_t _step)
+        : offset(_offset), size(static_cast<size_t>(_size)), step(_step)
+    {
+    }
+
+    size_t operator()(size_t gid) const
+    {
+        return static_cast<size_t>(offset + std::min<size_t>(gid, size) * step);
+    }
+
+private:
+    py::ssize_t offset = 0;
+    size_t size = 1;
+    py::ssize_t step = 1;
+};
+
+template <typename _IndexerFn> struct ZeroChecker
+{
+
+    ZeroChecker(_IndexerFn _indexer) : indexer_fn(_indexer) {}
+
+    template <typename dataT>
+    bool operator()(dataT const *data, size_t gid) const
+    {
+        constexpr dataT _zero(0);
+
+        return data[indexer_fn(gid)] == _zero;
+    }
+
+private:
+    _IndexerFn indexer_fn;
+};
+
+/*
+ * for integer type maskT,
+ *       output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
+ * for 0 <= j < n_elems
+ */
+template <typename inputT, typename outputT, typename IndexerT, size_t n_wi>
+sycl::event inclusive_scan_rec(sycl::queue exec_q,
+                               size_t n_elems,
+                               size_t wg_size,
+                               const inputT *input,
+                               outputT *output,
+                               size_t s0,
+                               size_t s1,
+                               IndexerT indexer,
+                               std::vector<sycl::event> const &depends = {})
+{
+    size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size);
+
+    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using slmT = sycl::local_accessor<size_t, 1>;
+
+        auto lws = sycl::range<1>(wg_size);
+        auto gws = sycl::range<1>(n_groups * wg_size);
+
+        slmT slm_iscan_tmp(lws, cgh);
+
+        ZeroChecker<IndexerT> is_zero_fn(indexer);
+
+        cgh.parallel_for<class inclusive_scan_rec_local_scan_krn<inputT, outputT, ZeroChecker<IndexerT>, n_wi>>(
+            sycl::nd_range<1>(gws, lws),
+            [=](sycl::nd_item<1> it)
+        {
+            auto chunk_gid = it.get_global_id(0);
+            auto lid = it.get_local_id(0);
+
+            std::array<size_t, n_wi> local_isum;
+
+            size_t i = chunk_gid * n_wi;
+            for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
+                constexpr outputT out_zero(0);
+                constexpr outputT out_one(1);
+                local_isum[m_wi] =
+                    (i + m_wi < n_elems)
+                        ? (is_zero_fn(input, s0 + s1 * (i + m_wi)) ? out_zero
+                                                                   : out_one)
+                        : out_zero;
+            }
+
+// local_isum is now result of
+// inclusive scan of locally stored mask indicators
+#pragma unroll
+            for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) {
+                local_isum[m_wi] += local_isum[m_wi - 1];
+            }
+
+            size_t wg_iscan_val =
+                sycl::inclusive_scan_over_group(it.get_group(),
+                                                local_isum.back(),
+                                                sycl::plus<size_t>(),
+                                                size_t(0));
+
+            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
+            it.barrier(sycl::access::fence_space::local_space);
+            size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid];
+            it.barrier(sycl::access::fence_space::local_space);
+
+#pragma unroll
+            for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
+                local_isum[m_wi] += addand;
+            }
+
+            for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) {
+                output[i + m_wi] = local_isum[m_wi];
+            }
+            }
+        );
+    });
+
+    sycl::event out_event = inc_scan_phase1_ev;
+    if (n_groups > 1) {
+        outputT *temp = sycl::malloc_device<outputT>(n_groups - 1, exec_q);
+
+        auto chunk_size = wg_size * n_wi;
+
+        NoOpIndexer _no_op_indexer{};
+        auto e2 = inclusive_scan_rec<outputT, outputT, NoOpIndexer, n_wi>(
+            exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1,
+            chunk_size, _no_op_indexer, {inc_scan_phase1_ev});
+
+        // output[ chunk_size * (i + 1) + j] += temp[i]
+        auto e3 = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(e2);
+            cgh.parallel_for<class inclusive_scan_rec_chunk_update_krn<inputT, outputT, IndexerT>>(
+                {n_elems},
+                [=](auto wiid)
+            {
+                auto gid = wiid[0];
+                auto i = (gid / chunk_size);
+                output[gid] += (i > 0) ? temp[i - 1] : 0;
+                }
+            );
+        });
+
+        // dangling task to free the temporary
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(e3);
+            auto ctx = exec_q.get_context();
+            cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); });
+        });
+
+        out_event = e3;
+    }
+
+    return out_event;
+}
+
+template <typename displacementT> struct TwoOffsets
+{
+    TwoOffsets() : first_offset(0), second_offset(0) {}
+    TwoOffsets(displacementT first_offset_, displacementT second_offset_)
+        : first_offset(first_offset_), second_offset(second_offset_)
+    {
+    }
+
+    displacementT get_first_offset() const
+    {
+        return first_offset;
+    }
+    displacementT get_second_offset() const
+    {
+        return second_offset;
+    }
+
+private:
+    displacementT first_offset = 0;
+    displacementT second_offset = 0;
+};
+
+struct TwoOffsets_StridedIndexer
+{
+    TwoOffsets_StridedIndexer(int common_nd,
+                              py::ssize_t first_offset_,
+                              py::ssize_t second_offset_,
+                              py::ssize_t const *_packed_shape_strides)
+        : nd(common_nd), starting_first_offset(first_offset_),
+          starting_second_offset(second_offset_),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    TwoOffsets<py::ssize_t> operator()(py::ssize_t gid) const
+    {
+        CIndexer_vector _ind(nd);
+        py::ssize_t relative_first_offset(0);
+        py::ssize_t relative_second_offset(0);
+        _ind.get_displacement<const py::ssize_t *, const py::ssize_t *>(
+            gid,
+            shape_strides,          // shape ptr
+            shape_strides + nd,     // src strides ptr
+            shape_strides + 2 * nd, // src strides ptr
+            relative_first_offset, relative_second_offset);
+        return TwoOffsets<py::ssize_t>(
+            starting_first_offset + relative_first_offset,
+            starting_second_offset + relative_second_offset);
+    }
+
+private:
+    int nd;
+    py::ssize_t starting_first_offset;
+    py::ssize_t starting_second_offset;
+    py::ssize_t const *shape_strides;
+};
+
+struct TwoZeroOffsets_Indexer
+{
+    TwoZeroOffsets_Indexer() {}
+
+    TwoOffsets<py::ssize_t> operator()(py::ssize_t) const
+    {
+        return TwoOffsets<py::ssize_t>();
+    }
+};
+
+template <typename OrthogIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+struct MaskedExtractStridedFunctor
+{
+    MaskedExtractStridedFunctor(const char *src_data_p,
+                                const char *cumsum_data_p,
+                                char *dst_data_p,
+                                size_t orthog_iter_size,
+                                size_t masked_iter_size,
+                                OrthogIndexerT orthog_src_dst_indexer_,
+                                MaskedSrcIndexerT masked_src_indexer_,
+                                MaskedDstIndexerT masked_dst_indexer_)
+        : src_cp(src_data_p), cumsum_cp(cumsum_data_p), dst_cp(dst_data_p),
+          orthog_nelems(orthog_iter_size), masked_nelems(masked_iter_size),
+          orthog_src_dst_indexer(orthog_src_dst_indexer_),
+          masked_src_indexer(masked_src_indexer_),
+          masked_dst_indexer(masked_dst_indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> idx) const
+    {
+        const dataT *src_data = reinterpret_cast<const dataT *>(src_cp);
+        dataT *dst_data = reinterpret_cast<dataT *>(dst_cp);
+        const indT *cumsum_data = reinterpret_cast<const indT *>(cumsum_cp);
+
+        size_t global_i = idx[0];
+        size_t orthog_i = global_i / masked_nelems;
+        size_t masked_i = global_i - masked_nelems * orthog_i;
+
+        indT current_running_count = cumsum_data[masked_i];
+        bool mask_set =
+            (masked_i == 0)
+                ? (current_running_count == 1)
+                : (current_running_count == cumsum_data[masked_i - 1] + 1);
+
+        // dst[cumsum[i], j] - 1 = src[i, j] if cumsum[i] == ((i > 0) ?
+        // cumsum[i-1]
+        // + 1 : 1)
+        if (mask_set) {
+            auto orthog_offsets =
+                orthog_src_dst_indexer(static_cast<py::ssize_t>(orthog_i));
+
+            size_t total_src_offset = masked_src_indexer(masked_i) +
+                                      orthog_offsets.get_first_offset();
+            size_t total_dst_offset =
+                masked_dst_indexer(current_running_count - 1) +
+                orthog_offsets.get_second_offset();
+
+            dst_data[total_dst_offset] = src_data[total_src_offset];
+        }
+    }
+
+private:
+    const char *src_cp = nullptr;
+    const char *cumsum_cp = nullptr;
+    char *dst_cp = nullptr;
+    size_t orthog_nelems = 0;
+    size_t masked_nelems = 0;
+    OrthogIndexerT
+        orthog_src_dst_indexer; // has nd, shape, src_strides, dst_strides for
+                                // dimensions that ARE NOT masked
+    MaskedSrcIndexerT masked_src_indexer; // has nd, shape, src_strides for
+                                          // dimensions that ARE     masked
+    MaskedDstIndexerT
+        masked_dst_indexer; // has 1, dst_strides for dimensions that ARE masked
+};
+
+template <typename OrthogIndexerT,
+          typename MaskedDstIndexerT,
+          typename MaskedRhsIndexerT,
+          typename dataT,
+          typename indT>
+struct MaskedPlaceStridedFunctor
+{
+    MaskedPlaceStridedFunctor(char *dst_data_p,
+                              const char *cumsum_data_p,
+                              const char *rhs_data_p,
+                              size_t orthog_iter_size,
+                              size_t masked_iter_size,
+                              OrthogIndexerT orthog_dst_rhs_indexer_,
+                              MaskedDstIndexerT masked_dst_indexer_,
+                              MaskedRhsIndexerT masked_rhs_indexer_)
+        : dst_cp(dst_data_p), cumsum_cp(cumsum_data_p), rhs_cp(rhs_data_p),
+          orthog_nelems(orthog_iter_size), masked_nelems(masked_iter_size),
+          orthog_dst_rhs_indexer(orthog_dst_rhs_indexer_),
+          masked_dst_indexer(masked_dst_indexer_),
+          masked_rhs_indexer(masked_rhs_indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> idx) const
+    {
+        dataT *dst_data = reinterpret_cast<dataT *>(dst_cp);
+        const indT *cumsum_data = reinterpret_cast<const indT *>(cumsum_cp);
+        const dataT *rhs_data = reinterpret_cast<const dataT *>(rhs_cp);
+
+        size_t global_i = idx[0];
+        size_t orthog_i = global_i / masked_nelems;
+        size_t masked_i = global_i - masked_nelems * orthog_i;
+
+        indT current_running_count = cumsum_data[masked_i];
+        bool mask_set =
+            (masked_i == 0)
+                ? (current_running_count == 1)
+                : (current_running_count == cumsum_data[masked_i - 1] + 1);
+
+        // src[i, j] = rhs[cumsum[i] - 1, j] if cumsum[i] == ((i > 0) ?
+        // cumsum[i-1]
+        // + 1 : 1)
+        if (mask_set) {
+            auto orthog_offsets =
+                orthog_dst_rhs_indexer(static_cast<py::ssize_t>(orthog_i));
+
+            size_t total_dst_offset = masked_dst_indexer(masked_i) +
+                                      orthog_offsets.get_first_offset();
+            size_t total_rhs_offset =
+                masked_rhs_indexer(current_running_count - 1) +
+                orthog_offsets.get_second_offset();
+
+            dst_data[total_dst_offset] = rhs_data[total_rhs_offset];
+        }
+    }
+
+private:
+    char *dst_cp = nullptr;
+    const char *cumsum_cp = nullptr;
+    const char *rhs_cp = nullptr;
+    size_t orthog_nelems = 0;
+    size_t masked_nelems = 0;
+    OrthogIndexerT
+        orthog_dst_rhs_indexer; // has nd, shape, dst_strides, rhs_strides for
+                                // dimensions that ARE NOT masked
+    MaskedDstIndexerT masked_dst_indexer; // has nd, shape, dst_strides for
+                                          // dimensions that ARE     masked
+    MaskedRhsIndexerT
+        masked_rhs_indexer; // has 1, rhs_strides for dimensions that ARE masked
+};
+
+// mask positions
+
+typedef size_t (*mask_positions_contig_impl_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    const char *,
+    char *,
+    std::vector<sycl::event> const &);
+
+template <typename maskT, typename cumsumT>
+size_t mask_positions_contig_impl(sycl::queue q,
+                                  size_t n_elems,
+                                  const char *mask,
+                                  char *cumsum,
+                                  std::vector<sycl::event> const &depends = {})
+{
+    constexpr int n_wi = 8;
+    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
+    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
+    size_t wg_size = 128;
+
+    NoOpIndexer flat_indexer{};
+
+    sycl::event comp_ev = inclusive_scan_rec<maskT, cumsumT, NoOpIndexer, n_wi>(
+        q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, flat_indexer,
+        depends);
+
+    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
+
+    cumsumT *last_elem_host_usm = sycl::malloc_host<cumsumT>(1, q);
+
+    if (last_elem_host_usm == nullptr) {
+        throw std::bad_alloc();
+    }
+    sycl::event copy_e =
+        q.copy<std::int64_t>(last_elem, last_elem_host_usm, 1, {comp_ev});
+    copy_e.wait();
+    size_t return_val = static_cast<size_t>(*last_elem_host_usm);
+    sycl::free(last_elem_host_usm, q);
+
+    return return_val;
+}
+
+template <typename fnT, typename T> struct MaskPositionsContigFactory
+{
+    fnT get()
+    {
+        fnT fn = mask_positions_contig_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+typedef size_t (*mask_positions_strided_impl_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    const char *,
+    int,
+    py::ssize_t,
+    const py::ssize_t *,
+    char *,
+    std::vector<sycl::event> const &);
+
+template <typename maskT, typename cumsumT>
+size_t mask_positions_strided_impl(sycl::queue q,
+                                   size_t n_elems,
+                                   const char *mask,
+                                   int nd,
+                                   py::ssize_t input_offset,
+                                   const py::ssize_t *shape_strides,
+                                   char *cumsum,
+                                   std::vector<sycl::event> const &depends = {})
+{
+    constexpr int n_wi = 8;
+    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
+    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
+    size_t wg_size = 128;
+
+    StridedIndexer strided_indexer{nd, input_offset, shape_strides};
+
+    sycl::event comp_ev =
+        inclusive_scan_rec<maskT, cumsumT, StridedIndexer, n_wi>(
+            q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
+            strided_indexer, depends);
+
+    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
+
+    cumsumT *last_elem_host_usm = sycl::malloc_host<cumsumT>(1, q);
+
+    if (last_elem_host_usm == nullptr) {
+        throw std::bad_alloc();
+    }
+    sycl::event copy_e =
+        q.copy<std::int64_t>(last_elem, last_elem_host_usm, 1, {comp_ev});
+    copy_e.wait();
+    size_t return_val = static_cast<size_t>(*last_elem_host_usm);
+    sycl::free(last_elem_host_usm, q);
+
+    return return_val;
+}
+
+template <typename fnT, typename T> struct MaskPositionsStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = mask_positions_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+// ======= Masked extraction ================================
+
+template <typename OrthoIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_extract_all_slices_strided_impl_krn;
+
+typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)(
+    sycl::queue,
+    py::ssize_t,
+    const char *,
+    const char *,
+    char *,
+    int,
+    py::ssize_t const *,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename dataT, typename indT>
+sycl::event masked_extract_all_slices_strided_impl(
+    sycl::queue exec_q,
+    py::ssize_t iteration_size,
+    const char *src_p,
+    const char *cumsum_p,
+    char *dst_p,
+    int nd,
+    const py::ssize_t
+        *packed_src_shape_strides, // [src_shape, src_strides], length 2*nd
+    py::ssize_t dst_size,          // dst is 1D
+    py::ssize_t dst_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    //  using MaskedExtractStridedFunctor;
+    //  using Strided1DIndexer;
+    //  using StridedIndexer;
+    //  using TwoZeroOffsets_Indexer;
+
+    TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
+
+    /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
+     * *_packed_shape_strides) */
+    StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides);
+    Strided1DIndexer masked_dst_indexer(0, dst_size, dst_stride);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<class masked_extract_all_slices_strided_impl_krn<
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer, dataT,
+            indT>>(
+            sycl::range<1>(static_cast<size_t>(iteration_size)),
+            MaskedExtractStridedFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                        Strided1DIndexer, dataT, indT>(
+                src_p, cumsum_p, dst_p, 1, iteration_size,
+                orthog_src_dst_indexer, masked_src_indexer,
+                masked_dst_indexer));
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)(
+    sycl::queue,
+    py::ssize_t,
+    py::ssize_t,
+    const char *,
+    const char *,
+    char *,
+    int,
+    py::ssize_t const *,
+    py::ssize_t,
+    py::ssize_t,
+    int,
+    py::ssize_t const *,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename OrthoIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_extract_some_slices_strided_impl_krn;
+
+template <typename dataT, typename indT>
+sycl::event masked_extract_some_slices_strided_impl(
+    sycl::queue exec_q,
+    py::ssize_t orthog_nelems,
+    py::ssize_t masked_nelems,
+    const char *src_p,
+    const char *cumsum_p,
+    char *dst_p,
+    int orthog_nd,
+    const py::ssize_t
+        *packed_ortho_src_dst_shape_strides, // [ortho_shape, ortho_src_strides,
+                                             // ortho_dst_strides], length
+                                             // 3*ortho_nd
+    py::ssize_t ortho_src_offset,
+    py::ssize_t ortho_dst_offset,
+    int masked_nd,
+    const py::ssize_t *packed_masked_src_shape_strides, // [masked_src_shape,
+                                                        // masked_src_strides],
+                                                        // length 2*masked_nd
+    py::ssize_t masked_dst_size,                        // mask_dst is 1D
+    py::ssize_t masked_dst_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    //  using MaskedExtractStridedFunctor;
+    //  using Strided1DIndexer;
+    //  using StridedIndexer;
+    //  using TwoOffsets_StridedIndexer;
+
+    TwoOffsets_StridedIndexer orthog_src_dst_indexer{
+        orthog_nd, ortho_src_offset, ortho_dst_offset,
+        packed_ortho_src_dst_shape_strides};
+
+    StridedIndexer masked_src_indexer{masked_nd, 0,
+                                      packed_masked_src_shape_strides};
+    Strided1DIndexer masked_dst_indexer{0, masked_dst_size, masked_dst_stride};
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<class masked_extract_some_slices_strided_impl_krn<
+            TwoOffsets_StridedIndexer, StridedIndexer, Strided1DIndexer, dataT,
+            indT>>(
+            sycl::range<1>(static_cast<size_t>(orthog_nelems * masked_nelems)),
+            MaskedExtractStridedFunctor<TwoOffsets_StridedIndexer,
+                                        StridedIndexer, Strided1DIndexer, dataT,
+                                        indT>(
+                src_p, cumsum_p, dst_p, orthog_nelems, masked_nelems,
+                orthog_src_dst_indexer, masked_src_indexer,
+                masked_dst_indexer));
+    });
+
+    return comp_ev;
+}
+
+template <typename fnT, typename T> struct MaskExtractAllSlicesStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T> struct MaskExtractSomeSlicesStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_some_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+// Masked placement
+
+template <typename OrthoIndexerT,
+          typename MaskedDstIndexerT,
+          typename MaskedRhsIndexerT,
+          typename dataT,
+          typename indT>
+class masked_place_all_slices_strided_impl_krn;
+
+typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)(
+    sycl::queue,
+    py::ssize_t,
+    char *,
+    const char *,
+    const char *,
+    int,
+    py::ssize_t const *,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename dataT, typename indT>
+sycl::event masked_place_all_slices_strided_impl(
+    sycl::queue exec_q,
+    py::ssize_t iteration_size,
+    char *dst_p,
+    const char *cumsum_p,
+    const char *rhs_p,
+    int nd,
+    const py::ssize_t
+        *packed_dst_shape_strides, // [dst_shape, dst_strides], length 2*nd
+    py::ssize_t rhs_size,          // rhs is 1D
+    py::ssize_t rhs_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    //  using MaskedPlaceStridedFunctor;
+    //  using Strided1DIndexer;
+    //  using StridedIndexer;
+    //  using TwoZeroOffsets_Indexer;
+
+    TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};
+
+    /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
+     * *_packed_shape_strides) */
+    StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides);
+    Strided1DIndexer masked_rhs_indexer(0, rhs_size, rhs_stride);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<class masked_place_all_slices_strided_impl_krn<
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer, dataT,
+            indT>>(
+            sycl::range<1>(static_cast<size_t>(iteration_size)),
+            MaskedPlaceStridedFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                      Strided1DIndexer, dataT, indT>(
+                dst_p, cumsum_p, rhs_p, 1, iteration_size,
+                orthog_dst_rhs_indexer, masked_dst_indexer,
+                masked_rhs_indexer));
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)(
+    sycl::queue,
+    py::ssize_t,
+    py::ssize_t,
+    char *,
+    const char *,
+    const char *,
+    int,
+    py::ssize_t const *,
+    py::ssize_t,
+    py::ssize_t,
+    int,
+    py::ssize_t const *,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename OrthoIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_place_some_slices_strided_impl_krn;
+
+template <typename dataT, typename indT>
+sycl::event masked_place_some_slices_strided_impl(
+    sycl::queue exec_q,
+    py::ssize_t orthog_nelems,
+    py::ssize_t masked_nelems,
+    char *dst_p,
+    const char *cumsum_p,
+    const char *rhs_p,
+    int orthog_nd,
+    const py::ssize_t
+        *packed_ortho_dst_rhs_shape_strides, // [ortho_shape, ortho_dst_strides,
+                                             // ortho_rhs_strides], length
+                                             // 3*ortho_nd
+    py::ssize_t ortho_dst_offset,
+    py::ssize_t ortho_rhs_offset,
+    int masked_nd,
+    const py::ssize_t *packed_masked_dst_shape_strides, // [masked_dst_shape,
+                                                        // masked_dst_strides],
+                                                        // length 2*masked_nd
+    py::ssize_t masked_rhs_size,                        // mask_dst is 1D
+    py::ssize_t masked_rhs_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    //  using MaskedPlaceStridedFunctor;
+    //  using Strided1DIndexer;
+    //  using StridedIndexer;
+    //  using TwoOffsets_StridedIndexer;
+
+    TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{
+        orthog_nd, ortho_dst_offset, ortho_rhs_offset,
+        packed_ortho_dst_rhs_shape_strides};
+
+    /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
+     * *_packed_shape_strides) */
+    StridedIndexer masked_dst_indexer{masked_nd, 0,
+                                      packed_masked_dst_shape_strides};
+    Strided1DIndexer masked_rhs_indexer{0, masked_rhs_size, masked_rhs_stride};
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<class masked_place_some_slices_strided_impl_krn<
+            TwoOffsets_StridedIndexer, StridedIndexer, Strided1DIndexer, dataT,
+            indT>>(
+            sycl::range<1>(static_cast<size_t>(orthog_nelems * masked_nelems)),
+            MaskedPlaceStridedFunctor<TwoOffsets_StridedIndexer, StridedIndexer,
+                                      Strided1DIndexer, dataT, indT>(
+                dst_p, cumsum_p, rhs_p, orthog_nelems, masked_nelems,
+                orthog_dst_rhs_indexer, masked_dst_indexer,
+                masked_rhs_indexer));
+    });
+
+    return comp_ev;
+}
+
+static masked_place_all_slices_strided_impl_fn_ptr_t
+    masked_place_all_slices_strided_impl_dispatch_vector
+        [dpctl::tensor::detail::num_types];
+
+template <typename fnT, typename T> struct MaskPlaceAllSlicesStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = masked_place_all_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+static masked_place_some_slices_strided_impl_fn_ptr_t
+    masked_place_some_slices_strided_impl_dispatch_vector
+        [dpctl::tensor::detail::num_types];
+
+template <typename fnT, typename T> struct MaskPlaceSomeSlicesStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = masked_place_some_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+// Non-zero
+
+class non_zero_indexes_krn;
+
+template <typename indT1, typename indT2>
+sycl::event non_zero_indexes_impl(sycl::queue exec_q,
+                                  py::ssize_t iter_size,
+                                  py::ssize_t nz_elems,
+                                  int nd,
+                                  const char *cumsum_cp,
+                                  char *indexes_cp,
+                                  const py::ssize_t *mask_shape,
+                                  std::vector<sycl::event> const &depends)
+{
+    const indT1 *cumsum_data = reinterpret_cast<const indT1 *>(cumsum_cp);
+    indT2 *indexes_data = reinterpret_cast<indT2 *>(indexes_cp);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<class non_zero_indexes_krn>(
+            sycl::range<1>(iter_size), [=](sycl::id<1> idx) {
+                auto i = idx[0];
+
+                auto cs_curr_val = cumsum_data[i] - 1;
+                auto cs_prev_val = (i > 0) ? cumsum_data[i - 1] : indT1(0);
+                bool cond = (cs_curr_val == cs_prev_val);
+
+                py::ssize_t i_ = static_cast<py::ssize_t>(i);
+                for (int dim = nd; --dim > 0;) {
+                    auto sd = mask_shape[dim];
+                    py::ssize_t q = i_ / sd;
+                    py::ssize_t r = (i_ - q * sd);
+                    if (cond) {
+                        indexes_data[cs_curr_val + dim * nz_elems] =
+                            static_cast<indT2>(r);
+                    }
+                    i_ = q;
+                }
+                if (cond) {
+                    indexes_data[cs_curr_val] = static_cast<indT2>(i_);
+                }
+            });
+    });
+
+    return comp_ev;
+}
+
+} // namespace indexing
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
new file mode 100644
index 0000000000..1534b38391
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -0,0 +1,1085 @@
+//===-- boolean_advanced_indexing.cpp -                       --*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines implementation functions of dpctl.tensor.place and
+/// dpctl.tensor.extract, dpctl.tensor.nonzero
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <cstdint>
+#include <limits>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <utility>
+#include <vector>
+
+#include "boolean_advanced_indexing.hpp"
+#include "kernels/boolean_advanced_indexing.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+struct sink_t
+{
+    sink_t(){};
+    template <class T> sink_t(T &&){};
+};
+
+template <class V> std::size_t accumulate_size(std::size_t &s, V &&v)
+{
+    return s += v.size();
+}
+
+template <class V, class U> sink_t inserter(V &lhs, U &&rhs)
+{
+    lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+    return {};
+}
+
+template <typename T, typename A, typename... Vs>
+std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&... vs)
+{
+    std::size_t s = lhs.size();
+    {
+        // limited scope ensures array is freed
+        [[maybe_unused]] sink_t tmp[] = {accumulate_size(s, vs)..., 0};
+    }
+    lhs.reserve(s);
+    {
+        // array of no-data objects ensures ordering of calls to inserter
+        [[maybe_unused]] sink_t tmp[] = {inserter(lhs, std::forward<Vs>(vs))...,
+                                         0};
+    }
+
+    return std::move(lhs); // prevent return-value optimization
+}
+
+template <typename indT, typename... Vs>
+std::tuple<indT *, size_t, sycl::event>
+device_allocate_and_pack(sycl::queue q,
+                         std::vector<sycl::event> &host_task_events,
+                         Vs &&... vs)
+{
+
+    // memory transfer optimization, use USM-host for temporary speeds up
+    // tranfer to device, especially on dGPUs
+    using usm_host_allocatorT =
+        sycl::usm_allocator<indT, sycl::usm::alloc::host>;
+    using shT = std::vector<indT, usm_host_allocatorT>;
+
+    usm_host_allocatorT usm_host_allocator(q);
+    shT empty{0, usm_host_allocator};
+    shT packed_shape_strides = concat(empty, vs...);
+
+    auto packed_shape_strides_owner =
+        std::make_shared<shT>(std::move(packed_shape_strides));
+
+    auto sz = packed_shape_strides_owner->size();
+    indT *shape_strides = sycl::malloc_device<indT>(sz, q);
+
+    sycl::event copy_ev =
+        q.copy<indT>(packed_shape_strides_owner->data(), shape_strides, sz);
+
+    sycl::event cleanup_host_task_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(copy_ev);
+        cgh.host_task([packed_shape_strides_owner] {
+            // increment shared pointer ref-count to keep it alive
+            // till copy operation completes;
+        });
+    });
+    host_task_events.push_back(cleanup_host_task_ev);
+
+    return std::make_tuple(shape_strides, sz, copy_ev);
+}
+
+/* @brief check for overlap of memory regions behind arrays.
+
+Presenty assume that array occupies all bytes between smallest and largest
+displaced elements.
+
+TODO: Write proper Frobenius solver to account for holes, e.g.
+   overlap( x_contig[::2], x_contig[1::2]) should give False,
+   while this implementation gives True.
+*/
+bool overlap(dpctl::tensor::usm_ndarray ar1, dpctl::tensor::usm_ndarray ar2)
+{
+    const char *ar1_data = ar1.get_data();
+
+    const auto &ar1_offsets = ar1.get_minmax_offsets();
+    py::ssize_t ar1_elem_size = static_cast<py::ssize_t>(ar1.get_elemsize());
+
+    const char *ar2_data = ar2.get_data();
+    const auto &ar2_offsets = ar2.get_minmax_offsets();
+    py::ssize_t ar2_elem_size = static_cast<py::ssize_t>(ar2.get_elemsize());
+
+    /* Memory of array1 extends from  */
+    /*    [ar1_data + ar1_offsets.first * ar1_elem_size, ar1_data +
+     * ar1_offsets.second * ar1_elem_size + ar1_elem_size] */
+    /* Memory of array2 extends from */
+    /*    [ar2_data + ar2_offsets.first * ar2_elem_size, ar2_data +
+     * ar2_offsets.second * ar2_elem_size + ar2_elem_size] */
+
+    /* Intervals [x0, x1] and [y0, y1] do not overlap if (x0 <= x1) && (y0 <=
+     * y1)
+     * && (x1 <=y0 || y1 <= x0 ) */
+    /* Given that x0 <= x1 and y0 <= y1 are true by construction, the condition
+     * for overlap us (x1 > y0) && (y1 > x0) */
+
+    /*  Applying:
+         (ar1_data + ar1_offsets.second * ar1_elem_size + ar1_elem_size >
+       ar2_data
+       + ar2_offsets.first * ar2_elem_size) && (ar2_data + ar2_offsets.second *
+       ar2_elem_size + ar2_elem_size > ar1_data + ar1_offsets.first *
+       ar1_elem_size)
+    */
+
+    auto byte_distance = static_cast<py::ssize_t>(ar2_data - ar1_data);
+
+    py::ssize_t x1_minus_y0 =
+        (-byte_distance +
+         (ar1_elem_size + (ar1_offsets.second * ar1_elem_size) -
+          (ar2_offsets.first * ar2_elem_size)));
+
+    py::ssize_t y1_minus_x0 =
+        (byte_distance + (ar2_elem_size + (ar2_offsets.second * ar2_elem_size) -
+                          (ar1_offsets.first * ar1_elem_size)));
+
+    bool memory_overlap = (x1_minus_y0 > 0) && (y1_minus_x0 > 0);
+
+    return memory_overlap;
+}
+
+/* @brief Split shape/strides into dir1 (complementary to axis_start <= i <
+ * axis_end) and dir2 (along given set of axes)
+ */
+template <typename shT>
+void _split_iteration_space(const shT &shape_vec,
+                            const shT &strides_vec,
+                            int axis_start,
+                            int axis_end,
+                            shT &dir1_shape_vec,
+                            shT &dir2_shape_vec,
+                            shT &dir1_strides_vec,
+                            shT &dir2_strides_vec)
+{
+    int nd = static_cast<int>(shape_vec.size());
+    int dir2_sz = axis_end - axis_start;
+    int dir1_sz = nd - dir2_sz;
+
+    assert(dir1_sz > 0);
+    assert(dir2_sz > 0);
+
+    dir1_shape_vec.resize(dir1_sz);
+    dir2_shape_vec.resize(dir2_sz);
+
+    std::copy(shape_vec.begin(), shape_vec.begin() + axis_start,
+              dir1_shape_vec.begin());
+    std::copy(shape_vec.begin() + axis_end, shape_vec.end(),
+              dir1_shape_vec.begin() + axis_start);
+
+    std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end,
+              dir2_shape_vec.begin());
+
+    dir1_strides_vec.resize(dir1_sz);
+    dir2_strides_vec.resize(dir2_sz);
+
+    std::copy(strides_vec.begin(), strides_vec.begin() + axis_start,
+              dir1_strides_vec.begin());
+    std::copy(strides_vec.begin() + axis_end, strides_vec.end(),
+              dir1_strides_vec.begin() + axis_start);
+
+    std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end,
+              dir2_strides_vec.begin());
+
+    return;
+}
+
+// Computation of positions of masked elements
+
+using dpctl::tensor::kernels::indexing::mask_positions_contig_impl_fn_ptr_t;
+static mask_positions_contig_impl_fn_ptr_t
+    mask_positions_contig_dispatch_vector[dpctl::tensor::detail::num_types];
+
+using dpctl::tensor::kernels::indexing::mask_positions_strided_impl_fn_ptr_t;
+static mask_positions_strided_impl_fn_ptr_t
+    mask_positions_strided_dispatch_vector[dpctl::tensor::detail::num_types];
+
+void populate_mask_positions_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::indexing::MaskPositionsContigFactory;
+    dpctl::tensor::detail::DispatchVectorBuilder<
+        mask_positions_contig_impl_fn_ptr_t, MaskPositionsContigFactory,
+        dpctl::tensor::detail::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(mask_positions_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::MaskPositionsStridedFactory;
+    dpctl::tensor::detail::DispatchVectorBuilder<
+        mask_positions_strided_impl_fn_ptr_t, MaskPositionsStridedFactory,
+        dpctl::tensor::detail::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(mask_positions_strided_dispatch_vector);
+
+    return;
+}
+
+size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
+                         dpctl::tensor::usm_ndarray cumsum,
+                         sycl::queue exec_q,
+                         std::vector<sycl::event> const &depends)
+{
+    // cumsum is 1D
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("Result array must be one-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array must be C-contiguous.");
+    }
+
+    // cumsum.shape == (mask.size,)
+    auto mask_size = mask.get_size();
+    auto cumsum_size = cumsum.get_shape(0);
+    if (cumsum_size != mask_size) {
+        throw py::value_error("Inconsistent dimensions");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) {
+        // FIXME: use ExecutionPlacementError
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (mask_size == 0) {
+        return 0;
+    }
+
+    int mask_typenum = mask.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    // mask can be any type
+    const char *mask_data = mask.get_data();
+    char *cumsum_data = cumsum.get_data();
+
+    auto const &array_types = dpctl::tensor::detail::usm_ndarray_types();
+
+    int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    // cumsum must be int64_t only
+    constexpr int int64_typeid =
+        static_cast<int>(dpctl::tensor::detail::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Cumulative sum array must have int64 data-type.");
+    }
+
+    if (mask.is_c_contiguous()) {
+        auto fn = mask_positions_contig_dispatch_vector[mask_typeid];
+
+        return fn(exec_q, mask_size, mask_data, cumsum_data, depends);
+    }
+
+    const py::ssize_t *shape = mask.get_shape_raw();
+    const py::ssize_t *strides = mask.get_strides_raw();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_strides;
+    py::ssize_t offset(0);
+
+    int mask_nd = mask.get_ndim();
+    int nd = mask_nd;
+
+    constexpr py::ssize_t itemsize = 1; // in elements
+    bool is_c_contig = mask.is_c_contiguous();
+    bool is_f_contig = mask.is_f_contiguous();
+
+    dpctl::tensor::py_internal::simplify_iteration_space_1(
+        nd, shape, strides, itemsize, is_c_contig, is_f_contig,
+        simplified_shape, simplified_strides, offset);
+
+    if (nd == 1 && simplified_strides[0] == 1) {
+        auto fn = mask_positions_contig_dispatch_vector[mask_typeid];
+
+        return fn(exec_q, mask_size, mask_data, cumsum_data, depends);
+    }
+
+    // Strided implementation
+    auto strided_fn = mask_positions_strided_dispatch_vector[mask_typeid];
+    std::vector<sycl::event> host_task_events;
+
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_strides);
+    py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple);
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+
+    if (2 * static_cast<size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
+        copy_shape_ev.wait();
+        sycl::event::wait(host_task_events);
+        sycl::free(shape_strides, exec_q);
+        throw std::runtime_error("Unexacted error");
+    }
+
+    std::vector<sycl::event> dependent_events;
+    dependent_events.reserve(depends.size() + 1);
+    dependent_events.insert(dependent_events.end(), copy_shape_ev);
+    dependent_events.insert(dependent_events.end(), depends.begin(),
+                            depends.end());
+
+    size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, offset,
+                                  shape_strides, cumsum_data, dependent_events);
+
+    sycl::event::wait(host_task_events);
+    sycl::free(shape_strides, exec_q);
+
+    return total_set;
+}
+
+// Masked extraction
+
+using dpctl::tensor::kernels::indexing::
+    masked_extract_all_slices_strided_impl_fn_ptr_t;
+
+static masked_extract_all_slices_strided_impl_fn_ptr_t
+    masked_extract_all_slices_strided_impl_dispatch_vector
+        [dpctl::tensor::detail::num_types];
+
+using dpctl::tensor::kernels::indexing::
+    masked_extract_some_slices_strided_impl_fn_ptr_t;
+
+static masked_extract_some_slices_strided_impl_fn_ptr_t
+    masked_extract_some_slices_strided_impl_dispatch_vector
+        [dpctl::tensor::detail::num_types];
+
+void populate_masked_extract_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::indexing::MaskExtractAllSlicesStridedFactory;
+    dpctl::tensor::detail::DispatchVectorBuilder<
+        masked_extract_all_slices_strided_impl_fn_ptr_t,
+        MaskExtractAllSlicesStridedFactory, dpctl::tensor::detail::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(
+        masked_extract_all_slices_strided_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::MaskExtractSomeSlicesStridedFactory;
+    dpctl::tensor::detail::DispatchVectorBuilder<
+        masked_extract_some_slices_strided_impl_fn_ptr_t,
+        MaskExtractSomeSlicesStridedFactory, dpctl::tensor::detail::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(
+        masked_extract_some_slices_strided_impl_dispatch_vector);
+}
+
+std::pair<sycl::event, sycl::event>
+py_extract(dpctl::tensor::usm_ndarray src,
+           dpctl::tensor::usm_ndarray cumsum,
+           int axis_start, // axis_start <= mask_i < axis_end
+           int axis_end,
+           dpctl::tensor::usm_ndarray dst,
+           sycl::queue exec_q,
+           std::vector<sycl::event> const &depends)
+{
+    int src_nd = src.get_ndim();
+    if ((axis_start < 0 || axis_end > src_nd || axis_start >= axis_end)) {
+        throw py::value_error("Specified axes_start and axes_end are invalid.");
+    }
+    int mask_span_sz = axis_end - axis_start;
+
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd + (mask_span_sz - 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be a C-contiguous vector");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    py::ssize_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool same_ortho_dims(true);
+    size_t ortho_nelems(1); // number of orthogonal iterations
+
+    for (auto i = 0; i < axis_start; ++i) {
+        auto src_sh_i = src_shape[i];
+        ortho_nelems *= src_sh_i;
+        same_ortho_dims = same_ortho_dims && (src_sh_i == dst_shape[i]);
+    }
+    for (auto i = axis_end; i < src_nd; ++i) {
+        auto src_sh_i = src_shape[i];
+        ortho_nelems *= src_sh_i;
+        same_ortho_dims =
+            same_ortho_dims && (src_sh_i == dst_shape[i - (mask_span_sz - 1)]);
+    }
+
+    size_t masked_src_nelems(1);
+    size_t masked_dst_nelems(dst_shape[axis_start]);
+    for (auto i = axis_start; i < axis_end; ++i) {
+        masked_src_nelems *= src_shape[i];
+    }
+
+    // masked_dst_nelems is number of set elements in the mask, or last element
+    // in cumsum
+    if (!same_ortho_dims ||
+        (masked_src_nelems != static_cast<size_t>(cumsum_sz))) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    // ensure that dst is sufficiently ample
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accomodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < static_cast<size_t>(ortho_nelems * masked_dst_nelems)) {
+            throw py::value_error(
+                "Memory addressed by the destination array can not "
+                "accomodate all the "
+                "array elements.");
+        }
+    }
+
+    // check that dst does not intersect with src, not with cumsum.
+    if (overlap(dst, cumsum) || overlap(dst, src)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    constexpr int int64_typeid =
+        static_cast<int>(dpctl::tensor::detail::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Unexact data type of cumsum array, expecting 'int64'");
+    }
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data types");
+    }
+
+    char *src_data_p = src.get_data();
+    char *dst_data_p = dst.get_data();
+    char *cumsum_data_p = cumsum.get_data();
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    sycl::event extract_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis_start == 0 && axis_end == src_nd) {
+        // empty orthogonal directions
+        auto fn =
+            masked_extract_all_slices_strided_impl_dispatch_vector[src_typeid];
+
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, src_shape_vec, src_strides_vec);
+        py::ssize_t *packed_src_shape_strides =
+            std::get<0>(ptr_size_event_tuple1);
+        sycl::event copy_src_shape_strides_ev =
+            std::get<2>(ptr_size_event_tuple1);
+
+        assert(dst_shape_vec.size() == 1);
+        assert(dst_strides_vec.size() == 1);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_src_shape_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        extract_ev = fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p,
+                        dst_data_p, src_nd, packed_src_shape_strides,
+                        dst_shape_vec[0], dst_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(extract_ev);
+                auto ctx = exec_q.get_context();
+                cgh.host_task([ctx, packed_src_shape_strides] {
+                    sycl::free(packed_src_shape_strides, ctx);
+                });
+            });
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+    else {
+        // non-empty othogonal directions
+        auto fn =
+            masked_extract_some_slices_strided_impl_dispatch_vector[src_typeid];
+
+        int masked_src_nd = mask_span_sz;
+        int ortho_nd = src_nd - masked_src_nd;
+
+        using shT = std::vector<py::ssize_t>;
+
+        shT ortho_src_shape;
+        shT masked_src_shape;
+        shT ortho_src_strides;
+        shT masked_src_strides;
+        _split_iteration_space(src_shape_vec, src_strides_vec, axis_start,
+                               axis_end, ortho_src_shape,
+                               masked_src_shape, // 4 vectors modified
+                               ortho_src_strides, masked_src_strides);
+
+        shT ortho_dst_shape;
+        shT masked_dst_shape;
+        shT ortho_dst_strides;
+        shT masked_dst_strides;
+        _split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start,
+                               axis_start + 1, ortho_dst_shape,
+                               masked_dst_shape, // 4 vectors modified
+                               ortho_dst_strides, masked_dst_strides);
+
+        assert(ortho_src_shape.size() == static_cast<size_t>(ortho_nd));
+        assert(ortho_dst_shape.size() == static_cast<size_t>(ortho_nd));
+        assert(std::equal(ortho_src_shape.begin(), ortho_src_shape.end(),
+                          ortho_dst_shape.begin()));
+
+        std::vector<py::ssize_t> simplified_ortho_shape;
+        std::vector<py::ssize_t> simplified_ortho_src_strides;
+        std::vector<py::ssize_t> simplified_ortho_dst_strides;
+
+        const py::ssize_t *_shape = ortho_src_shape.data();
+        const py::ssize_t *_src_strides = ortho_src_strides.data();
+        const py::ssize_t *_dst_strides = ortho_dst_strides.data();
+        constexpr py::ssize_t _itemsize = 1; // in elements
+
+        constexpr bool is_c_contig = false;
+        constexpr bool is_f_contig = false;
+
+        py::ssize_t ortho_src_offset(0);
+        py::ssize_t ortho_dst_offset(0);
+
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            ortho_nd, _shape, _src_strides, _itemsize, is_c_contig, is_f_contig,
+            _dst_strides, _itemsize, is_c_contig, is_f_contig,
+            simplified_ortho_shape, simplified_ortho_src_strides,
+            simplified_ortho_dst_strides, ortho_src_offset, ortho_dst_offset);
+
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_ortho_shape,
+            simplified_ortho_src_strides, simplified_ortho_dst_strides);
+        py::ssize_t *packed_ortho_src_dst_shape_strides =
+            std::get<0>(ptr_size_event_tuple1);
+        sycl::event copy_shape_strides_ev1 = std::get<2>(ptr_size_event_tuple1);
+
+        auto ptr_size_event_tuple2 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, masked_src_shape, masked_src_strides);
+        py::ssize_t *packed_masked_src_shape_strides =
+            std::get<0>(ptr_size_event_tuple2);
+        sycl::event copy_shape_strides_ev2 = std::get<2>(ptr_size_event_tuple2);
+
+        assert(masked_dst_shape.size() == 1);
+        assert(masked_dst_strides.size() == 1);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 2);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shape_strides_ev1);
+        all_deps.push_back(copy_shape_strides_ev2);
+
+        assert(all_deps.size() == depends.size() + 2);
+
+        // OrthogIndexerT orthog_src_dst_indexer_, MaskedIndexerT
+        // masked_src_indexer_, MaskedIndexerT masked_dst_indexer_
+        extract_ev = fn(exec_q, ortho_nelems, masked_src_nelems, src_data_p,
+                        cumsum_data_p, dst_data_p,
+                        // data to build orthog_src_dst_indexer
+                        ortho_nd, packed_ortho_src_dst_shape_strides,
+                        ortho_src_offset, ortho_dst_offset,
+                        // data to build masked_src_indexer
+                        masked_src_nd, packed_masked_src_shape_strides,
+                        // data to build masked_dst_indexer,
+                        masked_dst_shape[0], masked_dst_strides[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(extract_ev);
+                auto ctx = exec_q.get_context();
+                cgh.host_task([ctx, packed_ortho_src_dst_shape_strides,
+                               packed_masked_src_shape_strides] {
+                    sycl::free(packed_ortho_src_dst_shape_strides, ctx);
+                    sycl::free(packed_masked_src_shape_strides, ctx);
+                });
+            });
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    host_task_events.push_back(extract_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {src, cumsum, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, extract_ev);
+}
+
+// Masked placement
+
+using dpctl::tensor::kernels::indexing::
+    masked_place_all_slices_strided_impl_fn_ptr_t;
+
+static masked_place_all_slices_strided_impl_fn_ptr_t
+    masked_place_all_slices_strided_impl_dispatch_vector
+        [dpctl::tensor::detail::num_types];
+
+using dpctl::tensor::kernels::indexing::
+    masked_place_some_slices_strided_impl_fn_ptr_t;
+
+static masked_place_some_slices_strided_impl_fn_ptr_t
+    masked_place_some_slices_strided_impl_dispatch_vector
+        [dpctl::tensor::detail::num_types];
+
+void populate_masked_place_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::indexing::MaskPlaceAllSlicesStridedFactory;
+    dpctl::tensor::detail::DispatchVectorBuilder<
+        masked_place_all_slices_strided_impl_fn_ptr_t,
+        MaskPlaceAllSlicesStridedFactory, dpctl::tensor::detail::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(
+        masked_place_all_slices_strided_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::MaskPlaceSomeSlicesStridedFactory;
+    dpctl::tensor::detail::DispatchVectorBuilder<
+        masked_place_some_slices_strided_impl_fn_ptr_t,
+        MaskPlaceSomeSlicesStridedFactory, dpctl::tensor::detail::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(
+        masked_place_some_slices_strided_impl_dispatch_vector);
+}
+
+/*
+ * @brief Copy dst[i, ortho_id] = rhs[cumsum[i] - 1, ortho_id]  if cumsum[i] ==
+ * ((i > 0) ? cumsum[i-1] + 1 : 1)
+ */
+std::pair<sycl::event, sycl::event>
+py_place(dpctl::tensor::usm_ndarray dst,
+         dpctl::tensor::usm_ndarray cumsum,
+         int axis_start, // axis_start <= mask_i < axis_end
+         int axis_end,
+         dpctl::tensor::usm_ndarray rhs,
+         sycl::queue exec_q,
+         std::vector<sycl::event> const &depends)
+{
+    int dst_nd = dst.get_ndim();
+    if ((axis_start < 0 || axis_end > dst_nd || axis_start >= axis_end)) {
+        throw py::value_error("Specified axes_start and axes_end are invalid.");
+    }
+    int mask_span_sz = axis_end - axis_start;
+
+    int rhs_nd = rhs.get_ndim();
+    if (dst_nd != rhs_nd + (mask_span_sz - 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be a C-contiguous vector");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst, cumsum, rhs})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    py::ssize_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    const py::ssize_t *rhs_shape = rhs.get_shape_raw();
+    bool same_ortho_dims(true);
+    size_t ortho_nelems(1); // number of orthogonal iterations
+
+    for (auto i = 0; i < axis_start; ++i) {
+        auto dst_sh_i = dst_shape[i];
+        ortho_nelems *= dst_sh_i;
+        same_ortho_dims = same_ortho_dims && (dst_sh_i == rhs_shape[i]);
+    }
+    for (auto i = axis_end; i < dst_nd; ++i) {
+        auto dst_sh_i = dst_shape[i];
+        ortho_nelems *= dst_sh_i;
+        same_ortho_dims =
+            same_ortho_dims && (dst_sh_i == rhs_shape[i - (mask_span_sz - 1)]);
+    }
+
+    size_t masked_dst_nelems(1);
+    for (auto i = axis_start; i < axis_end; ++i) {
+        masked_dst_nelems *= dst_shape[i];
+    }
+
+    if (!same_ortho_dims ||
+        (masked_dst_nelems != static_cast<size_t>(cumsum_sz))) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    // ensure that dst is sufficiently ample
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accomodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < static_cast<size_t>(ortho_nelems * masked_dst_nelems)) {
+            throw py::value_error(
+                "Memory addressed by the destination array can not "
+                "accomodate all the "
+                "array elements.");
+        }
+    }
+
+    // check that dst does not intersect with src, not with cumsum.
+    if (overlap(dst, rhs) || overlap(dst, cumsum)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int dst_typenum = dst.get_typenum();
+    int rhs_typenum = rhs.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    constexpr int int64_typeid =
+        static_cast<int>(dpctl::tensor::detail::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Unexact data type of cumsum array, expecting 'int64'");
+    }
+
+    // FIXME: should types be the same?
+    if (dst_typeid != rhs_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data types");
+    }
+
+    char *dst_data_p = dst.get_data();
+    char *rhs_data_p = rhs.get_data();
+    char *cumsum_data_p = cumsum.get_data();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    auto rhs_shape_vec = rhs.get_shape_vector();
+    auto rhs_strides_vec = rhs.get_strides_vector();
+
+    sycl::event extract_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis_start == 0 && axis_end == dst_nd) {
+        // empty orthogonal directions
+        auto fn =
+            masked_place_all_slices_strided_impl_dispatch_vector[dst_typeid];
+
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, dst_shape_vec, dst_strides_vec);
+        py::ssize_t *packed_dst_shape_strides =
+            std::get<0>(ptr_size_event_tuple1);
+        sycl::event copy_dst_shape_strides_ev =
+            std::get<2>(ptr_size_event_tuple1);
+
+        assert(rhs_shape_vec.size() == 1);
+        assert(rhs_strides_vec.size() == 1);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_dst_shape_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        extract_ev = fn(exec_q, cumsum_sz, dst_data_p, cumsum_data_p,
+                        rhs_data_p, dst_nd, packed_dst_shape_strides,
+                        rhs_shape_vec[0], rhs_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(extract_ev);
+                auto ctx = exec_q.get_context();
+                cgh.host_task([ctx, packed_dst_shape_strides] {
+                    sycl::free(packed_dst_shape_strides, ctx);
+                });
+            });
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+    else {
+        // non-empty othogonal directions
+        auto fn =
+            masked_place_some_slices_strided_impl_dispatch_vector[dst_typeid];
+
+        int masked_dst_nd = mask_span_sz;
+        int ortho_nd = dst_nd - masked_dst_nd;
+
+        using shT = std::vector<py::ssize_t>;
+
+        shT ortho_dst_shape;
+        shT masked_dst_shape;
+        shT ortho_dst_strides;
+        shT masked_dst_strides;
+        _split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start,
+                               axis_end, ortho_dst_shape,
+                               masked_dst_shape, // 4 vectors modified
+                               ortho_dst_strides, masked_dst_strides);
+
+        shT ortho_rhs_shape;
+        shT masked_rhs_shape;
+        shT ortho_rhs_strides;
+        shT masked_rhs_strides;
+        _split_iteration_space(rhs_shape_vec, rhs_strides_vec, axis_start,
+                               axis_start + 1, ortho_rhs_shape,
+                               masked_rhs_shape, // 4 vectors modified
+                               ortho_rhs_strides, masked_rhs_strides);
+
+        assert(ortho_dst_shape.size() == static_cast<size_t>(ortho_nd));
+        assert(ortho_rhs_shape.size() == static_cast<size_t>(ortho_nd));
+        assert(std::equal(ortho_dst_shape.begin(), ortho_dst_shape.end(),
+                          ortho_rhs_shape.begin()));
+
+        std::vector<py::ssize_t> simplified_ortho_shape;
+        std::vector<py::ssize_t> simplified_ortho_dst_strides;
+        std::vector<py::ssize_t> simplified_ortho_rhs_strides;
+
+        const py::ssize_t *_shape = ortho_dst_shape.data();
+        const py::ssize_t *_dst_strides = ortho_dst_strides.data();
+        const py::ssize_t *_rhs_strides = ortho_rhs_strides.data();
+        constexpr py::ssize_t _itemsize = 1; // in elements
+
+        constexpr bool is_c_contig = false;
+        constexpr bool is_f_contig = false;
+
+        py::ssize_t ortho_dst_offset(0);
+        py::ssize_t ortho_rhs_offset(0);
+
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            ortho_nd, _shape, _dst_strides, _itemsize, is_c_contig, is_f_contig,
+            _rhs_strides, _itemsize, is_c_contig, is_f_contig,
+            simplified_ortho_shape, simplified_ortho_dst_strides,
+            simplified_ortho_rhs_strides, ortho_dst_offset, ortho_rhs_offset);
+
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_ortho_shape,
+            simplified_ortho_dst_strides, simplified_ortho_rhs_strides);
+        py::ssize_t *packed_ortho_dst_rhs_shape_strides =
+            std::get<0>(ptr_size_event_tuple1);
+        sycl::event copy_shape_strides_ev1 = std::get<2>(ptr_size_event_tuple1);
+
+        auto ptr_size_event_tuple2 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, masked_dst_shape, masked_dst_strides);
+        py::ssize_t *packed_masked_dst_shape_strides =
+            std::get<0>(ptr_size_event_tuple2);
+        sycl::event copy_shape_strides_ev2 = std::get<2>(ptr_size_event_tuple2);
+
+        assert(masked_rhs_shape.size() == 1);
+        assert(masked_rhs_strides.size() == 1);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 2);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shape_strides_ev1);
+        all_deps.push_back(copy_shape_strides_ev2);
+
+        assert(all_deps.size() == depends.size() + 2);
+
+        extract_ev = fn(exec_q, ortho_nelems, masked_dst_nelems, dst_data_p,
+                        cumsum_data_p, rhs_data_p,
+                        // data to build orthog_dst_rhs_indexer
+                        ortho_nd, packed_ortho_dst_rhs_shape_strides,
+                        ortho_dst_offset, ortho_rhs_offset,
+                        // data to build masked_dst_indexer
+                        masked_dst_nd, packed_masked_dst_shape_strides,
+                        // data to build masked_dst_indexer,
+                        masked_rhs_shape[0], masked_rhs_strides[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(extract_ev);
+                auto ctx = exec_q.get_context();
+                cgh.host_task([ctx, packed_ortho_dst_rhs_shape_strides,
+                               packed_masked_dst_shape_strides] {
+                    sycl::free(packed_ortho_dst_rhs_shape_strides, ctx);
+                    sycl::free(packed_masked_dst_shape_strides, ctx);
+                });
+            });
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    host_task_events.push_back(extract_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {dst, cumsum, rhs}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, extract_ev);
+}
+
+// Non-zero
+
+std::pair<sycl::event, sycl::event> py_nonzero(
+    dpctl::tensor::usm_ndarray cumsum,  // int64 input array, 1D, C-contiguous
+    dpctl::tensor::usm_ndarray indexes, // int64 2D output array, C-contiguous
+    std::vector<py::ssize_t>
+        mask_shape, // shape of array from which cumsum was computed
+    sycl::queue exec_q,
+    std::vector<sycl::event> const &depends)
+{
+    if (!dpctl::utils::queues_are_compatible(exec_q, {cumsum, indexes})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    int cumsum_nd = cumsum.get_ndim();
+    if (cumsum_nd != 1 || !cumsum.is_c_contiguous()) {
+        throw py::value_error("Cumsum array must be a C-contiguous vector");
+    }
+
+    int indexes_nd = indexes.get_ndim();
+    if (indexes_nd != 2 || !indexes.is_c_contiguous()) {
+        throw py::value_error("Index array must be a C-contiguous matrix");
+    }
+
+    size_t _ndim = mask_shape.size();
+    if (_ndim > std::numeric_limits<int>::max()) {
+        throw py::value_error("Shape is too large");
+    }
+    int ndim = static_cast<int>(_ndim);
+
+    const py::ssize_t *indexes_shape = indexes.get_shape_raw();
+
+    if (ndim != indexes_shape[0]) {
+        throw py::value_error(
+            "Length of shape must equal width of index matrix");
+    }
+
+    auto cumsum_sz = cumsum.get_size();
+    py::ssize_t shape_nelems =
+        std::accumulate(mask_shape.begin(), mask_shape.end(), py::ssize_t(1),
+                        std::multiplies<py::ssize_t>());
+
+    if (cumsum_sz != shape_nelems) {
+        throw py::value_error("Shape and cumsum size are not constent");
+    }
+
+    py::ssize_t nz_elems = indexes_shape[1];
+
+    int indexes_typenum = indexes.get_typenum();
+    auto const &array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int indexes_typeid = array_types.typenum_to_lookup_id(indexes_typenum);
+
+    int cumsum_typenum = cumsum.get_typenum();
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    // cumsum must be int64_t only
+    constexpr int int64_typeid =
+        static_cast<int>(dpctl::tensor::detail::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid || indexes_typeid != int64_typeid) {
+        throw py::value_error(
+            "Cumulative sum array and index array must have int64 data-type");
+    }
+
+    if (cumsum_sz == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    if (overlap(cumsum, indexes)) {
+        throw py::value_error("Arrays are expected to ave no memory overlap");
+    }
+
+    // ensure that dst is sufficiently ample
+    auto indexes_offsets = indexes.get_minmax_offsets();
+    // destination must be ample enough to accomodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(indexes_offsets.second - indexes_offsets.first);
+        if (range + 1 < static_cast<size_t>(nz_elems * _ndim)) {
+            throw py::value_error(
+                "Memory addressed by the destination array can not "
+                "accomodate all the array elements.");
+        }
+    }
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    auto mask_shape_copying_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, mask_shape);
+    py::ssize_t *src_shape_device_ptr = std::get<0>(mask_shape_copying_tuple);
+    sycl::event copy_ev = std::get<2>(mask_shape_copying_tuple);
+
+    if (src_shape_device_ptr == nullptr) {
+        sycl::event::wait(host_task_events);
+        throw std::runtime_error("Device allocation failed");
+    }
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_ev);
+
+    using dpctl::tensor::kernels::indexing::non_zero_indexes_impl;
+
+    sycl::event non_zero_indexes_ev =
+        non_zero_indexes_impl<std::int64_t, std::int64_t>(
+            exec_q, cumsum_sz, nz_elems, ndim, cumsum.get_data(),
+            indexes.get_data(), src_shape_device_ptr, all_deps);
+
+    sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(non_zero_indexes_ev);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, src_shape_device_ptr] {
+            sycl::free(src_shape_device_ptr, ctx);
+        });
+    });
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {cumsum, indexes}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev,
+                          temporaries_cleanup_ev);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
new file mode 100644
index 0000000000..f165fe5118
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
@@ -0,0 +1,84 @@
+//===-- boolean_advanced_indexing.hpp -                       --*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares Python API for implementation functions of
+/// dpctl.tensor.place, dpctl.tensor.extract, and dpctl.tensor.nonzero
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void populate_mask_positions_dispatch_vectors(void);
+
+extern size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
+                                dpctl::tensor::usm_ndarray cumsum,
+                                sycl::queue exec_q,
+                                std::vector<sycl::event> const &depends = {});
+
+extern std::pair<sycl::event, sycl::event>
+py_extract(dpctl::tensor::usm_ndarray src,
+           dpctl::tensor::usm_ndarray cumsum,
+           int axis_start, // axis_start <= mask_i < axis_end
+           int axis_end,
+           dpctl::tensor::usm_ndarray dst,
+           sycl::queue exec_q,
+           std::vector<sycl::event> const &depends = {});
+
+extern void populate_masked_extract_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event>
+py_place(dpctl::tensor::usm_ndarray dst,
+         dpctl::tensor::usm_ndarray cumsum,
+         int axis_start, // axis_start <= mask_i < axis_end
+         int axis_end,
+         dpctl::tensor::usm_ndarray rhs,
+         sycl::queue exec_q,
+         std::vector<sycl::event> const &depends = {});
+
+extern void populate_masked_place_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event> py_nonzero(
+    dpctl::tensor::usm_ndarray cumsum,  // int64 input array, 1D, C-contiguous
+    dpctl::tensor::usm_ndarray indexes, // int64 2D output array, C-contiguous
+    std::vector<py::ssize_t>
+        mask_shape, // shape of array from which cumsum was computed
+    sycl::queue exec_q,
+    std::vector<sycl::event> const &depends = {});
+
+/* @brief Check if memory regions underlying two arrays have an overlap */
+extern bool overlap(dpctl::tensor::usm_ndarray ar1,
+                    dpctl::tensor::usm_ndarray ar2);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
index be4a35fb90..7eb7c8f8d6 100644
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -39,6 +39,86 @@ namespace py = pybind11;
 using dpctl::tensor::c_contiguous_strides;
 using dpctl::tensor::f_contiguous_strides;
 
+void simplify_iteration_space_1(int &nd,
+                                const py::ssize_t *&shape,
+                                const py::ssize_t *&strides,
+                                py::ssize_t itemsize,
+                                bool is_c_contig,
+                                bool is_f_contig,
+                                std::vector<py::ssize_t> &simplified_shape,
+                                std::vector<py::ssize_t> &simplified_strides,
+                                py::ssize_t &offset)
+{
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        for (int i = 0; i < nd; ++i) {
+            simplified_shape.push_back(shape[i]);
+        }
+
+        simplified_strides.reserve(nd);
+        if (strides == nullptr) {
+            if (is_c_contig) {
+                simplified_strides = c_contiguous_strides(nd, shape, itemsize);
+            }
+            else if (is_f_contig) {
+                simplified_strides = f_contiguous_strides(nd, shape, itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            for (int i = 0; i < nd; ++i) {
+                simplified_strides.push_back(strides[i]);
+            }
+        }
+
+        assert(simplified_shape.size() == static_cast<size_t>(nd));
+        assert(simplified_strides.size() == static_cast<size_t>(nd));
+        int contracted_nd = simplify_iteration_stride(
+            nd, simplified_shape.data(), simplified_strides.data(),
+            offset // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+
+        simplified_strides.reserve(nd);
+
+        if (strides == nullptr) {
+            if (is_c_contig) {
+                simplified_strides.push_back(itemsize);
+            }
+            else if (is_f_contig) {
+                simplified_strides.push_back(itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            simplified_strides.push_back(strides[0]);
+        }
+
+        assert(simplified_shape.size() == static_cast<size_t>(nd));
+        assert(simplified_strides.size() == static_cast<size_t>(nd));
+    }
+    shape = const_cast<const py::ssize_t *>(simplified_shape.data());
+    strides = const_cast<const py::ssize_t *>(simplified_strides.data());
+}
+
 void simplify_iteration_space(int &nd,
                               const py::ssize_t *&shape,
                               const py::ssize_t *&src_strides,
@@ -173,6 +253,195 @@ void simplify_iteration_space(int &nd,
         const_cast<const py::ssize_t *>(simplified_dst_strides.data());
 }
 
+void simplify_iteration_space_3(
+    int &nd,
+    const py::ssize_t *&shape,
+    // src1
+    const py::ssize_t *&src1_strides,
+    py::ssize_t src1_itemsize,
+    bool is_src1_c_contig,
+    bool is_src1_f_contig,
+    // src2
+    const py::ssize_t *&src2_strides,
+    py::ssize_t src2_itemsize,
+    bool is_src2_c_contig,
+    bool is_src2_f_contig,
+    // dst
+    const py::ssize_t *&dst_strides,
+    py::ssize_t dst_itemsize,
+    bool is_dst_c_contig,
+    bool is_dst_f_contig,
+    // output
+    std::vector<py::ssize_t> &simplified_shape,
+    std::vector<py::ssize_t> &simplified_src1_strides,
+    std::vector<py::ssize_t> &simplified_src2_strides,
+    std::vector<py::ssize_t> &simplified_dst_strides,
+    py::ssize_t &src1_offset,
+    py::ssize_t &src2_offset,
+    py::ssize_t &dst_offset)
+{
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        for (int i = 0; i < nd; ++i) {
+            simplified_shape.push_back(shape[i]);
+        }
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src2_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+        if (src1_strides == nullptr) {
+            if (is_src1_c_contig) {
+                simplified_src1_strides =
+                    c_contiguous_strides(nd, shape, src1_itemsize);
+            }
+            else if (is_src1_f_contig) {
+                simplified_src1_strides =
+                    f_contiguous_strides(nd, shape, src1_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Source array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            for (int i = 0; i < nd; ++i) {
+                simplified_src1_strides.push_back(src1_strides[i]);
+            }
+        }
+        if (src2_strides == nullptr) {
+            if (is_src2_c_contig) {
+                simplified_src2_strides =
+                    c_contiguous_strides(nd, shape, src2_itemsize);
+            }
+            else if (is_src2_f_contig) {
+                simplified_src2_strides =
+                    f_contiguous_strides(nd, shape, src2_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Source array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            for (int i = 0; i < nd; ++i) {
+                simplified_src2_strides.push_back(src2_strides[i]);
+            }
+        }
+        if (dst_strides == nullptr) {
+            if (is_dst_c_contig) {
+                simplified_dst_strides =
+                    c_contiguous_strides(nd, shape, dst_itemsize);
+            }
+            else if (is_dst_f_contig) {
+                simplified_dst_strides =
+                    f_contiguous_strides(nd, shape, dst_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Destination array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            for (int i = 0; i < nd; ++i) {
+                simplified_dst_strides.push_back(dst_strides[i]);
+            }
+        }
+
+        assert(simplified_shape.size() == static_cast<size_t>(nd));
+        assert(simplified_src1_strides.size() == static_cast<size_t>(nd));
+        assert(simplified_src2_strides.size() == static_cast<size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<size_t>(nd));
+        int contracted_nd = simplify_iteration_three_strides(
+            nd, simplified_shape.data(), simplified_src1_strides.data(),
+            simplified_src2_strides.data(), simplified_dst_strides.data(),
+            src1_offset, // modified by reference
+            src2_offset, // modified by reference
+            dst_offset   // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src1_strides.resize(contracted_nd);
+        simplified_src2_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src2_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if (src1_strides == nullptr) {
+            if (is_src1_c_contig) {
+                simplified_src1_strides.push_back(src1_itemsize);
+            }
+            else if (is_src1_f_contig) {
+                simplified_src1_strides.push_back(src1_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Source array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            simplified_src1_strides.push_back(src1_strides[0]);
+        }
+        if (src2_strides == nullptr) {
+            if (is_src2_c_contig) {
+                simplified_src2_strides.push_back(src2_itemsize);
+            }
+            else if (is_src2_f_contig) {
+                simplified_src2_strides.push_back(src2_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Source array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            simplified_src2_strides.push_back(src2_strides[0]);
+        }
+        if (dst_strides == nullptr) {
+            if (is_dst_c_contig) {
+                simplified_dst_strides.push_back(dst_itemsize);
+            }
+            else if (is_dst_f_contig) {
+                simplified_dst_strides.push_back(dst_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Destination array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_shape.size() == static_cast<size_t>(nd));
+        assert(simplified_src1_strides.size() == static_cast<size_t>(nd));
+        assert(simplified_src2_strides.size() == static_cast<size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<size_t>(nd));
+    }
+    shape = const_cast<const py::ssize_t *>(simplified_shape.data());
+    src1_strides =
+        const_cast<const py::ssize_t *>(simplified_src1_strides.data());
+    src2_strides =
+        const_cast<const py::ssize_t *>(simplified_src2_strides.data());
+    dst_strides =
+        const_cast<const py::ssize_t *>(simplified_dst_strides.data());
+}
+
 } // namespace py_internal
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
index 515e795d20..ec0cc286d4 100644
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
@@ -36,6 +36,16 @@ namespace py_internal
 
 namespace py = pybind11;
 
+void simplify_iteration_space_1(int &,
+                                const py::ssize_t *&,
+                                const py::ssize_t *&,
+                                py::ssize_t,
+                                bool,
+                                bool,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &);
+
 void simplify_iteration_space(int &,
                               const py::ssize_t *&,
                               const py::ssize_t *&,
@@ -52,6 +62,32 @@ void simplify_iteration_space(int &,
                               py::ssize_t &,
                               py::ssize_t &);
 
+void simplify_iteration_space_3(int &,
+                                const py::ssize_t *&,
+                                // src1
+                                const py::ssize_t *&,
+                                py::ssize_t,
+                                bool,
+                                bool,
+                                // src2
+                                const py::ssize_t *&,
+                                py::ssize_t,
+                                bool,
+                                bool,
+                                // dst
+                                const py::ssize_t *&,
+                                py::ssize_t,
+                                bool,
+                                bool,
+                                // output
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &);
+
 } // namespace py_internal
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index e164be2421..2e9e981a37 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -33,6 +33,7 @@
 
 #include "dpctl4pybind11.hpp"
 
+#include "boolean_advanced_indexing.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_for_reshape.hpp"
 #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
@@ -75,6 +76,12 @@ using dpctl::tensor::py_internal::usm_ndarray_full;
 using dpctl::tensor::py_internal::usm_ndarray_put;
 using dpctl::tensor::py_internal::usm_ndarray_take;
 
+using dpctl::tensor::py_internal::overlap;
+using dpctl::tensor::py_internal::py_extract;
+using dpctl::tensor::py_internal::py_mask_positions;
+using dpctl::tensor::py_internal::py_nonzero;
+using dpctl::tensor::py_internal::py_place;
+
 /* ================ Eye ================== */
 
 using dpctl::tensor::py_internal::usm_ndarray_eye;
@@ -105,6 +112,10 @@ void init_dispatch_vectors(void)
     init_eye_ctor_dispatch_vectors();
     init_triul_ctor_dispatch_vectors();
 
+    populate_mask_positions_dispatch_vectors();
+    populate_masked_extract_dispatch_vectors();
+    populate_masked_place_dispatch_vectors();
+
     return;
 }
 
@@ -252,4 +263,24 @@ PYBIND11_MODULE(_tensor_impl, m)
     m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
           py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
           py::arg("depends") = py::list());
+
+    m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
+          py::arg("cumsum"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
+          py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_array_overlap", &overlap,
+          "Determines if the memory regions indexed by each array overlap",
+          py::arg("array1"), py::arg("array2"));
+
+    m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
+          py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"),
+          py::arg("mask_shape"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 }

From c1f00812b35becd29c8af54df2e049dd7033f136 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 10:31:52 -0600
Subject: [PATCH 2/8] Added missing include

---
 .../libtensor/include/kernels/boolean_advanced_indexing.hpp      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
index b42b7869d2..71313e9a27 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -32,6 +32,7 @@
 #include <vector>
 
 #include "utils/strided_iters.hpp"
+#include "utils/type_dispatch.hpp"
 
 namespace dpctl
 {

From 849a3ea2025277c7128b4e72145b4308be68153b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 11:18:44 -0600
Subject: [PATCH 3/8] Hooked up boolean indexing, first attempt

---
 dpctl/tensor/__init__.py            |   5 +-
 dpctl/tensor/_copy_utils.py         | 147 +++++++++++++++++++---------
 dpctl/tensor/_indexing_functions.py | 118 ++++++++++++++++++++++
 dpctl/tensor/_usmarray.pyx          |  14 +--
 4 files changed, 229 insertions(+), 55 deletions(-)

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index d21958b4fa..2a2afd60a4 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -58,7 +58,7 @@
 )
 from dpctl.tensor._device import Device
 from dpctl.tensor._dlpack import from_dlpack
-from dpctl.tensor._indexing_functions import put, take
+from dpctl.tensor._indexing_functions import extract, nonzero, place, put, take
 from dpctl.tensor._manipulation_functions import (
     broadcast_arrays,
     broadcast_to,
@@ -115,6 +115,9 @@
     "squeeze",
     "take",
     "put",
+    "extract",
+    "place",
+    "nonzero",
     "from_numpy",
     "to_numpy",
     "asnumpy",
diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index 597db87c49..72b9e0a021 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -389,45 +389,75 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
     return R
 
 
-def _mock_extract(ary, ary_mask, p):
-    exec_q = dpctl.utils.get_execution_queue(
-        (
-            ary.sycl_queue,
-            ary_mask.sycl_queue,
+def _extract_impl(ary, ary_mask, axis=0):
+    """Extract elements of ary by applying mask starting from slot
+    dimension axis"""
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    if not isinstance(ary_mask, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}"
         )
+    exec_q = dpctl.utils.get_execution_queue(
+        (ary.sycl_queue, ary_mask.sycl_queue)
     )
     if exec_q is None:
         raise dpctl.utils.ExecutionPlacementError(
-            "Can not automatically determine where to allocate the "
-            "result or performance execution. "
-            "Use `usm_ndarray.to_device` method to migrate data to "
-            "be associated with the same queue."
+            "arrays have different associated queues. "
+            "Use `Y.to_device(X.device)` to migrate."
         )
-
-    res_usm_type = dpctl.utils.get_coerced_usm_type(
-        (
-            ary.usm_type,
-            ary_mask.usm_type,
+    ary_nd = ary.ndim
+    pp = normalize_axis_index(operator.index(axis), ary_nd)
+    mask_nd = ary_mask.ndim
+    if pp < 0 or pp + mask_nd > ary_nd:
+        raise ValueError(
+            "Parameter p is inconsistent with input array dimensions"
         )
+    mask_nelems = ary_mask.size
+    cumsum = dpt.empty(mask_nelems, dtype=dpt.int64, device=ary_mask.device)
+    exec_q = cumsum.sycl_queue
+    mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q)
+    dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
+    dst = dpt.empty(
+        dst_shape, dtype=ary.dtype, usm_type=ary.usm_type, device=ary.device
     )
-    ary_np = dpt.asnumpy(ary)
-    mask_np = dpt.asnumpy(ary_mask)
-    res_np = ary_np[(slice(None),) * p + (mask_np,)]
-    res = dpt.empty(
-        res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    hev, _ = ti._extract(
+        src=ary,
+        cumsum=cumsum,
+        axis_start=pp,
+        axis_end=pp + mask_nd,
+        dst=dst,
+        sycl_queue=exec_q,
     )
-    res[...] = res_np
-    return res
+    hev.wait()
+    return dst
 
 
-def _mock_nonzero(ary):
+def _nonzero_impl(ary):
     if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError
-    q = ary.sycl_queue
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    exec_q = ary.sycl_queue
     usm_type = ary.usm_type
-    ary_np = dpt.asnumpy(ary)
-    nz = ary_np.nonzero()
-    return tuple(dpt.asarray(i, usm_type=usm_type, sycl_queue=q) for i in nz)
+    mask_nelems = ary.size
+    cumsum = dpt.empty(
+        mask_nelems, dtype=dpt.int64, sycl_queue=exec_q, order="C"
+    )
+    mask_count = ti.mask_positions(ary, cumsum, sycl_queue=exec_q)
+    indexes = dpt.empty(
+        (ary.ndim, mask_count),
+        dtype=cumsum.dtype,
+        usm_type=usm_type,
+        sycl_queue=exec_q,
+        order="C",
+    )
+    hev, _ = ti._nonzero(cumsum, indexes, ary.shape, exec_q)
+    res = tuple(indexes[i, :] for i in range(ary.ndim))
+    hev.wait()
+    return res
 
 
 def _take_multi_index(ary, inds, p):
@@ -473,34 +503,57 @@ def _take_multi_index(ary, inds, p):
     return res
 
 
-def _mock_place(ary, ary_mask, p, vals):
+def _place_impl(ary, ary_mask, vals, axis=0):
+    """Extract elements of ary by applying mask starting from slot
+    dimension axis"""
     if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
     if not isinstance(ary_mask, dpt.usm_ndarray):
-        raise TypeError
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}"
+        )
+    if not isinstance(vals, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}"
+        )
     exec_q = dpctl.utils.get_execution_queue(
-        (ary.sycl_queue, ary_mask.sycl_queue)
+        (ary.sycl_queue, ary_mask.sycl_queue, vals.sycl_queue)
     )
-    if exec_q is not None and isinstance(vals, dpt.usm_ndarray):
-        exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue))
     if exec_q is None:
         raise dpctl.utils.ExecutionPlacementError(
-            "Can not automatically determine where to allocate the "
-            "result or performance execution. "
-            "Use `usm_ndarray.to_device` method to migrate data to "
-            "be associated with the same queue."
+            "arrays have different associated queues. "
+            "Use `Y.to_device(X.device)` to migrate."
         )
-
-    ary_np = dpt.asnumpy(ary)
-    mask_np = dpt.asnumpy(ary_mask)
-    if isinstance(vals, dpt.usm_ndarray) or hasattr(
-        vals, "__sycl_usm_array_interface__"
-    ):
-        vals_np = dpt.asnumpy(vals)
+    ary_nd = ary.ndim
+    pp = normalize_axis_index(operator.index(axis), ary_nd)
+    mask_nd = ary_mask.ndim
+    if pp < 0 or pp + mask_nd > ary_nd:
+        raise ValueError(
+            "Parameter p is inconsistent with input array dimensions"
+        )
+    mask_nelems = ary_mask.size
+    cumsum = dpt.empty(mask_nelems, dtype=dpt.int64, device=ary_mask.device)
+    exec_q = cumsum.sycl_queue
+    mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q)
+    expected_vals_shape = (
+        ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
+    )
+    if vals.dtype == ary.dtype:
+        rhs = vals
     else:
-        vals_np = vals
-    ary_np[(slice(None),) * p + (mask_np,)] = vals_np
-    ary[...] = ary_np
+        rhs = dpt.astype(vals, ary.dtype)
+    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
+    hev, _ = ti._place(
+        dst=ary,
+        cumsum=cumsum,
+        axis_start=pp,
+        axis_end=pp + mask_nd,
+        rhs=rhs,
+        sycl_queue=exec_q,
+    )
+    hev.wait()
     return
 
 
diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
index 12f7b2d72e..01f1a2370a 100644
--- a/dpctl/tensor/_indexing_functions.py
+++ b/dpctl/tensor/_indexing_functions.py
@@ -23,6 +23,8 @@
 import dpctl.tensor as dpt
 from dpctl.tensor._tensor_impl import _put, _take
 
+from ._copy_utils import _extract_impl, _nonzero_impl, _place_impl
+
 
 def take(x, indices, /, *, axis=None, mode="clip"):
     if not isinstance(x, dpt.usm_ndarray):
@@ -175,3 +177,119 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"):
 
     hev, _ = _put(x, indices, vals, axis, mode, sycl_queue=exec_q)
     hev.wait()
+
+
+def extract(condition, arr):
+    """extract(condition, arr)
+
+    Returns the elements of an array that satisfies the condition.
+
+    If `condition` is boolean :func:``dpctl.tensor.extract`` is
+    equivalent to ``arr[condition]``.
+
+    Note that :func:``dpctl.tensor.place`` does the opposite of
+    :func:``dpctl.tensor.extract``.
+
+    Args:
+       conditions: usm_ndarray
+          An array whose non-zero or True entries indicate the element
+          of `arr` to extract.
+       arr: usm_ndarray
+          Input array of the same size as `condition`.
+
+    Returns:
+       extract: usm_ndarray
+          Rank 1 array of values from `arr` where `condition` is True.
+    """
+    if not isinstance(condition, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}"
+        )
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    exec_q = dpctl.utils.get_execution_queue(
+        (
+            condition.sycl_queue,
+            arr.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    if condition.shape != arr.shape:
+        raise ValueError("Arrays are not of the same size")
+    return _extract_impl(arr, condition)
+
+
+def place(arr, mask, vals):
+    """place(arr, mask, vals)
+
+    Change elements of an array based on conditional and input values.
+
+    If `mask` is boolean :func:``dpctl.tensor.place`` is
+    equivalent to ``arr[condition] = vals``.
+
+    Args:
+       arr: usm_ndarray
+          Array to put data into.
+       mask: usm_ndarray
+          Boolean mask array. Must have the same size as `arr`.
+       vals: usm_ndarray
+          Values to put into `arr`. Only the first N elements are
+          used, where N is the number of True values in `mask`. If
+          `vals` is smaller than N, it will be repeated, and if
+          elements of `arr` are to be masked, this sequence must be
+          non-empty. Array `vals` must be one dimensional.
+    """
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    if not isinstance(mask, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(mask)}"
+        )
+    if not isinstance(vals, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(vals)}"
+        )
+    exec_q = dpctl.utils.get_execution_queue(
+        (
+            arr.sycl_queue,
+            mask.sycl_queue,
+            vals.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    if arr.shape != mask.shape or vals.ndim != 1:
+        raise ValueError("Array sizes are not as required")
+    # FIXME
+    _place_impl(arr, mask, vals, axis=0)
+
+
+def nonzero(arr):
+    """nonzero(arr)
+
+    Return the indices of non-zero elements.
+
+    Returns the tuple of usm_narrays, one for each dimension
+    of `arr`, containing the indices of the non-zero elements
+    in that dimension. The values of `arr` are always tested in
+    row-major, C-style order.
+
+    Args:
+       arr: usm_ndarray
+          Input array, which has non-zero array rank.
+    Returns:
+       tuple_of_usm_ndarrays: tuple
+          Indices of non-zero array elements.
+    """
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    if arr.ndim == 0:
+        raise ValueError("Array of positive rank is exepcted")
+    return _nonzero_impl(arr)
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 64a492065f..1abc1e88ac 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -670,15 +670,15 @@ cdef class usm_ndarray:
         if adv_ind_start_p < 0:
             return res
 
-        from ._copy_utils import _mock_extract, _mock_nonzero, _take_multi_index
+        from ._copy_utils import _extract_impl, _nonzero_impl, _take_multi_index
         if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
-            return _mock_extract(res, adv_ind[0], adv_ind_start_p)
+            return _extract_impl(res, adv_ind[0], axis=adv_ind_start_p)
 
         if any(ind.dtype == dpt_bool for ind in adv_ind):
             adv_ind_int = list()
             for ind in adv_ind:
                 if ind.dtype == dpt_bool:
-                    adv_ind_int.extend(_mock_nonzero(ind))
+                    adv_ind_int.extend(_nonzero_impl(ind))
                 else:
                     adv_ind_int.append(ind)
             return _take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p)
@@ -1015,8 +1015,8 @@ cdef class usm_ndarray:
         from ._copy_utils import (
             _copy_from_numpy_into,
             _copy_from_usm_ndarray_to_usm_ndarray,
-            _mock_nonzero,
-            _mock_place,
+            _nonzero_impl,
+            _place_impl,
             _put_multi_index,
         )
 
@@ -1050,14 +1050,14 @@ cdef class usm_ndarray:
             return
 
         if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
-            _mock_place(Xv, adv_ind[0], adv_ind_start_p, rhs)
+            _place_impl(Xv, adv_ind[0], rhs, axis=adv_ind_start_p)
             return
 
         if any(ind.dtype == dpt_bool for ind in adv_ind):
             adv_ind_int = list()
             for ind in adv_ind:
                 if ind.dtype == dpt_bool:
-                    adv_ind_int.extend(_mock_nonzero(ind))
+                    adv_ind_int.extend(_nonzero_impl(ind))
                 else:
                     adv_ind_int.append(ind)
             _put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs)

From ed279d63b6f6b43ef5f3b4791696216536639ca1 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 11:43:28 -0600
Subject: [PATCH 4/8] Changes per clang-format 11

---
 dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
index 1534b38391..9689612b8a 100644
--- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -62,7 +62,7 @@ template <class V, class U> sink_t inserter(V &lhs, U &&rhs)
 }
 
 template <typename T, typename A, typename... Vs>
-std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&... vs)
+std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&...vs)
 {
     std::size_t s = lhs.size();
     {
@@ -83,7 +83,7 @@ template <typename indT, typename... Vs>
 std::tuple<indT *, size_t, sycl::event>
 device_allocate_and_pack(sycl::queue q,
                          std::vector<sycl::event> &host_task_events,
-                         Vs &&... vs)
+                         Vs &&...vs)
 {
 
     // memory transfer optimization, use USM-host for temporary speeds up

From 3ced89a095650fbc40688294b8e343d44857f495 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 12:42:07 -0600
Subject: [PATCH 5/8] Used Strided1DCyclingIndexer in place implementations

This allows to implement behavior of place which cycles over
values of val array if that is shorter than the number of non-zero
elements in the mask.
---
 .../kernels/boolean_advanced_indexing.hpp     | 53 +++++++++++--------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
index 71313e9a27..aa0a90ce70 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -1,4 +1,4 @@
-//=== boolean_advance_indexing.hpp -                       ---*-C++-*--/===//
+//=== boolean_advance_indexing.hpp -                      ------*-C++-*--/===//
 //
 //                      Data Parallel Control (dpctl)
 //
@@ -16,11 +16,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 ///
 /// \file
 /// This file defines kernels for advanced tensor index operations.
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 
 #pragma once
 #include <CL/sycl.hpp>
@@ -114,6 +114,26 @@ struct Strided1DIndexer
     py::ssize_t step = 1;
 };
 
+struct Strided1DCyclicIndexer
+{
+    Strided1DCyclicIndexer(py::ssize_t _offset,
+                           py::ssize_t _size,
+                           py::ssize_t _step)
+        : offset(_offset), size(static_cast<size_t>(_size)), step(_step)
+    {
+    }
+
+    size_t operator()(size_t gid) const
+    {
+        return static_cast<size_t>(offset + (gid % size) * step);
+    }
+
+private:
+    py::ssize_t offset = 0;
+    size_t size = 1;
+    py::ssize_t step = 1;
+};
+
 template <typename _IndexerFn> struct ZeroChecker
 {
 
@@ -762,27 +782,22 @@ sycl::event masked_place_all_slices_strided_impl(
     py::ssize_t rhs_stride,
     const std::vector<sycl::event> &depends = {})
 {
-    //  using MaskedPlaceStridedFunctor;
-    //  using Strided1DIndexer;
-    //  using StridedIndexer;
-    //  using TwoZeroOffsets_Indexer;
-
     TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};
 
     /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
      * *_packed_shape_strides) */
     StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides);
-    Strided1DIndexer masked_rhs_indexer(0, rhs_size, rhs_stride);
+    Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride);
 
     sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
         cgh.parallel_for<class masked_place_all_slices_strided_impl_krn<
-            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer, dataT,
-            indT>>(
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer,
+            dataT, indT>>(
             sycl::range<1>(static_cast<size_t>(iteration_size)),
             MaskedPlaceStridedFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
-                                      Strided1DIndexer, dataT, indT>(
+                                      Strided1DCyclicIndexer, dataT, indT>(
                 dst_p, cumsum_p, rhs_p, 1, iteration_size,
                 orthog_dst_rhs_indexer, masked_dst_indexer,
                 masked_rhs_indexer));
@@ -838,11 +853,6 @@ sycl::event masked_place_some_slices_strided_impl(
     py::ssize_t masked_rhs_stride,
     const std::vector<sycl::event> &depends = {})
 {
-    //  using MaskedPlaceStridedFunctor;
-    //  using Strided1DIndexer;
-    //  using StridedIndexer;
-    //  using TwoOffsets_StridedIndexer;
-
     TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{
         orthog_nd, ortho_dst_offset, ortho_rhs_offset,
         packed_ortho_dst_rhs_shape_strides};
@@ -851,17 +861,18 @@ sycl::event masked_place_some_slices_strided_impl(
      * *_packed_shape_strides) */
     StridedIndexer masked_dst_indexer{masked_nd, 0,
                                       packed_masked_dst_shape_strides};
-    Strided1DIndexer masked_rhs_indexer{0, masked_rhs_size, masked_rhs_stride};
+    Strided1DCyclicIndexer masked_rhs_indexer{0, masked_rhs_size,
+                                              masked_rhs_stride};
 
     sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
         cgh.parallel_for<class masked_place_some_slices_strided_impl_krn<
-            TwoOffsets_StridedIndexer, StridedIndexer, Strided1DIndexer, dataT,
-            indT>>(
+            TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer,
+            dataT, indT>>(
             sycl::range<1>(static_cast<size_t>(orthog_nelems * masked_nelems)),
             MaskedPlaceStridedFunctor<TwoOffsets_StridedIndexer, StridedIndexer,
-                                      Strided1DIndexer, dataT, indT>(
+                                      Strided1DCyclicIndexer, dataT, indT>(
                 dst_p, cumsum_p, rhs_p, orthog_nelems, masked_nelems,
                 orthog_dst_rhs_indexer, masked_dst_indexer,
                 masked_rhs_indexer));

From 19691ca89ff4ee3324c3402a19a4be669ee8e138 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 12:44:49 -0600
Subject: [PATCH 6/8] Implemented dpctl.tensor.place as per documented
 behavior.

---
 dpctl/tensor/_indexing_functions.py | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
index 01f1a2370a..20c4a22786 100644
--- a/dpctl/tensor/_indexing_functions.py
+++ b/dpctl/tensor/_indexing_functions.py
@@ -21,9 +21,9 @@
 
 import dpctl
 import dpctl.tensor as dpt
-from dpctl.tensor._tensor_impl import _put, _take
+import dpctl.tensor._tensor_impl as ti
 
-from ._copy_utils import _extract_impl, _nonzero_impl, _place_impl
+from ._copy_utils import _extract_impl, _nonzero_impl
 
 
 def take(x, indices, /, *, axis=None, mode="clip"):
@@ -95,7 +95,7 @@ def take(x, indices, /, *, axis=None, mode="clip"):
         res_shape, dtype=x.dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
 
-    hev, _ = _take(x, indices, res, axis, mode, sycl_queue=exec_q)
+    hev, _ = ti._take(x, indices, res, axis, mode, sycl_queue=exec_q)
     hev.wait()
 
     return res
@@ -175,7 +175,7 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"):
 
     vals = dpt.broadcast_to(vals, val_shape)
 
-    hev, _ = _put(x, indices, vals, axis, mode, sycl_queue=exec_q)
+    hev, _ = ti._put(x, indices, vals, axis, mode, sycl_queue=exec_q)
     hev.wait()
 
 
@@ -265,8 +265,23 @@ def place(arr, mask, vals):
         raise dpctl.utils.ExecutionPlacementError
     if arr.shape != mask.shape or vals.ndim != 1:
         raise ValueError("Array sizes are not as required")
-    # FIXME
-    _place_impl(arr, mask, vals, axis=0)
+    cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q)
+    nz_count = ti.mask_positions(mask, cumsum, sycl_queue=exec_q)
+    if nz_count == 0:
+        return
+    if vals.dtype == arr.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, arr.dtype)
+    hev, _ = ti._place(
+        dst=arr,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=mask.ndim,
+        rhs=rhs,
+        sycl_queue=exec_q,
+    )
+    hev.wait()
 
 
 def nonzero(arr):

From f75723b143c67f109bcf27bdb8837cd30a7d81cb Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 14:47:59 -0600
Subject: [PATCH 7/8] Added tests to test_usm_ndarray_indexing

---
 dpctl/tests/test_usm_ndarray_indexing.py | 220 +++++++++++++++++++++++
 1 file changed, 220 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index 98bb674b21..aec71def7d 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -970,3 +970,223 @@ def test_advanced_indexing_compute_follows_data():
         dpt.put(x, ind0, val1, axis=0)
     with pytest.raises(ExecutionPlacementError):
         x[ind0] = val1
+
+
+#######
+
+
+def test_extract_all_1d():
+    x = dpt.arange(30, dtype="i4")
+    sel = dpt.ones(30, dtype="?")
+    sel[::2] = False
+
+    res = x[sel]
+    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+    res2 = dpt.extract(sel, x)
+    assert (dpt.asnumpy(res2) == expected_res).all()
+
+
+def test_extract_all_2d():
+    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
+    sel = dpt.ones(30, dtype="?")
+    sel[::2] = False
+    sel = dpt.reshape(sel, x.shape)
+
+    res = x[sel]
+    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+    res2 = dpt.extract(sel, x)
+    assert (dpt.asnumpy(res2) == expected_res).all()
+
+
+def test_extract_2D_axis0():
+    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
+    sel = dpt.ones(x.shape[0], dtype="?")
+    sel[::2] = False
+
+    res = x[sel]
+    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+
+def test_extract_2D_axis1():
+    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
+    sel = dpt.ones(x.shape[1], dtype="?")
+    sel[::2] = False
+
+    res = x[:, sel]
+    expected = dpt.asnumpy(x)[:, dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected).all()
+
+
+def test_extract_begin():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 3), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[sel]
+    expected = dpt.asnumpy(y)[[0, 1], [0, 1]]
+    assert (dpt.asnumpy(z) == expected).all()
+
+
+def test_extract_end():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((4, 4), dtype="?")
+    sel[0, 0] = True
+    z = y[..., sel]
+    expected = dpt.asnumpy(y)[..., [0], [0]]
+    assert (dpt.asnumpy(z) == expected).all()
+
+
+def test_extract_middle():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 4), dtype="?")
+    sel[0, 0] = True
+    z = y[:, sel]
+    expected = dpt.asnumpy(y)[:, [0], [0], :]
+    assert (dpt.asnumpy(z) == expected).all()
+
+
+def test_extract_empty_result():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 4), dtype="?")
+    z = y[:, sel]
+    assert z.shape == (
+        y.shape[0],
+        0,
+        y.shape[3],
+    )
+
+
+def test_place_all_1d():
+    x = dpt.arange(10, dtype="i2")
+    sel = dpt.zeros(10, dtype="?")
+    sel[0::2] = True
+    val = dpt.zeros(5, dtype=x.dtype)
+    x[sel] = val
+    assert (dpt.asnumpy(x) == np.array([0, 1, 0, 3, 0, 5, 0, 7, 0, 9])).all()
+    dpt.place(x, sel, dpt.asarray(2))
+    assert (dpt.asnumpy(x) == np.array([2, 1, 2, 3, 2, 5, 2, 7, 2, 9])).all()
+
+
+def test_place_2d_axis0():
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray([True, False, True])
+    val = dpt.zeros((2, 4), dtype=x.dtype)
+    x[sel] = val
+    expected_x = np.stack(
+        (
+            np.zeros(4, dtype="i2"),
+            np.arange(4, 8, dtype="i2"),
+            np.zeros(4, dtype="i2"),
+        )
+    )
+    assert (dpt.asnumpy(x) == expected_x).all()
+
+
+def test_place_2d_axis1():
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray([True, False, True, False])
+    val = dpt.zeros((3, 2), dtype=x.dtype)
+    x[:, sel] = val
+    expected_x = np.array(
+        [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2"
+    )
+    assert (dpt.asnumpy(x) == expected_x).all()
+
+
+def test_place_2d_axis1_scalar():
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray([True, False, True, False])
+    val = dpt.zeros(tuple(), dtype=x.dtype)
+    x[:, sel] = val
+    expected_x = np.array(
+        [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2"
+    )
+    assert (dpt.asnumpy(x) == expected_x).all()
+
+
+def test_place_all_slices():
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray(
+        [
+            [False, True, True, False],
+            [True, True, False, False],
+            [False, False, True, True],
+        ],
+        dtype="?",
+    )
+    y = dpt.ones_like(x)
+    y[sel] = x[sel]
+
+
+def test_place_some_slices_begin():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 3), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[sel]
+    w = dpt.zeros_like(y)
+    w[sel] = z
+
+
+def test_place_some_slices_mid():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 4), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[:, sel]
+    w = dpt.zeros_like(y)
+    w[:, sel] = z
+
+
+def test_place_some_slices_end():
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((4, 4), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[:, :, sel]
+    w = dpt.zeros_like(y)
+    w[:, :, sel] = z
+
+
+def test_place_cycling():
+    x = dpt.zeros(10, dtype="f4")
+    y = dpt.asarray([2, 3])
+    sel = dpt.ones(x.size, dtype="?")
+    dpt.place(x, sel, y)
+    expected = np.array(
+        [
+            2,
+            3,
+        ]
+        * 5,
+        dtype=x.dtype,
+    )
+    assert (dpt.asnumpy(x) == expected).all()
+
+
+def test_place_subset():
+    x = dpt.zeros(10, dtype="f4")
+    y = dpt.ones_like(x)
+    sel = dpt.ones(x.size, dtype="?")
+    sel[::2] = False
+    dpt.place(x, sel, y)
+    expected = np.array([1, 3, 5, 7, 9], dtype=x.dtype)
+    assert (dpt.asnumpy(x) == expected).all()
+
+
+def test_nonzero():
+    x = dpt.concat((dpt.zeros(3), dpt.ones(4), dpt.zeros(3)))
+    (i,) = dpt.nonzero(x)
+    assert dpt.asnumpy(i) == np.array([3, 4, 5, 6]).all()

From cab00351ba9ba9018528dd9cf3563c0aeb31f861 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 3 Mar 2023 15:43:17 -0600
Subject: [PATCH 8/8] Fixed tests for boolean indexing

---
 dpctl/tests/test_usm_ndarray_indexing.py | 28 +++++++++++++++++++-----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index aec71def7d..bcc1fdbb60 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -972,10 +972,8 @@ def test_advanced_indexing_compute_follows_data():
         x[ind0] = val1
 
 
-#######
-
-
 def test_extract_all_1d():
+    get_queue_or_skip()
     x = dpt.arange(30, dtype="i4")
     sel = dpt.ones(30, dtype="?")
     sel[::2] = False
@@ -989,6 +987,7 @@ def test_extract_all_1d():
 
 
 def test_extract_all_2d():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
     sel = dpt.ones(30, dtype="?")
     sel[::2] = False
@@ -1003,6 +1002,7 @@ def test_extract_all_2d():
 
 
 def test_extract_2D_axis0():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
     sel = dpt.ones(x.shape[0], dtype="?")
     sel[::2] = False
@@ -1013,6 +1013,7 @@ def test_extract_2D_axis0():
 
 
 def test_extract_2D_axis1():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
     sel = dpt.ones(x.shape[1], dtype="?")
     sel[::2] = False
@@ -1023,6 +1024,7 @@ def test_extract_2D_axis1():
 
 
 def test_extract_begin():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((3, 3), dtype="?")
@@ -1034,6 +1036,7 @@ def test_extract_begin():
 
 
 def test_extract_end():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((4, 4), dtype="?")
@@ -1044,6 +1047,7 @@ def test_extract_end():
 
 
 def test_extract_middle():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((3, 4), dtype="?")
@@ -1054,6 +1058,7 @@ def test_extract_middle():
 
 
 def test_extract_empty_result():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((3, 4), dtype="?")
@@ -1066,17 +1071,19 @@ def test_extract_empty_result():
 
 
 def test_place_all_1d():
+    get_queue_or_skip()
     x = dpt.arange(10, dtype="i2")
     sel = dpt.zeros(10, dtype="?")
     sel[0::2] = True
     val = dpt.zeros(5, dtype=x.dtype)
     x[sel] = val
     assert (dpt.asnumpy(x) == np.array([0, 1, 0, 3, 0, 5, 0, 7, 0, 9])).all()
-    dpt.place(x, sel, dpt.asarray(2))
+    dpt.place(x, sel, dpt.asarray([2]))
     assert (dpt.asnumpy(x) == np.array([2, 1, 2, 3, 2, 5, 2, 7, 2, 9])).all()
 
 
 def test_place_2d_axis0():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
     sel = dpt.asarray([True, False, True])
     val = dpt.zeros((2, 4), dtype=x.dtype)
@@ -1092,6 +1099,7 @@ def test_place_2d_axis0():
 
 
 def test_place_2d_axis1():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
     sel = dpt.asarray([True, False, True, False])
     val = dpt.zeros((3, 2), dtype=x.dtype)
@@ -1103,6 +1111,7 @@ def test_place_2d_axis1():
 
 
 def test_place_2d_axis1_scalar():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
     sel = dpt.asarray([True, False, True, False])
     val = dpt.zeros(tuple(), dtype=x.dtype)
@@ -1114,6 +1123,7 @@ def test_place_2d_axis1_scalar():
 
 
 def test_place_all_slices():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
     sel = dpt.asarray(
         [
@@ -1128,6 +1138,7 @@ def test_place_all_slices():
 
 
 def test_place_some_slices_begin():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((3, 3), dtype="?")
@@ -1139,6 +1150,7 @@ def test_place_some_slices_begin():
 
 
 def test_place_some_slices_mid():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((3, 4), dtype="?")
@@ -1150,6 +1162,7 @@ def test_place_some_slices_mid():
 
 
 def test_place_some_slices_end():
+    get_queue_or_skip()
     x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
     y = dpt.permute_dims(x, (2, 0, 3, 1))
     sel = dpt.zeros((4, 4), dtype="?")
@@ -1161,6 +1174,7 @@ def test_place_some_slices_end():
 
 
 def test_place_cycling():
+    get_queue_or_skip()
     x = dpt.zeros(10, dtype="f4")
     y = dpt.asarray([2, 3])
     sel = dpt.ones(x.size, dtype="?")
@@ -1177,16 +1191,18 @@ def test_place_cycling():
 
 
 def test_place_subset():
+    get_queue_or_skip()
     x = dpt.zeros(10, dtype="f4")
     y = dpt.ones_like(x)
     sel = dpt.ones(x.size, dtype="?")
     sel[::2] = False
     dpt.place(x, sel, y)
-    expected = np.array([1, 3, 5, 7, 9], dtype=x.dtype)
+    expected = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1], dtype=x.dtype)
     assert (dpt.asnumpy(x) == expected).all()
 
 
 def test_nonzero():
+    get_queue_or_skip()
     x = dpt.concat((dpt.zeros(3), dpt.ones(4), dpt.zeros(3)))
     (i,) = dpt.nonzero(x)
-    assert dpt.asnumpy(i) == np.array([3, 4, 5, 6]).all()
+    assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all()