From 2446b00354533e69723e581d140da6ed542ea61d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 2 Mar 2023 19:35:18 -0600 Subject: [PATCH 1/8] Implements place, extract, nonzero kernels, and Python API for them Implemented mask_positions, _extract, _place, _nonzero and _array_overlap APIs. --- dpctl/tensor/CMakeLists.txt | 1 + .../kernels/boolean_advanced_indexing.hpp | 948 ++++++++++++++ .../source/boolean_advanced_indexing.cpp | 1085 +++++++++++++++++ .../source/boolean_advanced_indexing.hpp | 84 ++ .../source/simplify_iteration_space.cpp | 269 ++++ .../source/simplify_iteration_space.hpp | 36 + dpctl/tensor/libtensor/source/tensor_py.cpp | 31 + 7 files changed, 2454 insertions(+) create mode 100644 dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp create mode 100644 dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp create mode 100644 dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 3f5780cd75..300baa98c9 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -32,6 +32,7 @@ pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp new file mode 100644 index 0000000000..b42b7869d2 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -0,0 +1,948 @@ +//=== boolean_advance_indexing.hpp - ---*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for advanced tensor index operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "utils/strided_iters.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace indexing +{ + +namespace py = pybind11; + +template T ceiling_quotient(T n, T m) +{ + return (n + m - 1) / m; +} +template T1 ceiling_quotient(T1 n, T2 m) +{ + return ceiling_quotient(n, static_cast(m)); +} + +template +class inclusive_scan_rec_local_scan_krn; + +template +class inclusive_scan_rec_chunk_update_krn; + +struct NoOpIndexer +{ + size_t operator()(size_t gid) const + { + return gid; + } +}; + +struct StridedIndexer +{ + StridedIndexer(int _nd, + py::ssize_t _offset, + py::ssize_t const *_packed_shape_strides) + : nd(_nd), starting_offset(_offset), + shape_strides(_packed_shape_strides) + { + } + + size_t operator()(size_t gid) const + { + CIndexer_vector _ind(nd); + py::ssize_t relative_offset(0); + _ind.get_displacement( + static_cast(gid), + shape_strides, // shape ptr + shape_strides + nd, // strides ptr + relative_offset); + return starting_offset + relative_offset; + } + +private: + int nd; + py::ssize_t starting_offset; + py::ssize_t const *shape_strides; +}; + +struct Strided1DIndexer +{ + Strided1DIndexer(py::ssize_t _offset, py::ssize_t _size, py::ssize_t _step) + : offset(_offset), size(static_cast(_size)), step(_step) + { + } + + size_t operator()(size_t gid) const + { + return static_cast(offset + std::min(gid, size) * step); + } + +private: + py::ssize_t offset = 0; + size_t size = 1; + py::ssize_t step = 1; +}; + +template struct ZeroChecker +{ + + ZeroChecker(_IndexerFn _indexer) : indexer_fn(_indexer) {} + + template + bool operator()(dataT const *data, size_t gid) const + { + constexpr dataT _zero(0); + + return data[indexer_fn(gid)] == _zero; + } + +private: + _IndexerFn indexer_fn; +}; + +/* + * for integer type maskT, + * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) + * for 0 <= j < n_elems + */ +template +sycl::event inclusive_scan_rec(sycl::queue exec_q, + size_t n_elems, + size_t wg_size, + const inputT *input, + outputT *output, + size_t s0, + size_t s1, + IndexerT indexer, + std::vector const &depends = {}) +{ + size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size); + + sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using slmT = sycl::local_accessor; + + auto lws = sycl::range<1>(wg_size); + auto gws = sycl::range<1>(n_groups * wg_size); + + slmT slm_iscan_tmp(lws, cgh); + + ZeroChecker is_zero_fn(indexer); + + cgh.parallel_for, n_wi>>( + sycl::nd_range<1>(gws, lws), + [=](sycl::nd_item<1> it) + { + auto chunk_gid = it.get_global_id(0); + auto lid = it.get_local_id(0); + + std::array local_isum; + + size_t i = chunk_gid * n_wi; + for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { + constexpr outputT out_zero(0); + constexpr outputT out_one(1); + local_isum[m_wi] = + (i + m_wi < n_elems) + ? (is_zero_fn(input, s0 + s1 * (i + m_wi)) ? out_zero + : out_one) + : out_zero; + } + +// local_isum is now result of +// inclusive scan of locally stored mask indicators +#pragma unroll + for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += local_isum[m_wi - 1]; + } + + size_t wg_iscan_val = + sycl::inclusive_scan_over_group(it.get_group(), + local_isum.back(), + sycl::plus(), + size_t(0)); + + slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; + it.barrier(sycl::access::fence_space::local_space); + size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid]; + it.barrier(sycl::access::fence_space::local_space); + +#pragma unroll + for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += addand; + } + + for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) { + output[i + m_wi] = local_isum[m_wi]; + } + } + ); + }); + + sycl::event out_event = inc_scan_phase1_ev; + if (n_groups > 1) { + outputT *temp = sycl::malloc_device(n_groups - 1, exec_q); + + auto chunk_size = wg_size * n_wi; + + NoOpIndexer _no_op_indexer{}; + auto e2 = inclusive_scan_rec( + exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1, + chunk_size, _no_op_indexer, {inc_scan_phase1_ev}); + + // output[ chunk_size * (i + 1) + j] += temp[i] + auto e3 = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(e2); + cgh.parallel_for>( + {n_elems}, + [=](auto wiid) + { + auto gid = wiid[0]; + auto i = (gid / chunk_size); + output[gid] += (i > 0) ? temp[i - 1] : 0; + } + ); + }); + + // dangling task to free the temporary + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(e3); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); }); + }); + + out_event = e3; + } + + return out_event; +} + +template struct TwoOffsets +{ + TwoOffsets() : first_offset(0), second_offset(0) {} + TwoOffsets(displacementT first_offset_, displacementT second_offset_) + : first_offset(first_offset_), second_offset(second_offset_) + { + } + + displacementT get_first_offset() const + { + return first_offset; + } + displacementT get_second_offset() const + { + return second_offset; + } + +private: + displacementT first_offset = 0; + displacementT second_offset = 0; +}; + +struct TwoOffsets_StridedIndexer +{ + TwoOffsets_StridedIndexer(int common_nd, + py::ssize_t first_offset_, + py::ssize_t second_offset_, + py::ssize_t const *_packed_shape_strides) + : nd(common_nd), starting_first_offset(first_offset_), + starting_second_offset(second_offset_), + shape_strides(_packed_shape_strides) + { + } + + TwoOffsets operator()(py::ssize_t gid) const + { + CIndexer_vector _ind(nd); + py::ssize_t relative_first_offset(0); + py::ssize_t relative_second_offset(0); + _ind.get_displacement( + gid, + shape_strides, // shape ptr + shape_strides + nd, // src strides ptr + shape_strides + 2 * nd, // src strides ptr + relative_first_offset, relative_second_offset); + return TwoOffsets( + starting_first_offset + relative_first_offset, + starting_second_offset + relative_second_offset); + } + +private: + int nd; + py::ssize_t starting_first_offset; + py::ssize_t starting_second_offset; + py::ssize_t const *shape_strides; +}; + +struct TwoZeroOffsets_Indexer +{ + TwoZeroOffsets_Indexer() {} + + TwoOffsets operator()(py::ssize_t) const + { + return TwoOffsets(); + } +}; + +template +struct MaskedExtractStridedFunctor +{ + MaskedExtractStridedFunctor(const char *src_data_p, + const char *cumsum_data_p, + char *dst_data_p, + size_t orthog_iter_size, + size_t masked_iter_size, + OrthogIndexerT orthog_src_dst_indexer_, + MaskedSrcIndexerT masked_src_indexer_, + MaskedDstIndexerT masked_dst_indexer_) + : src_cp(src_data_p), cumsum_cp(cumsum_data_p), dst_cp(dst_data_p), + orthog_nelems(orthog_iter_size), masked_nelems(masked_iter_size), + orthog_src_dst_indexer(orthog_src_dst_indexer_), + masked_src_indexer(masked_src_indexer_), + masked_dst_indexer(masked_dst_indexer_) + { + } + + void operator()(sycl::id<1> idx) const + { + const dataT *src_data = reinterpret_cast(src_cp); + dataT *dst_data = reinterpret_cast(dst_cp); + const indT *cumsum_data = reinterpret_cast(cumsum_cp); + + size_t global_i = idx[0]; + size_t orthog_i = global_i / masked_nelems; + size_t masked_i = global_i - masked_nelems * orthog_i; + + indT current_running_count = cumsum_data[masked_i]; + bool mask_set = + (masked_i == 0) + ? (current_running_count == 1) + : (current_running_count == cumsum_data[masked_i - 1] + 1); + + // dst[cumsum[i], j] - 1 = src[i, j] if cumsum[i] == ((i > 0) ? + // cumsum[i-1] + // + 1 : 1) + if (mask_set) { + auto orthog_offsets = + orthog_src_dst_indexer(static_cast(orthog_i)); + + size_t total_src_offset = masked_src_indexer(masked_i) + + orthog_offsets.get_first_offset(); + size_t total_dst_offset = + masked_dst_indexer(current_running_count - 1) + + orthog_offsets.get_second_offset(); + + dst_data[total_dst_offset] = src_data[total_src_offset]; + } + } + +private: + const char *src_cp = nullptr; + const char *cumsum_cp = nullptr; + char *dst_cp = nullptr; + size_t orthog_nelems = 0; + size_t masked_nelems = 0; + OrthogIndexerT + orthog_src_dst_indexer; // has nd, shape, src_strides, dst_strides for + // dimensions that ARE NOT masked + MaskedSrcIndexerT masked_src_indexer; // has nd, shape, src_strides for + // dimensions that ARE masked + MaskedDstIndexerT + masked_dst_indexer; // has 1, dst_strides for dimensions that ARE masked +}; + +template +struct MaskedPlaceStridedFunctor +{ + MaskedPlaceStridedFunctor(char *dst_data_p, + const char *cumsum_data_p, + const char *rhs_data_p, + size_t orthog_iter_size, + size_t masked_iter_size, + OrthogIndexerT orthog_dst_rhs_indexer_, + MaskedDstIndexerT masked_dst_indexer_, + MaskedRhsIndexerT masked_rhs_indexer_) + : dst_cp(dst_data_p), cumsum_cp(cumsum_data_p), rhs_cp(rhs_data_p), + orthog_nelems(orthog_iter_size), masked_nelems(masked_iter_size), + orthog_dst_rhs_indexer(orthog_dst_rhs_indexer_), + masked_dst_indexer(masked_dst_indexer_), + masked_rhs_indexer(masked_rhs_indexer_) + { + } + + void operator()(sycl::id<1> idx) const + { + dataT *dst_data = reinterpret_cast(dst_cp); + const indT *cumsum_data = reinterpret_cast(cumsum_cp); + const dataT *rhs_data = reinterpret_cast(rhs_cp); + + size_t global_i = idx[0]; + size_t orthog_i = global_i / masked_nelems; + size_t masked_i = global_i - masked_nelems * orthog_i; + + indT current_running_count = cumsum_data[masked_i]; + bool mask_set = + (masked_i == 0) + ? (current_running_count == 1) + : (current_running_count == cumsum_data[masked_i - 1] + 1); + + // src[i, j] = rhs[cumsum[i] - 1, j] if cumsum[i] == ((i > 0) ? + // cumsum[i-1] + // + 1 : 1) + if (mask_set) { + auto orthog_offsets = + orthog_dst_rhs_indexer(static_cast(orthog_i)); + + size_t total_dst_offset = masked_dst_indexer(masked_i) + + orthog_offsets.get_first_offset(); + size_t total_rhs_offset = + masked_rhs_indexer(current_running_count - 1) + + orthog_offsets.get_second_offset(); + + dst_data[total_dst_offset] = rhs_data[total_rhs_offset]; + } + } + +private: + char *dst_cp = nullptr; + const char *cumsum_cp = nullptr; + const char *rhs_cp = nullptr; + size_t orthog_nelems = 0; + size_t masked_nelems = 0; + OrthogIndexerT + orthog_dst_rhs_indexer; // has nd, shape, dst_strides, rhs_strides for + // dimensions that ARE NOT masked + MaskedDstIndexerT masked_dst_indexer; // has nd, shape, dst_strides for + // dimensions that ARE masked + MaskedRhsIndexerT + masked_rhs_indexer; // has 1, rhs_strides for dimensions that ARE masked +}; + +// mask positions + +typedef size_t (*mask_positions_contig_impl_fn_ptr_t)( + sycl::queue, + size_t, + const char *, + char *, + std::vector const &); + +template +size_t mask_positions_contig_impl(sycl::queue q, + size_t n_elems, + const char *mask, + char *cumsum, + std::vector const &depends = {}) +{ + constexpr int n_wi = 8; + const maskT *mask_data_ptr = reinterpret_cast(mask); + cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); + size_t wg_size = 128; + + NoOpIndexer flat_indexer{}; + + sycl::event comp_ev = inclusive_scan_rec( + q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, flat_indexer, + depends); + + cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); + + cumsumT *last_elem_host_usm = sycl::malloc_host(1, q); + + if (last_elem_host_usm == nullptr) { + throw std::bad_alloc(); + } + sycl::event copy_e = + q.copy(last_elem, last_elem_host_usm, 1, {comp_ev}); + copy_e.wait(); + size_t return_val = static_cast(*last_elem_host_usm); + sycl::free(last_elem_host_usm, q); + + return return_val; +} + +template struct MaskPositionsContigFactory +{ + fnT get() + { + fnT fn = mask_positions_contig_impl; + return fn; + } +}; + +typedef size_t (*mask_positions_strided_impl_fn_ptr_t)( + sycl::queue, + size_t, + const char *, + int, + py::ssize_t, + const py::ssize_t *, + char *, + std::vector const &); + +template +size_t mask_positions_strided_impl(sycl::queue q, + size_t n_elems, + const char *mask, + int nd, + py::ssize_t input_offset, + const py::ssize_t *shape_strides, + char *cumsum, + std::vector const &depends = {}) +{ + constexpr int n_wi = 8; + const maskT *mask_data_ptr = reinterpret_cast(mask); + cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); + size_t wg_size = 128; + + StridedIndexer strided_indexer{nd, input_offset, shape_strides}; + + sycl::event comp_ev = + inclusive_scan_rec( + q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, + strided_indexer, depends); + + cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); + + cumsumT *last_elem_host_usm = sycl::malloc_host(1, q); + + if (last_elem_host_usm == nullptr) { + throw std::bad_alloc(); + } + sycl::event copy_e = + q.copy(last_elem, last_elem_host_usm, 1, {comp_ev}); + copy_e.wait(); + size_t return_val = static_cast(*last_elem_host_usm); + sycl::free(last_elem_host_usm, q); + + return return_val; +} + +template struct MaskPositionsStridedFactory +{ + fnT get() + { + fnT fn = mask_positions_strided_impl; + return fn; + } +}; + +// ======= Masked extraction ================================ + +template +class masked_extract_all_slices_strided_impl_krn; + +typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)( + sycl::queue, + py::ssize_t, + const char *, + const char *, + char *, + int, + py::ssize_t const *, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +sycl::event masked_extract_all_slices_strided_impl( + sycl::queue exec_q, + py::ssize_t iteration_size, + const char *src_p, + const char *cumsum_p, + char *dst_p, + int nd, + const py::ssize_t + *packed_src_shape_strides, // [src_shape, src_strides], length 2*nd + py::ssize_t dst_size, // dst is 1D + py::ssize_t dst_stride, + const std::vector &depends = {}) +{ + // using MaskedExtractStridedFunctor; + // using Strided1DIndexer; + // using StridedIndexer; + // using TwoZeroOffsets_Indexer; + + TwoZeroOffsets_Indexer orthog_src_dst_indexer{}; + + /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const + * *_packed_shape_strides) */ + StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides); + Strided1DIndexer masked_dst_indexer(0, dst_size, dst_stride); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for>( + sycl::range<1>(static_cast(iteration_size)), + MaskedExtractStridedFunctor( + src_p, cumsum_p, dst_p, 1, iteration_size, + orthog_src_dst_indexer, masked_src_indexer, + masked_dst_indexer)); + }); + + return comp_ev; +} + +typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)( + sycl::queue, + py::ssize_t, + py::ssize_t, + const char *, + const char *, + char *, + int, + py::ssize_t const *, + py::ssize_t, + py::ssize_t, + int, + py::ssize_t const *, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +class masked_extract_some_slices_strided_impl_krn; + +template +sycl::event masked_extract_some_slices_strided_impl( + sycl::queue exec_q, + py::ssize_t orthog_nelems, + py::ssize_t masked_nelems, + const char *src_p, + const char *cumsum_p, + char *dst_p, + int orthog_nd, + const py::ssize_t + *packed_ortho_src_dst_shape_strides, // [ortho_shape, ortho_src_strides, + // ortho_dst_strides], length + // 3*ortho_nd + py::ssize_t ortho_src_offset, + py::ssize_t ortho_dst_offset, + int masked_nd, + const py::ssize_t *packed_masked_src_shape_strides, // [masked_src_shape, + // masked_src_strides], + // length 2*masked_nd + py::ssize_t masked_dst_size, // mask_dst is 1D + py::ssize_t masked_dst_stride, + const std::vector &depends = {}) +{ + // using MaskedExtractStridedFunctor; + // using Strided1DIndexer; + // using StridedIndexer; + // using TwoOffsets_StridedIndexer; + + TwoOffsets_StridedIndexer orthog_src_dst_indexer{ + orthog_nd, ortho_src_offset, ortho_dst_offset, + packed_ortho_src_dst_shape_strides}; + + StridedIndexer masked_src_indexer{masked_nd, 0, + packed_masked_src_shape_strides}; + Strided1DIndexer masked_dst_indexer{0, masked_dst_size, masked_dst_stride}; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for>( + sycl::range<1>(static_cast(orthog_nelems * masked_nelems)), + MaskedExtractStridedFunctor( + src_p, cumsum_p, dst_p, orthog_nelems, masked_nelems, + orthog_src_dst_indexer, masked_src_indexer, + masked_dst_indexer)); + }); + + return comp_ev; +} + +template struct MaskExtractAllSlicesStridedFactory +{ + fnT get() + { + fnT fn = masked_extract_all_slices_strided_impl; + return fn; + } +}; + +template struct MaskExtractSomeSlicesStridedFactory +{ + fnT get() + { + fnT fn = masked_extract_some_slices_strided_impl; + return fn; + } +}; + +// Masked placement + +template +class masked_place_all_slices_strided_impl_krn; + +typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)( + sycl::queue, + py::ssize_t, + char *, + const char *, + const char *, + int, + py::ssize_t const *, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +sycl::event masked_place_all_slices_strided_impl( + sycl::queue exec_q, + py::ssize_t iteration_size, + char *dst_p, + const char *cumsum_p, + const char *rhs_p, + int nd, + const py::ssize_t + *packed_dst_shape_strides, // [dst_shape, dst_strides], length 2*nd + py::ssize_t rhs_size, // rhs is 1D + py::ssize_t rhs_stride, + const std::vector &depends = {}) +{ + // using MaskedPlaceStridedFunctor; + // using Strided1DIndexer; + // using StridedIndexer; + // using TwoZeroOffsets_Indexer; + + TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{}; + + /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const + * *_packed_shape_strides) */ + StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides); + Strided1DIndexer masked_rhs_indexer(0, rhs_size, rhs_stride); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for>( + sycl::range<1>(static_cast(iteration_size)), + MaskedPlaceStridedFunctor( + dst_p, cumsum_p, rhs_p, 1, iteration_size, + orthog_dst_rhs_indexer, masked_dst_indexer, + masked_rhs_indexer)); + }); + + return comp_ev; +} + +typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)( + sycl::queue, + py::ssize_t, + py::ssize_t, + char *, + const char *, + const char *, + int, + py::ssize_t const *, + py::ssize_t, + py::ssize_t, + int, + py::ssize_t const *, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +class masked_place_some_slices_strided_impl_krn; + +template +sycl::event masked_place_some_slices_strided_impl( + sycl::queue exec_q, + py::ssize_t orthog_nelems, + py::ssize_t masked_nelems, + char *dst_p, + const char *cumsum_p, + const char *rhs_p, + int orthog_nd, + const py::ssize_t + *packed_ortho_dst_rhs_shape_strides, // [ortho_shape, ortho_dst_strides, + // ortho_rhs_strides], length + // 3*ortho_nd + py::ssize_t ortho_dst_offset, + py::ssize_t ortho_rhs_offset, + int masked_nd, + const py::ssize_t *packed_masked_dst_shape_strides, // [masked_dst_shape, + // masked_dst_strides], + // length 2*masked_nd + py::ssize_t masked_rhs_size, // mask_dst is 1D + py::ssize_t masked_rhs_stride, + const std::vector &depends = {}) +{ + // using MaskedPlaceStridedFunctor; + // using Strided1DIndexer; + // using StridedIndexer; + // using TwoOffsets_StridedIndexer; + + TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{ + orthog_nd, ortho_dst_offset, ortho_rhs_offset, + packed_ortho_dst_rhs_shape_strides}; + + /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const + * *_packed_shape_strides) */ + StridedIndexer masked_dst_indexer{masked_nd, 0, + packed_masked_dst_shape_strides}; + Strided1DIndexer masked_rhs_indexer{0, masked_rhs_size, masked_rhs_stride}; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for>( + sycl::range<1>(static_cast(orthog_nelems * masked_nelems)), + MaskedPlaceStridedFunctor( + dst_p, cumsum_p, rhs_p, orthog_nelems, masked_nelems, + orthog_dst_rhs_indexer, masked_dst_indexer, + masked_rhs_indexer)); + }); + + return comp_ev; +} + +static masked_place_all_slices_strided_impl_fn_ptr_t + masked_place_all_slices_strided_impl_dispatch_vector + [dpctl::tensor::detail::num_types]; + +template struct MaskPlaceAllSlicesStridedFactory +{ + fnT get() + { + fnT fn = masked_place_all_slices_strided_impl; + return fn; + } +}; + +static masked_place_some_slices_strided_impl_fn_ptr_t + masked_place_some_slices_strided_impl_dispatch_vector + [dpctl::tensor::detail::num_types]; + +template struct MaskPlaceSomeSlicesStridedFactory +{ + fnT get() + { + fnT fn = masked_place_some_slices_strided_impl; + return fn; + } +}; + +// Non-zero + +class non_zero_indexes_krn; + +template +sycl::event non_zero_indexes_impl(sycl::queue exec_q, + py::ssize_t iter_size, + py::ssize_t nz_elems, + int nd, + const char *cumsum_cp, + char *indexes_cp, + const py::ssize_t *mask_shape, + std::vector const &depends) +{ + const indT1 *cumsum_data = reinterpret_cast(cumsum_cp); + indT2 *indexes_data = reinterpret_cast(indexes_cp); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for( + sycl::range<1>(iter_size), [=](sycl::id<1> idx) { + auto i = idx[0]; + + auto cs_curr_val = cumsum_data[i] - 1; + auto cs_prev_val = (i > 0) ? cumsum_data[i - 1] : indT1(0); + bool cond = (cs_curr_val == cs_prev_val); + + py::ssize_t i_ = static_cast(i); + for (int dim = nd; --dim > 0;) { + auto sd = mask_shape[dim]; + py::ssize_t q = i_ / sd; + py::ssize_t r = (i_ - q * sd); + if (cond) { + indexes_data[cs_curr_val + dim * nz_elems] = + static_cast(r); + } + i_ = q; + } + if (cond) { + indexes_data[cs_curr_val] = static_cast(i_); + } + }); + }); + + return comp_ev; +} + +} // namespace indexing +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp new file mode 100644 index 0000000000..1534b38391 --- /dev/null +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -0,0 +1,1085 @@ +//===-- boolean_advanced_indexing.cpp - --*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines implementation functions of dpctl.tensor.place and +/// dpctl.tensor.extract, dpctl.tensor.nonzero +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include +#include + +#include "boolean_advanced_indexing.hpp" +#include "kernels/boolean_advanced_indexing.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +struct sink_t +{ + sink_t(){}; + template sink_t(T &&){}; +}; + +template std::size_t accumulate_size(std::size_t &s, V &&v) +{ + return s += v.size(); +} + +template sink_t inserter(V &lhs, U &&rhs) +{ + lhs.insert(lhs.end(), rhs.begin(), rhs.end()); + return {}; +} + +template +std::vector concat(std::vector lhs, Vs &&... vs) +{ + std::size_t s = lhs.size(); + { + // limited scope ensures array is freed + [[maybe_unused]] sink_t tmp[] = {accumulate_size(s, vs)..., 0}; + } + lhs.reserve(s); + { + // array of no-data objects ensures ordering of calls to inserter + [[maybe_unused]] sink_t tmp[] = {inserter(lhs, std::forward(vs))..., + 0}; + } + + return std::move(lhs); // prevent return-value optimization +} + +template +std::tuple +device_allocate_and_pack(sycl::queue q, + std::vector &host_task_events, + Vs &&... vs) +{ + + // memory transfer optimization, use USM-host for temporary speeds up + // tranfer to device, especially on dGPUs + using usm_host_allocatorT = + sycl::usm_allocator; + using shT = std::vector; + + usm_host_allocatorT usm_host_allocator(q); + shT empty{0, usm_host_allocator}; + shT packed_shape_strides = concat(empty, vs...); + + auto packed_shape_strides_owner = + std::make_shared(std::move(packed_shape_strides)); + + auto sz = packed_shape_strides_owner->size(); + indT *shape_strides = sycl::malloc_device(sz, q); + + sycl::event copy_ev = + q.copy(packed_shape_strides_owner->data(), shape_strides, sz); + + sycl::event cleanup_host_task_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(copy_ev); + cgh.host_task([packed_shape_strides_owner] { + // increment shared pointer ref-count to keep it alive + // till copy operation completes; + }); + }); + host_task_events.push_back(cleanup_host_task_ev); + + return std::make_tuple(shape_strides, sz, copy_ev); +} + +/* @brief check for overlap of memory regions behind arrays. + +Presenty assume that array occupies all bytes between smallest and largest +displaced elements. + +TODO: Write proper Frobenius solver to account for holes, e.g. + overlap( x_contig[::2], x_contig[1::2]) should give False, + while this implementation gives True. +*/ +bool overlap(dpctl::tensor::usm_ndarray ar1, dpctl::tensor::usm_ndarray ar2) +{ + const char *ar1_data = ar1.get_data(); + + const auto &ar1_offsets = ar1.get_minmax_offsets(); + py::ssize_t ar1_elem_size = static_cast(ar1.get_elemsize()); + + const char *ar2_data = ar2.get_data(); + const auto &ar2_offsets = ar2.get_minmax_offsets(); + py::ssize_t ar2_elem_size = static_cast(ar2.get_elemsize()); + + /* Memory of array1 extends from */ + /* [ar1_data + ar1_offsets.first * ar1_elem_size, ar1_data + + * ar1_offsets.second * ar1_elem_size + ar1_elem_size] */ + /* Memory of array2 extends from */ + /* [ar2_data + ar2_offsets.first * ar2_elem_size, ar2_data + + * ar2_offsets.second * ar2_elem_size + ar2_elem_size] */ + + /* Intervals [x0, x1] and [y0, y1] do not overlap if (x0 <= x1) && (y0 <= + * y1) + * && (x1 <=y0 || y1 <= x0 ) */ + /* Given that x0 <= x1 and y0 <= y1 are true by construction, the condition + * for overlap us (x1 > y0) && (y1 > x0) */ + + /* Applying: + (ar1_data + ar1_offsets.second * ar1_elem_size + ar1_elem_size > + ar2_data + + ar2_offsets.first * ar2_elem_size) && (ar2_data + ar2_offsets.second * + ar2_elem_size + ar2_elem_size > ar1_data + ar1_offsets.first * + ar1_elem_size) + */ + + auto byte_distance = static_cast(ar2_data - ar1_data); + + py::ssize_t x1_minus_y0 = + (-byte_distance + + (ar1_elem_size + (ar1_offsets.second * ar1_elem_size) - + (ar2_offsets.first * ar2_elem_size))); + + py::ssize_t y1_minus_x0 = + (byte_distance + (ar2_elem_size + (ar2_offsets.second * ar2_elem_size) - + (ar1_offsets.first * ar1_elem_size))); + + bool memory_overlap = (x1_minus_y0 > 0) && (y1_minus_x0 > 0); + + return memory_overlap; +} + +/* @brief Split shape/strides into dir1 (complementary to axis_start <= i < + * axis_end) and dir2 (along given set of axes) + */ +template +void _split_iteration_space(const shT &shape_vec, + const shT &strides_vec, + int axis_start, + int axis_end, + shT &dir1_shape_vec, + shT &dir2_shape_vec, + shT &dir1_strides_vec, + shT &dir2_strides_vec) +{ + int nd = static_cast(shape_vec.size()); + int dir2_sz = axis_end - axis_start; + int dir1_sz = nd - dir2_sz; + + assert(dir1_sz > 0); + assert(dir2_sz > 0); + + dir1_shape_vec.resize(dir1_sz); + dir2_shape_vec.resize(dir2_sz); + + std::copy(shape_vec.begin(), shape_vec.begin() + axis_start, + dir1_shape_vec.begin()); + std::copy(shape_vec.begin() + axis_end, shape_vec.end(), + dir1_shape_vec.begin() + axis_start); + + std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end, + dir2_shape_vec.begin()); + + dir1_strides_vec.resize(dir1_sz); + dir2_strides_vec.resize(dir2_sz); + + std::copy(strides_vec.begin(), strides_vec.begin() + axis_start, + dir1_strides_vec.begin()); + std::copy(strides_vec.begin() + axis_end, strides_vec.end(), + dir1_strides_vec.begin() + axis_start); + + std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end, + dir2_strides_vec.begin()); + + return; +} + +// Computation of positions of masked elements + +using dpctl::tensor::kernels::indexing::mask_positions_contig_impl_fn_ptr_t; +static mask_positions_contig_impl_fn_ptr_t + mask_positions_contig_dispatch_vector[dpctl::tensor::detail::num_types]; + +using dpctl::tensor::kernels::indexing::mask_positions_strided_impl_fn_ptr_t; +static mask_positions_strided_impl_fn_ptr_t + mask_positions_strided_dispatch_vector[dpctl::tensor::detail::num_types]; + +void populate_mask_positions_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::indexing::MaskPositionsContigFactory; + dpctl::tensor::detail::DispatchVectorBuilder< + mask_positions_contig_impl_fn_ptr_t, MaskPositionsContigFactory, + dpctl::tensor::detail::num_types> + dvb1; + dvb1.populate_dispatch_vector(mask_positions_contig_dispatch_vector); + + using dpctl::tensor::kernels::indexing::MaskPositionsStridedFactory; + dpctl::tensor::detail::DispatchVectorBuilder< + mask_positions_strided_impl_fn_ptr_t, MaskPositionsStridedFactory, + dpctl::tensor::detail::num_types> + dvb2; + dvb2.populate_dispatch_vector(mask_positions_strided_dispatch_vector); + + return; +} + +size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, + dpctl::tensor::usm_ndarray cumsum, + sycl::queue exec_q, + std::vector const &depends) +{ + // cumsum is 1D + if (cumsum.get_ndim() != 1) { + throw py::value_error("Result array must be one-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array must be C-contiguous."); + } + + // cumsum.shape == (mask.size,) + auto mask_size = mask.get_size(); + auto cumsum_size = cumsum.get_shape(0); + if (cumsum_size != mask_size) { + throw py::value_error("Inconsistent dimensions"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) { + // FIXME: use ExecutionPlacementError + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (mask_size == 0) { + return 0; + } + + int mask_typenum = mask.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + // mask can be any type + const char *mask_data = mask.get_data(); + char *cumsum_data = cumsum.get_data(); + + auto const &array_types = dpctl::tensor::detail::usm_ndarray_types(); + + int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + // cumsum must be int64_t only + constexpr int int64_typeid = + static_cast(dpctl::tensor::detail::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Cumulative sum array must have int64 data-type."); + } + + if (mask.is_c_contiguous()) { + auto fn = mask_positions_contig_dispatch_vector[mask_typeid]; + + return fn(exec_q, mask_size, mask_data, cumsum_data, depends); + } + + const py::ssize_t *shape = mask.get_shape_raw(); + const py::ssize_t *strides = mask.get_strides_raw(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_strides; + py::ssize_t offset(0); + + int mask_nd = mask.get_ndim(); + int nd = mask_nd; + + constexpr py::ssize_t itemsize = 1; // in elements + bool is_c_contig = mask.is_c_contiguous(); + bool is_f_contig = mask.is_f_contiguous(); + + dpctl::tensor::py_internal::simplify_iteration_space_1( + nd, shape, strides, itemsize, is_c_contig, is_f_contig, + simplified_shape, simplified_strides, offset); + + if (nd == 1 && simplified_strides[0] == 1) { + auto fn = mask_positions_contig_dispatch_vector[mask_typeid]; + + return fn(exec_q, mask_size, mask_data, cumsum_data, depends); + } + + // Strided implementation + auto strided_fn = mask_positions_strided_dispatch_vector[mask_typeid]; + std::vector host_task_events; + + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_strides); + py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + + if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { + copy_shape_ev.wait(); + sycl::event::wait(host_task_events); + sycl::free(shape_strides, exec_q); + throw std::runtime_error("Unexacted error"); + } + + std::vector dependent_events; + dependent_events.reserve(depends.size() + 1); + dependent_events.insert(dependent_events.end(), copy_shape_ev); + dependent_events.insert(dependent_events.end(), depends.begin(), + depends.end()); + + size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, offset, + shape_strides, cumsum_data, dependent_events); + + sycl::event::wait(host_task_events); + sycl::free(shape_strides, exec_q); + + return total_set; +} + +// Masked extraction + +using dpctl::tensor::kernels::indexing:: + masked_extract_all_slices_strided_impl_fn_ptr_t; + +static masked_extract_all_slices_strided_impl_fn_ptr_t + masked_extract_all_slices_strided_impl_dispatch_vector + [dpctl::tensor::detail::num_types]; + +using dpctl::tensor::kernels::indexing:: + masked_extract_some_slices_strided_impl_fn_ptr_t; + +static masked_extract_some_slices_strided_impl_fn_ptr_t + masked_extract_some_slices_strided_impl_dispatch_vector + [dpctl::tensor::detail::num_types]; + +void populate_masked_extract_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::indexing::MaskExtractAllSlicesStridedFactory; + dpctl::tensor::detail::DispatchVectorBuilder< + masked_extract_all_slices_strided_impl_fn_ptr_t, + MaskExtractAllSlicesStridedFactory, dpctl::tensor::detail::num_types> + dvb1; + dvb1.populate_dispatch_vector( + masked_extract_all_slices_strided_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing::MaskExtractSomeSlicesStridedFactory; + dpctl::tensor::detail::DispatchVectorBuilder< + masked_extract_some_slices_strided_impl_fn_ptr_t, + MaskExtractSomeSlicesStridedFactory, dpctl::tensor::detail::num_types> + dvb2; + dvb2.populate_dispatch_vector( + masked_extract_some_slices_strided_impl_dispatch_vector); +} + +std::pair +py_extract(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + std::vector const &depends) +{ + int src_nd = src.get_ndim(); + if ((axis_start < 0 || axis_end > src_nd || axis_start >= axis_end)) { + throw py::value_error("Specified axes_start and axes_end are invalid."); + } + int mask_span_sz = axis_end - axis_start; + + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd + (mask_span_sz - 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) { + throw py::value_error("cumsum array must be a C-contiguous vector"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + py::ssize_t cumsum_sz = cumsum.get_size(); + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool same_ortho_dims(true); + size_t ortho_nelems(1); // number of orthogonal iterations + + for (auto i = 0; i < axis_start; ++i) { + auto src_sh_i = src_shape[i]; + ortho_nelems *= src_sh_i; + same_ortho_dims = same_ortho_dims && (src_sh_i == dst_shape[i]); + } + for (auto i = axis_end; i < src_nd; ++i) { + auto src_sh_i = src_shape[i]; + ortho_nelems *= src_sh_i; + same_ortho_dims = + same_ortho_dims && (src_sh_i == dst_shape[i - (mask_span_sz - 1)]); + } + + size_t masked_src_nelems(1); + size_t masked_dst_nelems(dst_shape[axis_start]); + for (auto i = axis_start; i < axis_end; ++i) { + masked_src_nelems *= src_shape[i]; + } + + // masked_dst_nelems is number of set elements in the mask, or last element + // in cumsum + if (!same_ortho_dims || + (masked_src_nelems != static_cast(cumsum_sz))) { + throw py::value_error("Inconsistent array dimensions"); + } + + // ensure that dst is sufficiently ample + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accomodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < static_cast(ortho_nelems * masked_dst_nelems)) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accomodate all the " + "array elements."); + } + } + + // check that dst does not intersect with src, not with cumsum. + if (overlap(dst, cumsum) || overlap(dst, src)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = dpctl::tensor::detail::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + constexpr int int64_typeid = + static_cast(dpctl::tensor::detail::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Unexact data type of cumsum array, expecting 'int64'"); + } + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data types"); + } + + char *src_data_p = src.get_data(); + char *dst_data_p = dst.get_data(); + char *cumsum_data_p = cumsum.get_data(); + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + sycl::event extract_ev; + std::vector host_task_events{}; + if (axis_start == 0 && axis_end == src_nd) { + // empty orthogonal directions + auto fn = + masked_extract_all_slices_strided_impl_dispatch_vector[src_typeid]; + + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + py::ssize_t *packed_src_shape_strides = + std::get<0>(ptr_size_event_tuple1); + sycl::event copy_src_shape_strides_ev = + std::get<2>(ptr_size_event_tuple1); + + assert(dst_shape_vec.size() == 1); + assert(dst_strides_vec.size() == 1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_src_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + extract_ev = fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p, + dst_data_p, src_nd, packed_src_shape_strides, + dst_shape_vec[0], dst_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(extract_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_src_shape_strides] { + sycl::free(packed_src_shape_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + else { + // non-empty othogonal directions + auto fn = + masked_extract_some_slices_strided_impl_dispatch_vector[src_typeid]; + + int masked_src_nd = mask_span_sz; + int ortho_nd = src_nd - masked_src_nd; + + using shT = std::vector; + + shT ortho_src_shape; + shT masked_src_shape; + shT ortho_src_strides; + shT masked_src_strides; + _split_iteration_space(src_shape_vec, src_strides_vec, axis_start, + axis_end, ortho_src_shape, + masked_src_shape, // 4 vectors modified + ortho_src_strides, masked_src_strides); + + shT ortho_dst_shape; + shT masked_dst_shape; + shT ortho_dst_strides; + shT masked_dst_strides; + _split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start, + axis_start + 1, ortho_dst_shape, + masked_dst_shape, // 4 vectors modified + ortho_dst_strides, masked_dst_strides); + + assert(ortho_src_shape.size() == static_cast(ortho_nd)); + assert(ortho_dst_shape.size() == static_cast(ortho_nd)); + assert(std::equal(ortho_src_shape.begin(), ortho_src_shape.end(), + ortho_dst_shape.begin())); + + std::vector simplified_ortho_shape; + std::vector simplified_ortho_src_strides; + std::vector simplified_ortho_dst_strides; + + const py::ssize_t *_shape = ortho_src_shape.data(); + const py::ssize_t *_src_strides = ortho_src_strides.data(); + const py::ssize_t *_dst_strides = ortho_dst_strides.data(); + constexpr py::ssize_t _itemsize = 1; // in elements + + constexpr bool is_c_contig = false; + constexpr bool is_f_contig = false; + + py::ssize_t ortho_src_offset(0); + py::ssize_t ortho_dst_offset(0); + + dpctl::tensor::py_internal::simplify_iteration_space( + ortho_nd, _shape, _src_strides, _itemsize, is_c_contig, is_f_contig, + _dst_strides, _itemsize, is_c_contig, is_f_contig, + simplified_ortho_shape, simplified_ortho_src_strides, + simplified_ortho_dst_strides, ortho_src_offset, ortho_dst_offset); + + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_ortho_shape, + simplified_ortho_src_strides, simplified_ortho_dst_strides); + py::ssize_t *packed_ortho_src_dst_shape_strides = + std::get<0>(ptr_size_event_tuple1); + sycl::event copy_shape_strides_ev1 = std::get<2>(ptr_size_event_tuple1); + + auto ptr_size_event_tuple2 = device_allocate_and_pack( + exec_q, host_task_events, masked_src_shape, masked_src_strides); + py::ssize_t *packed_masked_src_shape_strides = + std::get<0>(ptr_size_event_tuple2); + sycl::event copy_shape_strides_ev2 = std::get<2>(ptr_size_event_tuple2); + + assert(masked_dst_shape.size() == 1); + assert(masked_dst_strides.size() == 1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 2); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev1); + all_deps.push_back(copy_shape_strides_ev2); + + assert(all_deps.size() == depends.size() + 2); + + // OrthogIndexerT orthog_src_dst_indexer_, MaskedIndexerT + // masked_src_indexer_, MaskedIndexerT masked_dst_indexer_ + extract_ev = fn(exec_q, ortho_nelems, masked_src_nelems, src_data_p, + cumsum_data_p, dst_data_p, + // data to build orthog_src_dst_indexer + ortho_nd, packed_ortho_src_dst_shape_strides, + ortho_src_offset, ortho_dst_offset, + // data to build masked_src_indexer + masked_src_nd, packed_masked_src_shape_strides, + // data to build masked_dst_indexer, + masked_dst_shape[0], masked_dst_strides[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(extract_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_ortho_src_dst_shape_strides, + packed_masked_src_shape_strides] { + sycl::free(packed_ortho_src_dst_shape_strides, ctx); + sycl::free(packed_masked_src_shape_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + host_task_events.push_back(extract_ev); + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {src, cumsum, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, extract_ev); +} + +// Masked placement + +using dpctl::tensor::kernels::indexing:: + masked_place_all_slices_strided_impl_fn_ptr_t; + +static masked_place_all_slices_strided_impl_fn_ptr_t + masked_place_all_slices_strided_impl_dispatch_vector + [dpctl::tensor::detail::num_types]; + +using dpctl::tensor::kernels::indexing:: + masked_place_some_slices_strided_impl_fn_ptr_t; + +static masked_place_some_slices_strided_impl_fn_ptr_t + masked_place_some_slices_strided_impl_dispatch_vector + [dpctl::tensor::detail::num_types]; + +void populate_masked_place_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::indexing::MaskPlaceAllSlicesStridedFactory; + dpctl::tensor::detail::DispatchVectorBuilder< + masked_place_all_slices_strided_impl_fn_ptr_t, + MaskPlaceAllSlicesStridedFactory, dpctl::tensor::detail::num_types> + dvb1; + dvb1.populate_dispatch_vector( + masked_place_all_slices_strided_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing::MaskPlaceSomeSlicesStridedFactory; + dpctl::tensor::detail::DispatchVectorBuilder< + masked_place_some_slices_strided_impl_fn_ptr_t, + MaskPlaceSomeSlicesStridedFactory, dpctl::tensor::detail::num_types> + dvb2; + dvb2.populate_dispatch_vector( + masked_place_some_slices_strided_impl_dispatch_vector); +} + +/* + * @brief Copy dst[i, ortho_id] = rhs[cumsum[i] - 1, ortho_id] if cumsum[i] == + * ((i > 0) ? cumsum[i-1] + 1 : 1) + */ +std::pair +py_place(dpctl::tensor::usm_ndarray dst, + dpctl::tensor::usm_ndarray cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + dpctl::tensor::usm_ndarray rhs, + sycl::queue exec_q, + std::vector const &depends) +{ + int dst_nd = dst.get_ndim(); + if ((axis_start < 0 || axis_end > dst_nd || axis_start >= axis_end)) { + throw py::value_error("Specified axes_start and axes_end are invalid."); + } + int mask_span_sz = axis_end - axis_start; + + int rhs_nd = rhs.get_ndim(); + if (dst_nd != rhs_nd + (mask_span_sz - 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) { + throw py::value_error("cumsum array must be a C-contiguous vector"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst, cumsum, rhs})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + py::ssize_t cumsum_sz = cumsum.get_size(); + + const py::ssize_t *dst_shape = dst.get_shape_raw(); + const py::ssize_t *rhs_shape = rhs.get_shape_raw(); + bool same_ortho_dims(true); + size_t ortho_nelems(1); // number of orthogonal iterations + + for (auto i = 0; i < axis_start; ++i) { + auto dst_sh_i = dst_shape[i]; + ortho_nelems *= dst_sh_i; + same_ortho_dims = same_ortho_dims && (dst_sh_i == rhs_shape[i]); + } + for (auto i = axis_end; i < dst_nd; ++i) { + auto dst_sh_i = dst_shape[i]; + ortho_nelems *= dst_sh_i; + same_ortho_dims = + same_ortho_dims && (dst_sh_i == rhs_shape[i - (mask_span_sz - 1)]); + } + + size_t masked_dst_nelems(1); + for (auto i = axis_start; i < axis_end; ++i) { + masked_dst_nelems *= dst_shape[i]; + } + + if (!same_ortho_dims || + (masked_dst_nelems != static_cast(cumsum_sz))) { + throw py::value_error("Inconsistent array dimensions"); + } + + // ensure that dst is sufficiently ample + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accomodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < static_cast(ortho_nelems * masked_dst_nelems)) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accomodate all the " + "array elements."); + } + } + + // check that dst does not intersect with src, not with cumsum. + if (overlap(dst, rhs) || overlap(dst, cumsum)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int dst_typenum = dst.get_typenum(); + int rhs_typenum = rhs.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = dpctl::tensor::detail::usm_ndarray_types(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + constexpr int int64_typeid = + static_cast(dpctl::tensor::detail::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Unexact data type of cumsum array, expecting 'int64'"); + } + + // FIXME: should types be the same? + if (dst_typeid != rhs_typeid) { + throw py::value_error( + "Destination array must have the same elemental data types"); + } + + char *dst_data_p = dst.get_data(); + char *rhs_data_p = rhs.get_data(); + char *cumsum_data_p = cumsum.get_data(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + auto rhs_shape_vec = rhs.get_shape_vector(); + auto rhs_strides_vec = rhs.get_strides_vector(); + + sycl::event extract_ev; + std::vector host_task_events{}; + if (axis_start == 0 && axis_end == dst_nd) { + // empty orthogonal directions + auto fn = + masked_place_all_slices_strided_impl_dispatch_vector[dst_typeid]; + + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, dst_shape_vec, dst_strides_vec); + py::ssize_t *packed_dst_shape_strides = + std::get<0>(ptr_size_event_tuple1); + sycl::event copy_dst_shape_strides_ev = + std::get<2>(ptr_size_event_tuple1); + + assert(rhs_shape_vec.size() == 1); + assert(rhs_strides_vec.size() == 1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_dst_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + extract_ev = fn(exec_q, cumsum_sz, dst_data_p, cumsum_data_p, + rhs_data_p, dst_nd, packed_dst_shape_strides, + rhs_shape_vec[0], rhs_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(extract_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_dst_shape_strides] { + sycl::free(packed_dst_shape_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + else { + // non-empty othogonal directions + auto fn = + masked_place_some_slices_strided_impl_dispatch_vector[dst_typeid]; + + int masked_dst_nd = mask_span_sz; + int ortho_nd = dst_nd - masked_dst_nd; + + using shT = std::vector; + + shT ortho_dst_shape; + shT masked_dst_shape; + shT ortho_dst_strides; + shT masked_dst_strides; + _split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start, + axis_end, ortho_dst_shape, + masked_dst_shape, // 4 vectors modified + ortho_dst_strides, masked_dst_strides); + + shT ortho_rhs_shape; + shT masked_rhs_shape; + shT ortho_rhs_strides; + shT masked_rhs_strides; + _split_iteration_space(rhs_shape_vec, rhs_strides_vec, axis_start, + axis_start + 1, ortho_rhs_shape, + masked_rhs_shape, // 4 vectors modified + ortho_rhs_strides, masked_rhs_strides); + + assert(ortho_dst_shape.size() == static_cast(ortho_nd)); + assert(ortho_rhs_shape.size() == static_cast(ortho_nd)); + assert(std::equal(ortho_dst_shape.begin(), ortho_dst_shape.end(), + ortho_rhs_shape.begin())); + + std::vector simplified_ortho_shape; + std::vector simplified_ortho_dst_strides; + std::vector simplified_ortho_rhs_strides; + + const py::ssize_t *_shape = ortho_dst_shape.data(); + const py::ssize_t *_dst_strides = ortho_dst_strides.data(); + const py::ssize_t *_rhs_strides = ortho_rhs_strides.data(); + constexpr py::ssize_t _itemsize = 1; // in elements + + constexpr bool is_c_contig = false; + constexpr bool is_f_contig = false; + + py::ssize_t ortho_dst_offset(0); + py::ssize_t ortho_rhs_offset(0); + + dpctl::tensor::py_internal::simplify_iteration_space( + ortho_nd, _shape, _dst_strides, _itemsize, is_c_contig, is_f_contig, + _rhs_strides, _itemsize, is_c_contig, is_f_contig, + simplified_ortho_shape, simplified_ortho_dst_strides, + simplified_ortho_rhs_strides, ortho_dst_offset, ortho_rhs_offset); + + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_ortho_shape, + simplified_ortho_dst_strides, simplified_ortho_rhs_strides); + py::ssize_t *packed_ortho_dst_rhs_shape_strides = + std::get<0>(ptr_size_event_tuple1); + sycl::event copy_shape_strides_ev1 = std::get<2>(ptr_size_event_tuple1); + + auto ptr_size_event_tuple2 = device_allocate_and_pack( + exec_q, host_task_events, masked_dst_shape, masked_dst_strides); + py::ssize_t *packed_masked_dst_shape_strides = + std::get<0>(ptr_size_event_tuple2); + sycl::event copy_shape_strides_ev2 = std::get<2>(ptr_size_event_tuple2); + + assert(masked_rhs_shape.size() == 1); + assert(masked_rhs_strides.size() == 1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 2); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev1); + all_deps.push_back(copy_shape_strides_ev2); + + assert(all_deps.size() == depends.size() + 2); + + extract_ev = fn(exec_q, ortho_nelems, masked_dst_nelems, dst_data_p, + cumsum_data_p, rhs_data_p, + // data to build orthog_dst_rhs_indexer + ortho_nd, packed_ortho_dst_rhs_shape_strides, + ortho_dst_offset, ortho_rhs_offset, + // data to build masked_dst_indexer + masked_dst_nd, packed_masked_dst_shape_strides, + // data to build masked_dst_indexer, + masked_rhs_shape[0], masked_rhs_strides[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(extract_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_ortho_dst_rhs_shape_strides, + packed_masked_dst_shape_strides] { + sycl::free(packed_ortho_dst_rhs_shape_strides, ctx); + sycl::free(packed_masked_dst_shape_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + host_task_events.push_back(extract_ev); + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {dst, cumsum, rhs}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, extract_ev); +} + +// Non-zero + +std::pair py_nonzero( + dpctl::tensor::usm_ndarray cumsum, // int64 input array, 1D, C-contiguous + dpctl::tensor::usm_ndarray indexes, // int64 2D output array, C-contiguous + std::vector + mask_shape, // shape of array from which cumsum was computed + sycl::queue exec_q, + std::vector const &depends) +{ + if (!dpctl::utils::queues_are_compatible(exec_q, {cumsum, indexes})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + int cumsum_nd = cumsum.get_ndim(); + if (cumsum_nd != 1 || !cumsum.is_c_contiguous()) { + throw py::value_error("Cumsum array must be a C-contiguous vector"); + } + + int indexes_nd = indexes.get_ndim(); + if (indexes_nd != 2 || !indexes.is_c_contiguous()) { + throw py::value_error("Index array must be a C-contiguous matrix"); + } + + size_t _ndim = mask_shape.size(); + if (_ndim > std::numeric_limits::max()) { + throw py::value_error("Shape is too large"); + } + int ndim = static_cast(_ndim); + + const py::ssize_t *indexes_shape = indexes.get_shape_raw(); + + if (ndim != indexes_shape[0]) { + throw py::value_error( + "Length of shape must equal width of index matrix"); + } + + auto cumsum_sz = cumsum.get_size(); + py::ssize_t shape_nelems = + std::accumulate(mask_shape.begin(), mask_shape.end(), py::ssize_t(1), + std::multiplies()); + + if (cumsum_sz != shape_nelems) { + throw py::value_error("Shape and cumsum size are not constent"); + } + + py::ssize_t nz_elems = indexes_shape[1]; + + int indexes_typenum = indexes.get_typenum(); + auto const &array_types = dpctl::tensor::detail::usm_ndarray_types(); + int indexes_typeid = array_types.typenum_to_lookup_id(indexes_typenum); + + int cumsum_typenum = cumsum.get_typenum(); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + // cumsum must be int64_t only + constexpr int int64_typeid = + static_cast(dpctl::tensor::detail::typenum_t::INT64); + if (cumsum_typeid != int64_typeid || indexes_typeid != int64_typeid) { + throw py::value_error( + "Cumulative sum array and index array must have int64 data-type"); + } + + if (cumsum_sz == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + if (overlap(cumsum, indexes)) { + throw py::value_error("Arrays are expected to ave no memory overlap"); + } + + // ensure that dst is sufficiently ample + auto indexes_offsets = indexes.get_minmax_offsets(); + // destination must be ample enough to accomodate all elements + { + size_t range = + static_cast(indexes_offsets.second - indexes_offsets.first); + if (range + 1 < static_cast(nz_elems * _ndim)) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accomodate all the array elements."); + } + } + + std::vector host_task_events; + host_task_events.reserve(2); + + auto mask_shape_copying_tuple = device_allocate_and_pack( + exec_q, host_task_events, mask_shape); + py::ssize_t *src_shape_device_ptr = std::get<0>(mask_shape_copying_tuple); + sycl::event copy_ev = std::get<2>(mask_shape_copying_tuple); + + if (src_shape_device_ptr == nullptr) { + sycl::event::wait(host_task_events); + throw std::runtime_error("Device allocation failed"); + } + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_ev); + + using dpctl::tensor::kernels::indexing::non_zero_indexes_impl; + + sycl::event non_zero_indexes_ev = + non_zero_indexes_impl( + exec_q, cumsum_sz, nz_elems, ndim, cumsum.get_data(), + indexes.get_data(), src_shape_device_ptr, all_deps); + + sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(non_zero_indexes_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, src_shape_device_ptr] { + sycl::free(src_shape_device_ptr, ctx); + }); + }); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {cumsum, indexes}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, + temporaries_cleanup_ev); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp new file mode 100644 index 0000000000..f165fe5118 --- /dev/null +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp @@ -0,0 +1,84 @@ +//===-- boolean_advanced_indexing.hpp - --*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares Python API for implementation functions of +/// dpctl.tensor.place, dpctl.tensor.extract, and dpctl.tensor.nonzero +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void populate_mask_positions_dispatch_vectors(void); + +extern size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, + dpctl::tensor::usm_ndarray cumsum, + sycl::queue exec_q, + std::vector const &depends = {}); + +extern std::pair +py_extract(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + std::vector const &depends = {}); + +extern void populate_masked_extract_dispatch_vectors(void); + +extern std::pair +py_place(dpctl::tensor::usm_ndarray dst, + dpctl::tensor::usm_ndarray cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + dpctl::tensor::usm_ndarray rhs, + sycl::queue exec_q, + std::vector const &depends = {}); + +extern void populate_masked_place_dispatch_vectors(void); + +extern std::pair py_nonzero( + dpctl::tensor::usm_ndarray cumsum, // int64 input array, 1D, C-contiguous + dpctl::tensor::usm_ndarray indexes, // int64 2D output array, C-contiguous + std::vector + mask_shape, // shape of array from which cumsum was computed + sycl::queue exec_q, + std::vector const &depends = {}); + +/* @brief Check if memory regions underlying two arrays have an overlap */ +extern bool overlap(dpctl::tensor::usm_ndarray ar1, + dpctl::tensor::usm_ndarray ar2); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp index be4a35fb90..7eb7c8f8d6 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp @@ -39,6 +39,86 @@ namespace py = pybind11; using dpctl::tensor::c_contiguous_strides; using dpctl::tensor::f_contiguous_strides; +void simplify_iteration_space_1(int &nd, + const py::ssize_t *&shape, + const py::ssize_t *&strides, + py::ssize_t itemsize, + bool is_c_contig, + bool is_f_contig, + std::vector &simplified_shape, + std::vector &simplified_strides, + py::ssize_t &offset) +{ + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + for (int i = 0; i < nd; ++i) { + simplified_shape.push_back(shape[i]); + } + + simplified_strides.reserve(nd); + if (strides == nullptr) { + if (is_c_contig) { + simplified_strides = c_contiguous_strides(nd, shape, itemsize); + } + else if (is_f_contig) { + simplified_strides = f_contiguous_strides(nd, shape, itemsize); + } + else { + throw std::runtime_error( + "Array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + for (int i = 0; i < nd; ++i) { + simplified_strides.push_back(strides[i]); + } + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); + int contracted_nd = simplify_iteration_stride( + nd, simplified_shape.data(), simplified_strides.data(), + offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + + simplified_strides.reserve(nd); + + if (strides == nullptr) { + if (is_c_contig) { + simplified_strides.push_back(itemsize); + } + else if (is_f_contig) { + simplified_strides.push_back(itemsize); + } + else { + throw std::runtime_error( + "Array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + simplified_strides.push_back(strides[0]); + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); + } + shape = const_cast(simplified_shape.data()); + strides = const_cast(simplified_strides.data()); +} + void simplify_iteration_space(int &nd, const py::ssize_t *&shape, const py::ssize_t *&src_strides, @@ -173,6 +253,195 @@ void simplify_iteration_space(int &nd, const_cast(simplified_dst_strides.data()); } +void simplify_iteration_space_3( + int &nd, + const py::ssize_t *&shape, + // src1 + const py::ssize_t *&src1_strides, + py::ssize_t src1_itemsize, + bool is_src1_c_contig, + bool is_src1_f_contig, + // src2 + const py::ssize_t *&src2_strides, + py::ssize_t src2_itemsize, + bool is_src2_c_contig, + bool is_src2_f_contig, + // dst + const py::ssize_t *&dst_strides, + py::ssize_t dst_itemsize, + bool is_dst_c_contig, + bool is_dst_f_contig, + // output + std::vector &simplified_shape, + std::vector &simplified_src1_strides, + std::vector &simplified_src2_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src1_offset, + py::ssize_t &src2_offset, + py::ssize_t &dst_offset) +{ + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + for (int i = 0; i < nd; ++i) { + simplified_shape.push_back(shape[i]); + } + + simplified_src1_strides.reserve(nd); + simplified_src2_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + if (src1_strides == nullptr) { + if (is_src1_c_contig) { + simplified_src1_strides = + c_contiguous_strides(nd, shape, src1_itemsize); + } + else if (is_src1_f_contig) { + simplified_src1_strides = + f_contiguous_strides(nd, shape, src1_itemsize); + } + else { + throw std::runtime_error( + "Source array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + for (int i = 0; i < nd; ++i) { + simplified_src1_strides.push_back(src1_strides[i]); + } + } + if (src2_strides == nullptr) { + if (is_src2_c_contig) { + simplified_src2_strides = + c_contiguous_strides(nd, shape, src2_itemsize); + } + else if (is_src2_f_contig) { + simplified_src2_strides = + f_contiguous_strides(nd, shape, src2_itemsize); + } + else { + throw std::runtime_error( + "Source array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + for (int i = 0; i < nd; ++i) { + simplified_src2_strides.push_back(src2_strides[i]); + } + } + if (dst_strides == nullptr) { + if (is_dst_c_contig) { + simplified_dst_strides = + c_contiguous_strides(nd, shape, dst_itemsize); + } + else if (is_dst_f_contig) { + simplified_dst_strides = + f_contiguous_strides(nd, shape, dst_itemsize); + } + else { + throw std::runtime_error( + "Destination array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + for (int i = 0; i < nd; ++i) { + simplified_dst_strides.push_back(dst_strides[i]); + } + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + int contracted_nd = simplify_iteration_three_strides( + nd, simplified_shape.data(), simplified_src1_strides.data(), + simplified_src2_strides.data(), simplified_dst_strides.data(), + src1_offset, // modified by reference + src2_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src1_strides.resize(contracted_nd); + simplified_src2_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + + simplified_src1_strides.reserve(nd); + simplified_src2_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if (src1_strides == nullptr) { + if (is_src1_c_contig) { + simplified_src1_strides.push_back(src1_itemsize); + } + else if (is_src1_f_contig) { + simplified_src1_strides.push_back(src1_itemsize); + } + else { + throw std::runtime_error( + "Source array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + simplified_src1_strides.push_back(src1_strides[0]); + } + if (src2_strides == nullptr) { + if (is_src2_c_contig) { + simplified_src2_strides.push_back(src2_itemsize); + } + else if (is_src2_f_contig) { + simplified_src2_strides.push_back(src2_itemsize); + } + else { + throw std::runtime_error( + "Source array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + simplified_src2_strides.push_back(src2_strides[0]); + } + if (dst_strides == nullptr) { + if (is_dst_c_contig) { + simplified_dst_strides.push_back(dst_itemsize); + } + else if (is_dst_f_contig) { + simplified_dst_strides.push_back(dst_itemsize); + } + else { + throw std::runtime_error( + "Destination array has null strides " + "but has neither C- nor F- contiguous flag set"); + } + } + else { + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } + shape = const_cast(simplified_shape.data()); + src1_strides = + const_cast(simplified_src1_strides.data()); + src2_strides = + const_cast(simplified_src2_strides.data()); + dst_strides = + const_cast(simplified_dst_strides.data()); +} + } // namespace py_internal } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp index 515e795d20..ec0cc286d4 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp @@ -36,6 +36,16 @@ namespace py_internal namespace py = pybind11; +void simplify_iteration_space_1(int &, + const py::ssize_t *&, + const py::ssize_t *&, + py::ssize_t, + bool, + bool, + std::vector &, + std::vector &, + py::ssize_t &); + void simplify_iteration_space(int &, const py::ssize_t *&, const py::ssize_t *&, @@ -52,6 +62,32 @@ void simplify_iteration_space(int &, py::ssize_t &, py::ssize_t &); +void simplify_iteration_space_3(int &, + const py::ssize_t *&, + // src1 + const py::ssize_t *&, + py::ssize_t, + bool, + bool, + // src2 + const py::ssize_t *&, + py::ssize_t, + bool, + bool, + // dst + const py::ssize_t *&, + py::ssize_t, + bool, + bool, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &); + } // namespace py_internal } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index e164be2421..2e9e981a37 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -33,6 +33,7 @@ #include "dpctl4pybind11.hpp" +#include "boolean_advanced_indexing.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_for_reshape.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" @@ -75,6 +76,12 @@ using dpctl::tensor::py_internal::usm_ndarray_full; using dpctl::tensor::py_internal::usm_ndarray_put; using dpctl::tensor::py_internal::usm_ndarray_take; +using dpctl::tensor::py_internal::overlap; +using dpctl::tensor::py_internal::py_extract; +using dpctl::tensor::py_internal::py_mask_positions; +using dpctl::tensor::py_internal::py_nonzero; +using dpctl::tensor::py_internal::py_place; + /* ================ Eye ================== */ using dpctl::tensor::py_internal::usm_ndarray_eye; @@ -105,6 +112,10 @@ void init_dispatch_vectors(void) init_eye_ctor_dispatch_vectors(); init_triul_ctor_dispatch_vectors(); + populate_mask_positions_dispatch_vectors(); + populate_masked_extract_dispatch_vectors(); + populate_masked_place_dispatch_vectors(); + return; } @@ -252,4 +263,24 @@ PYBIND11_MODULE(_tensor_impl, m) m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"), py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), + py::arg("cumsum"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"), + py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_array_overlap", &overlap, + "Determines if the memory regions indexed by each array overlap", + py::arg("array1"), py::arg("array2")); + + m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"), + py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"), + py::arg("mask_shape"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); } From c1f00812b35becd29c8af54df2e049dd7033f136 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 10:31:52 -0600 Subject: [PATCH 2/8] Added missing include --- .../libtensor/include/kernels/boolean_advanced_indexing.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index b42b7869d2..71313e9a27 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -32,6 +32,7 @@ #include #include "utils/strided_iters.hpp" +#include "utils/type_dispatch.hpp" namespace dpctl { From 849a3ea2025277c7128b4e72145b4308be68153b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 11:18:44 -0600 Subject: [PATCH 3/8] Hooked up boolean indexing, first attempt --- dpctl/tensor/__init__.py | 5 +- dpctl/tensor/_copy_utils.py | 147 +++++++++++++++++++--------- dpctl/tensor/_indexing_functions.py | 118 ++++++++++++++++++++++ dpctl/tensor/_usmarray.pyx | 14 +-- 4 files changed, 229 insertions(+), 55 deletions(-) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index d21958b4fa..2a2afd60a4 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -58,7 +58,7 @@ ) from dpctl.tensor._device import Device from dpctl.tensor._dlpack import from_dlpack -from dpctl.tensor._indexing_functions import put, take +from dpctl.tensor._indexing_functions import extract, nonzero, place, put, take from dpctl.tensor._manipulation_functions import ( broadcast_arrays, broadcast_to, @@ -115,6 +115,9 @@ "squeeze", "take", "put", + "extract", + "place", + "nonzero", "from_numpy", "to_numpy", "asnumpy", diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index 597db87c49..72b9e0a021 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -389,45 +389,75 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True): return R -def _mock_extract(ary, ary_mask, p): - exec_q = dpctl.utils.get_execution_queue( - ( - ary.sycl_queue, - ary_mask.sycl_queue, +def _extract_impl(ary, ary_mask, axis=0): + """Extract elements of ary by applying mask starting from slot + dimension axis""" + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}" + ) + if not isinstance(ary_mask, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}" ) + exec_q = dpctl.utils.get_execution_queue( + (ary.sycl_queue, ary_mask.sycl_queue) ) if exec_q is None: raise dpctl.utils.ExecutionPlacementError( - "Can not automatically determine where to allocate the " - "result or performance execution. " - "Use `usm_ndarray.to_device` method to migrate data to " - "be associated with the same queue." + "arrays have different associated queues. " + "Use `Y.to_device(X.device)` to migrate." ) - - res_usm_type = dpctl.utils.get_coerced_usm_type( - ( - ary.usm_type, - ary_mask.usm_type, + ary_nd = ary.ndim + pp = normalize_axis_index(operator.index(axis), ary_nd) + mask_nd = ary_mask.ndim + if pp < 0 or pp + mask_nd > ary_nd: + raise ValueError( + "Parameter p is inconsistent with input array dimensions" ) + mask_nelems = ary_mask.size + cumsum = dpt.empty(mask_nelems, dtype=dpt.int64, device=ary_mask.device) + exec_q = cumsum.sycl_queue + mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q) + dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] + dst = dpt.empty( + dst_shape, dtype=ary.dtype, usm_type=ary.usm_type, device=ary.device ) - ary_np = dpt.asnumpy(ary) - mask_np = dpt.asnumpy(ary_mask) - res_np = ary_np[(slice(None),) * p + (mask_np,)] - res = dpt.empty( - res_np.shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q + hev, _ = ti._extract( + src=ary, + cumsum=cumsum, + axis_start=pp, + axis_end=pp + mask_nd, + dst=dst, + sycl_queue=exec_q, ) - res[...] = res_np - return res + hev.wait() + return dst -def _mock_nonzero(ary): +def _nonzero_impl(ary): if not isinstance(ary, dpt.usm_ndarray): - raise TypeError - q = ary.sycl_queue + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}" + ) + exec_q = ary.sycl_queue usm_type = ary.usm_type - ary_np = dpt.asnumpy(ary) - nz = ary_np.nonzero() - return tuple(dpt.asarray(i, usm_type=usm_type, sycl_queue=q) for i in nz) + mask_nelems = ary.size + cumsum = dpt.empty( + mask_nelems, dtype=dpt.int64, sycl_queue=exec_q, order="C" + ) + mask_count = ti.mask_positions(ary, cumsum, sycl_queue=exec_q) + indexes = dpt.empty( + (ary.ndim, mask_count), + dtype=cumsum.dtype, + usm_type=usm_type, + sycl_queue=exec_q, + order="C", + ) + hev, _ = ti._nonzero(cumsum, indexes, ary.shape, exec_q) + res = tuple(indexes[i, :] for i in range(ary.ndim)) + hev.wait() + return res def _take_multi_index(ary, inds, p): @@ -473,34 +503,57 @@ def _take_multi_index(ary, inds, p): return res -def _mock_place(ary, ary_mask, p, vals): +def _place_impl(ary, ary_mask, vals, axis=0): + """Extract elements of ary by applying mask starting from slot + dimension axis""" if not isinstance(ary, dpt.usm_ndarray): - raise TypeError + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}" + ) if not isinstance(ary_mask, dpt.usm_ndarray): - raise TypeError + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}" + ) + if not isinstance(vals, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary_mask)}" + ) exec_q = dpctl.utils.get_execution_queue( - (ary.sycl_queue, ary_mask.sycl_queue) + (ary.sycl_queue, ary_mask.sycl_queue, vals.sycl_queue) ) - if exec_q is not None and isinstance(vals, dpt.usm_ndarray): - exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue)) if exec_q is None: raise dpctl.utils.ExecutionPlacementError( - "Can not automatically determine where to allocate the " - "result or performance execution. " - "Use `usm_ndarray.to_device` method to migrate data to " - "be associated with the same queue." + "arrays have different associated queues. " + "Use `Y.to_device(X.device)` to migrate." ) - - ary_np = dpt.asnumpy(ary) - mask_np = dpt.asnumpy(ary_mask) - if isinstance(vals, dpt.usm_ndarray) or hasattr( - vals, "__sycl_usm_array_interface__" - ): - vals_np = dpt.asnumpy(vals) + ary_nd = ary.ndim + pp = normalize_axis_index(operator.index(axis), ary_nd) + mask_nd = ary_mask.ndim + if pp < 0 or pp + mask_nd > ary_nd: + raise ValueError( + "Parameter p is inconsistent with input array dimensions" + ) + mask_nelems = ary_mask.size + cumsum = dpt.empty(mask_nelems, dtype=dpt.int64, device=ary_mask.device) + exec_q = cumsum.sycl_queue + mask_count = ti.mask_positions(ary_mask, cumsum, sycl_queue=exec_q) + expected_vals_shape = ( + ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] + ) + if vals.dtype == ary.dtype: + rhs = vals else: - vals_np = vals - ary_np[(slice(None),) * p + (mask_np,)] = vals_np - ary[...] = ary_np + rhs = dpt.astype(vals, ary.dtype) + rhs = dpt.broadcast_to(rhs, expected_vals_shape) + hev, _ = ti._place( + dst=ary, + cumsum=cumsum, + axis_start=pp, + axis_end=pp + mask_nd, + rhs=rhs, + sycl_queue=exec_q, + ) + hev.wait() return diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 12f7b2d72e..01f1a2370a 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -23,6 +23,8 @@ import dpctl.tensor as dpt from dpctl.tensor._tensor_impl import _put, _take +from ._copy_utils import _extract_impl, _nonzero_impl, _place_impl + def take(x, indices, /, *, axis=None, mode="clip"): if not isinstance(x, dpt.usm_ndarray): @@ -175,3 +177,119 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"): hev, _ = _put(x, indices, vals, axis, mode, sycl_queue=exec_q) hev.wait() + + +def extract(condition, arr): + """extract(condition, arr) + + Returns the elements of an array that satisfies the condition. + + If `condition` is boolean :func:``dpctl.tensor.extract`` is + equivalent to ``arr[condition]``. + + Note that :func:``dpctl.tensor.place`` does the opposite of + :func:``dpctl.tensor.extract``. + + Args: + conditions: usm_ndarray + An array whose non-zero or True entries indicate the element + of `arr` to extract. + arr: usm_ndarray + Input array of the same size as `condition`. + + Returns: + extract: usm_ndarray + Rank 1 array of values from `arr` where `condition` is True. + """ + if not isinstance(condition, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}" + ) + if not isinstance(arr, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}" + ) + exec_q = dpctl.utils.get_execution_queue( + ( + condition.sycl_queue, + arr.sycl_queue, + ) + ) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError + if condition.shape != arr.shape: + raise ValueError("Arrays are not of the same size") + return _extract_impl(arr, condition) + + +def place(arr, mask, vals): + """place(arr, mask, vals) + + Change elements of an array based on conditional and input values. + + If `mask` is boolean :func:``dpctl.tensor.place`` is + equivalent to ``arr[condition] = vals``. + + Args: + arr: usm_ndarray + Array to put data into. + mask: usm_ndarray + Boolean mask array. Must have the same size as `arr`. + vals: usm_ndarray + Values to put into `arr`. Only the first N elements are + used, where N is the number of True values in `mask`. If + `vals` is smaller than N, it will be repeated, and if + elements of `arr` are to be masked, this sequence must be + non-empty. Array `vals` must be one dimensional. + """ + if not isinstance(arr, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}" + ) + if not isinstance(mask, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(mask)}" + ) + if not isinstance(vals, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(vals)}" + ) + exec_q = dpctl.utils.get_execution_queue( + ( + arr.sycl_queue, + mask.sycl_queue, + vals.sycl_queue, + ) + ) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError + if arr.shape != mask.shape or vals.ndim != 1: + raise ValueError("Array sizes are not as required") + # FIXME + _place_impl(arr, mask, vals, axis=0) + + +def nonzero(arr): + """nonzero(arr) + + Return the indices of non-zero elements. + + Returns the tuple of usm_narrays, one for each dimension + of `arr`, containing the indices of the non-zero elements + in that dimension. The values of `arr` are always tested in + row-major, C-style order. + + Args: + arr: usm_ndarray + Input array, which has non-zero array rank. + Returns: + tuple_of_usm_ndarrays: tuple + Indices of non-zero array elements. + """ + if not isinstance(arr, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}" + ) + if arr.ndim == 0: + raise ValueError("Array of positive rank is exepcted") + return _nonzero_impl(arr) diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 64a492065f..1abc1e88ac 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -670,15 +670,15 @@ cdef class usm_ndarray: if adv_ind_start_p < 0: return res - from ._copy_utils import _mock_extract, _mock_nonzero, _take_multi_index + from ._copy_utils import _extract_impl, _nonzero_impl, _take_multi_index if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool: - return _mock_extract(res, adv_ind[0], adv_ind_start_p) + return _extract_impl(res, adv_ind[0], axis=adv_ind_start_p) if any(ind.dtype == dpt_bool for ind in adv_ind): adv_ind_int = list() for ind in adv_ind: if ind.dtype == dpt_bool: - adv_ind_int.extend(_mock_nonzero(ind)) + adv_ind_int.extend(_nonzero_impl(ind)) else: adv_ind_int.append(ind) return _take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p) @@ -1015,8 +1015,8 @@ cdef class usm_ndarray: from ._copy_utils import ( _copy_from_numpy_into, _copy_from_usm_ndarray_to_usm_ndarray, - _mock_nonzero, - _mock_place, + _nonzero_impl, + _place_impl, _put_multi_index, ) @@ -1050,14 +1050,14 @@ cdef class usm_ndarray: return if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool: - _mock_place(Xv, adv_ind[0], adv_ind_start_p, rhs) + _place_impl(Xv, adv_ind[0], rhs, axis=adv_ind_start_p) return if any(ind.dtype == dpt_bool for ind in adv_ind): adv_ind_int = list() for ind in adv_ind: if ind.dtype == dpt_bool: - adv_ind_int.extend(_mock_nonzero(ind)) + adv_ind_int.extend(_nonzero_impl(ind)) else: adv_ind_int.append(ind) _put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs) From ed279d63b6f6b43ef5f3b4791696216536639ca1 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 11:43:28 -0600 Subject: [PATCH 4/8] Changes per clang-format 11 --- dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp index 1534b38391..9689612b8a 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -62,7 +62,7 @@ template sink_t inserter(V &lhs, U &&rhs) } template -std::vector concat(std::vector lhs, Vs &&... vs) +std::vector concat(std::vector lhs, Vs &&...vs) { std::size_t s = lhs.size(); { @@ -83,7 +83,7 @@ template std::tuple device_allocate_and_pack(sycl::queue q, std::vector &host_task_events, - Vs &&... vs) + Vs &&...vs) { // memory transfer optimization, use USM-host for temporary speeds up From 3ced89a095650fbc40688294b8e343d44857f495 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 12:42:07 -0600 Subject: [PATCH 5/8] Used Strided1DCyclingIndexer in place implementations This allows to implement behavior of place which cycles over values of val array if that is shorter than the number of non-zero elements in the mask. --- .../kernels/boolean_advanced_indexing.hpp | 53 +++++++++++-------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index 71313e9a27..aa0a90ce70 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -1,4 +1,4 @@ -//=== boolean_advance_indexing.hpp - ---*-C++-*--/===// +//=== boolean_advance_indexing.hpp - ------*-C++-*--/===// // // Data Parallel Control (dpctl) // @@ -16,11 +16,11 @@ // See the License for the specific language governing permissions and // limitations under the License. // -//===----------------------------------------------------------------------===// +//===---------------------------------------------------------------------===// /// /// \file /// This file defines kernels for advanced tensor index operations. -//===----------------------------------------------------------------------===// +//===---------------------------------------------------------------------===// #pragma once #include @@ -114,6 +114,26 @@ struct Strided1DIndexer py::ssize_t step = 1; }; +struct Strided1DCyclicIndexer +{ + Strided1DCyclicIndexer(py::ssize_t _offset, + py::ssize_t _size, + py::ssize_t _step) + : offset(_offset), size(static_cast(_size)), step(_step) + { + } + + size_t operator()(size_t gid) const + { + return static_cast(offset + (gid % size) * step); + } + +private: + py::ssize_t offset = 0; + size_t size = 1; + py::ssize_t step = 1; +}; + template struct ZeroChecker { @@ -762,27 +782,22 @@ sycl::event masked_place_all_slices_strided_impl( py::ssize_t rhs_stride, const std::vector &depends = {}) { - // using MaskedPlaceStridedFunctor; - // using Strided1DIndexer; - // using StridedIndexer; - // using TwoZeroOffsets_Indexer; - TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{}; /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const * *_packed_shape_strides) */ StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides); - Strided1DIndexer masked_rhs_indexer(0, rhs_size, rhs_stride); + Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride); sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); cgh.parallel_for>( + TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer, + dataT, indT>>( sycl::range<1>(static_cast(iteration_size)), MaskedPlaceStridedFunctor( + Strided1DCyclicIndexer, dataT, indT>( dst_p, cumsum_p, rhs_p, 1, iteration_size, orthog_dst_rhs_indexer, masked_dst_indexer, masked_rhs_indexer)); @@ -838,11 +853,6 @@ sycl::event masked_place_some_slices_strided_impl( py::ssize_t masked_rhs_stride, const std::vector &depends = {}) { - // using MaskedPlaceStridedFunctor; - // using Strided1DIndexer; - // using StridedIndexer; - // using TwoOffsets_StridedIndexer; - TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{ orthog_nd, ortho_dst_offset, ortho_rhs_offset, packed_ortho_dst_rhs_shape_strides}; @@ -851,17 +861,18 @@ sycl::event masked_place_some_slices_strided_impl( * *_packed_shape_strides) */ StridedIndexer masked_dst_indexer{masked_nd, 0, packed_masked_dst_shape_strides}; - Strided1DIndexer masked_rhs_indexer{0, masked_rhs_size, masked_rhs_stride}; + Strided1DCyclicIndexer masked_rhs_indexer{0, masked_rhs_size, + masked_rhs_stride}; sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); cgh.parallel_for>( + TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer, + dataT, indT>>( sycl::range<1>(static_cast(orthog_nelems * masked_nelems)), MaskedPlaceStridedFunctor( + Strided1DCyclicIndexer, dataT, indT>( dst_p, cumsum_p, rhs_p, orthog_nelems, masked_nelems, orthog_dst_rhs_indexer, masked_dst_indexer, masked_rhs_indexer)); From 19691ca89ff4ee3324c3402a19a4be669ee8e138 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 12:44:49 -0600 Subject: [PATCH 6/8] Implemented dpctl.tensor.place as per documented behavior. --- dpctl/tensor/_indexing_functions.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 01f1a2370a..20c4a22786 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -21,9 +21,9 @@ import dpctl import dpctl.tensor as dpt -from dpctl.tensor._tensor_impl import _put, _take +import dpctl.tensor._tensor_impl as ti -from ._copy_utils import _extract_impl, _nonzero_impl, _place_impl +from ._copy_utils import _extract_impl, _nonzero_impl def take(x, indices, /, *, axis=None, mode="clip"): @@ -95,7 +95,7 @@ def take(x, indices, /, *, axis=None, mode="clip"): res_shape, dtype=x.dtype, usm_type=res_usm_type, sycl_queue=exec_q ) - hev, _ = _take(x, indices, res, axis, mode, sycl_queue=exec_q) + hev, _ = ti._take(x, indices, res, axis, mode, sycl_queue=exec_q) hev.wait() return res @@ -175,7 +175,7 @@ def put(x, indices, vals, /, *, axis=None, mode="clip"): vals = dpt.broadcast_to(vals, val_shape) - hev, _ = _put(x, indices, vals, axis, mode, sycl_queue=exec_q) + hev, _ = ti._put(x, indices, vals, axis, mode, sycl_queue=exec_q) hev.wait() @@ -265,8 +265,23 @@ def place(arr, mask, vals): raise dpctl.utils.ExecutionPlacementError if arr.shape != mask.shape or vals.ndim != 1: raise ValueError("Array sizes are not as required") - # FIXME - _place_impl(arr, mask, vals, axis=0) + cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q) + nz_count = ti.mask_positions(mask, cumsum, sycl_queue=exec_q) + if nz_count == 0: + return + if vals.dtype == arr.dtype: + rhs = vals + else: + rhs = dpt.astype(vals, arr.dtype) + hev, _ = ti._place( + dst=arr, + cumsum=cumsum, + axis_start=0, + axis_end=mask.ndim, + rhs=rhs, + sycl_queue=exec_q, + ) + hev.wait() def nonzero(arr): From f75723b143c67f109bcf27bdb8837cd30a7d81cb Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 14:47:59 -0600 Subject: [PATCH 7/8] Added tests to test_usm_ndarray_indexing --- dpctl/tests/test_usm_ndarray_indexing.py | 220 +++++++++++++++++++++++ 1 file changed, 220 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index 98bb674b21..aec71def7d 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -970,3 +970,223 @@ def test_advanced_indexing_compute_follows_data(): dpt.put(x, ind0, val1, axis=0) with pytest.raises(ExecutionPlacementError): x[ind0] = val1 + + +####### + + +def test_extract_all_1d(): + x = dpt.arange(30, dtype="i4") + sel = dpt.ones(30, dtype="?") + sel[::2] = False + + res = x[sel] + expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)] + assert (dpt.asnumpy(res) == expected_res).all() + + res2 = dpt.extract(sel, x) + assert (dpt.asnumpy(res2) == expected_res).all() + + +def test_extract_all_2d(): + x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) + sel = dpt.ones(30, dtype="?") + sel[::2] = False + sel = dpt.reshape(sel, x.shape) + + res = x[sel] + expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)] + assert (dpt.asnumpy(res) == expected_res).all() + + res2 = dpt.extract(sel, x) + assert (dpt.asnumpy(res2) == expected_res).all() + + +def test_extract_2D_axis0(): + x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) + sel = dpt.ones(x.shape[0], dtype="?") + sel[::2] = False + + res = x[sel] + expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)] + assert (dpt.asnumpy(res) == expected_res).all() + + +def test_extract_2D_axis1(): + x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) + sel = dpt.ones(x.shape[1], dtype="?") + sel[::2] = False + + res = x[:, sel] + expected = dpt.asnumpy(x)[:, dpt.asnumpy(sel)] + assert (dpt.asnumpy(res) == expected).all() + + +def test_extract_begin(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 3), dtype="?") + sel[0, 0] = True + sel[1, 1] = True + z = y[sel] + expected = dpt.asnumpy(y)[[0, 1], [0, 1]] + assert (dpt.asnumpy(z) == expected).all() + + +def test_extract_end(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((4, 4), dtype="?") + sel[0, 0] = True + z = y[..., sel] + expected = dpt.asnumpy(y)[..., [0], [0]] + assert (dpt.asnumpy(z) == expected).all() + + +def test_extract_middle(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 4), dtype="?") + sel[0, 0] = True + z = y[:, sel] + expected = dpt.asnumpy(y)[:, [0], [0], :] + assert (dpt.asnumpy(z) == expected).all() + + +def test_extract_empty_result(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 4), dtype="?") + z = y[:, sel] + assert z.shape == ( + y.shape[0], + 0, + y.shape[3], + ) + + +def test_place_all_1d(): + x = dpt.arange(10, dtype="i2") + sel = dpt.zeros(10, dtype="?") + sel[0::2] = True + val = dpt.zeros(5, dtype=x.dtype) + x[sel] = val + assert (dpt.asnumpy(x) == np.array([0, 1, 0, 3, 0, 5, 0, 7, 0, 9])).all() + dpt.place(x, sel, dpt.asarray(2)) + assert (dpt.asnumpy(x) == np.array([2, 1, 2, 3, 2, 5, 2, 7, 2, 9])).all() + + +def test_place_2d_axis0(): + x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) + sel = dpt.asarray([True, False, True]) + val = dpt.zeros((2, 4), dtype=x.dtype) + x[sel] = val + expected_x = np.stack( + ( + np.zeros(4, dtype="i2"), + np.arange(4, 8, dtype="i2"), + np.zeros(4, dtype="i2"), + ) + ) + assert (dpt.asnumpy(x) == expected_x).all() + + +def test_place_2d_axis1(): + x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) + sel = dpt.asarray([True, False, True, False]) + val = dpt.zeros((3, 2), dtype=x.dtype) + x[:, sel] = val + expected_x = np.array( + [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2" + ) + assert (dpt.asnumpy(x) == expected_x).all() + + +def test_place_2d_axis1_scalar(): + x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) + sel = dpt.asarray([True, False, True, False]) + val = dpt.zeros(tuple(), dtype=x.dtype) + x[:, sel] = val + expected_x = np.array( + [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2" + ) + assert (dpt.asnumpy(x) == expected_x).all() + + +def test_place_all_slices(): + x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) + sel = dpt.asarray( + [ + [False, True, True, False], + [True, True, False, False], + [False, False, True, True], + ], + dtype="?", + ) + y = dpt.ones_like(x) + y[sel] = x[sel] + + +def test_place_some_slices_begin(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 3), dtype="?") + sel[0, 0] = True + sel[1, 1] = True + z = y[sel] + w = dpt.zeros_like(y) + w[sel] = z + + +def test_place_some_slices_mid(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((3, 4), dtype="?") + sel[0, 0] = True + sel[1, 1] = True + z = y[:, sel] + w = dpt.zeros_like(y) + w[:, sel] = z + + +def test_place_some_slices_end(): + x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) + y = dpt.permute_dims(x, (2, 0, 3, 1)) + sel = dpt.zeros((4, 4), dtype="?") + sel[0, 0] = True + sel[1, 1] = True + z = y[:, :, sel] + w = dpt.zeros_like(y) + w[:, :, sel] = z + + +def test_place_cycling(): + x = dpt.zeros(10, dtype="f4") + y = dpt.asarray([2, 3]) + sel = dpt.ones(x.size, dtype="?") + dpt.place(x, sel, y) + expected = np.array( + [ + 2, + 3, + ] + * 5, + dtype=x.dtype, + ) + assert (dpt.asnumpy(x) == expected).all() + + +def test_place_subset(): + x = dpt.zeros(10, dtype="f4") + y = dpt.ones_like(x) + sel = dpt.ones(x.size, dtype="?") + sel[::2] = False + dpt.place(x, sel, y) + expected = np.array([1, 3, 5, 7, 9], dtype=x.dtype) + assert (dpt.asnumpy(x) == expected).all() + + +def test_nonzero(): + x = dpt.concat((dpt.zeros(3), dpt.ones(4), dpt.zeros(3))) + (i,) = dpt.nonzero(x) + assert dpt.asnumpy(i) == np.array([3, 4, 5, 6]).all() From cab00351ba9ba9018528dd9cf3563c0aeb31f861 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Mar 2023 15:43:17 -0600 Subject: [PATCH 8/8] Fixed tests for boolean indexing --- dpctl/tests/test_usm_ndarray_indexing.py | 28 +++++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py index aec71def7d..bcc1fdbb60 100644 --- a/dpctl/tests/test_usm_ndarray_indexing.py +++ b/dpctl/tests/test_usm_ndarray_indexing.py @@ -972,10 +972,8 @@ def test_advanced_indexing_compute_follows_data(): x[ind0] = val1 -####### - - def test_extract_all_1d(): + get_queue_or_skip() x = dpt.arange(30, dtype="i4") sel = dpt.ones(30, dtype="?") sel[::2] = False @@ -989,6 +987,7 @@ def test_extract_all_1d(): def test_extract_all_2d(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) sel = dpt.ones(30, dtype="?") sel[::2] = False @@ -1003,6 +1002,7 @@ def test_extract_all_2d(): def test_extract_2D_axis0(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) sel = dpt.ones(x.shape[0], dtype="?") sel[::2] = False @@ -1013,6 +1013,7 @@ def test_extract_2D_axis0(): def test_extract_2D_axis1(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6)) sel = dpt.ones(x.shape[1], dtype="?") sel[::2] = False @@ -1023,6 +1024,7 @@ def test_extract_2D_axis1(): def test_extract_begin(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((3, 3), dtype="?") @@ -1034,6 +1036,7 @@ def test_extract_begin(): def test_extract_end(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((4, 4), dtype="?") @@ -1044,6 +1047,7 @@ def test_extract_end(): def test_extract_middle(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((3, 4), dtype="?") @@ -1054,6 +1058,7 @@ def test_extract_middle(): def test_extract_empty_result(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((3, 4), dtype="?") @@ -1066,17 +1071,19 @@ def test_extract_empty_result(): def test_place_all_1d(): + get_queue_or_skip() x = dpt.arange(10, dtype="i2") sel = dpt.zeros(10, dtype="?") sel[0::2] = True val = dpt.zeros(5, dtype=x.dtype) x[sel] = val assert (dpt.asnumpy(x) == np.array([0, 1, 0, 3, 0, 5, 0, 7, 0, 9])).all() - dpt.place(x, sel, dpt.asarray(2)) + dpt.place(x, sel, dpt.asarray([2])) assert (dpt.asnumpy(x) == np.array([2, 1, 2, 3, 2, 5, 2, 7, 2, 9])).all() def test_place_2d_axis0(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) sel = dpt.asarray([True, False, True]) val = dpt.zeros((2, 4), dtype=x.dtype) @@ -1092,6 +1099,7 @@ def test_place_2d_axis0(): def test_place_2d_axis1(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) sel = dpt.asarray([True, False, True, False]) val = dpt.zeros((3, 2), dtype=x.dtype) @@ -1103,6 +1111,7 @@ def test_place_2d_axis1(): def test_place_2d_axis1_scalar(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) sel = dpt.asarray([True, False, True, False]) val = dpt.zeros(tuple(), dtype=x.dtype) @@ -1114,6 +1123,7 @@ def test_place_2d_axis1_scalar(): def test_place_all_slices(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4)) sel = dpt.asarray( [ @@ -1128,6 +1138,7 @@ def test_place_all_slices(): def test_place_some_slices_begin(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((3, 3), dtype="?") @@ -1139,6 +1150,7 @@ def test_place_some_slices_begin(): def test_place_some_slices_mid(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((3, 4), dtype="?") @@ -1150,6 +1162,7 @@ def test_place_some_slices_mid(): def test_place_some_slices_end(): + get_queue_or_skip() x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4)) y = dpt.permute_dims(x, (2, 0, 3, 1)) sel = dpt.zeros((4, 4), dtype="?") @@ -1161,6 +1174,7 @@ def test_place_some_slices_end(): def test_place_cycling(): + get_queue_or_skip() x = dpt.zeros(10, dtype="f4") y = dpt.asarray([2, 3]) sel = dpt.ones(x.size, dtype="?") @@ -1177,16 +1191,18 @@ def test_place_cycling(): def test_place_subset(): + get_queue_or_skip() x = dpt.zeros(10, dtype="f4") y = dpt.ones_like(x) sel = dpt.ones(x.size, dtype="?") sel[::2] = False dpt.place(x, sel, y) - expected = np.array([1, 3, 5, 7, 9], dtype=x.dtype) + expected = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1], dtype=x.dtype) assert (dpt.asnumpy(x) == expected).all() def test_nonzero(): + get_queue_or_skip() x = dpt.concat((dpt.zeros(3), dpt.ones(4), dpt.zeros(3))) (i,) = dpt.nonzero(x) - assert dpt.asnumpy(i) == np.array([3, 4, 5, 6]).all() + assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all()