From b280e107e274ebe75ae94260a9d2651255c15cc4 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 20 Aug 2024 11:44:58 -0500 Subject: [PATCH] Clean up uses of Strided1DIndexer Provide overloaded constructors to avoid uses of static_cast at constructor sites. Provide shortcut constructors (for zero offset). Constructo call site use comments to specify meaning of constructor parameters. --- .../include/kernels/accumulators.hpp | 14 +- .../kernels/boolean_advanced_indexing.hpp | 7 +- .../kernels/linalg_functions/dot_product.hpp | 44 ++--- .../include/kernels/linalg_functions/gemm.hpp | 167 +++++++++--------- .../libtensor/include/kernels/reductions.hpp | 131 ++++++-------- .../libtensor/include/kernels/repeat.hpp | 28 +-- .../include/kernels/sorting/searchsorted.hpp | 4 +- .../libtensor/include/utils/offset_utils.hpp | 25 +++ 8 files changed, 213 insertions(+), 207 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index a45d4d7aca..c236688842 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -592,9 +592,8 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, size_t src_size = acc_groups - 1; using LocalScanIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; - const LocalScanIndexerT scan_iter_indexer{ - 0, static_cast(iter_nelems), - static_cast(src_size)}; + const LocalScanIndexerT scan_iter_indexer{/* size */ iter_nelems, + /* step */ src_size}; using IterIndexerT = dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< @@ -623,11 +622,10 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q, using LocalScanIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; const LocalScanIndexerT scan1_iter_indexer{ - 0, static_cast(iter_nelems), - static_cast(size_to_update)}; - const LocalScanIndexerT scan2_iter_indexer{ - 0, static_cast(iter_nelems), - static_cast(src_size)}; + /* size */ iter_nelems, + /* step */ size_to_update}; + const LocalScanIndexerT scan2_iter_indexer{/* size */ iter_nelems, + /* step */ src_size}; using IterIndexerT = dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index de55854768..4e99b26f53 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -233,7 +233,8 @@ sycl::event masked_extract_all_slices_strided_impl( /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const * *_packed_shape_strides) */ const StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides); - const Strided1DIndexer masked_dst_indexer(0, dst_size, dst_stride); + const Strided1DIndexer masked_dst_indexer(/* size */ dst_size, + /* step */ dst_stride); sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -309,8 +310,8 @@ sycl::event masked_extract_some_slices_strided_impl( const StridedIndexer masked_src_indexer{masked_nd, 0, packed_masked_src_shape_strides}; - const Strided1DIndexer masked_dst_indexer{0, masked_dst_size, - masked_dst_stride}; + const Strided1DIndexer masked_dst_indexer{/* size */ masked_dst_size, + /* step */ masked_dst_stride}; sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp index 5f621b48c4..8d7e4a3f09 100644 --- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp +++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp @@ -576,9 +576,8 @@ dot_product_contig_impl(sycl::queue &exec_q, dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< NoOpIndexerT, NoOpIndexerT>; - const InputBatchIndexerT inp_batch_indexer{ - 0, static_cast(batches), - static_cast(reduction_nelems)}; + const InputBatchIndexerT inp_batch_indexer{/* size */ batches, + /* step */ reduction_nelems}; const InputOutputBatchIndexerT inp_out_batch_indexer{ inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, @@ -612,9 +611,8 @@ dot_product_contig_impl(sycl::queue &exec_q, dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< NoOpIndexerT, NoOpIndexerT>; - const InputBatchIndexerT inp_batch_indexer{ - 0, static_cast(batches), - static_cast(reduction_nelems)}; + const InputBatchIndexerT inp_batch_indexer{/* size */ batches, + /* step */ reduction_nelems}; const InputOutputBatchIndexerT inp_out_batch_indexer{ inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, @@ -1089,9 +1087,8 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(batches), - static_cast(reduction_groups_)}; + const InputIndexerT inp_indexer{/* size */ batches, + /* step */ reduction_groups_}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -1120,9 +1117,8 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(batches), - static_cast(remaining_reduction_nelems)}; + const InputIndexerT inp_indexer{/* size */ batches, + /* step */ remaining_reduction_nelems}; const ResIndexerT res_iter_indexer{ batch_nd, batch_res_offset, /* shape */ batch_shape_and_strides, @@ -1200,9 +1196,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< NoOpIndexerT, NoOpIndexerT>; - const InputBatchIndexerT inp_batch_indexer{ - 0, static_cast(batches), - static_cast(reduction_nelems)}; + const InputBatchIndexerT inp_batch_indexer{/* size */ batches, + /* step */ reduction_nelems}; const InputOutputBatchIndexerT inp_out_batch_indexer{ inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, @@ -1238,9 +1233,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< NoOpIndexerT, NoOpIndexerT>; - const InputBatchIndexerT inp_batch_indexer{ - 0, static_cast(batches), - static_cast(reduction_nelems)}; + const InputBatchIndexerT inp_batch_indexer{/* size */ batches, + /* step */ reduction_nelems}; const InputOutputBatchIndexerT inp_out_batch_indexer{ inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, @@ -1307,8 +1301,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, NoOpIndexerT, NoOpIndexerT>; const InputBatchIndexerT inp_batch_indexer{ - 0, static_cast(batches), - static_cast(reduction_nelems)}; + /* size */ batches, + /* step */ reduction_nelems}; const InputOutputBatchIndexerT inp_out_batch_indexer{ inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{}, @@ -1343,9 +1337,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(batches), - static_cast(reduction_groups_)}; + const InputIndexerT inp_indexer{/* size */ batches, + /* step */ reduction_groups_}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -1374,9 +1367,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(batches), - static_cast(remaining_reduction_nelems)}; + const InputIndexerT inp_indexer{/* size */ batches, + /* step */ remaining_reduction_nelems}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp index d818002754..d71cb3272a 100644 --- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp +++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp @@ -132,9 +132,8 @@ sycl::event single_reduction_for_gemm(sycl::queue &exec_q, const ResIndexerT res_iter_indexer{res_nd, 0, res_shapes_strides}; const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, res_iter_indexer}; - const ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; red_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -162,9 +161,8 @@ sycl::event single_reduction_for_gemm(sycl::queue &exec_q, const ResIndexerT res_iter_indexer{res_nd, 0, res_shapes_strides}; const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, res_iter_indexer}; - const ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; if (iter_nelems == 1) { // increase GPU occupancy @@ -213,9 +211,10 @@ single_reduction_for_gemm_contig(sycl::queue &exec_q, constexpr InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; - const ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + // tmp allocation is a C-contiguous matrix (reduction_nelems, + // iter_nelems) and we are reducing by axis 0 + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; red_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -241,9 +240,10 @@ single_reduction_for_gemm_contig(sycl::queue &exec_q, constexpr InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; - const ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + // tmp allocation is a C-contiguous matrix + // (reduction_nelems, iter_nelems). Reducing along axis 0 + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; if (iter_nelems == 1) { // increase GPU occupancy @@ -295,9 +295,10 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, constexpr InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; - const ReductionIndexerT reduction_indexer{ - 0, /* size */ static_cast(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + // partially_reduced_tmp is C-contig matrix with shape + // (reduction_nelems, iter_nelems). Reducing along axis 0. + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; first_reduction_ev = dpctl::tensor::kernels::submit_no_atomic_reduction< T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT, @@ -327,9 +328,8 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -359,11 +359,12 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; - const ResIndexerT res_iter_indexer{res_nd, static_cast(res_offset), - res_shape_strides}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + const ResIndexerT res_iter_indexer{ + /* ndim */ res_nd, + /* offset */ static_cast(res_offset), + /* packed shape_strides*/ res_shape_strides}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; @@ -416,9 +417,8 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q, constexpr InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; - const ReductionIndexerT reduction_indexer{ - 0, /* size */ static_cast(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; const sycl::event &first_reduction_ev = dpctl::tensor::kernels::submit_no_atomic_reduction< @@ -451,9 +451,8 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q, // n * m = iter_nelems because essentially, this process // creates a stack of reduction_nelems 2D matrices and we reduce // along the stack axis - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -485,8 +484,8 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q, using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + /* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -1705,12 +1704,12 @@ sycl::event gemm_batch_contig_impl(sycl::queue &exec_q, Strided1DIndexer>; const BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); const size_t min_nm = std::min(n, m); const size_t max_nm = std::max(n, m); @@ -2342,7 +2341,8 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, batch_nd, rhs_batch_offset, batch_shape_strides, batch_shape_strides + 2 * batch_nd); const Strided1DIndexer tmp_batch_indexer( - 0, static_cast(batch_nelems), n * m); + /* size */ batch_nelems, + /* step */ n * m); const BatchDimsIndexerT batch_indexer( lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer); @@ -2406,7 +2406,8 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, const StridedIndexer rhs_batch_indexer( batch_nd, rhs_batch_offset, batch_shape_strides + 2 * batch_nd); const Strided1DIndexer tmp_batch_indexer( - 0, static_cast(batch_nelems), n * m); + /* size */ batch_nelems, + /* step */ n * m); const BatchDimsIndexerT batch_indexer( lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer); @@ -2641,7 +2642,8 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, batch_nd, rhs_batch_offset, batch_shape_strides, batch_shape_strides + 2 * batch_nd); const Strided1DIndexer tmp_batch_indexer( - 0, static_cast(batch_nelems), n * m); + /* size */ batch_nelems, + /* step */ n * m); const BatchDimsIndexerT batch_indexer( lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer); @@ -2709,7 +2711,8 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, batch_nd, rhs_batch_offset, batch_shape_strides, batch_shape_strides + 2 * batch_nd); const Strided1DIndexer tmp_batch_indexer( - 0, static_cast(batch_nelems), n * m); + /* size */ batch_nelems, + /* step */ n * m); const BatchDimsIndexerT batch_indexer( lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer); @@ -2957,12 +2960,12 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, using dpctl::tensor::offset_utils::Strided1DIndexer; const BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); return gemm_detail::_gemm_tree_k_step< lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, @@ -3018,12 +3021,12 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, Strided1DIndexer>; const BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, @@ -3077,12 +3080,12 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, Strided1DIndexer>; const BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step< lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, @@ -3159,12 +3162,12 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, Strided1DIndexer>; const BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); return gemm_detail::_gemm_tree_nm_step< lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, @@ -3220,12 +3223,12 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, Strided1DIndexer>; const BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, @@ -3280,12 +3283,12 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, Strided1DIndexer>; const BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step< lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, @@ -3398,11 +3401,13 @@ gemm_batch_nm_contig_impl(sycl::queue &exec_q, using dpctl::tensor::offset_utils::Strided1DIndexer; - const ssize_t ss_batch_nelems = static_cast(batch_nelems); const BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, ss_batch_nelems, static_cast(n * k)}, - Strided1DIndexer{0, ss_batch_nelems, static_cast(k * m)}, - Strided1DIndexer{0, ss_batch_nelems, static_cast(n * m)}); + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * k}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ k * m}, + Strided1DIndexer{/* size */ batch_nelems, + /* step */ n * m}); sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl< lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT, diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 1ac896059f..10dbd2fa40 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -886,8 +886,8 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl( using ReductionIndexerT = NoOpIndexerT; const InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, NoOpIndexerT{}}; constexpr ReductionIndexerT reduction_indexer{}; @@ -912,8 +912,8 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl( RowsIndexerT, NoOpIndexerT>; using ReductionIndexerT = NoOpIndexerT; - const RowsIndexerT rows_indexer{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}; + const RowsIndexerT rows_indexer{/* size */ iter_nelems, + /* step */ reduction_nelems}; constexpr NoOpIndexerT result_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer, result_indexer}; @@ -975,9 +975,8 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl( const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; - const ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; sycl::event comp_ev = sequential_reduction(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; constexpr size_t preferred_reductions_per_wi = 8; size_t reductions_per_wi = @@ -1311,9 +1309,8 @@ sycl::event reduction_over_group_temps_strided_impl( using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{ @@ -1342,9 +1339,8 @@ sycl::event reduction_over_group_temps_strided_impl( InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; const ResIndexerT res_iter_indexer{ iter_nd, iter_res_offset, /* shape */ iter_shape_and_strides, @@ -1428,8 +1424,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( using ReductionIndexerT = NoOpIndexerT; const InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, NoOpIndexerT{}}; constexpr ReductionIndexerT reduction_indexer{}; @@ -1461,8 +1457,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( using ReductionIndexerT = NoOpIndexerT; const InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, NoOpIndexerT{}}; constexpr ReductionIndexerT reduction_indexer{}; @@ -1520,9 +1516,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( RowsIndexerT, NoOpIndexerT>; using ReductionIndexerT = NoOpIndexerT; - const RowsIndexerT rows_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_nelems)}; + const RowsIndexerT rows_indexer{/* size */ iter_nelems, + /* step */ reduction_nelems}; constexpr NoOpIndexerT noop_tmp_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer, noop_tmp_indexer}; @@ -1559,9 +1554,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -1589,9 +1583,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -1672,9 +1665,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; - const ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; sycl::event comp_ev = sequential_reduction(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; if (iter_nelems == 1) { // increase GPU occupancy @@ -1770,8 +1761,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, noop_tmp_indexer}; const ReductionIndexerT reduction_indexer{ - 0, /* size */ static_cast(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + /* size */ reduction_nelems, + /* step */ iter_nelems}; first_reduction_ev = submit_no_atomic_reduction< argTy, resTy, ReductionOpT, InputOutputIterIndexerT, @@ -1804,9 +1795,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -1834,9 +1824,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -2732,9 +2721,8 @@ sycl::event search_over_group_temps_strided_impl( InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -2765,9 +2753,8 @@ sycl::event search_over_group_temps_strided_impl( InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; const ResIndexerT res_iter_indexer{ iter_nd, iter_res_offset, /* shape */ iter_shape_and_strides, @@ -2870,8 +2857,8 @@ sycl::event search_axis1_over_group_temps_contig_impl( using ReductionIndexerT = NoOpIndexerT; const InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, NoOpIndexerT{}}; constexpr ReductionIndexerT reduction_indexer{}; @@ -2909,8 +2896,8 @@ sycl::event search_axis1_over_group_temps_contig_impl( using ReductionIndexerT = NoOpIndexerT; const InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, NoOpIndexerT{}}; constexpr ReductionIndexerT reduction_indexer{}; @@ -2985,8 +2972,8 @@ sycl::event search_axis1_over_group_temps_contig_impl( using ReductionIndexerT = NoOpIndexerT; const InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, NoOpIndexerT{}}; constexpr ReductionIndexerT reduction_indexer{}; @@ -3027,9 +3014,8 @@ sycl::event search_axis1_over_group_temps_contig_impl( InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -3060,9 +3046,8 @@ sycl::event search_axis1_over_group_temps_contig_impl( InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -3151,9 +3136,8 @@ sycl::event search_axis0_over_group_temps_contig_impl( const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; - const ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -3197,9 +3181,8 @@ sycl::event search_axis0_over_group_temps_contig_impl( constexpr NoOpIndexerT result_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, result_indexer}; - const ReductionIndexerT reduction_indexer{ - 0, /* size */ static_cast(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; if (iter_nelems == 1) { // increase GPU occupancy @@ -3275,8 +3258,8 @@ sycl::event search_axis0_over_group_temps_contig_impl( const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, result_indexer}; const ReductionIndexerT reduction_indexer{ - 0, /* size */ static_cast(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + /* size */ reduction_nelems, + /* step */ iter_nelems}; first_reduction_ev = submit_search_reduction; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -3348,9 +3330,8 @@ sycl::event search_axis0_over_group_temps_contig_impl( InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - const InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; constexpr ResIndexerT res_iter_indexer{}; const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, diff --git a/dpctl/tensor/libtensor/include/kernels/repeat.hpp b/dpctl/tensor/libtensor/include/kernels/repeat.hpp index 9b12be2126..bfc019f55b 100644 --- a/dpctl/tensor/libtensor/include/kernels/repeat.hpp +++ b/dpctl/tensor/libtensor/include/kernels/repeat.hpp @@ -161,12 +161,13 @@ repeat_by_sequence_impl(sycl::queue &q, orthog_nd, src_offset, dst_offset, orthog_src_dst_shape_and_strides}; // indexers along repeated axis - const Strided1DIndexer src_axis_indexer{0, src_axis_shape, - src_axis_stride}; - const Strided1DIndexer dst_axis_indexer{0, dst_axis_shape, - dst_axis_stride}; + const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape, + /* step */ src_axis_stride}; + const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape, + /* step */ dst_axis_stride}; // indexer along reps array - const Strided1DIndexer reps_indexer{0, reps_shape, reps_stride}; + const Strided1DIndexer reps_indexer{/* size */ reps_shape, + /* step */ reps_stride}; const size_t gws = orthog_nelems * src_axis_nelems; @@ -235,9 +236,11 @@ sycl::event repeat_by_sequence_1d_impl(sycl::queue &q, constexpr TwoZeroOffsets_Indexer orthog_indexer{}; // indexers along repeated axis const StridedIndexer src_indexer{src_nd, 0, src_shape_strides}; - const Strided1DIndexer dst_indexer{0, dst_shape, dst_stride}; + const Strided1DIndexer dst_indexer{/* size */ dst_shape, + /* step */ dst_stride}; // indexer along reps array - const Strided1DIndexer reps_indexer{0, reps_shape, reps_stride}; + const Strided1DIndexer reps_indexer{/* size */ reps_shape, + /* step */ reps_stride}; const size_t gws = src_nelems; @@ -358,10 +361,10 @@ sycl::event repeat_by_scalar_impl(sycl::queue &q, const TwoOffsets_StridedIndexer orthog_indexer{ orthog_nd, src_offset, dst_offset, orthog_shape_and_strides}; // indexers along repeated axis - const Strided1DIndexer src_axis_indexer{0, src_axis_shape, - src_axis_stride}; - const Strided1DIndexer dst_axis_indexer{0, dst_axis_shape, - dst_axis_stride}; + const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape, + /* step */ src_axis_stride}; + const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape, + /* step */ dst_axis_stride}; const size_t gws = orthog_nelems * dst_axis_nelems; @@ -420,7 +423,8 @@ sycl::event repeat_by_scalar_1d_impl(sycl::queue &q, constexpr TwoZeroOffsets_Indexer orthog_indexer{}; // indexers along repeated axis const StridedIndexer src_indexer(src_nd, 0, src_shape_strides); - const Strided1DIndexer dst_indexer{0, dst_shape, dst_stride}; + const Strided1DIndexer dst_indexer{/* size */ dst_shape, + /* step */ dst_stride}; const size_t gws = dst_nelems; diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp index cf8f7e32f4..4c1f5c5c93 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp @@ -218,8 +218,8 @@ sycl::event searchsorted_strided_impl( using HayIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; const HayIndexerT hay_indexer( /* offset */ hay_offset, - /* size */ static_cast(hay_nelems), - /* step */ static_cast(hay_stride)); + /* size */ hay_nelems, + /* step */ hay_stride); using NeedlesIndexerT = dpctl::tensor::offset_utils::StridedIndexer; const ssize_t *needles_shape_strides = packed_shape_strides; diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp index b7e0f53acc..1a6ca8dc47 100644 --- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp @@ -213,6 +213,31 @@ struct UnpackedStridedIndexer struct Strided1DIndexer { + Strided1DIndexer(size_t _size) : offset{}, size(_size), step(1) {} + Strided1DIndexer(ssize_t _size) + : offset{}, size(static_cast(_size)), step(1) + { + } + Strided1DIndexer(size_t _size, ssize_t _step) + : offset{}, size(_size), step(_step) + { + } + Strided1DIndexer(size_t _size, size_t _step) + : offset{}, size(_size), step(static_cast(_step)) + { + } + Strided1DIndexer(ssize_t _size, ssize_t _step) + : offset{}, size(static_cast(_size)), step(_step) + { + } + Strided1DIndexer(ssize_t _offset, size_t _size, ssize_t _step) + : offset(_offset), size(_size), step(_step) + { + } + Strided1DIndexer(ssize_t _offset, size_t _size, size_t _step) + : offset(_offset), size(_size), step(static_cast(_step)) + { + } Strided1DIndexer(ssize_t _offset, ssize_t _size, ssize_t _step) : offset(_offset), size(static_cast(_size)), step(_step) {