diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index be9426a834..0bd7a30cd0 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -100,7 +100,9 @@ isfinite, isinf, isnan, + multiply, sqrt, + subtract, ) __all__ = [ @@ -186,5 +188,7 @@ "isfinite", "sqrt", "divide", + "multiply", + "subtract", "equal", ] diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py index 27c549c034..29d1052c9c 100644 --- a/dpctl/tensor/_elementwise_funcs.py +++ b/dpctl/tensor/_elementwise_funcs.py @@ -34,7 +34,7 @@ # B01: ===== ADD (x1, x2) _add_docstring_ = """ -add(x1, x2, order='K') +add(x1, x2, out=None, order='K') Calculates the sum for each element `x1_i` of the input array `x1` with the respective element `x2_i` of the input array `x2`. @@ -94,7 +94,7 @@ # U11: ==== COS (x) _cos_docstring = """ -cos(x, order='K') +cos(x, out=None, order='K') Computes cosine for each element `x_i` for input array `x`. """ @@ -106,7 +106,7 @@ # B08: ==== DIVIDE (x1, x2) _divide_docstring_ = """ -divide(x1, x2, order='K') +divide(x1, x2, out=None, order='K') Calculates the ratio for each element `x1_i` of the input array `x1` with the respective element `x2_i` of the input array `x2`. @@ -128,7 +128,7 @@ # B09: ==== EQUAL (x1, x2) _equal_docstring_ = """ -equal(x1, x2, order='K') +equal(x1, x2, out=None, order='K') Calculates equality test results for each element `x1_i` of the input array `x1` with the respective element `x2_i` of the input array `x2`. @@ -172,6 +172,8 @@ # U17: ==== ISFINITE (x) _isfinite_docstring_ = """ +isfinite(x, out=None, order='K') + Computes if every element of input array is a finite number. """ @@ -181,6 +183,8 @@ # U18: ==== ISINF (x) _isinf_docstring_ = """ +isinf(x, out=None, order='K') + Computes if every element of input array is an infinity. """ @@ -190,6 +194,8 @@ # U19: ==== ISNAN (x) _isnan_docstring_ = """ +isnan(x, out=None, order='K') + Computes if every element of input array is a NaN. """ @@ -231,7 +237,25 @@ # FIXME: implement B18 # B19: ==== MULTIPLY (x1, x2) -# FIXME: implement B19 +_multiply_docstring_ = """ +multiply(x1, x2, out=None, order='K') + +Calculates the product for each element `x1_i` of the input array `x1` +with the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have numeric data type. + x2 (usm_ndarray): + Second input array, also expected to have numeric data type. +Returns: + usm_narray: + an array containing the element-wise products. The data type of + the returned array is determined by the Type Promotion Rules. +""" +multiply = BinaryElementwiseFunc( + "multiply", ti._multiply_result_type, ti._multiply, _multiply_docstring_ +) # U25: ==== NEGATIVE (x) # FIXME: implement U25 @@ -268,6 +292,8 @@ # U33: ==== SQRT (x) _sqrt_docstring_ = """ +sqrt(x, out=None, order='K') + Computes sqrt for each element `x_i` for input array `x`. """ @@ -276,7 +302,26 @@ ) # B23: ==== SUBTRACT (x1, x2) -# FIXME: implement B23 +_subtract_docstring_ = """ +subtract(x1, x2, out=None, order='K') + +Calculates the difference bewteen each element `x1_i` of the input +array `x1` and the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have numeric data type. + x2 (usm_ndarray): + Second input array, also expected to have numeric data type. +Returns: + usm_narray: + an array containing the element-wise differences. The data type + of the returned array is determined by the Type Promotion Rules. +""" +subtract = BinaryElementwiseFunc( + "subtract", ti._subtract_result_type, ti._subtract, _subtract_docstring_ +) + # U34: ==== TAN (x) # FIXME: implement U34 diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp index 09f2995874..c734fd54ec 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp @@ -115,27 +115,9 @@ sycl::event abs_contig_impl(sycl::queue exec_q, char *res_p, const std::vector &depends = {}) { - sycl::event abs_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - size_t lws = 64; - constexpr unsigned int vec_sz = 4; - constexpr unsigned int n_vecs = 2; - const size_t n_groups = - ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); - const auto gws_range = sycl::range<1>(n_groups * lws); - const auto lws_range = sycl::range<1>(lws); - - using resTy = typename AbsOutputType::value_type; - const argTy *arg_tp = reinterpret_cast(arg_p); - resTy *res_tp = reinterpret_cast(res_p); - - cgh.parallel_for>( - sycl::nd_range<1>(gws_range, lws_range), - AbsContigFunctor(arg_tp, res_tp, - nelems)); - }); - return abs_ev; + return elementwise_common::unary_contig_impl< + argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel>( + exec_q, nelems, arg_p, res_p, depends); } template struct AbsContigFactory @@ -182,24 +164,10 @@ sycl::event abs_strided_impl(sycl::queue exec_q, const std::vector &depends, const std::vector &additional_depends) { - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - - using resTy = typename AbsOutputType::value_type; - using IndexerT = - typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; - - IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides}; - - const argTy *arg_tp = reinterpret_cast(arg_p); - resTy *res_tp = reinterpret_cast(res_p); - - cgh.parallel_for>( - {nelems}, - AbsStridedFunctor(arg_tp, res_tp, indexer)); - }); - return comp_ev; + return elementwise_common::unary_strided_impl< + argTy, AbsOutputType, AbsStridedFunctor, abs_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); } template struct AbsStridedFactory diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp index d0ab25d270..2f3dec3d14 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp @@ -184,32 +184,10 @@ sycl::event add_contig_impl(sycl::queue exec_q, py::ssize_t res_offset, const std::vector &depends = {}) { - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - size_t lws = 64; - constexpr unsigned int vec_sz = 4; - constexpr unsigned int n_vecs = 2; - const size_t n_groups = - ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); - const auto gws_range = sycl::range<1>(n_groups * lws); - const auto lws_range = sycl::range<1>(lws); - - using resTy = typename AddOutputType::value_type; - - const argTy1 *arg1_tp = - reinterpret_cast(arg1_p) + arg1_offset; - const argTy2 *arg2_tp = - reinterpret_cast(arg2_p) + arg2_offset; - resTy *res_tp = reinterpret_cast(res_p) + res_offset; - - cgh.parallel_for< - add_contig_kernel>( - sycl::nd_range<1>(gws_range, lws_range), - AddContigFunctor( - arg1_tp, arg2_tp, res_tp, nelems)); - }); - return comp_ev; + return elementwise_common::binary_contig_impl< + argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel>( + exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends); } template struct AddContigFactory @@ -256,28 +234,11 @@ sycl::event add_strided_impl(sycl::queue exec_q, const std::vector &depends, const std::vector &additional_depends) { - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - - using resTy = typename AddOutputType::value_type; - - using IndexerT = - typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; - - IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset, - shape_and_strides}; - - const argTy1 *arg1_tp = reinterpret_cast(arg1_p); - const argTy2 *arg2_tp = reinterpret_cast(arg2_p); - resTy *res_tp = reinterpret_cast(res_p); - - cgh.parallel_for< - add_strided_strided_kernel>( - {nelems}, AddStridedFunctor( - arg1_tp, arg2_tp, res_tp, indexer)); - }); - return comp_ev; + return elementwise_common::binary_strided_impl< + argTy1, argTy2, AddOutputType, AddStridedFunctor, + add_strided_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); } template struct AddStridedFactory @@ -322,62 +283,11 @@ sycl::event add_contig_matrix_contig_row_broadcast_impl( py::ssize_t res_offset, const std::vector &depends = {}) { - const argT1 *mat = reinterpret_cast(mat_p) + mat_offset; - const argT2 *vec = reinterpret_cast(vec_p) + vec_offset; - resT *res = reinterpret_cast(res_p) + res_offset; - - const auto &dev = exec_q.get_device(); - const auto &sg_sizes = dev.get_info(); - // Get device-specific kernel info max_sub_group_size - size_t max_sgSize = - *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); - - size_t n1_padded = n1 + max_sgSize; - argT2 *padded_vec = sycl::malloc_device(n1_padded, exec_q); - - if (padded_vec == nullptr) { - throw std::runtime_error("Could not allocate memory on the device"); - } - sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); // ensure vec contains actual data - cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) { - auto i = id[0]; - padded_vec[i] = vec[i % n1]; - }); - }); - - // sub-group spans work-items [I, I + sgSize) - // base = ndit.get_global_linear_id() - sg.get_local_id()[0] - // Generically, sg.load( &mat[base]) may load arrays from - // different rows of mat. The start corresponds to row (base / n0) - // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to - // ensure that reads are accessible - - size_t lws = 64; - - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(make_padded_vec_ev); - - auto lwsRange = sycl::range<1>(lws); - size_t n_elems = n0 * n1; - size_t n_groups = (n_elems + lws - 1) / lws; - auto gwsRange = sycl::range<1>(n_groups * lws); - - cgh.parallel_for< - class add_matrix_row_broadcast_sg_krn>( - sycl::nd_range<1>(gwsRange, lwsRange), - AddContigMatrixContigRowBroadcastingFunctor( - mat, padded_vec, res, n_elems, n1)); - }); - - sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); - sycl::context ctx = exec_q.get_context(); - cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); }); - }); - host_tasks.push_back(tmp_cleanup_ev); - - return comp_ev; + return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl< + argT1, argT2, resT, AddContigMatrixContigRowBroadcastingFunctor, + add_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p, + mat_offset, vec_p, vec_offset, res_p, + res_offset, depends); } template diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp index a1108c541c..ed38f380df 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -249,6 +249,81 @@ struct UnaryStridedFunctor } }; +template + class UnaryOutputType, + template + class ContigFunctorT, + template + class kernel_name, + unsigned int vec_sz = 4, + unsigned int n_vecs = 2> +sycl::event unary_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + size_t lws = 64; + const size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + using resTy = typename UnaryOutputType::value_type; + const argTy *arg_tp = reinterpret_cast(arg_p); + resTy *res_tp = reinterpret_cast(res_p); + + cgh.parallel_for>( + sycl::nd_range<1>(gws_range, lws_range), + ContigFunctorT(arg_tp, res_tp, + nelems)); + }); + return comp_ev; +} + +template + class UnaryOutputType, + template + class StridedFunctorT, + template + class kernel_name> +sycl::event +unary_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using resTy = typename UnaryOutputType::value_type; + using IndexerT = + typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides}; + + const argTy *arg_tp = reinterpret_cast(arg_p); + resTy *res_tp = reinterpret_cast(res_p); + + cgh.parallel_for>( + {nelems}, + StridedFunctorT(arg_tp, res_tp, indexer)); + }); + return comp_ev; +} + template &); +template + class BinaryOutputType, + template + class BinaryContigFunctorT, + template + class kernel_name, + unsigned int vec_sz = 4, + unsigned int n_vecs = 2> +sycl::event binary_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + size_t lws = 64; + const size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + using resTy = typename BinaryOutputType::value_type; + + const argTy1 *arg1_tp = + reinterpret_cast(arg1_p) + arg1_offset; + const argTy2 *arg2_tp = + reinterpret_cast(arg2_p) + arg2_offset; + resTy *res_tp = reinterpret_cast(res_p) + res_offset; + + cgh.parallel_for>( + sycl::nd_range<1>(gws_range, lws_range), + BinaryContigFunctorT( + arg1_tp, arg2_tp, res_tp, nelems)); + }); + return comp_ev; +} + +template + class BinaryOutputType, + template + class BinaryStridedFunctorT, + template + class kernel_name> +sycl::event +binary_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using resTy = typename BinaryOutputType::value_type; + + using IndexerT = + typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + + IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset, + shape_and_strides}; + + const argTy1 *arg1_tp = reinterpret_cast(arg1_p); + const argTy2 *arg2_tp = reinterpret_cast(arg2_p); + resTy *res_tp = reinterpret_cast(res_p); + + cgh.parallel_for>( + {nelems}, BinaryStridedFunctorT( + arg1_tp, arg2_tp, res_tp, indexer)); + }); + return comp_ev; +} + +template + class BinaryContigMatrixContigRowBroadcastFunctorT, + template + class kernel_name> +sycl::event binary_contig_matrix_contig_row_broadcast_impl( + sycl::queue exec_q, + std::vector &host_tasks, + size_t n0, + size_t n1, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + py::ssize_t mat_offset, + const char *vec_p, // typeless pointer to (n1,) contiguous row + py::ssize_t vec_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = op(mat[i,j], vec[j]) + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + const argT1 *mat = reinterpret_cast(mat_p) + mat_offset; + const argT2 *vec = reinterpret_cast(vec_p) + vec_offset; + resT *res = reinterpret_cast(res_p) + res_offset; + + const auto &dev = exec_q.get_device(); + const auto &sg_sizes = dev.get_info(); + // Get device-specific kernel info max_sub_group_size + size_t max_sgSize = + *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); + + size_t n1_padded = n1 + max_sgSize; + argT2 *padded_vec = sycl::malloc_device(n1_padded, exec_q); + + if (padded_vec == nullptr) { + throw std::runtime_error("Could not allocate memory on the device"); + } + sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); // ensure vec contains actual data + cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) { + auto i = id[0]; + padded_vec[i] = vec[i % n1]; + }); + }); + + // sub-group spans work-items [I, I + sgSize) + // base = ndit.get_global_linear_id() - sg.get_local_id()[0] + // Generically, sg.load( &mat[base]) may load arrays from + // different rows of mat. The start corresponds to row (base / n0) + // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to + // ensure that reads are accessible + + size_t lws = 64; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(make_padded_vec_ev); + + auto lwsRange = sycl::range<1>(lws); + size_t n_elems = n0 * n1; + size_t n_groups = (n_elems + lws - 1) / lws; + auto gwsRange = sycl::range<1>(n_groups * lws); + + cgh.parallel_for>( + sycl::nd_range<1>(gwsRange, lwsRange), + BinaryContigMatrixContigRowBroadcastFunctorT( + mat, padded_vec, res, n_elems, n1)); + }); + + sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + sycl::context ctx = exec_q.get_context(); + cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); }); + }); + host_tasks.push_back(tmp_cleanup_ev); + + return comp_ev; +} + +template + class BinaryContigRowContigMatrixBroadcastFunctorT, + template + class kernel_name> +sycl::event binary_contig_row_contig_matrix_broadcast_impl( + sycl::queue exec_q, + std::vector &host_tasks, + size_t n0, + size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + py::ssize_t vec_offset, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + py::ssize_t mat_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = op(vec[j], mat[i,j]) + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + const argT1 *vec = reinterpret_cast(vec_p) + vec_offset; + const argT2 *mat = reinterpret_cast(mat_p) + mat_offset; + resT *res = reinterpret_cast(res_p) + res_offset; + + const auto &dev = exec_q.get_device(); + const auto &sg_sizes = dev.get_info(); + // Get device-specific kernel info max_sub_group_size + size_t max_sgSize = + *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); + + size_t n1_padded = n1 + max_sgSize; + argT2 *padded_vec = sycl::malloc_device(n1_padded, exec_q); + + if (padded_vec == nullptr) { + throw std::runtime_error("Could not allocate memory on the device"); + } + + sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); // ensure vec contains actual data + cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) { + auto i = id[0]; + padded_vec[i] = vec[i % n1]; + }); + }); + + // sub-group spans work-items [I, I + sgSize) + // base = ndit.get_global_linear_id() - sg.get_local_id()[0] + // Generically, sg.load( &mat[base]) may load arrays from + // different rows of mat. The start corresponds to row (base / n0) + // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to + // ensure that reads are accessible + + size_t lws = 64; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(make_padded_vec_ev); + + auto lwsRange = sycl::range<1>(lws); + size_t n_elems = n0 * n1; + size_t n_groups = (n_elems + lws - 1) / lws; + auto gwsRange = sycl::range<1>(n_groups * lws); + + cgh.parallel_for>( + sycl::nd_range<1>(gwsRange, lwsRange), + BinaryContigRowContigMatrixBroadcastFunctorT( + padded_vec, mat, res, n_elems, n1)); + }); + + sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + sycl::context ctx = exec_q.get_context(); + cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); }); + }); + host_tasks.push_back(tmp_cleanup_ev); + + return comp_ev; +}; + } // namespace elementwise_common } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp index b6859910e3..7639facb07 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp @@ -103,27 +103,9 @@ sycl::event cos_contig_impl(sycl::queue exec_q, char *res_p, const std::vector &depends = {}) { - sycl::event cos_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - constexpr size_t lws = 64; - constexpr unsigned int vec_sz = 4; - constexpr unsigned int n_vecs = 2; - static_assert(lws % vec_sz == 0); - auto gws_range = sycl::range<1>( - ((nelems + n_vecs * lws * vec_sz - 1) / (lws * n_vecs * vec_sz)) * - lws); - auto lws_range = sycl::range<1>(lws); - - using resTy = typename CosOutputType::value_type; - const argTy *arg_tp = reinterpret_cast(arg_p); - resTy *res_tp = reinterpret_cast(res_p); - - cgh.parallel_for>( - sycl::nd_range<1>(gws_range, lws_range), - CosContigFunctor(arg_tp, res_tp, - nelems)); - }); - return cos_ev; + return elementwise_common::unary_contig_impl< + argTy, CosOutputType, CosContigFunctor, cos_contig_kernel>( + exec_q, nelems, arg_p, res_p, depends); } template struct CosContigFactory @@ -166,26 +148,10 @@ sycl::event cos_strided_impl(sycl::queue exec_q, const std::vector &depends, const std::vector &additional_depends) { - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - - using resTy = typename CosOutputType::value_type; - using IndexerT = - typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; - - IndexerT arg_res_indexer(nd, arg_offset, res_offset, shape_and_strides); - - const argTy *arg_tp = reinterpret_cast(arg_p); - resTy *res_tp = reinterpret_cast(res_p); - - sycl::range<1> gRange{nelems}; - - cgh.parallel_for>( - gRange, CosStridedFunctor(arg_tp, res_tp, - arg_res_indexer)); - }); - return comp_ev; + return elementwise_common::unary_strided_impl< + argTy, CosOutputType, CosStridedFunctor, cos_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); } template struct CosStridedFactory diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp index edbbc393c5..a77ad54e85 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp @@ -167,32 +167,10 @@ sycl::event equal_contig_impl(sycl::queue exec_q, py::ssize_t res_offset, const std::vector &depends = {}) { - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - size_t lws = 64; - constexpr unsigned int vec_sz = 4; - constexpr unsigned int n_vecs = 2; - const size_t n_groups = - ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); - const auto gws_range = sycl::range<1>(n_groups * lws); - const auto lws_range = sycl::range<1>(lws); - - using resTy = typename EqualOutputType::value_type; - - const argTy1 *arg1_tp = - reinterpret_cast(arg1_p) + arg1_offset; - const argTy2 *arg2_tp = - reinterpret_cast(arg2_p) + arg2_offset; - resTy *res_tp = reinterpret_cast(res_p) + res_offset; - - cgh.parallel_for< - equal_contig_kernel>( - sycl::nd_range<1>(gws_range, lws_range), - EqualContigFunctor( - arg1_tp, arg2_tp, res_tp, nelems)); - }); - return comp_ev; + return elementwise_common::binary_contig_impl< + argTy1, argTy2, EqualOutputType, EqualContigFunctor, + equal_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends); } template struct EqualContigFactory @@ -240,28 +218,11 @@ equal_strided_impl(sycl::queue exec_q, const std::vector &depends, const std::vector &additional_depends) { - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - - using resTy = typename EqualOutputType::value_type; - - using IndexerT = - typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; - - IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset, - shape_and_strides}; - - const argTy1 *arg1_tp = reinterpret_cast(arg1_p); - const argTy2 *arg2_tp = reinterpret_cast(arg2_p); - resTy *res_tp = reinterpret_cast(res_p); - - cgh.parallel_for< - equal_strided_strided_kernel>( - {nelems}, EqualStridedFunctor( - arg1_tp, arg2_tp, res_tp, indexer)); - }); - return comp_ev; + return elementwise_common::binary_strided_impl< + argTy1, argTy2, EqualOutputType, EqualStridedFunctor, + equal_strided_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); } template struct EqualStridedFactory diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp index 86340329da..cc91bdbb97 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp @@ -113,7 +113,7 @@ template struct IsFiniteOutputType using value_type = bool; }; -template +template class isfinite_contig_kernel; template @@ -123,29 +123,10 @@ sycl::event isfinite_contig_impl(sycl::queue exec_q, char *res_p, const std::vector &depends = {}) { - sycl::event isfinite_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - constexpr size_t lws = 64; - constexpr std::uint8_t vec_sz = 4; - constexpr std::uint8_t n_vecs = 2; - static_assert(lws % vec_sz == 0); - size_t n_groups = - ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); - auto gws_range = sycl::range<1>(n_groups * lws); - auto lws_range = sycl::range<1>(lws); - - using resTy = typename IsFiniteOutputType::value_type; - const argTy *arg_tp = reinterpret_cast(arg_p); - resTy *res_tp = reinterpret_cast(res_p); - - cgh.parallel_for< - class isfinite_contig_kernel>( - sycl::nd_range<1>(gws_range, lws_range), - IsFiniteContigFunctor(arg_tp, res_tp, - nelems)); - }); - return isfinite_ev; + return elementwise_common::unary_contig_impl( + exec_q, nelems, arg_p, res_p, depends); } template struct IsFiniteContigFactory @@ -182,24 +163,11 @@ isfinite_strided_impl(sycl::queue exec_q, const std::vector &depends, const std::vector &additional_depends) { - sycl::event isfinite_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - - using resTy = typename IsFiniteOutputType::value_type; - using IndexerT = - typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; - - IndexerT arg_res_indexer{nd, arg_offset, res_offset, shape_and_strides}; - - const argTy *arg_tptr = reinterpret_cast(arg_p); - resTy *res_tptr = reinterpret_cast(res_p); - - cgh.parallel_for>( - {nelems}, IsFiniteStridedFunctor( - arg_tptr, res_tptr, arg_res_indexer)); - }); - return isfinite_ev; + return elementwise_common::unary_strided_impl( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); } template struct IsFiniteStridedFactory diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp index 8f7dcca6c7..8cb93ea456 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp @@ -111,7 +111,7 @@ template struct IsInfOutputType using value_type = bool; }; -template +template class isinf_contig_kernel; template @@ -121,28 +121,9 @@ sycl::event isinf_contig_impl(sycl::queue exec_q, char *res_p, const std::vector &depends = {}) { - sycl::event isinf_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - constexpr size_t lws = 64; - constexpr std::uint8_t vec_sz = 4; - constexpr std::uint8_t n_vecs = 2; - static_assert(lws % vec_sz == 0); - auto gws_range = sycl::range<1>( - ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)) * - lws); - auto lws_range = sycl::range<1>(lws); - - using resTy = typename IsInfOutputType::value_type; - const argTy *arg_tp = reinterpret_cast(arg_p); - resTy *res_tp = reinterpret_cast(res_p); - - cgh.parallel_for< - class isinf_contig_kernel>( - sycl::nd_range<1>(gws_range, lws_range), - IsInfContigFunctor(arg_tp, res_tp, - nelems)); - }); - return isinf_ev; + return elementwise_common::unary_contig_impl< + argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel>( + exec_q, nelems, arg_p, res_p, depends); } template struct IsInfContigFactory @@ -179,26 +160,10 @@ isinf_strided_impl(sycl::queue exec_q, const std::vector &depends, const std::vector &additional_depends) { - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - - using resTy = typename IsInfOutputType::value_type; - using IndexerT = - typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; - - IndexerT arg_res_indexer{nd, arg_offset, res_offset, shape_and_strides}; - - const argTy *arg_tptr = reinterpret_cast(arg_p); - resTy *res_tptr = reinterpret_cast(res_p); - - sycl::range<1> gRange{nelems}; - - cgh.parallel_for>( - gRange, IsInfStridedFunctor( - arg_tptr, res_tptr, arg_res_indexer)); - }); - return comp_ev; + return elementwise_common::unary_strided_impl< + argTy, IsInfOutputType, IsInfStridedFunctor, isinf_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); } template struct IsInfStridedFactory diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp index 3e4f68ed57..5d628437ed 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp @@ -104,119 +104,12 @@ template using IsNanStridedFunctor = elementwise_common:: UnaryStridedFunctor>; -template -struct IsNanContigFunctorOld -{ -private: - const argT *in = nullptr; - resT *out = nullptr; - const size_t nelems_; - -public: - IsNanContigFunctorOld(const argT *inp, resT *res, const size_t nelems) - : in(inp), out(res), nelems_(nelems) - { - } - - void operator()(sycl::nd_item<1> ndit) const - { - using dpctl::tensor::type_utils::is_complex; - using dpctl::tensor::type_utils::vec_cast; - - if constexpr (is_complex::value) { - std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; - size_t base = ndit.get_global_linear_id(); - - base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize); - for (size_t offset = base; - offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz)); - offset += sgSize) - { - const bool real_isnan = sycl::isnan(std::real(in[offset])); - const bool imag_isnan = sycl::isnan(std::imag(in[offset])); - out[offset] = real_isnan || imag_isnan; - } - } - else if constexpr (std::is_same::value || - std::is_integral::value) - { - using out_ptrT = - sycl::multi_ptr; - - auto sg = ndit.get_sub_group(); - std::uint8_t sgSize = sg.get_local_range()[0]; - std::uint8_t max_sgSize = sg.get_max_local_range()[0]; - size_t base = n_vecs * vec_sz * - (ndit.get_group(0) * ndit.get_local_range(0) + - sg.get_group_id()[0] * sgSize); - if (base + n_vecs * vec_sz * max_sgSize < nelems_ && - max_sgSize == sgSize) { - sycl::vec res_vec(false); -#pragma unroll - for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - sg.store(out_ptrT(&out[base + it * sgSize]), - res_vec); - } - } - else { - for (size_t k = base + sg.get_local_id()[0]; k < nelems_; - k += sgSize) { - out[k] = false; - } - } - } - else { - using in_ptrT = - sycl::multi_ptr; - using out_ptrT = - sycl::multi_ptr; - static_assert(std::is_same::value); - - auto sg = ndit.get_sub_group(); - std::uint16_t sgSize = sg.get_local_range()[0]; - std::uint16_t max_sgSize = sg.get_max_local_range()[0]; - size_t base = n_vecs * vec_sz * - (ndit.get_group(0) * ndit.get_local_range(0) + - sg.get_group_id()[0] * max_sgSize); - if (base + n_vecs * vec_sz * max_sgSize < nelems_ && - sgSize == max_sgSize) { - sycl::vec x; - -#pragma unroll - for (std::uint16_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - x = sg.load(in_ptrT(&in[base + it * sgSize])); - // returns vec - auto res_vec = sycl::isnan(x); - // cast it to bool - sycl::vec res_bool = - vec_cast(res_vec); - sg.store(out_ptrT(&out[base + it * sgSize]), - res_bool); - } - } - else { - for (size_t k = base + sg.get_local_id()[0]; k < nelems_; - k += sgSize) { - out[k] = static_cast(sycl::isnan(in[k])); - } - } - } - } -}; - template struct IsNanOutputType { using value_type = bool; }; -template +template class isnan_contig_kernel; template @@ -226,28 +119,9 @@ sycl::event isnan_contig_impl(sycl::queue exec_q, char *res_p, const std::vector &depends = {}) { - sycl::event isnan_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - constexpr size_t lws = 64; - constexpr std::uint8_t vec_sz = 4; - constexpr std::uint8_t n_vecs = 2; - static_assert(lws % vec_sz == 0); - auto gws_range = sycl::range<1>( - ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)) * - lws); - auto lws_range = sycl::range<1>(lws); - - using resTy = typename IsNanOutputType::value_type; - const argTy *arg_tp = reinterpret_cast(arg_p); - resTy *res_tp = reinterpret_cast(res_p); - - cgh.parallel_for< - class isnan_contig_kernel>( - sycl::nd_range<1>(gws_range, lws_range), - IsNanContigFunctor(arg_tp, res_tp, - nelems)); - }); - return isnan_ev; + return elementwise_common::unary_contig_impl< + argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel>( + exec_q, nelems, arg_p, res_p, depends); } template struct IsNanContigFactory @@ -269,48 +143,6 @@ template struct IsNanTypeMapFactory } }; -template -struct IsNanStridedFunctorOld -{ -private: - const argT *inp_ = nullptr; - resT *res_ = nullptr; - IndexerT inp_out_indexer_; - -public: - IsNanStridedFunctorOld(const argT *inp_p, - resT *res_p, - IndexerT inp_out_indexer) - : inp_(inp_p), res_(res_p), inp_out_indexer_(inp_out_indexer) - { - } - - void operator()(sycl::id<1> wid) const - { - const argT *const &in = inp_; - resT *const &out = res_; - - auto offsets_ = inp_out_indexer_(wid.get(0)); - const py::ssize_t &inp_offset = offsets_.get_first_offset(); - const py::ssize_t &out_offset = offsets_.get_second_offset(); - - using dpctl::tensor::type_utils::is_complex; - if constexpr (std::is_same_v || - (std::is_integral::value)) { - out[out_offset] = false; - } - else if constexpr (is_complex::value) { - const bool real_isnan = sycl::isnan(std::real(in[inp_offset])); - const bool imag_isnan = sycl::isnan(std::imag(in[inp_offset])); - - out[out_offset] = real_isnan || imag_isnan; - } - else { - out[out_offset] = sycl::isnan(in[inp_offset]); - } - } -}; - template class isnan_strided_kernel; template @@ -326,24 +158,10 @@ isnan_strided_impl(sycl::queue exec_q, const std::vector &depends, const std::vector &additional_depends) { - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - - using resTy = typename IsNanOutputType::value_type; - using IndexerT = - typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; - - IndexerT arg_res_indexer{nd, arg_offset, res_offset, shape_and_strides}; - - const argTy *arg_tptr = reinterpret_cast(arg_p); - resTy *res_tptr = reinterpret_cast(res_p); - - cgh.parallel_for>( - {nelems}, IsNanStridedFunctor( - arg_tptr, res_tptr, arg_res_indexer)); - }); - return comp_ev; + return elementwise_common::unary_strided_impl< + argTy, IsNanOutputType, IsNanStridedFunctor, isnan_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); } template struct IsNanStridedFactory diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp new file mode 100644 index 0000000000..71998e0dac --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp @@ -0,0 +1,377 @@ +//=== multiply.hpp - Binary function MUL ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of MUL(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace multiply +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template struct MultiplyFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + resT operator()(const argT1 &in1, const argT2 &in2) + { + return in1 * in2; + } + + template + sycl::vec operator()(const sycl::vec &in1, + const sycl::vec &in2) + { + auto tmp = in1 * in2; + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using MultiplyContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs>; + +template +using MultiplyStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + MultiplyFunctor>; + +template struct MultiplyOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class multiply_contig_kernel; + +template +sycl::event multiply_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_impl< + argTy1, argTy2, MultiplyOutputType, MultiplyContigFunctor, + multiply_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends); +} + +template struct MultiplyContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename MultiplyOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = multiply_contig_impl; + return fn; + } + } +}; + +template struct MultiplyTypeMapFactory +{ + /*! @brief get typeid for output type of multiply(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename MultiplyOutputType::value_type; + ; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class multiply_strided_strided_kernel; + +template +sycl::event +multiply_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, MultiplyOutputType, MultiplyStridedFunctor, + multiply_strided_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template struct MultiplyStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename MultiplyOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = multiply_strided_impl; + return fn; + } + } +}; + +template +class multiply_matrix_row_broadcast_sg_krn; + +template +using MultiplyContigMatrixContigRowBroadcastingFunctor = + elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor< + argT1, + argT2, + resT, + MultiplyFunctor>; + +template +sycl::event multiply_contig_matrix_contig_row_broadcast_impl( + sycl::queue exec_q, + std::vector &host_tasks, + size_t n0, + size_t n1, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + py::ssize_t mat_offset, + const char *vec_p, // typeless pointer to (n1,) contiguous row + py::ssize_t vec_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] * vec[j] + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl< + argT1, argT2, resT, MultiplyContigMatrixContigRowBroadcastingFunctor, + multiply_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p, + mat_offset, vec_p, vec_offset, + res_p, res_offset, depends); +} + +template +struct MultiplyContigMatrixContigRowBroadcastFactory +{ + fnT get() + { + using resT = typename MultiplyOutputType::value_type; + if constexpr (std::is_same_v) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + multiply_contig_matrix_contig_row_broadcast_impl; + return fn; + } + } + } +}; + +template +sycl::event multiply_contig_row_contig_matrix_broadcast_impl( + sycl::queue exec_q, + std::vector &host_tasks, + size_t n0, + size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + py::ssize_t vec_offset, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + py::ssize_t mat_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] * vec[j] + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return multiply_contig_matrix_contig_row_broadcast_impl( + exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p, + res_offset, depends); +}; + +template +struct MultiplyContigRowContigMatrixBroadcastFactory +{ + fnT get() + { + using resT = typename MultiplyOutputType::value_type; + if constexpr (std::is_same_v) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + multiply_contig_row_contig_matrix_broadcast_impl; + return fn; + } + } + } +}; + +} // namespace multiply +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp index 7e576c8746..7853428227 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp @@ -104,28 +104,9 @@ sycl::event sqrt_contig_impl(sycl::queue exec_q, char *res_p, const std::vector &depends = {}) { - sycl::event sqrt_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - constexpr size_t lws = 64; - constexpr unsigned int vec_sz = 4; - constexpr unsigned int n_vecs = 2; - static_assert(lws % vec_sz == 0); - auto gws_range = sycl::range<1>( - ((nelems + n_vecs * lws * vec_sz - 1) / (lws * n_vecs * vec_sz)) * - lws); - auto lws_range = sycl::range<1>(lws); - - using resTy = typename SqrtOutputType::value_type; - const argTy *arg_tp = reinterpret_cast(arg_p); - resTy *res_tp = reinterpret_cast(res_p); - - cgh.parallel_for< - class sqrt_contig_kernel>( - sycl::nd_range<1>(gws_range, lws_range), - SqrtContigFunctor(arg_tp, res_tp, - nelems)); - }); - return sqrt_ev; + return elementwise_common::unary_contig_impl< + argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel>( + exec_q, nelems, arg_p, res_p, depends); } template struct SqrtContigFactory @@ -169,26 +150,10 @@ sqrt_strided_impl(sycl::queue exec_q, const std::vector &depends, const std::vector &additional_depends) { - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - - using resTy = typename SqrtOutputType::value_type; - using IndexerT = - typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; - - IndexerT arg_res_indexer(nd, arg_offset, res_offset, shape_and_strides); - - const argTy *arg_tp = reinterpret_cast(arg_p); - resTy *res_tp = reinterpret_cast(res_p); - - sycl::range<1> gRange{nelems}; - - cgh.parallel_for>( - gRange, SqrtStridedFunctor( - arg_tp, res_tp, arg_res_indexer)); - }); - return comp_ev; + return elementwise_common::unary_strided_impl< + argTy, SqrtOutputType, SqrtStridedFunctor, sqrt_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); } template struct SqrtStridedFactory diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp new file mode 100644 index 0000000000..d6fb44d072 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp @@ -0,0 +1,391 @@ +//=== subtract.hpp - Binary function SUBTRACT ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of DIVIDE(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace subtract +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template struct SubtractFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + resT operator()(const argT1 &in1, const argT2 &in2) + { + return in1 - in2; + } + + template + sycl::vec operator()(const sycl::vec &in1, + const sycl::vec &in2) + { + auto tmp = in1 - in2; + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using SubtractContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs>; + +template +using SubtractStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + SubtractFunctor>; + +template struct SubtractOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class subtract_contig_kernel; + +template +sycl::event subtract_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_impl< + argTy1, argTy2, SubtractOutputType, SubtractContigFunctor, + subtract_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends); +} + +template struct SubtractContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename SubtractOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = subtract_contig_impl; + return fn; + } + } +}; + +template struct SubtractTypeMapFactory +{ + /*! @brief get typeid for output type of divide(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename SubtractOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class subtract_strided_strided_kernel; + +template +sycl::event +subtract_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, SubtractOutputType, SubtractStridedFunctor, + subtract_strided_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template struct SubtractStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename SubtractOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = subtract_strided_impl; + return fn; + } + } +}; + +template +using SubtractContigMatrixContigRowBroadcastingFunctor = + elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor< + argT1, + argT2, + resT, + SubtractFunctor>; + +template +using SubtractContigRowContigMatrixBroadcastingFunctor = + elementwise_common::BinaryContigRowContigMatrixBroadcastingFunctor< + argT1, + argT2, + resT, + SubtractFunctor>; + +template +class subtract_matrix_row_broadcast_sg_krn; + +template +class subtract_row_matrix_broadcast_sg_krn; + +template +sycl::event subtract_contig_matrix_contig_row_broadcast_impl( + sycl::queue exec_q, + std::vector &host_tasks, + size_t n0, + size_t n1, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + py::ssize_t mat_offset, + const char *vec_p, // typeless pointer to (n1,) contiguous row + py::ssize_t vec_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] - vec[j] + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl< + argT1, argT2, resT, SubtractContigMatrixContigRowBroadcastingFunctor, + subtract_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p, + mat_offset, vec_p, vec_offset, + res_p, res_offset, depends); +} + +template +struct SubtractContigMatrixContigRowBroadcastFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename SubtractOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + using resT = typename SubtractOutputType::value_type; + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + subtract_contig_matrix_contig_row_broadcast_impl; + return fn; + } + } + } +}; + +template +sycl::event subtract_contig_row_contig_matrix_broadcast_impl( + sycl::queue exec_q, + std::vector &host_tasks, + size_t n0, + size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + py::ssize_t vec_offset, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + py::ssize_t mat_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = op(vec[j], mat[i,j]) + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl< + argT1, argT2, resT, SubtractContigRowContigMatrixBroadcastingFunctor, + subtract_row_matrix_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, vec_p, + vec_offset, mat_p, mat_offset, + res_p, res_offset, depends); +} + +template +struct SubtractContigRowContigMatrixBroadcastFactory +{ + fnT get() + { + using resT = typename SubtractOutputType::value_type; + if constexpr (std::is_same_v) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + subtract_contig_row_contig_matrix_broadcast_impl; + return fn; + } + } + } +}; + +} // namespace subtract +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp index f34da1b415..96ca231584 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp @@ -165,32 +165,10 @@ true_divide_contig_impl(sycl::queue exec_q, py::ssize_t res_offset, const std::vector &depends = {}) { - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - size_t lws = 64; - constexpr unsigned int vec_sz = 4; - constexpr unsigned int n_vecs = 2; - const size_t n_groups = - ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); - const auto gws_range = sycl::range<1>(n_groups * lws); - const auto lws_range = sycl::range<1>(lws); - - using resTy = typename TrueDivideOutputType::value_type; - - const argTy1 *arg1_tp = - reinterpret_cast(arg1_p) + arg1_offset; - const argTy2 *arg2_tp = - reinterpret_cast(arg2_p) + arg2_offset; - resTy *res_tp = reinterpret_cast(res_p) + res_offset; - - cgh.parallel_for< - true_divide_contig_kernel>( - sycl::nd_range<1>(gws_range, lws_range), - TrueDivideContigFunctor( - arg1_tp, arg2_tp, res_tp, nelems)); - }); - return comp_ev; + return elementwise_common::binary_contig_impl< + argTy1, argTy2, TrueDivideOutputType, TrueDivideContigFunctor, + true_divide_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends); } template struct TrueDivideContigFactory @@ -240,30 +218,11 @@ true_divide_strided_impl(sycl::queue exec_q, const std::vector &depends, const std::vector &additional_depends) { - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.depends_on(additional_depends); - - using resTy = typename TrueDivideOutputType::value_type; - - using IndexerT = - typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; - - IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset, - shape_and_strides}; - - const argTy1 *arg1_tp = reinterpret_cast(arg1_p); - const argTy2 *arg2_tp = reinterpret_cast(arg2_p); - resTy *res_tp = reinterpret_cast(res_p); - - sycl::range<1> gRange(nelems); - - cgh.parallel_for>( - gRange, TrueDivideStridedFunctor( - arg1_tp, arg2_tp, res_tp, indexer)); - }); - return comp_ev; + return elementwise_common::binary_strided_impl< + argTy1, argTy2, TrueDivideOutputType, TrueDivideStridedFunctor, + true_divide_strided_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); } template @@ -322,63 +281,11 @@ sycl::event true_divide_contig_matrix_contig_row_broadcast_impl( py::ssize_t res_offset, const std::vector &depends = {}) { - const argT1 *mat = reinterpret_cast(mat_p) + mat_offset; - const argT2 *vec = reinterpret_cast(vec_p) + vec_offset; - resT *res = reinterpret_cast(res_p) + res_offset; - - const auto &dev = exec_q.get_device(); - const auto &sg_sizes = dev.get_info(); - // Get device-specific kernel info max_sub_group_size - size_t max_sgSize = - *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); - - size_t n1_padded = n1 + max_sgSize; - argT2 *padded_vec = sycl::malloc_device(n1_padded, exec_q); - - if (padded_vec == nullptr) { - throw std::runtime_error("Could not allocate memory on the device"); - } - sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); // ensure vec contains actual data - cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) { - auto i = id[0]; - padded_vec[i] = vec[i % n1]; - }); - }); - - // sub-group spans work-items [I, I + sgSize) - // base = ndit.get_global_linear_id() - sg.get_local_id()[0] - // Generically, sg.load( &mat[base]) may load arrays from - // different rows of mat. The start corresponds to row (base / n0) - // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to - // ensure that reads are accessible - - size_t lws = 64; - - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(make_padded_vec_ev); - - auto lwsRange = sycl::range<1>(lws); - size_t n_elems = n0 * n1; - size_t n_groups = (n_elems + lws - 1) / lws; - auto gwsRange = sycl::range<1>(n_groups * lws); - - cgh.parallel_for< - class true_divide_matrix_row_broadcast_sg_krn>( - sycl::nd_range<1>(gwsRange, lwsRange), - TrueDivideContigMatrixContigRowBroadcastingFunctor( - mat, padded_vec, res, n_elems, n1)); - }); - - sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); - sycl::context ctx = exec_q.get_context(); - cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); }); - }); - host_tasks.push_back(tmp_cleanup_ev); - - return comp_ev; + return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl< + argT1, argT2, resT, TrueDivideContigMatrixContigRowBroadcastingFunctor, + true_divide_matrix_row_broadcast_sg_krn>( + exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p, + res_offset, depends); } template @@ -427,63 +334,11 @@ sycl::event true_divide_contig_row_contig_matrix_broadcast_impl( py::ssize_t res_offset, const std::vector &depends = {}) { - const argT1 *vec = reinterpret_cast(vec_p) + vec_offset; - const argT2 *mat = reinterpret_cast(mat_p) + mat_offset; - resT *res = reinterpret_cast(res_p) + res_offset; - - const auto &dev = exec_q.get_device(); - const auto &sg_sizes = dev.get_info(); - // Get device-specific kernel info max_sub_group_size - size_t max_sgSize = - *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); - - size_t n1_padded = n1 + max_sgSize; - argT2 *padded_vec = sycl::malloc_device(n1_padded, exec_q); - - if (padded_vec == nullptr) { - throw std::runtime_error("Could not allocate memory on the device"); - } - sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); // ensure vec contains actual data - cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) { - auto i = id[0]; - padded_vec[i] = vec[i % n1]; - }); - }); - - // sub-group spans work-items [I, I + sgSize) - // base = ndit.get_global_linear_id() - sg.get_local_id()[0] - // Generically, sg.load( &mat[base]) may load arrays from - // different rows of mat. The start corresponds to row (base / n0) - // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to - // ensure that reads are accessible - - size_t lws = 64; - - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(make_padded_vec_ev); - - auto lwsRange = sycl::range<1>(lws); - size_t n_elems = n0 * n1; - size_t n_groups = (n_elems + lws - 1) / lws; - auto gwsRange = sycl::range<1>(n_groups * lws); - - cgh.parallel_for< - class true_divide_row_matrix_broadcast_sg_krn>( - sycl::nd_range<1>(gwsRange, lwsRange), - TrueDivideContigRowContigMatrixBroadcastingFunctor( - padded_vec, mat, res, n_elems, n1)); - }); - - sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); - sycl::context ctx = exec_q.get_context(); - cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); }); - }); - host_tasks.push_back(tmp_cleanup_ev); - - return comp_ev; + return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl< + argT1, argT2, resT, TrueDivideContigRowContigMatrixBroadcastingFunctor, + true_divide_row_matrix_broadcast_sg_krn>( + exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset, res_p, + res_offset, depends); }; template diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp index de209fdb1c..c943f28c97 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions.cpp @@ -39,7 +39,9 @@ #include "kernels/elementwise_functions/isfinite.hpp" #include "kernels/elementwise_functions/isinf.hpp" #include "kernels/elementwise_functions/isnan.hpp" +#include "kernels/elementwise_functions/multiply.hpp" #include "kernels/elementwise_functions/sqrt.hpp" +#include "kernels/elementwise_functions/subtract.hpp" #include "kernels/elementwise_functions/true_divide.hpp" namespace dpctl @@ -663,7 +665,71 @@ namespace impl // B19: ==== MULTIPLY (x1, x2) namespace impl { -// FIXME: add code for B19 + +namespace multiply_fn_ns = dpctl::tensor::kernels::multiply; + +static binary_contig_impl_fn_ptr_t + multiply_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int multiply_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + multiply_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// mul(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + multiply_contig_matrix_contig_row_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +// mul(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + multiply_contig_row_contig_matrix_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +void populate_multiply_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = multiply_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::MultiplyTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(multiply_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::MultiplyStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(multiply_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::MultiplyContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(multiply_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::MultiplyContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + MultiplyContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + multiply_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::MultiplyContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + MultiplyContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + multiply_contig_row_contig_matrix_broadcast_dispatch_table); +}; + } // namespace impl // U25: ==== NEGATIVE (x) @@ -770,7 +836,70 @@ void populate_sqrt_dispatch_vectors(void) // B23: ==== SUBTRACT (x1, x2) namespace impl { -// FIXME: add code for B23 +namespace subtract_fn_ns = dpctl::tensor::kernels::subtract; + +static binary_contig_impl_fn_ptr_t + subtract_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int subtract_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + subtract_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// sub(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + subtract_contig_matrix_contig_row_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +// sub(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + subtract_contig_row_contig_matrix_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +void populate_subtract_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = subtract_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::SubtractTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(subtract_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::SubtractStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(subtract_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::SubtractContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(subtract_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::SubtractContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + SubtractContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + subtract_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::SubtractContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + SubtractContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + subtract_contig_row_contig_matrix_broadcast_dispatch_table); +}; + } // namespace impl // U34: ==== TAN (x) @@ -1140,7 +1269,44 @@ void init_elementwise_functions(py::module_ m) // FIXME: // B19: ==== MULTIPLY (x1, x2) - // FIXME: + { + impl::populate_multiply_dispatch_tables(); + using impl::multiply_contig_dispatch_table; + using impl::multiply_contig_matrix_contig_row_broadcast_dispatch_table; + using impl::multiply_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::multiply_output_id_table; + using impl::multiply_strided_dispatch_table; + + auto multiply_pyapi = + [&](dpctl::tensor::usm_ndarray src1, + dpctl::tensor::usm_ndarray src2, dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, multiply_output_id_table, + // function pointers to handle operation on contiguous + // arrays (pointers may be nullptr) + multiply_contig_dispatch_table, + // function pointers to handle operation on strided arrays + // (most general case) + multiply_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + multiply_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + multiply_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto multiply_result_type_pyapi = [&](py::dtype dtype1, + py::dtype dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + multiply_output_id_table); + }; + m.def("_multiply", multiply_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_multiply_result_type", multiply_result_type_pyapi, ""); + } // U25: ==== NEGATIVE (x) // FIXME: @@ -1198,7 +1364,44 @@ void init_elementwise_functions(py::module_ m) } // B23: ==== SUBTRACT (x1, x2) - // FIXME: + { + impl::populate_subtract_dispatch_tables(); + using impl::subtract_contig_dispatch_table; + using impl::subtract_contig_matrix_contig_row_broadcast_dispatch_table; + using impl::subtract_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::subtract_output_id_table; + using impl::subtract_strided_dispatch_table; + + auto subtract_pyapi = + [&](dpctl::tensor::usm_ndarray src1, + dpctl::tensor::usm_ndarray src2, dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, subtract_output_id_table, + // function pointers to handle operation on contiguous + // arrays (pointers may be nullptr) + subtract_contig_dispatch_table, + // function pointers to handle operation on strided arrays + // (most general case) + subtract_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + subtract_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + subtract_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto subtract_result_type_pyapi = [&](py::dtype dtype1, + py::dtype dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + subtract_output_id_table); + }; + m.def("_subtract", subtract_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_subtract_result_type", subtract_result_type_pyapi, ""); + } // U34: ==== TAN (x) // FIXME: diff --git a/dpctl/tests/elementwise/test_multiply.py b/dpctl/tests/elementwise/test_multiply.py new file mode 100644 index 0000000000..cd506cd182 --- /dev/null +++ b/dpctl/tests/elementwise/test_multiply.py @@ -0,0 +1,154 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ctypes + +import numpy as np +import pytest + +import dpctl +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _all_dtypes, _compare_dtypes, _usm_types + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_multiply_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.multiply(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.multiply( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.multiply(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.multiply( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_multiply_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.multiply(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpctl.utils.get_coerced_usm_type( + (op1_usm_type, op2_usm_type) + ) + assert r.usm_type == expected_usm_type + + +def test_multiply_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.multiply(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.multiply(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.multiply(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.multiply(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.multiply(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.multiply(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.multiply(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.multiply(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.multiply(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.multiply(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_multiply_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.multiply(m, v) + + expected = np.multiply( + np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4") + ) + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + r2 = dpt.multiply(v, m) + expected2 = np.multiply( + np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4") + ) + assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_multiply_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + complex(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.multiply(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.multiply(sc, X) + assert isinstance(R, dpt.usm_ndarray) diff --git a/dpctl/tests/elementwise/test_subtract.py b/dpctl/tests/elementwise/test_subtract.py new file mode 100644 index 0000000000..f524151e97 --- /dev/null +++ b/dpctl/tests/elementwise/test_subtract.py @@ -0,0 +1,171 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ctypes + +import numpy as np +import pytest + +import dpctl +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _all_dtypes, _compare_dtypes, _usm_types + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:]) +def test_subtract_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.subtract(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.subtract( + np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == np.full(r.shape, 0, dtype=r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + r2 = dpt.empty_like(ar1, dtype=r.dtype) + dpt.subtract(ar1, ar2, out=r2) + assert (dpt.asnumpy(r2) == np.full(r2.shape, 0, dtype=r2.dtype)).all() + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.subtract(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.subtract( + np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == np.full(r.shape, 0, dtype=r.dtype)).all() + + r2 = dpt.empty_like(ar1, dtype=r.dtype) + dpt.subtract(ar3[::-1], ar4[::2], out=r2) + assert (dpt.asnumpy(r2) == np.full(r2.shape, 0, dtype=r2.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_subtract_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.subtract(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpctl.utils.get_coerced_usm_type( + (op1_usm_type, op2_usm_type) + ) + assert r.usm_type == expected_usm_type + + +def test_subtract_order(): + get_queue_or_skip() + + test_shape = ( + 20, + 20, + ) + test_shape2 = tuple(2 * dim for dim in test_shape) + n = test_shape[-1] + + for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]): + ar1 = dpt.ones(test_shape, dtype=dt1, order="C") + ar2 = dpt.ones(test_shape, dtype=dt2, order="C") + r1 = dpt.subtract(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.subtract(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.subtract(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.subtract(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(test_shape, dtype=dt1, order="F") + ar2 = dpt.ones(test_shape, dtype=dt2, order="F") + r1 = dpt.subtract(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.subtract(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.subtract(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.subtract(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2] + ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2] + r4 = dpt.subtract(ar1, ar2, order="K") + assert r4.strides == (n, -1) + r5 = dpt.subtract(ar1, ar2, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT + r4 = dpt.subtract(ar1, ar2, order="K") + assert r4.strides == (-1, n) + r5 = dpt.subtract(ar1, ar2, order="C") + assert r5.strides == (n, 1) + + +def test_subtract_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(5, dtype="i4") + + r = dpt.subtract(m, v) + assert ( + dpt.asnumpy(r) == np.arange(1, -4, step=-1, dtype="i4")[np.newaxis, :] + ).all() + + r2 = dpt.subtract(v, m) + assert ( + dpt.asnumpy(r2) == np.arange(-1, 4, dtype="i4")[np.newaxis, :] + ).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_subtract_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q) + py_zeros = ( + bool(0), + int(0), + float(0), + complex(0), + np.float32(0), + ctypes.c_int(0), + ) + for sc in py_zeros: + R = dpt.subtract(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.subtract(sc, X) + assert isinstance(R, dpt.usm_ndarray)