From 9bb5fa726dbb9a8ea473e3ac6da4e40ae7219215 Mon Sep 17 00:00:00 2001 From: Natalia Polina Date: Wed, 31 May 2023 11:57:53 -0500 Subject: [PATCH 1/2] Implemented floor_divide() function. --- dpctl/tensor/__init__.py | 2 + dpctl/tensor/_elementwise_funcs.py | 26 +- .../elementwise_functions/floor_divide.hpp | 284 ++++++++++++++++++ .../libtensor/include/utils/offset_utils.hpp | 4 +- .../source/elementwise_functions.cpp | 76 ++++- dpctl/tests/elementwise/test_floor_divide.py | 188 ++++++++++++ dpctl/tests/elementwise/utils.py | 5 +- 7 files changed, 579 insertions(+), 6 deletions(-) create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp create mode 100644 dpctl/tests/elementwise/test_floor_divide.py diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 0bd7a30cd0..20e72a4300 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -97,6 +97,7 @@ cos, divide, equal, + floor_divide, isfinite, isinf, isnan, @@ -191,4 +192,5 @@ "multiply", "subtract", "equal", + "floor_divide", ] diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py index 29d1052c9c..f738de8033 100644 --- a/dpctl/tensor/_elementwise_funcs.py +++ b/dpctl/tensor/_elementwise_funcs.py @@ -159,7 +159,31 @@ # FIXME: implement U15 # B10: ==== FLOOR_DIVIDE (x1, x2) -# FIXME: implement B10 +_floor_divide_docstring_ = """ +floor_divide(x1, x2, out=None, order='K') + +Calculates the ratio for each element `x1_i` of the input array `x1` with +the respective element `x2_i` of the input array `x2` to the greatest +integer-value number that is not greater than the division result. + +Args: + x1 (usm_ndarray): + First input array, expected to have numeric data type. + x2 (usm_ndarray): + Second input array, also expected to have numeric data type. +Returns: + usm_narray: + an array containing the result of element-wise floor division. + The data type of the returned array is determined by the Type + Promotion Rules. +""" + +floor_divide = BinaryElementwiseFunc( + "floor_divide", + ti._floor_divide_result_type, + ti._floor_divide, + _floor_divide_docstring_, +) # B11: ==== GREATER (x1, x2) # FIXME: implement B11 diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp new file mode 100644 index 0000000000..f79c706028 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -0,0 +1,284 @@ +//=== floor_divide.hpp - Binary function FLOOR_DIVIDE ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of FLOOR_DIVIDE(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace floor_divide +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct FloorDivideFunctor +{ + + using supports_sg_loadstore = + std::negation, + tu_ns::is_complex>>; // TRUE + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + resT operator()(const argT1 &in1, const argT2 &in2) + { + auto tmp = in1 / in2; + if constexpr (std::is_integral_v) { + return tmp; + } + else { + return sycl::floor(tmp); + } + } + + template + sycl::vec operator()(const sycl::vec &in1, + const sycl::vec &in2) + { + auto tmp = in1 / in2; + if constexpr (std::is_same_v && + std::is_integral_v) + { + return tmp; + } + if constexpr (std::is_integral_v) + { + using dpctl::tensor::type_utils::vec_cast; + return vec_cast( + tmp); + } + else { + sycl::vec res; + for (int i = 0; i < vec_sz; i++) { + auto tmp2 = sycl::floor(tmp[i]); + if constexpr (std::is_same_v) { + res[i] = tmp2; + } + else { + res[i] = static_cast(tmp2); + } + } + return res; + } + } +}; + +template +using FloorDivideContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + FloorDivideFunctor, + vec_sz, + n_vecs>; + +template +using FloorDivideStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + FloorDivideFunctor>; + +template struct FloorDivideOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class floor_divide_contig_kernel; + +template +sycl::event +floor_divide_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_impl< + argTy1, argTy2, FloorDivideOutputType, FloorDivideContigFunctor, + floor_divide_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends); +} + +template +struct FloorDivideContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename FloorDivideOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = floor_divide_contig_impl; + return fn; + } + } +}; + +template +struct FloorDivideTypeMapFactory +{ + /*! @brief get typeid for output type of floor_divide(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename FloorDivideOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class floor_divide_strided_strided_kernel; + +template +sycl::event +floor_divide_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, FloorDivideOutputType, FloorDivideStridedFunctor, + floor_divide_strided_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends, additional_depends); +} + +template +struct FloorDivideStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename FloorDivideOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = floor_divide_strided_impl; + return fn; + } + } +}; + +} // namespace floor_divide +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp index 814854ef01..aaeb64a204 100644 --- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp @@ -64,7 +64,7 @@ template sink_t __appender(V &lhs, U &&rhs) } template -std::vector concat(std::vector lhs, Vs &&...vs) +std::vector concat(std::vector lhs, Vs &&... vs) { std::size_t s = lhs.size(); { @@ -87,7 +87,7 @@ template std::tuple device_allocate_and_pack(sycl::queue q, std::vector &host_task_events, - Vs &&...vs) + Vs &&... vs) { // memory transfer optimization, use USM-host for temporary speeds up diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp index c943f28c97..ff052a2fe8 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions.cpp @@ -36,6 +36,7 @@ #include "kernels/elementwise_functions/add.hpp" #include "kernels/elementwise_functions/cos.hpp" #include "kernels/elementwise_functions/equal.hpp" +#include "kernels/elementwise_functions/floor_divide.hpp" #include "kernels/elementwise_functions/isfinite.hpp" #include "kernels/elementwise_functions/isinf.hpp" #include "kernels/elementwise_functions/isnan.hpp" @@ -470,7 +471,40 @@ namespace impl // B10: ==== FLOOR_DIVIDE (x1, x2) namespace impl { -// FIXME: add code for B10 +namespace floor_divide_fn_ns = dpctl::tensor::kernels::floor_divide; + +static binary_contig_impl_fn_ptr_t + floor_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int floor_divide_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + floor_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_floor_divide_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = floor_divide_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::FloorDivideTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(floor_divide_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::FloorDivideStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(floor_divide_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::FloorDivideContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(floor_divide_contig_dispatch_table); +}; + } // namespace impl // B11: ==== GREATER (x1, x2) @@ -1151,7 +1185,45 @@ void init_elementwise_functions(py::module_ m) // FIXME: // B10: ==== FLOOR_DIVIDE (x1, x2) - // FIXME: + { + impl::populate_floor_divide_dispatch_tables(); + using impl::floor_divide_contig_dispatch_table; + using impl::floor_divide_output_id_table; + using impl::floor_divide_strided_dispatch_table; + + auto floor_divide_pyapi = [&](dpctl::tensor::usm_ndarray src1, + dpctl::tensor::usm_ndarray src2, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = + {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, floor_divide_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + floor_divide_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + floor_divide_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto floor_divide_result_type_pyapi = [&](py::dtype dtype1, + py::dtype dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + floor_divide_output_id_table); + }; + m.def("_floor_divide", floor_divide_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_floor_divide_result_type", floor_divide_result_type_pyapi, ""); + } // B11: ==== GREATER (x1, x2) // FIXME: diff --git a/dpctl/tests/elementwise/test_floor_divide.py b/dpctl/tests/elementwise/test_floor_divide.py new file mode 100644 index 0000000000..def39a6570 --- /dev/null +++ b/dpctl/tests/elementwise/test_floor_divide.py @@ -0,0 +1,188 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ctypes + +import numpy as np +import pytest + +import dpctl +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes) +def test_floor_divide_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.floor_divide(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.floor_divide( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.floor_divide(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.floor_divide( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_floor_divide_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.floor_divide(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpctl.utils.get_coerced_usm_type( + (op1_usm_type, op2_usm_type) + ) + assert r.usm_type == expected_usm_type + + +def test_floor_divide_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.floor_divide(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.floor_divide(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.floor_divide(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.floor_divide(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.floor_divide(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.floor_divide(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.floor_divide(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.floor_divide(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.floor_divide(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.floor_divide(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_floor_divide_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.floor_divide(m, v) + + expected = np.floor_divide( + np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4") + ) + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + r2 = dpt.floor_divide(v, m) + expected2 = np.floor_divide( + np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4") + ) + assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _no_complex_dtypes) +def test_floor_divide_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.floor_divide(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.floor_divide(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_floor_divide_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.floor_divide(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_floor_divide_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.floor_divide(a, c) diff --git a/dpctl/tests/elementwise/utils.py b/dpctl/tests/elementwise/utils.py index 0d9396dcb4..38778df4be 100644 --- a/dpctl/tests/elementwise/utils.py +++ b/dpctl/tests/elementwise/utils.py @@ -17,7 +17,7 @@ import dpctl import dpctl.tensor._type_utils as tu -_all_dtypes = [ +_no_complex_dtypes = [ "b1", "i1", "u1", @@ -30,6 +30,8 @@ "f2", "f4", "f8", +] +_all_dtypes = _no_complex_dtypes + [ "c8", "c16", ] @@ -48,6 +50,7 @@ def _compare_dtypes(dt, ref_dt, sycl_queue=None): __all__ = [ + "_no_complex_dtypes", "_all_dtypes", "_usm_types", "_map_to_device_dtype", From 9a0699964f3eded49c0f4425f09edf4a062d2c60 Mon Sep 17 00:00:00 2001 From: Natalia Polina Date: Tue, 6 Jun 2023 10:21:59 -0500 Subject: [PATCH 2/2] Vector implementation for floor_divide() function has been changed. --- .../elementwise_functions/floor_divide.hpp | 24 +++++++++---------- .../libtensor/include/utils/offset_utils.hpp | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp index f79c706028..f9e84f6324 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -81,24 +81,24 @@ struct FloorDivideFunctor { return tmp; } - if constexpr (std::is_integral_v) - { + else if constexpr (std::is_integral_v) { using dpctl::tensor::type_utils::vec_cast; return vec_cast( tmp); } else { - sycl::vec res; - for (int i = 0; i < vec_sz; i++) { - auto tmp2 = sycl::floor(tmp[i]); - if constexpr (std::is_same_v) { - res[i] = tmp2; - } - else { - res[i] = static_cast(tmp2); - } + sycl::vec res = sycl::floor(tmp); + if constexpr (std::is_same_v) + { + return res; + } + else { + using dpctl::tensor::type_utils::vec_cast; + return vec_cast(res); } - return res; } } }; diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp index aaeb64a204..814854ef01 100644 --- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp @@ -64,7 +64,7 @@ template sink_t __appender(V &lhs, U &&rhs) } template -std::vector concat(std::vector lhs, Vs &&... vs) +std::vector concat(std::vector lhs, Vs &&...vs) { std::size_t s = lhs.size(); { @@ -87,7 +87,7 @@ template std::tuple device_allocate_and_pack(sycl::queue q, std::vector &host_task_events, - Vs &&... vs) + Vs &&...vs) { // memory transfer optimization, use USM-host for temporary speeds up