From 9bb5fa726dbb9a8ea473e3ac6da4e40ae7219215 Mon Sep 17 00:00:00 2001
From: Natalia Polina <natalia.polina@intel.com>
Date: Wed, 31 May 2023 11:57:53 -0500
Subject: [PATCH 1/2] Implemented floor_divide() function.

---
 dpctl/tensor/__init__.py                      |   2 +
 dpctl/tensor/_elementwise_funcs.py            |  26 +-
 .../elementwise_functions/floor_divide.hpp    | 284 ++++++++++++++++++
 .../libtensor/include/utils/offset_utils.hpp  |   4 +-
 .../source/elementwise_functions.cpp          |  76 ++++-
 dpctl/tests/elementwise/test_floor_divide.py  | 188 ++++++++++++
 dpctl/tests/elementwise/utils.py              |   5 +-
 7 files changed, 579 insertions(+), 6 deletions(-)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
 create mode 100644 dpctl/tests/elementwise/test_floor_divide.py
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 0bd7a30cd0..20e72a4300 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -97,6 +97,7 @@
     cos,
     divide,
     equal,
+    floor_divide,
     isfinite,
     isinf,
     isnan,
@@ -191,4 +192,5 @@
     "multiply",
     "subtract",
     "equal",
+    "floor_divide",
 ]
diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
index 29d1052c9c..f738de8033 100644
--- a/dpctl/tensor/_elementwise_funcs.py
+++ b/dpctl/tensor/_elementwise_funcs.py
@@ -159,7 +159,31 @@
 # FIXME: implement U15
 
 # B10: ==== FLOOR_DIVIDE  (x1, x2)
-# FIXME: implement B10
+_floor_divide_docstring_ = """
+floor_divide(x1, x2, out=None, order='K')
+
+Calculates the ratio for each element `x1_i` of the input array `x1` with
+the respective element `x2_i` of the input array `x2` to the greatest
+integer-value number that is not greater than the division result.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have numeric data type.
+Returns:
+    usm_narray:
+        an array containing the result of element-wise  floor division.
+        The data type of the returned array is determined by the Type
+        Promotion Rules.
+"""
+
+floor_divide = BinaryElementwiseFunc(
+    "floor_divide",
+    ti._floor_divide_result_type,
+    ti._floor_divide,
+    _floor_divide_docstring_,
+)
 
 # B11: ==== GREATER       (x1, x2)
 # FIXME: implement B11
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
new file mode 100644
index 0000000000..f79c706028
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -0,0 +1,284 @@
+//=== floor_divide.hpp -  Binary function FLOOR_DIVIDE  ------  *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of FLOOR_DIVIDE(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace floor_divide
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct FloorDivideFunctor
+{
+
+    using supports_sg_loadstore =
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>; // TRUE
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2)
+    {
+        auto tmp = in1 / in2;
+        if constexpr (std::is_integral_v<decltype(tmp)>) {
+            return tmp;
+        }
+        else {
+            return sycl::floor(tmp);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
+                                       const sycl::vec<argT2, vec_sz> &in2)
+    {
+        auto tmp = in1 / in2;
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type> &&
+                      std::is_integral_v<resT>)
+        {
+            return tmp;
+        }
+        if constexpr (std::is_integral_v<typename decltype(tmp)::element_type>)
+        {
+            using dpctl::tensor::type_utils::vec_cast;
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            sycl::vec<resT, vec_sz> res;
+            for (int i = 0; i < vec_sz; i++) {
+                auto tmp2 = sycl::floor(tmp[i]);
+                if constexpr (std::is_same_v<resT, decltype(tmp2)>) {
+                    res[i] = tmp2;
+                }
+                else {
+                    res[i] = static_cast<resT>(tmp2);
+                }
+            }
+            return res;
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using FloorDivideContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    FloorDivideFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using FloorDivideStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    FloorDivideFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2> struct FloorDivideOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class floor_divide_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+floor_divide_contig_impl(sycl::queue exec_q,
+                         size_t nelems,
+                         const char *arg1_p,
+                         py::ssize_t arg1_offset,
+                         const char *arg2_p,
+                         py::ssize_t arg2_offset,
+                         char *res_p,
+                         py::ssize_t res_offset,
+                         const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, FloorDivideOutputType, FloorDivideContigFunctor,
+        floor_divide_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                                    arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename FloorDivideOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_divide_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideTypeMapFactory
+{
+    /*! @brief get typeid for output type of floor_divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename FloorDivideOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class floor_divide_strided_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+floor_divide_strided_impl(sycl::queue exec_q,
+                          size_t nelems,
+                          int nd,
+                          const py::ssize_t *shape_and_strides,
+                          const char *arg1_p,
+                          py::ssize_t arg1_offset,
+                          const char *arg2_p,
+                          py::ssize_t arg2_offset,
+                          char *res_p,
+                          py::ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, FloorDivideOutputType, FloorDivideStridedFunctor,
+        floor_divide_strided_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename FloorDivideOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_divide_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace floor_divide
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
index 814854ef01..aaeb64a204 100644
--- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
@@ -64,7 +64,7 @@ template <class V, class U> sink_t __appender(V &lhs, U &&rhs)
 }
 
 template <typename T, typename A, typename... Vs>
-std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&...vs)
+std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&... vs)
 {
     std::size_t s = lhs.size();
     {
@@ -87,7 +87,7 @@ template <typename indT, typename... Vs>
 std::tuple<indT *, size_t, sycl::event>
 device_allocate_and_pack(sycl::queue q,
                          std::vector<sycl::event> &host_task_events,
-                         Vs &&...vs)
+                         Vs &&... vs)
 {
 
     // memory transfer optimization, use USM-host for temporary speeds up
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
index c943f28c97..ff052a2fe8 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
@@ -36,6 +36,7 @@
 #include "kernels/elementwise_functions/add.hpp"
 #include "kernels/elementwise_functions/cos.hpp"
 #include "kernels/elementwise_functions/equal.hpp"
+#include "kernels/elementwise_functions/floor_divide.hpp"
 #include "kernels/elementwise_functions/isfinite.hpp"
 #include "kernels/elementwise_functions/isinf.hpp"
 #include "kernels/elementwise_functions/isnan.hpp"
@@ -470,7 +471,40 @@ namespace impl
 // B10: ==== FLOOR_DIVIDE  (x1, x2)
 namespace impl
 {
-// FIXME: add code for B10
+namespace floor_divide_fn_ns = dpctl::tensor::kernels::floor_divide;
+
+static binary_contig_impl_fn_ptr_t
+    floor_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int floor_divide_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    floor_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_floor_divide_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = floor_divide_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::FloorDivideTypeMapFactory;
+    DispatchTableBuilder<int, FloorDivideTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(floor_divide_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::FloorDivideStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         FloorDivideStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(floor_divide_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::FloorDivideContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, FloorDivideContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(floor_divide_contig_dispatch_table);
+};
+
 } // namespace impl
 
 // B11: ==== GREATER       (x1, x2)
@@ -1151,7 +1185,45 @@ void init_elementwise_functions(py::module_ m)
     // FIXME:
 
     // B10: ==== FLOOR_DIVIDE  (x1, x2)
-    // FIXME:
+    {
+        impl::populate_floor_divide_dispatch_tables();
+        using impl::floor_divide_contig_dispatch_table;
+        using impl::floor_divide_output_id_table;
+        using impl::floor_divide_strided_dispatch_table;
+
+        auto floor_divide_pyapi = [&](dpctl::tensor::usm_ndarray src1,
+                                      dpctl::tensor::usm_ndarray src2,
+                                      dpctl::tensor::usm_ndarray dst,
+                                      sycl::queue exec_q,
+                                      const std::vector<sycl::event> &depends =
+                                          {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, floor_divide_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                floor_divide_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                floor_divide_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto floor_divide_result_type_pyapi = [&](py::dtype dtype1,
+                                                  py::dtype dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               floor_divide_output_id_table);
+        };
+        m.def("_floor_divide", floor_divide_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_floor_divide_result_type", floor_divide_result_type_pyapi, "");
+    }
 
     // B11: ==== GREATER       (x1, x2)
     // FIXME:
diff --git a/dpctl/tests/elementwise/test_floor_divide.py b/dpctl/tests/elementwise/test_floor_divide.py
new file mode 100644
index 0000000000..def39a6570
--- /dev/null
+++ b/dpctl/tests/elementwise/test_floor_divide.py
@@ -0,0 +1,188 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import ctypes
+
+import numpy as np
+import pytest
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
+def test_floor_divide_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.floor_divide(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.floor_divide(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.floor_divide(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.floor_divide(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_floor_divide_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.floor_divide(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_floor_divide_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.floor_divide(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.floor_divide(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.floor_divide(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.floor_divide(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.floor_divide(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.floor_divide(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.floor_divide(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.floor_divide(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.floor_divide(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.floor_divide(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_floor_divide_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.floor_divide(m, v)
+
+    expected = np.floor_divide(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.floor_divide(v, m)
+    expected2 = np.floor_divide(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes)
+def test_floor_divide_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.floor_divide(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.floor_divide(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_floor_divide_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.floor_divide(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_floor_divide_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.floor_divide(a, c)
diff --git a/dpctl/tests/elementwise/utils.py b/dpctl/tests/elementwise/utils.py
index 0d9396dcb4..38778df4be 100644
--- a/dpctl/tests/elementwise/utils.py
+++ b/dpctl/tests/elementwise/utils.py
@@ -17,7 +17,7 @@
 import dpctl
 import dpctl.tensor._type_utils as tu
 
-_all_dtypes = [
+_no_complex_dtypes = [
     "b1",
     "i1",
     "u1",
@@ -30,6 +30,8 @@
     "f2",
     "f4",
     "f8",
+]
+_all_dtypes = _no_complex_dtypes + [
     "c8",
     "c16",
 ]
@@ -48,6 +50,7 @@ def _compare_dtypes(dt, ref_dt, sycl_queue=None):
 
 
 __all__ = [
+    "_no_complex_dtypes",
     "_all_dtypes",
     "_usm_types",
     "_map_to_device_dtype",

From 9a0699964f3eded49c0f4425f09edf4a062d2c60 Mon Sep 17 00:00:00 2001
From: Natalia Polina <natalia.polina@intel.com>
Date: Tue, 6 Jun 2023 10:21:59 -0500
Subject: [PATCH 2/2] Vector implementation for floor_divide() function has
 been changed.

---
 .../elementwise_functions/floor_divide.hpp    | 24 +++++++++----------
 .../libtensor/include/utils/offset_utils.hpp  |  4 ++--
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
index f79c706028..f9e84f6324 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -81,24 +81,24 @@ struct FloorDivideFunctor
         {
             return tmp;
         }
-        if constexpr (std::is_integral_v<typename decltype(tmp)::element_type>)
-        {
+        else if constexpr (std::is_integral_v<typename decltype(
+                               tmp)::element_type>) {
             using dpctl::tensor::type_utils::vec_cast;
             return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
                 tmp);
         }
         else {
-            sycl::vec<resT, vec_sz> res;
-            for (int i = 0; i < vec_sz; i++) {
-                auto tmp2 = sycl::floor(tmp[i]);
-                if constexpr (std::is_same_v<resT, decltype(tmp2)>) {
-                    res[i] = tmp2;
-                }
-                else {
-                    res[i] = static_cast<resT>(tmp2);
-                }
+            sycl::vec<resT, vec_sz> res = sycl::floor(tmp);
+            if constexpr (std::is_same_v<resT,
+                                         typename decltype(res)::element_type>)
+            {
+                return res;
+            }
+            else {
+                using dpctl::tensor::type_utils::vec_cast;
+                return vec_cast<resT, typename decltype(res)::element_type,
+                                vec_sz>(res);
             }
-            return res;
         }
     }
 };
diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
index aaeb64a204..814854ef01 100644
--- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
@@ -64,7 +64,7 @@ template <class V, class U> sink_t __appender(V &lhs, U &&rhs)
 }
 
 template <typename T, typename A, typename... Vs>
-std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&... vs)
+std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&...vs)
 {
     std::size_t s = lhs.size();
     {
@@ -87,7 +87,7 @@ template <typename indT, typename... Vs>
 std::tuple<indT *, size_t, sycl::event>
 device_allocate_and_pack(sycl::queue q,
                          std::vector<sycl::event> &host_task_events,
-                         Vs &&... vs)
+                         Vs &&...vs)
 {
 
     // memory transfer optimization, use USM-host for temporary speeds up