Made changes as per PR review by @oleksandr-pavlyk

ndgrigorian · ndgrigorian · commit b4967a4c6b29 · 2023-04-07T10:13:39.000-07:00
diff --git a/dpctl/tensor/_search_functions.py b/dpctl/tensor/_search_functions.py
@@ -1,6 +1,6 @@
 #                       Data Parallel Control (dpctl)
 #
-#  Copyright 2020-2022 Intel Corporation
+#  Copyright 2020-2023 Intel Corporation
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -40,6 +40,37 @@ def _where_result_type(dt1, dt2, dev):
 
 
 def where(condition, x1, x2):
+    """where(condition, x1, x2)
+
+    Returns :class:`dpctl.tensor.usm_ndarray` with elements chosen
+    from `x1` or `x2` depending on `condition`.
+
+    Args:
+        condition (usm_ndarray): When True yields from `x1`,
+            and otherwise yields from `x2`.
+            Must be compatible with `x1` and `x2` according
+            to broadcasting rules.
+        x1 (usm_ndarray): Array from which values are chosen when
+            `condition` is True.
+            Must be compatible with `condition` and `x2` according
+            to broadcasting rules.
+        x2 (usm_ndarray): Array from which values are chosen when
+            `condition` is not True.
+            Must be compatible with `condition` and `x2` according
+            to broadcasting rules.
+
+    Returns:
+        usm_ndarray:
+            An array with elements from `x1` where `condition` is True,
+            and elements from `x2` elsewhere.
+
+    The data type of the returned array is determined by applying
+    the Type Promotion Rules to `x1` and `x2`.
+
+    The memory layout of the returned array is
+    F-contiguous (column-major) when all inputs are F-contiguous,
+    and C-contiguous (row-major) otherwise.
+    """
     if not isinstance(condition, dpt.usm_ndarray):
         raise TypeError(
             "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}"
@@ -89,7 +120,7 @@ def where(condition, x1, x2):
 
     deps = []
     wait_list = []
-    if x1_dtype is not dst_dtype:
+    if x1_dtype != dst_dtype:
         _x1 = dpt.empty_like(x1, dtype=dst_dtype)
         ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x1, dst=_x1, sycl_queue=exec_q
@@ -98,7 +129,7 @@ def where(condition, x1, x2):
         deps.append(copy1_ev)
         wait_list.append(ht_copy1_ev)
 
-    if x2_dtype is not dst_dtype:
+    if x2_dtype != dst_dtype:
         _x2 = dpt.empty_like(x2, dtype=dst_dtype)
         ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x2, dst=_x2, sycl_queue=exec_q
@@ -140,7 +171,7 @@ def where(condition, x1, x2):
         sycl_queue=exec_q,
         depends=deps,
     )
-    wait_list.append(hev)
     dpctl.SyclEvent.wait_for(wait_list)
+    hev.wait()
 
     return dst
diff --git a/dpctl/tensor/_type_utils.py b/dpctl/tensor/_type_utils.py
@@ -1,6 +1,6 @@
 #                      Data Parallel Control (dpctl)
 #
-# Copyright 2020-2022 Intel Corporation
+# Copyright 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -84,7 +84,7 @@ def _all_data_types(_fp16, _fp64):
             ]
 
 
-def is_maximal_inexact_type(dt: dpt.dtype, _fp16: bool, _fp64: bool):
+def _is_maximal_inexact_type(dt: dpt.dtype, _fp16: bool, _fp64: bool):
     """
     Return True if data type `dt` is the
     maximal size inexact data type
@@ -106,7 +106,7 @@ def _can_cast(from_: dpt.dtype, to_: dpt.dtype, _fp16: bool, _fp64: bool):
         if (
             from_.kind in "biu"
             and to_.kind in "fc"
-            and is_maximal_inexact_type(to_, _fp16, _fp64)
+            and _is_maximal_inexact_type(to_, _fp16, _fp64)
         ):
             return True
 
diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp
@@ -2,7 +2,7 @@
 //
 //                      Data Parallel Control (dpctl)
 //
-// Copyright 2020-2022 Intel Corporation
+// Copyright 2020-2023 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -57,31 +57,24 @@ class WhereContigFunctor
 {
 private:
     size_t nelems = 0;
-    const char *x1_cp = nullptr;
-    const char *x2_cp = nullptr;
-    char *dst_cp = nullptr;
-    const char *cond_cp = nullptr;
+    const condT *cond_p = nullptr;
+    const T *x1_p = nullptr;
+    const T *x2_p = nullptr;
+    T *dst_p = nullptr;
 
 public:
     WhereContigFunctor(size_t nelems_,
-                       const char *cond_data_p,
-                       const char *x1_data_p,
-                       const char *x2_data_p,
-                       char *dst_data_p)
-        : nelems(nelems_), x1_cp(x1_data_p), x2_cp(x2_data_p),
-          dst_cp(dst_data_p), cond_cp(cond_data_p)
+                       const condT *cond_p_,
+                       const T *x1_p_,
+                       const T *x2_p_,
+                       T *dst_p_)
+        : nelems(nelems_), cond_p(cond_p_), x1_p(x1_p_), x2_p(x2_p_),
+          dst_p(dst_p_)
     {
     }
 
     void operator()(sycl::nd_item<1> ndit) const
     {
-        const T *x1_data = reinterpret_cast<const T *>(x1_cp);
-        const T *x2_data = reinterpret_cast<const T *>(x2_cp);
-        T *dst_data = reinterpret_cast<T *>(dst_cp);
-        const condT *cond_data = reinterpret_cast<const condT *>(cond_cp);
-
-        using dpctl::tensor::type_utils::convert_impl;
-
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (is_complex<condT>::value || is_complex<T>::value) {
             std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
@@ -92,8 +85,9 @@ class WhereContigFunctor
                  offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz));
                  offset += sgSize)
             {
-                bool check = convert_impl<bool, condT>(cond_data[offset]);
-                dst_data[offset] = check ? x1_data[offset] : x2_data[offset];
+                using dpctl::tensor::type_utils::convert_impl;
+                bool check = convert_impl<bool, condT>(cond_p[offset]);
+                dst_p[offset] = check ? x1_p[offset] : x2_p[offset];
             }
         }
         else {
@@ -115,7 +109,6 @@ class WhereContigFunctor
                 using cond_ptrT =
                     sycl::multi_ptr<const condT,
                                     sycl::access::address_space::global_space>;
-
                 sycl::vec<T, vec_sz> dst_vec;
                 sycl::vec<T, vec_sz> x1_vec;
                 sycl::vec<T, vec_sz> x2_vec;
@@ -124,23 +117,20 @@ class WhereContigFunctor
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
                     auto idx = base + it * sgSize;
-                    x1_vec = sg.load<vec_sz>(x_ptrT(&x1_data[idx]));
-                    x2_vec = sg.load<vec_sz>(x_ptrT(&x2_data[idx]));
-                    cond_vec = sg.load<vec_sz>(cond_ptrT(&cond_data[idx]));
-
+                    x1_vec = sg.load<vec_sz>(x_ptrT(&x1_p[idx]));
+                    x2_vec = sg.load<vec_sz>(x_ptrT(&x2_p[idx]));
+                    cond_vec = sg.load<vec_sz>(cond_ptrT(&cond_p[idx]));
 #pragma unroll
                     for (std::uint8_t k = 0; k < vec_sz; ++k) {
-                        bool check = convert_impl<bool, condT>(cond_vec[k]);
-                        dst_vec[k] = check ? x1_vec[k] : x2_vec[k];
+                        dst_vec[k] = cond_vec[k] ? x1_vec[k] : x2_vec[k];
                     }
-                    sg.store<vec_sz>(dst_ptrT(&dst_data[idx]), dst_vec);
+                    sg.store<vec_sz>(dst_ptrT(&dst_p[idx]), dst_vec);
                 }
             }
             else {
                 for (size_t k = base + sg.get_local_id()[0]; k < nelems;
                      k += sgSize) {
-                    bool check = convert_impl<bool, condT>(cond_data[k]);
-                    dst_data[k] = check ? x1_data[k] : x2_data[k];
+                    dst_p[k] = cond_p[k] ? x1_p[k] : x2_p[k];
                 }
             }
         }
@@ -159,12 +149,17 @@ typedef sycl::event (*where_contig_impl_fn_ptr_t)(
 template <typename T, typename condT>
 sycl::event where_contig_impl(sycl::queue q,
                               size_t nelems,
-                              const char *cond_p,
-                              const char *x1_p,
-                              const char *x2_p,
-                              char *dst_p,
+                              const char *cond_cp,
+                              const char *x1_cp,
+                              const char *x2_cp,
+                              char *dst_cp,
                               const std::vector<sycl::event> &depends)
 {
+    const condT *cond_tp = reinterpret_cast<const condT *>(cond_cp);
+    const T *x1_tp = reinterpret_cast<const T *>(x1_cp);
+    const T *x2_tp = reinterpret_cast<const T *>(x2_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
     sycl::event where_ev = q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
@@ -178,8 +173,8 @@ sycl::event where_contig_impl(sycl::queue q,
 
         cgh.parallel_for<where_contig_kernel<T, condT, vec_sz, n_vecs>>(
             sycl::nd_range<1>(gws_range, lws_range),
-            WhereContigFunctor<T, condT, vec_sz, n_vecs>(nelems, cond_p, x1_p,
-                                                         x2_p, dst_p));
+            WhereContigFunctor<T, condT, vec_sz, n_vecs>(nelems, cond_tp, x1_tp,
+                                                         x2_tp, dst_tp));
     });
 
     return where_ev;
@@ -189,39 +184,34 @@ template <typename T, typename condT, typename IndexerT>
 class WhereStridedFunctor
 {
 private:
-    const char *x1_cp = nullptr;
-    const char *x2_cp = nullptr;
-    char *dst_cp = nullptr;
-    const char *cond_cp = nullptr;
+    const T *x1_p = nullptr;
+    const T *x2_p = nullptr;
+    T *dst_p = nullptr;
+    const condT *cond_p = nullptr;
     IndexerT indexer;
 
 public:
-    WhereStridedFunctor(const char *cond_data_p,
-                        const char *x1_data_p,
-                        const char *x2_data_p,
-                        char *dst_data_p,
+    WhereStridedFunctor(const condT *cond_p_,
+                        const T *x1_p_,
+                        const T *x2_p_,
+                        T *dst_p_,
                         IndexerT indexer_)
-        : x1_cp(x1_data_p), x2_cp(x2_data_p), dst_cp(dst_data_p),
-          cond_cp(cond_data_p), indexer(indexer_)
+        : x1_p(x1_p_), x2_p(x2_p_), dst_p(dst_p_), cond_p(cond_p_),
+          indexer(indexer_)
     {
     }
 
     void operator()(sycl::id<1> id) const
     {
-        const T *x1_data = reinterpret_cast<const T *>(x1_cp);
-        const T *x2_data = reinterpret_cast<const T *>(x2_cp);
-        T *dst_data = reinterpret_cast<T *>(dst_cp);
-        const condT *cond_data = reinterpret_cast<const condT *>(cond_cp);
-
         size_t gid = id[0];
         auto offsets = indexer(static_cast<py::ssize_t>(gid));
 
         using dpctl::tensor::type_utils::convert_impl;
         bool check =
-            convert_impl<bool, condT>(cond_data[offsets.get_first_offset()]);
+            convert_impl<bool, condT>(cond_p[offsets.get_first_offset()]);
 
-        dst_data[gid] = check ? x1_data[offsets.get_second_offset()]
-                              : x2_data[offsets.get_third_offset()];
+        dst_p[gid] = check ? x1_p[offsets.get_second_offset()]
+                           : x2_p[offsets.get_third_offset()];
     }
 };
 
@@ -243,16 +233,21 @@ template <typename T, typename condT>
 sycl::event where_strided_impl(sycl::queue q,
                                size_t nelems,
                                int nd,
-                               const char *cond_p,
-                               const char *x1_p,
-                               const char *x2_p,
-                               char *dst_p,
+                               const char *cond_cp,
+                               const char *x1_cp,
+                               const char *x2_cp,
+                               char *dst_cp,
                                const py::ssize_t *shape_strides,
                                py::ssize_t x1_offset,
                                py::ssize_t x2_offset,
                                py::ssize_t cond_offset,
                                const std::vector<sycl::event> &depends)
 {
+    const condT *cond_tp = reinterpret_cast<const condT *>(cond_cp);
+    const T *x1_tp = reinterpret_cast<const T *>(x1_cp);
+    const T *x2_tp = reinterpret_cast<const T *>(x2_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
     sycl::event where_ev = q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
@@ -263,7 +258,7 @@ sycl::event where_strided_impl(sycl::queue q,
             where_strided_kernel<T, condT, ThreeOffsets_StridedIndexer>>(
             sycl::range<1>(nelems),
             WhereStridedFunctor<T, condT, ThreeOffsets_StridedIndexer>(
-                cond_p, x1_p, x2_p, dst_p, indexer));
+                cond_tp, x1_tp, x2_tp, dst_tp, indexer));
     });
 
     return where_ev;
diff --git a/dpctl/tensor/libtensor/source/where.cpp b/dpctl/tensor/libtensor/source/where.cpp
@@ -2,7 +2,7 @@
 //
 //                      Data Parallel Control (dpctl)
 //
-// Copyright 2020-2022 Intel Corporation
+// Copyright 2020-2023 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -96,10 +96,10 @@ py_where(dpctl::tensor::usm_ndarray condition,
     bool shapes_equal(true);
     size_t nelems(1);
     for (int i = 0; i < nd; ++i) {
-        nelems *= static_cast<size_t>(dst_shape[i]);
-        shapes_equal = shapes_equal && (x1_shape[i] == dst_shape[i]) &&
-                       (x2_shape[i] == dst_shape[i]) &&
-                       (cond_shape[i] == dst_shape[i]);
+        const auto &sh_i = dst_shape[i];
+        nelems *= static_cast<size_t>(sh_i);
+        shapes_equal = shapes_equal && (x1_shape[i] == sh_i) &&
+                       (x2_shape[i] == sh_i) && (cond_shape[i] == sh_i);
     }
 
     if (!shapes_equal) {
@@ -127,7 +127,7 @@ py_where(dpctl::tensor::usm_ndarray condition,
     int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
 
     if (x1_typeid != x2_typeid || x1_typeid != dst_typeid) {
-        throw py::value_error("Non-condition are not of same type.");
+        throw py::value_error("Value arrays must have the same data type");
     }
 
     // ensure that dst is sufficiently ample
@@ -166,8 +166,8 @@ py_where(dpctl::tensor::usm_ndarray condition,
 
         auto where_ev = contig_fn(exec_q, nelems, cond_data, x1_data, x2_data,
                                   dst_data, depends);
-        sycl::event ht_ev = dpctl::utils::keep_args_alive(
-            exec_q, {x1, x2, dst, condition}, {where_ev});
+        sycl::event ht_ev =
+            keep_args_alive(exec_q, {x1, x2, dst, condition}, {where_ev});
 
         return std::make_pair(ht_ev, where_ev);
     }
diff --git a/dpctl/tensor/libtensor/source/where.hpp b/dpctl/tensor/libtensor/source/where.hpp
@@ -2,7 +2,7 @@
 //
 //                      Data Parallel Control (dpctl)
 //
-// Copyright 2020-2022 Intel Corporation
+// Copyright 2020-2023 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/dpctl/tests/test_usm_ndarray_search_functions.py b/dpctl/tests/test_usm_ndarray_search_functions.py
@@ -121,8 +121,8 @@ def test_where_all_dtypes(dt):
 
     # mask dtype changes
     cond = dpt.asarray([0, 1, 3, 0, 10], dtype=dt, sycl_queue=q)
-    x1 = dpt.asarray(0, dtype="f", sycl_queue=q)
-    x2 = dpt.asarray(1, dtype="f", sycl_queue=q)
+    x1 = dpt.asarray(0, dtype="f4", sycl_queue=q)
+    x2 = dpt.asarray(1, dtype="f4", sycl_queue=q)
     res = dpt.where(cond, x1, x2)
 
     res_check = np.asarray([1, 0, 0, 1, 0], dtype=res.dtype)