Merge pull request #896 from ndgrigorian/dpctl-eye

oleksandr-pavlyk · web-flow · commit 4344c83c7ef4 · 2022-09-13T11:43:22.000-05:00
Implemented dpctl.tensor.eye constructor and tests
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
@@ -27,6 +27,7 @@
     asarray,
     empty,
     empty_like,
+    eye,
     full,
     full_like,
     linspace,
@@ -62,6 +63,7 @@
     "zeros",
     "ones",
     "full",
+    "eye",
     "linspace",
     "empty_like",
     "zeros_like",
diff --git a/dpctl/tensor/_ctors.py b/dpctl/tensor/_ctors.py
@@ -1035,3 +1035,84 @@ def linspace(
     )
     hev.wait()
     return res
+
+
+def eye(
+    n_rows,
+    n_cols=None,
+    /,
+    *,
+    k=0,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    eye(n_rows, n_cols = None, /, *, k = 0, dtype = None, \
+    device = None, usm_type="device", sycl_queue=None) -> usm_ndarray
+
+    Creates `usm_ndarray` with ones on the `k`th diagonal.
+
+    Args:
+        n_rows: number of rows in the output array.
+        n_cols (optional): number of columns in the output array. If None,
+            n_cols = n_rows. Default: `None`.
+        k: index of the diagonal, with 0 as the main diagonal.
+            A positive value of k is a superdiagonal, a negative value
+            is a subdiagonal.
+            Raises `TypeError` if k is not an integer.
+            Default: `0`.
+        dtype (optional): data type of the array. Can be typestring,
+            a `numpy.dtype` object, `numpy` char string, or a numpy
+            scalar type. Default: None
+        order ("C" or F"): memory layout for the array. Default: "C"
+        device (optional): array API concept of device where the output array
+            is created. `device` can be `None`, a oneAPI filter selector string,
+            an instance of :class:`dpctl.SyclDevice` corresponding to a
+            non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a `Device` object returnedby
+            `dpctl.tensor.usm_array.device`. Default: `None`.
+        usm_type ("device"|"shared"|"host", optional): The type of SYCL USM
+            allocation for the output array. Default: `"device"`.
+        sycl_queue (:class:`dpctl.SyclQueue`, optional): The SYCL queue to use
+            for output array allocation and copying. `sycl_queue` and `device`
+            are exclusive keywords, i.e. use one or another. If both are
+            specified, a `TypeError` is raised unless both imply the same
+            underlying SYCL queue to be used. If both are `None`, the
+            `dpctl.SyclQueue()` is used for allocation and copying.
+            Default: `None`.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    else:
+        order = order[0].upper()
+    n_rows = operator.index(n_rows)
+    n_cols = n_rows if n_cols is None else operator.index(n_cols)
+    k = operator.index(k)
+    if k >= n_cols or -k >= n_rows:
+        return dpt.zeros(
+            (n_rows, n_cols),
+            dtype=dtype,
+            order=order,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = _get_dtype(dtype, sycl_queue)
+    res = dpt.usm_ndarray(
+        (n_rows, n_cols),
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    if n_rows != 0 and n_cols != 0:
+        hev, _ = ti._eye(k, dst=res, sycl_queue=sycl_queue)
+        hev.wait()
+    return res
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -45,6 +45,7 @@ template <typename srcT, typename dstT, int nd> class copy_cast_spec_kernel;
 template <typename Ty> class copy_for_reshape_generic_kernel;
 template <typename Ty> class linear_sequence_step_kernel;
 template <typename Ty, typename wTy> class linear_sequence_affine_kernel;
+template <typename Ty> class eye_kernel;
 
 static dpctl::tensor::detail::usm_ndarray_types array_types;
 
@@ -1740,6 +1741,144 @@ usm_ndarray_full(py::object py_value,
     }
 }
 
+/* ================ Eye ================== */
+
+typedef sycl::event (*eye_fn_ptr_t)(sycl::queue,
+                                    size_t nelems, // num_elements
+                                    py::ssize_t start,
+                                    py::ssize_t end,
+                                    py::ssize_t step,
+                                    char *, // dst_data_ptr
+                                    const std::vector<sycl::event> &);
+
+static eye_fn_ptr_t eye_dispatch_vector[_ns::num_types];
+
+template <typename Ty> class EyeFunctor
+{
+private:
+    Ty *p = nullptr;
+    py::ssize_t start_v;
+    py::ssize_t end_v;
+    py::ssize_t step_v;
+
+public:
+    EyeFunctor(char *dst_p,
+               const py::ssize_t v0,
+               const py::ssize_t v1,
+               const py::ssize_t dv)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1), step_v(dv)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        Ty set_v = 0;
+        py::ssize_t i = static_cast<py::ssize_t>(wiid.get(0));
+        if (i >= start_v and i <= end_v) {
+            if ((i - start_v) % step_v == 0) {
+                set_v = 1;
+            }
+        }
+        p[i] = set_v;
+    }
+};
+
+template <typename Ty>
+sycl::event eye_impl(sycl::queue exec_q,
+                     size_t nelems,
+                     const py::ssize_t start,
+                     const py::ssize_t end,
+                     const py::ssize_t step,
+                     char *array_data,
+                     const std::vector<sycl::event> &depends)
+{
+    sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<eye_kernel<Ty>>(
+            sycl::range<1>{nelems},
+            EyeFunctor<Ty>(array_data, start, end, step));
+    });
+
+    return eye_event;
+}
+
+template <typename fnT, typename Ty> struct EyeFactory
+{
+    fnT get()
+    {
+        fnT f = eye_impl<Ty>;
+        return f;
+    }
+};
+
+std::pair<sycl::event, sycl::event>
+eye(py::ssize_t k,
+    dpctl::tensor::usm_ndarray dst,
+    sycl::queue exec_q,
+    const std::vector<sycl::event> &depends = {})
+{
+    // dst must be 2D
+
+    if (dst.get_ndim() != 2) {
+        throw py::value_error(
+            "usm_ndarray_eye: Expecting 2D array to populate");
+    }
+
+    sycl::queue dst_q = dst.get_queue();
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) {
+        throw py::value_error("Execution queue is not compatible with the "
+                              "allocation queue");
+    }
+
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    const py::ssize_t nelem = dst.get_size();
+    const py::ssize_t rows = dst.get_shape(0);
+    const py::ssize_t cols = dst.get_shape(1);
+    if (rows == 0 || cols == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    bool is_dst_c_contig = ((dst.get_flags() & USM_ARRAY_C_CONTIGUOUS) != 0);
+    bool is_dst_f_contig = ((dst.get_flags() & USM_ARRAY_F_CONTIGUOUS) != 0);
+    if (!is_dst_c_contig && !is_dst_f_contig) {
+        throw py::value_error("USM array is not contiguous");
+    }
+
+    py::ssize_t start;
+    if (is_dst_c_contig) {
+        start = (k < 0) ? -k * cols : k;
+    }
+    else {
+        start = (k < 0) ? -k : k * rows;
+    }
+
+    const py::ssize_t *strides = dst.get_strides_raw();
+    py::ssize_t step;
+    if (strides == nullptr) {
+        step = (is_dst_c_contig) ? cols + 1 : rows + 1;
+    }
+    else {
+        step = strides[0] + strides[1];
+    }
+
+    const py::ssize_t length = std::min({rows, cols, rows + k, cols - k});
+    const py::ssize_t end = start + step * (length - 1);
+
+    char *dst_data = dst.get_data();
+    sycl::event eye_event;
+
+    auto fn = eye_dispatch_vector[dst_typeid];
+
+    eye_event = fn(exec_q, static_cast<size_t>(nelem), start, end, step,
+                   dst_data, depends);
+
+    return std::make_pair(keep_args_alive(exec_q, {dst}, {eye_event}),
+                          eye_event);
+}
+
 // populate dispatch tables
 void init_copy_and_cast_dispatch_tables(void)
 {
@@ -1794,6 +1933,9 @@ void init_copy_for_reshape_dispatch_vector(void)
         dvb3;
     dvb3.populate_dispatch_vector(full_contig_dispatch_vector);
 
+    DispatchVectorBuilder<eye_fn_ptr_t, EyeFactory, num_types> dvb4;
+    dvb4.populate_dispatch_vector(eye_dispatch_vector);
+
     return;
 }
 
@@ -1899,6 +2041,16 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
+    m.def("_eye", &eye,
+          "Fills input 2D contiguous usm_ndarray `dst` with "
+          "zeros outside of the diagonal "
+          "specified by "
+          "the diagonal index `k` "
+          "which is filled with ones."
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("k"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
     m.def("default_device_fp_type", [](sycl::queue q) -> std::string {
         return get_default_device_fp_type(q.get_device());
     });
diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py
@@ -1256,6 +1256,24 @@ def test_full_like(dt, usm_kind):
     assert np.array_equal(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype))
 
 
+@pytest.mark.parametrize("dtype", _all_dtypes)
+@pytest.mark.parametrize("usm_kind", ["shared", "device", "host"])
+def test_eye(dtype, usm_kind):
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+
+    if dtype in ["f8", "c16"] and q.sycl_device.has_aspect_fp64 is False:
+        pytest.skip(
+            "Device does not support double precision floating point type"
+        )
+    X = dpt.eye(4, 5, k=1, dtype=dtype, usm_type=usm_kind, sycl_queue=q)
+    Xnp = np.eye(4, 5, k=1, dtype=dtype)
+    assert X.dtype == Xnp.dtype
+    assert np.array_equal(Xnp, dpt.asnumpy(X))
+
+
 def test_common_arg_validation():
     order = "I"
     # invalid order must raise ValueError
@@ -1267,6 +1285,8 @@ def test_common_arg_validation():
         dpt.ones(10, order=order)
     with pytest.raises(ValueError):
         dpt.full(10, 1, order=order)
+    with pytest.raises(ValueError):
+        dpt.eye(10, order=order)
     X = dpt.empty(10)
     with pytest.raises(ValueError):
         dpt.empty_like(X, order=order)