From 883fd26f5e5bc354ff96c33080b7645b083a50c6 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 4 Jul 2023 01:12:02 -0700
Subject: [PATCH 1/4] Implements negative, positive, pow, and square

---
 dpctl/tensor/__init__.py                      |   8 +
 dpctl/tensor/_elementwise_funcs.py            |  74 ++++-
 .../elementwise_functions/negative.hpp        | 236 +++++++++++++++
 .../elementwise_functions/positive.hpp        | 251 ++++++++++++++++
 .../kernels/elementwise_functions/pow.hpp     | 269 ++++++++++++++++++
 .../kernels/elementwise_functions/square.hpp  | 206 ++++++++++++++
 .../source/elementwise_functions.cpp          | 242 +++++++++++++++-
 7 files changed, 1274 insertions(+), 12 deletions(-)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index ec488cb3d4..af3ae9b1c2 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -116,11 +116,15 @@
     logical_or,
     logical_xor,
     multiply,
+    negative,
     not_equal,
+    positive,
+    pow,
     proj,
     real,
     sin,
     sqrt,
+    square,
     subtract,
 )
 from ._reduction import sum
@@ -220,12 +224,16 @@
     "logical_or",
     "logical_xor",
     "log1p",
+    "negative",
+    "positive",
     "proj",
     "real",
     "sin",
     "sqrt",
+    "square",
     "divide",
     "multiply",
+    "pow",
     "subtract",
     "equal",
     "not_equal",
diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
index 2c07ab8e6a..49e7309998 100644
--- a/dpctl/tensor/_elementwise_funcs.py
+++ b/dpctl/tensor/_elementwise_funcs.py
@@ -715,7 +715,27 @@
 )
 
 # U25: ==== NEGATIVE    (x)
-# FIXME: implement U25
+_negative_docstring_ = """
+negative(x, out=None, order='K')
+
+Computes the numerical negative elementwise.
+Args:
+    x (usm_ndarray):
+        Input array, expected to have numeric data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is `None`.
+        Default: "K".
+Return:
+    usm_ndarray:
+        An array containing the element-wise negative values.
+"""
+
+negative = UnaryElementwiseFunc(
+    "negative", ti._negative_result_type, ti._negative, _negative_docstring_
+)
 
 # B20: ==== NOT_EQUAL   (x1, x2)
 _not_equal_docstring_ = """
@@ -747,10 +767,48 @@
 )
 
 # U26: ==== POSITIVE    (x)
-# FIXME: implement U26
+_positive_docstring_ = """
+positive(x, out=None, order='K')
+
+Computes the numerical positive element-wise.
+Args:
+    x (usm_ndarray):
+        Input array, expected to have numeric data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is `None`.
+        Default: "K".
+Return:
+    usm_ndarray:
+        An array containing the element-wise positive values.
+"""
+
+positive = UnaryElementwiseFunc(
+    "positive", ti._positive_result_type, ti._positive, _positive_docstring_
+)
 
 # B21: ==== POW         (x1, x2)
-# FIXME: implement B21
+_pow_docstring_ = """
+pow(x1, x2, out=None, order='K')
+
+Calculates `x1_i` raised to `x2_i` for each element `x1_i` of the input array
+`x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a numeric data type.
+Returns:
+    usm_narray:
+        an array containing the element-wise result. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+pow = BinaryElementwiseFunc(
+    "pow", ti._pow_result_type, ti._pow, _pow_docstring_
+)
 
 # U??: ==== PROJ        (x)
 _proj_docstring = """
@@ -838,7 +896,15 @@
 # FIXME: implement U31
 
 # U32: ==== SQUARE      (x)
-# FIXME: implement U32
+_square_docstring_ = """
+square(x, out=None, order='K')
+
+Computes `x_i**2` for each element `x_i` for input array `x`.
+"""
+
+square = UnaryElementwiseFunc(
+    "square", ti._square_result_type, ti._square, _square_docstring_
+)
 
 # U33: ==== SQRT        (x)
 _sqrt_docstring_ = """
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
new file mode 100644
index 0000000000..f69a35669e
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -0,0 +1,236 @@
+//=== negative.hpp -   Unary function POSITIVE           ------  *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of POSITIVE(x)
+/// function that returns x.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+#include <pybind11/pybind11.h>
+
+#include <iostream>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace negative
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT> struct NegativeFunctor
+{
+
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::false_type;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &x)
+    {
+        return -x;
+    }
+};
+
+template <typename argT,
+          typename resT = argT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using NegativeContigFunctor = elementwise_common::
+    UnaryContigFunctor<argT, resT, NegativeFunctor<argT, resT>, vec_sz, n_vecs>;
+
+template <typename T> struct NegativeOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+class negative_contig_kernel;
+
+typedef sycl::event (*negative_contig_impl_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename argTy>
+sycl::event negative_contig_impl(sycl::queue exec_q,
+                                 size_t nelems,
+                                 const char *arg_p,
+                                 char *res_p,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    sycl::event negative_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        size_t lws = 64;
+        constexpr unsigned int vec_sz = 4;
+        constexpr unsigned int n_vecs = 2;
+        const size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        using resTy = typename NegativeOutputType<argTy>::value_type;
+        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+        cgh.parallel_for<negative_contig_kernel<argTy, resTy, vec_sz, n_vecs>>(
+            sycl::nd_range<1>(gws_range, lws_range),
+            NegativeContigFunctor<argTy, resTy, vec_sz, n_vecs>(arg_tp, res_tp,
+                                                                nelems));
+    });
+    return negative_ev;
+}
+
+template <typename fnT, typename T> struct NegativeContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename NegativeOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = negative_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct NegativeTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::negative(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename NegativeOutputType<T>::value_type;
+        ;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename argTy, typename resTy, typename IndexerT>
+using NegativeStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, NegativeFunctor<argTy, resTy>>;
+
+template <typename T1, typename T2, typename T3> class negative_strided_kernel;
+
+typedef sycl::event (*negative_strided_impl_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    int,
+    const py::ssize_t *,
+    const char *,
+    py::ssize_t,
+    char *,
+    py::ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename argTy>
+sycl::event
+negative_strided_impl(sycl::queue exec_q,
+                      size_t nelems,
+                      int nd,
+                      const py::ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      py::ssize_t arg_offset,
+                      char *res_p,
+                      py::ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    sycl::event negative_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        using resTy = typename NegativeOutputType<argTy>::value_type;
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides};
+
+        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+        cgh.parallel_for<negative_strided_kernel<argTy, resTy, IndexerT>>(
+            {nelems}, NegativeStridedFunctor<argTy, resTy, IndexerT>(
+                          arg_tp, res_tp, indexer));
+    });
+    return negative_ev;
+}
+
+template <typename fnT, typename T> struct NegativeStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename NegativeOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = negative_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace negative
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
new file mode 100644
index 0000000000..725bdf83df
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -0,0 +1,251 @@
+//=== positive.hpp -   Unary function POSITIVE           ------  *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of POSITIVE(x)
+/// function that returns x.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+#include <pybind11/pybind11.h>
+
+#include <iostream>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace positive
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT> struct PositiveFunctor
+{
+
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &x)
+    {
+        return x;
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in)
+    {
+        auto const &res_vec = in;
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+        if constexpr (std::is_same_v<resT, deducedT>) {
+            return res_vec;
+        }
+        else {
+            return vec_cast<resT, deducedT, vec_sz>(res_vec);
+        }
+    }
+};
+
+template <typename argT,
+          typename resT = argT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using PositiveContigFunctor = elementwise_common::
+    UnaryContigFunctor<argT, resT, PositiveFunctor<argT, resT>, vec_sz, n_vecs>;
+
+template <typename T> struct PositiveOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+class positive_contig_kernel;
+
+typedef sycl::event (*positive_contig_impl_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename argTy>
+sycl::event positive_contig_impl(sycl::queue exec_q,
+                                 size_t nelems,
+                                 const char *arg_p,
+                                 char *res_p,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    sycl::event positive_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        size_t lws = 64;
+        constexpr unsigned int vec_sz = 4;
+        constexpr unsigned int n_vecs = 2;
+        const size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        using resTy = typename PositiveOutputType<argTy>::value_type;
+        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+        cgh.parallel_for<positive_contig_kernel<argTy, resTy, vec_sz, n_vecs>>(
+            sycl::nd_range<1>(gws_range, lws_range),
+            PositiveContigFunctor<argTy, resTy, vec_sz, n_vecs>(arg_tp, res_tp,
+                                                                nelems));
+    });
+    return positive_ev;
+}
+
+template <typename fnT, typename T> struct PositiveContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename PositiveOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = positive_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct PositiveTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::positive(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename PositiveOutputType<T>::value_type;
+        ;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename argTy, typename resTy, typename IndexerT>
+using PositiveStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, PositiveFunctor<argTy, resTy>>;
+
+template <typename T1, typename T2, typename T3> class positive_strided_kernel;
+
+typedef sycl::event (*positive_strided_impl_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    int,
+    const py::ssize_t *,
+    const char *,
+    py::ssize_t,
+    char *,
+    py::ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename argTy>
+sycl::event
+positive_strided_impl(sycl::queue exec_q,
+                      size_t nelems,
+                      int nd,
+                      const py::ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      py::ssize_t arg_offset,
+                      char *res_p,
+                      py::ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    sycl::event positive_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        using resTy = typename PositiveOutputType<argTy>::value_type;
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides};
+
+        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+        cgh.parallel_for<positive_strided_kernel<argTy, resTy, IndexerT>>(
+            {nelems}, PositiveStridedFunctor<argTy, resTy, IndexerT>(
+                          arg_tp, res_tp, indexer));
+    });
+    return positive_ev;
+}
+
+template <typename fnT, typename T> struct PositiveStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename PositiveOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = positive_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace positive
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
new file mode 100644
index 0000000000..9c5727c6d4
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -0,0 +1,269 @@
+//=== POW.hpp -   Binary function POW                    ------  *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of POW(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace pow
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT> struct PowFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec =
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>,
+                                       std::is_integral<argT1>,
+                                       std::is_integral<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2)
+    {
+        return std::pow(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
+                                       const sycl::vec<argT2, vec_sz> &in2)
+    {
+        auto res = sycl::pow(in1, in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(res)::element_type>) {
+            return res;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(res)::element_type, vec_sz>(
+                res);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using PowContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            PowFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using PowStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             PowFunctor<argT1, argT2, resT>>;
+
+// TODO: when type promotion logic is better defined,
+// consider implementing overloads of std::pow that take
+// integers for the exponents. Seem to give better accuracy in
+// some cases (complex data especially)
+template <typename T1, typename T2> struct PowOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class pow_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event pow_contig_impl(sycl::queue exec_q,
+                            size_t nelems,
+                            const char *arg1_p,
+                            py::ssize_t arg1_offset,
+                            const char *arg2_p,
+                            py::ssize_t arg2_offset,
+                            char *res_p,
+                            py::ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, PowOutputType, PowContigFunctor, pow_contig_kernel>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct PowContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename PowOutputType<T1, T2>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2> struct PowTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::pow(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename PowOutputType<T1, T2>::value_type;
+        ;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class pow_strided_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event pow_strided_impl(sycl::queue exec_q,
+                             size_t nelems,
+                             int nd,
+                             const py::ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             py::ssize_t arg1_offset,
+                             const char *arg2_p,
+                             py::ssize_t arg2_offset,
+                             char *res_p,
+                             py::ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, PowOutputType, PowStridedFunctor,
+        pow_strided_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct PowStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename PowOutputType<T1, T2>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace pow
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
new file mode 100644
index 0000000000..29d096ae88
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
@@ -0,0 +1,206 @@
+//=== square.hpp -   Unary function SQUARE       ------         *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SQUARE(x)
+///
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace square
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT> struct SquareFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in)
+    {
+        return in * in;
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in)
+    {
+        auto const &res_vec = in * in;
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+        if constexpr (std::is_same_v<resT, deducedT>) {
+            return res_vec;
+        }
+        else {
+            return vec_cast<resT, deducedT, vec_sz>(res_vec);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using SquareContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           SquareFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using SquareStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, SquareFunctor<argTy, resTy>>;
+
+template <typename T> struct SquareOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::TypeMapResultEntry<T, bool, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+class square_contig_kernel;
+
+template <typename argTy>
+sycl::event square_contig_impl(sycl::queue exec_q,
+                               size_t nelems,
+                               const char *arg_p,
+                               char *res_p,
+                               const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::unary_contig_impl<
+        argTy, SquareOutputType, SquareContigFunctor, square_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T> struct SquareContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename SquareOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = square_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct SquareTypeMapFactory
+{
+    /*! @brief get typeid for output type of x * x */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SquareOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3> class square_strided_kernel;
+
+template <typename argTy>
+sycl::event
+square_strided_impl(sycl::queue exec_q,
+                    size_t nelems,
+                    int nd,
+                    const py::ssize_t *shape_and_strides,
+                    const char *arg_p,
+                    py::ssize_t arg_offset,
+                    char *res_p,
+                    py::ssize_t res_offset,
+                    const std::vector<sycl::event> &depends,
+                    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, SquareOutputType, SquareStridedFunctor, square_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T> struct SquareStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename SquareOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = square_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace square
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
index 5898f0ca7d..dbd06d9250 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
@@ -55,11 +55,15 @@
 #include "kernels/elementwise_functions/logical_or.hpp"
 #include "kernels/elementwise_functions/logical_xor.hpp"
 #include "kernels/elementwise_functions/multiply.hpp"
+#include "kernels/elementwise_functions/negative.hpp"
 #include "kernels/elementwise_functions/not_equal.hpp"
+#include "kernels/elementwise_functions/positive.hpp"
+#include "kernels/elementwise_functions/pow.hpp"
 #include "kernels/elementwise_functions/proj.hpp"
 #include "kernels/elementwise_functions/real.hpp"
 #include "kernels/elementwise_functions/sin.hpp"
 #include "kernels/elementwise_functions/sqrt.hpp"
+#include "kernels/elementwise_functions/square.hpp"
 #include "kernels/elementwise_functions/subtract.hpp"
 #include "kernels/elementwise_functions/true_divide.hpp"
 
@@ -1276,7 +1280,37 @@ void populate_multiply_dispatch_tables(void)
 // U25: ==== NEGATIVE    (x)
 namespace impl
 {
-// FIXME: add code for U25
+
+namespace negative_fn_ns = dpctl::tensor::kernels::negative;
+
+static unary_contig_impl_fn_ptr_t
+    negative_contig_dispatch_vector[td_ns::num_types];
+static int negative_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    negative_strided_dispatch_vector[td_ns::num_types];
+
+void populate_negative_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = negative_fn_ns;
+
+    using fn_ns::NegativeContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, NegativeContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(negative_contig_dispatch_vector);
+
+    using fn_ns::NegativeStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, NegativeStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(negative_strided_dispatch_vector);
+
+    using fn_ns::NegativeTypeMapFactory;
+    DispatchVectorBuilder<int, NegativeTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(negative_output_typeid_vector);
+}
+
 } // namespace impl
 
 // B20: ==== NOT_EQUAL   (x1, x2)
@@ -1320,13 +1354,77 @@ void populate_not_equal_dispatch_tables(void)
 // U26: ==== POSITIVE    (x)
 namespace impl
 {
-// FIXME: add code for U26
+
+namespace positive_fn_ns = dpctl::tensor::kernels::positive;
+
+static unary_contig_impl_fn_ptr_t
+    positive_contig_dispatch_vector[td_ns::num_types];
+static int positive_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    positive_strided_dispatch_vector[td_ns::num_types];
+
+void populate_positive_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = positive_fn_ns;
+
+    using fn_ns::PositiveContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, PositiveContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(positive_contig_dispatch_vector);
+
+    using fn_ns::PositiveStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, PositiveStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(positive_strided_dispatch_vector);
+
+    using fn_ns::PositiveTypeMapFactory;
+    DispatchVectorBuilder<int, PositiveTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(positive_output_typeid_vector);
+}
+
 } // namespace impl
 
 // B21: ==== POW         (x1, x2)
 namespace impl
 {
-// FIXME: add code for B21
+
+namespace pow_fn_ns = dpctl::tensor::kernels::pow;
+
+static binary_contig_impl_fn_ptr_t pow_contig_dispatch_table[td_ns::num_types]
+                                                            [td_ns::num_types];
+static int pow_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    pow_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_pow_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = pow_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::PowTypeMapFactory;
+    DispatchTableBuilder<int, PowTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(pow_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::PowStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, PowStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(pow_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::PowContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, PowContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(pow_contig_dispatch_table);
+};
+
 } // namespace impl
 
 // U??: ==== PROJ        (x)
@@ -1459,7 +1557,37 @@ namespace impl
 // U32: ==== SQUARE      (x)
 namespace impl
 {
-// FIXME: add code for U32
+
+namespace square_fn_ns = dpctl::tensor::kernels::square;
+
+static unary_contig_impl_fn_ptr_t
+    square_contig_dispatch_vector[td_ns::num_types];
+static int square_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    square_strided_dispatch_vector[td_ns::num_types];
+
+void populate_square_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = square_fn_ns;
+
+    using fn_ns::SquareContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SquareContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(square_contig_dispatch_vector);
+
+    using fn_ns::SquareStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SquareStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(square_strided_dispatch_vector);
+
+    using fn_ns::SquareTypeMapFactory;
+    DispatchVectorBuilder<int, SquareTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(square_output_typeid_vector);
+}
+
 } // namespace impl
 
 // U33: ==== SQRT        (x)
@@ -2493,7 +2621,28 @@ void init_elementwise_functions(py::module_ m)
     }
 
     // U25: ==== NEGATIVE    (x)
-    // FIXME:
+    {
+        impl::populate_negative_dispatch_vectors();
+        using impl::negative_contig_dispatch_vector;
+        using impl::negative_output_typeid_vector;
+        using impl::negative_strided_dispatch_vector;
+
+        auto negative_pyapi = [&](arrayT src, arrayT dst, sycl::queue exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  negative_output_typeid_vector,
+                                  negative_contig_dispatch_vector,
+                                  negative_strided_dispatch_vector);
+        };
+        m.def("_negative", negative_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto negative_result_type_pyapi = [&](py::dtype dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              negative_output_typeid_vector);
+        };
+        m.def("_negative_result_type", negative_result_type_pyapi);
+    }
 
     // B20: ==== NOT_EQUAL   (x1, x2)
     {
@@ -2537,10 +2686,67 @@ void init_elementwise_functions(py::module_ m)
     }
 
     // U26: ==== POSITIVE    (x)
-    // FIXME:
+    {
+        impl::populate_positive_dispatch_vectors();
+        using impl::positive_contig_dispatch_vector;
+        using impl::positive_output_typeid_vector;
+        using impl::positive_strided_dispatch_vector;
+
+        auto positive_pyapi = [&](arrayT src, arrayT dst, sycl::queue exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  positive_output_typeid_vector,
+                                  positive_contig_dispatch_vector,
+                                  positive_strided_dispatch_vector);
+        };
+        m.def("_positive", positive_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto positive_result_type_pyapi = [&](py::dtype dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              positive_output_typeid_vector);
+        };
+        m.def("_positive_result_type", positive_result_type_pyapi);
+    }
 
     // B21: ==== POW         (x1, x2)
-    // FIXME:
+    {
+
+        impl::populate_pow_dispatch_tables();
+        using impl::pow_contig_dispatch_table;
+        using impl::pow_output_id_table;
+        using impl::pow_strided_dispatch_table;
+
+        auto pow_pyapi = [&](dpctl::tensor::usm_ndarray src1,
+                             dpctl::tensor::usm_ndarray src2,
+                             dpctl::tensor::usm_ndarray dst, sycl::queue exec_q,
+                             const std::vector<sycl::event> &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, pow_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                pow_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                pow_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto pow_result_type_pyapi = [&](py::dtype dtype1, py::dtype dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               pow_output_id_table);
+        };
+        m.def("_pow", pow_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_pow_result_type", pow_result_type_pyapi, "");
+    }
 
     // U??: ==== PROJ        (x)
     {
@@ -2620,7 +2826,27 @@ void init_elementwise_functions(py::module_ m)
     // FIXME:
 
     // U32: ==== SQUARE      (x)
-    // FIXME:
+    {
+        impl::populate_square_dispatch_vectors();
+        using impl::square_contig_dispatch_vector;
+        using impl::square_output_typeid_vector;
+        using impl::square_strided_dispatch_vector;
+
+        auto square_pyapi = [&](arrayT src, arrayT dst, sycl::queue exec_q,
+                                const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, square_output_typeid_vector,
+                square_contig_dispatch_vector, square_strided_dispatch_vector);
+        };
+        m.def("_square", square_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto square_result_type_pyapi = [&](py::dtype dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              square_output_typeid_vector);
+        };
+        m.def("_square_result_type", square_result_type_pyapi);
+    }
 
     // U33: ==== SQRT        (x)
     {

From 55caa045499696c414e9ad579a06d32e2dddea6d Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 4 Jul 2023 02:16:44 -0700
Subject: [PATCH 2/4] Tests for negative, positive, pow, and square

---
 dpctl/tests/elementwise/test_negative.py |  79 ++++++++++++
 dpctl/tests/elementwise/test_positive.py |  79 ++++++++++++
 dpctl/tests/elementwise/test_pow.py      | 154 +++++++++++++++++++++++
 dpctl/tests/elementwise/test_square.py   |  99 +++++++++++++++
 4 files changed, 411 insertions(+)
 create mode 100644 dpctl/tests/elementwise/test_negative.py
 create mode 100644 dpctl/tests/elementwise/test_positive.py
 create mode 100644 dpctl/tests/elementwise/test_pow.py
 create mode 100644 dpctl/tests/elementwise/test_square.py

diff --git a/dpctl/tests/elementwise/test_negative.py b/dpctl/tests/elementwise/test_negative.py
new file mode 100644
index 0000000000..3af6d7fcf5
--- /dev/null
+++ b/dpctl/tests/elementwise/test_negative.py
@@ -0,0 +1,79 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _all_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_negative_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
+    assert dpt.negative(X).dtype == arg_dt
+
+    r = dpt.empty_like(X, dtype=arg_dt)
+    dpt.negative(X, out=r)
+    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.negative(X)))
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_negative_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("i4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    Y = dpt.negative(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.negative(dpt.asnumpy(X))
+    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_negative_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    for ord in ["C", "F", "A", "K"]:
+        for perms in itertools.permutations(range(4)):
+            U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+            Y = dpt.negative(U, order=ord)
+            expected_Y = np.negative(np.ones(Y.shape, dtype=Y.dtype))
+            expected_Y[..., 1::2] = 0
+            expected_Y = np.transpose(expected_Y, perms)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpctl/tests/elementwise/test_positive.py b/dpctl/tests/elementwise/test_positive.py
new file mode 100644
index 0000000000..657c26d8cf
--- /dev/null
+++ b/dpctl/tests/elementwise/test_positive.py
@@ -0,0 +1,79 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _all_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_positive_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
+    assert dpt.positive(X).dtype == arg_dt
+
+    r = dpt.empty_like(X, dtype=arg_dt)
+    dpt.positive(X, out=r)
+    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.positive(X)))
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_positive_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("i4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    Y = dpt.positive(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = dpt.asnumpy(X)
+    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_positive_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    for ord in ["C", "F", "A", "K"]:
+        for perms in itertools.permutations(range(4)):
+            U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+            Y = dpt.positive(U, order=ord)
+            expected_Y = np.ones(Y.shape, dtype=Y.dtype)
+            expected_Y[..., 1::2] = 0
+            expected_Y = np.transpose(expected_Y, perms)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpctl/tests/elementwise/test_pow.py b/dpctl/tests/elementwise/test_pow.py
new file mode 100644
index 0000000000..1f13e2b533
--- /dev/null
+++ b/dpctl/tests/elementwise/test_pow.py
@@ -0,0 +1,154 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import ctypes
+
+import numpy as np
+import pytest
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _all_dtypes, _compare_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
+def test_power_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.pow(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.power(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.pow(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.power(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_power_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.pow(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_pow_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.pow(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.pow(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.pow(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.pow(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.pow(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.pow(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.pow(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.pow(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.pow(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.pow(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_pow_broadcasting():
+    get_queue_or_skip()
+
+    v = dpt.arange(1, 6, dtype="i4")
+    m = dpt.full((100, 5), 2, dtype="i4")
+
+    r = dpt.pow(m, v)
+
+    expected = np.power(
+        np.full((100, 5), 2, dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.pow(v, m)
+    expected2 = np.power(
+        np.arange(1, 6, dtype="i4"), np.full((100, 5), 2, dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_pow_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.pow(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.pow(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
diff --git a/dpctl/tests/elementwise/test_square.py b/dpctl/tests/elementwise/test_square.py
new file mode 100644
index 0000000000..95ec163e2f
--- /dev/null
+++ b/dpctl/tests/elementwise/test_square.py
@@ -0,0 +1,99 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _all_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_square_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.arange(5, dtype=arg_dt, sycl_queue=q)
+    assert dpt.square(X).dtype == arg_dt
+
+    r = dpt.empty_like(X, dtype=arg_dt)
+    dpt.square(X, out=r)
+    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.square(X)))
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_square_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("i4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    Y = dpt.square(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = dpt.asnumpy(X)
+    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_square_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 2
+    X[..., 1::2] = 0
+
+    for ord in ["C", "F", "A", "K"]:
+        for perms in itertools.permutations(range(4)):
+            U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+            Y = dpt.square(U, order=ord)
+            expected_Y = np.full(Y.shape, 4, dtype=Y.dtype)
+            expected_Y[..., 1::2] = 0
+            expected_Y = np.transpose(expected_Y, perms)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_square_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    vals = [np.nan, np.inf, -np.inf, 0.0, -0.0]
+    X = dpt.asarray(vals, dtype=dtype, sycl_queue=q)
+    X_np = dpt.asnumpy(X)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    with np.errstate(all="ignore"):
+        assert np.allclose(
+            dpt.asnumpy(dpt.square(X)),
+            np.square(X_np),
+            atol=tol,
+            rtol=tol,
+            equal_nan=True,
+        )

From 602eef11c868508d494edeb51a16cfcefc819971 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 4 Jul 2023 03:28:57 -0700
Subject: [PATCH 3/4] pow no longer uses std::pow for integers - Not portable
 to devices without 64-bit precision

---
 .../kernels/elementwise_functions/pow.hpp     | 27 +++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
index 9c5727c6d4..d4249c7574 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -27,6 +27,7 @@
 #include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
@@ -60,9 +61,31 @@ template <typename argT1, typename argT2, typename resT> struct PowFunctor
                                        std::is_integral<argT1>,
                                        std::is_integral<argT2>>>;
 
-    resT operator()(const argT1 &in1, const argT2 &in2)
+    resT operator()(argT1 in1, argT2 in2)
     {
-        return std::pow(in1, in2);
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            if constexpr (std::is_signed_v<argT2>) {
+                if (in2 < 0) {
+                    // invalid; return 0
+                    return resT(0);
+                }
+            }
+            resT res = 1;
+            if (in1 == 1 || in2 == 0) {
+                return res;
+            }
+            while (in2 > 0) {
+                if (in2 & 1) {
+                    res *= in1;
+                }
+                in2 >>= 1;
+                in1 *= in1;
+            }
+            return res;
+        }
+        else {
+            return std::pow(in1, in2);
+        }
     }
 
     template <int vec_sz>

From 47482334c2912d5914e0e0cb8d4e7d6a47e157f2 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 4 Jul 2023 04:30:22 -0700
Subject: [PATCH 4/4] Fixed docstrings for negative, positive, pow, and square

---
 dpctl/tensor/_elementwise_funcs.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
index 49e7309998..335e0350ac 100644
--- a/dpctl/tensor/_elementwise_funcs.py
+++ b/dpctl/tensor/_elementwise_funcs.py
@@ -718,7 +718,7 @@
 _negative_docstring_ = """
 negative(x, out=None, order='K')
 
-Computes the numerical negative elementwise.
+Computes the numerical negative for each element `x_i` of input array `x`.
 Args:
     x (usm_ndarray):
         Input array, expected to have numeric data type.
@@ -730,7 +730,7 @@
         Default: "K".
 Return:
     usm_ndarray:
-        An array containing the element-wise negative values.
+        An array containing the negative of `x`.
 """
 
 negative = UnaryElementwiseFunc(
@@ -770,7 +770,7 @@
 _positive_docstring_ = """
 positive(x, out=None, order='K')
 
-Computes the numerical positive element-wise.
+Computes the numerical positive for each element `x_i` of input array `x`.
 Args:
     x (usm_ndarray):
         Input array, expected to have numeric data type.
@@ -782,7 +782,7 @@
         Default: "K".
 Return:
     usm_ndarray:
-        An array containing the element-wise positive values.
+        An array containing the values of `x`.
 """
 
 positive = UnaryElementwiseFunc(
@@ -802,7 +802,7 @@
     x2 (usm_ndarray):
         Second input array, also expected to have a numeric data type.
 Returns:
-    usm_narray:
+    usm_ndarray:
         an array containing the element-wise result. The data type of
         the returned array is determined by the Type Promotion Rules.
 """
@@ -899,7 +899,21 @@
 _square_docstring_ = """
 square(x, out=None, order='K')
 
-Computes `x_i**2` for each element `x_i` for input array `x`.
+Computes `x_i**2` (or `x_i*x_i`) for each element `x_i` of input array `x`.
+Args:
+    x (usm_ndarray):
+        Input array, expected to have numeric data type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_ndarray:
+        An array containing the square `x`.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
 """
 
 square = UnaryElementwiseFunc(