diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index be9426a834..0bd7a30cd0 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -100,7 +100,9 @@
     isfinite,
     isinf,
     isnan,
+    multiply,
     sqrt,
+    subtract,
 )
 
 __all__ = [
@@ -186,5 +188,7 @@
     "isfinite",
     "sqrt",
     "divide",
+    "multiply",
+    "subtract",
     "equal",
 ]
diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
index 27c549c034..29d1052c9c 100644
--- a/dpctl/tensor/_elementwise_funcs.py
+++ b/dpctl/tensor/_elementwise_funcs.py
@@ -34,7 +34,7 @@
 # B01: ===== ADD   (x1, x2)
 
 _add_docstring_ = """
-add(x1, x2, order='K')
+add(x1, x2, out=None, order='K')
 
 Calculates the sum for each element `x1_i` of the input array `x1` with
 the respective element `x2_i` of the input array `x2`.
@@ -94,7 +94,7 @@
 
 # U11: ==== COS           (x)
 _cos_docstring = """
-cos(x, order='K')
+cos(x, out=None, order='K')
 
 Computes cosine for each element `x_i` for input array `x`.
 """
@@ -106,7 +106,7 @@
 
 # B08: ==== DIVIDE        (x1, x2)
 _divide_docstring_ = """
-divide(x1, x2, order='K')
+divide(x1, x2, out=None, order='K')
 
 Calculates the ratio for each element `x1_i` of the input array `x1` with
 the respective element `x2_i` of the input array `x2`.
@@ -128,7 +128,7 @@
 
 # B09: ==== EQUAL         (x1, x2)
 _equal_docstring_ = """
-equal(x1, x2, order='K')
+equal(x1, x2, out=None, order='K')
 
 Calculates equality test results for each element `x1_i` of the input array `x1`
 with the respective element `x2_i` of the input array `x2`.
@@ -172,6 +172,8 @@
 
 # U17: ==== ISFINITE    (x)
 _isfinite_docstring_ = """
+isfinite(x, out=None, order='K')
+
 Computes if every element of input array is a finite number.
 """
 
@@ -181,6 +183,8 @@
 
 # U18: ==== ISINF       (x)
 _isinf_docstring_ = """
+isinf(x, out=None, order='K')
+
 Computes if every element of input array is an infinity.
 """
 
@@ -190,6 +194,8 @@
 
 # U19: ==== ISNAN       (x)
 _isnan_docstring_ = """
+isnan(x, out=None, order='K')
+
 Computes if every element of input array is a NaN.
 """
 
@@ -231,7 +237,25 @@
 # FIXME: implement B18
 
 # B19: ==== MULTIPLY    (x1, x2)
-# FIXME: implement B19
+_multiply_docstring_ = """
+multiply(x1, x2, out=None, order='K')
+
+Calculates the product for each element `x1_i` of the input array `x1`
+with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have numeric data type.
+Returns:
+    usm_narray:
+        an array containing the element-wise products. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+multiply = BinaryElementwiseFunc(
+    "multiply", ti._multiply_result_type, ti._multiply, _multiply_docstring_
+)
 
 # U25: ==== NEGATIVE    (x)
 # FIXME: implement U25
@@ -268,6 +292,8 @@
 
 # U33: ==== SQRT        (x)
 _sqrt_docstring_ = """
+sqrt(x, out=None, order='K')
+
 Computes sqrt for each element `x_i` for input array `x`.
 """
 
@@ -276,7 +302,26 @@
 )
 
 # B23: ==== SUBTRACT    (x1, x2)
-# FIXME: implement B23
+_subtract_docstring_ = """
+subtract(x1, x2, out=None, order='K')
+
+Calculates the difference bewteen each element `x1_i` of the input
+array `x1` and the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have numeric data type.
+Returns:
+    usm_narray:
+        an array containing the element-wise differences. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+subtract = BinaryElementwiseFunc(
+    "subtract", ti._subtract_result_type, ti._subtract, _subtract_docstring_
+)
+
 
 # U34: ==== TAN         (x)
 # FIXME: implement U34
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
index 09f2995874..c734fd54ec 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -115,27 +115,9 @@ sycl::event abs_contig_impl(sycl::queue exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
-    sycl::event abs_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        size_t lws = 64;
-        constexpr unsigned int vec_sz = 4;
-        constexpr unsigned int n_vecs = 2;
-        const size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
-
-        using resTy = typename AbsOutputType<argTy>::value_type;
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<abs_contig_kernel<argTy, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            AbsContigFunctor<argTy, resTy, vec_sz, n_vecs>(arg_tp, res_tp,
-                                                           nelems));
-    });
-    return abs_ev;
+    return elementwise_common::unary_contig_impl<
+        argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AbsContigFactory
@@ -182,24 +164,10 @@ sycl::event abs_strided_impl(sycl::queue exec_q,
                              const std::vector<sycl::event> &depends,
                              const std::vector<sycl::event> &additional_depends)
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using resTy = typename AbsOutputType<argTy>::value_type;
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides};
-
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<abs_strided_kernel<argTy, resTy, IndexerT>>(
-            {nelems},
-            AbsStridedFunctor<argTy, resTy, IndexerT>(arg_tp, res_tp, indexer));
-    });
-    return comp_ev;
+    return elementwise_common::unary_strided_impl<
+        argTy, AbsOutputType, AbsStridedFunctor, abs_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
 }
 
 template <typename fnT, typename T> struct AbsStridedFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index d0ab25d270..2f3dec3d14 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -184,32 +184,10 @@ sycl::event add_contig_impl(sycl::queue exec_q,
                             py::ssize_t res_offset,
                             const std::vector<sycl::event> &depends = {})
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        size_t lws = 64;
-        constexpr unsigned int vec_sz = 4;
-        constexpr unsigned int n_vecs = 2;
-        const size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
-
-        using resTy = typename AddOutputType<argTy1, argTy2>::value_type;
-
-        const argTy1 *arg1_tp =
-            reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
-        const argTy2 *arg2_tp =
-            reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
-
-        cgh.parallel_for<
-            add_contig_kernel<argTy1, argTy2, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            AddContigFunctor<argTy1, argTy2, resTy, vec_sz, n_vecs>(
-                arg1_tp, arg2_tp, res_tp, nelems));
-    });
-    return comp_ev;
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct AddContigFactory
@@ -256,28 +234,11 @@ sycl::event add_strided_impl(sycl::queue exec_q,
                              const std::vector<sycl::event> &depends,
                              const std::vector<sycl::event> &additional_depends)
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using resTy = typename AddOutputType<argTy1, argTy2>::value_type;
-
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
-
-        IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset,
-                         shape_and_strides};
-
-        const argTy1 *arg1_tp = reinterpret_cast<const argTy1 *>(arg1_p);
-        const argTy2 *arg2_tp = reinterpret_cast<const argTy2 *>(arg2_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<
-            add_strided_strided_kernel<argTy1, argTy2, resTy, IndexerT>>(
-            {nelems}, AddStridedFunctor<argTy1, argTy2, resTy, IndexerT>(
-                          arg1_tp, arg2_tp, res_tp, indexer));
-    });
-    return comp_ev;
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, AddOutputType, AddStridedFunctor,
+        add_strided_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct AddStridedFactory
@@ -322,62 +283,11 @@ sycl::event add_contig_matrix_contig_row_broadcast_impl(
     py::ssize_t res_offset,
     const std::vector<sycl::event> &depends = {})
 {
-    const argT1 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
-    const argT2 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
-    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
-
-    const auto &dev = exec_q.get_device();
-    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
-    // Get device-specific kernel info max_sub_group_size
-    size_t max_sgSize =
-        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
-
-    size_t n1_padded = n1 + max_sgSize;
-    argT2 *padded_vec = sycl::malloc_device<argT2>(n1_padded, exec_q);
-
-    if (padded_vec == nullptr) {
-        throw std::runtime_error("Could not allocate memory on the device");
-    }
-    sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends); // ensure vec contains actual data
-        cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
-            auto i = id[0];
-            padded_vec[i] = vec[i % n1];
-        });
-    });
-
-    // sub-group spans work-items [I, I + sgSize)
-    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
-    // Generically, sg.load( &mat[base]) may load arrays from
-    // different rows of mat. The start corresponds to row (base / n0)
-    // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
-    // ensure that reads are accessible
-
-    size_t lws = 64;
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(make_padded_vec_ev);
-
-        auto lwsRange = sycl::range<1>(lws);
-        size_t n_elems = n0 * n1;
-        size_t n_groups = (n_elems + lws - 1) / lws;
-        auto gwsRange = sycl::range<1>(n_groups * lws);
-
-        cgh.parallel_for<
-            class add_matrix_row_broadcast_sg_krn<argT1, argT2, resT>>(
-            sycl::nd_range<1>(gwsRange, lwsRange),
-            AddContigMatrixContigRowBroadcastingFunctor<argT1, argT2, resT>(
-                mat, padded_vec, res, n_elems, n1));
-    });
-
-    sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(comp_ev);
-        sycl::context ctx = exec_q.get_context();
-        cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); });
-    });
-    host_tasks.push_back(tmp_cleanup_ev);
-
-    return comp_ev;
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, AddContigMatrixContigRowBroadcastingFunctor,
+        add_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
+                                         mat_offset, vec_p, vec_offset, res_p,
+                                         res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index a1108c541c..ed38f380df 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -249,6 +249,81 @@ struct UnaryStridedFunctor
     }
 };
 
+template <typename argTy,
+          template <typename T>
+          class UnaryOutputType,
+          template <typename A, typename R, unsigned int vs, unsigned int nv>
+          class ContigFunctorT,
+          template <typename A, typename R, unsigned int vs, unsigned int nv>
+          class kernel_name,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+sycl::event unary_contig_impl(sycl::queue exec_q,
+                              size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        size_t lws = 64;
+        const size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        using resTy = typename UnaryOutputType<argTy>::value_type;
+        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+        cgh.parallel_for<kernel_name<argTy, resTy, vec_sz, n_vecs>>(
+            sycl::nd_range<1>(gws_range, lws_range),
+            ContigFunctorT<argTy, resTy, vec_sz, n_vecs>(arg_tp, res_tp,
+                                                         nelems));
+    });
+    return comp_ev;
+}
+
+template <typename argTy,
+          template <typename T>
+          class UnaryOutputType,
+          template <typename A, typename R, typename I>
+          class StridedFunctorT,
+          template <typename A, typename R, typename I>
+          class kernel_name>
+sycl::event
+unary_strided_impl(sycl::queue exec_q,
+                   size_t nelems,
+                   int nd,
+                   const py::ssize_t *shape_and_strides,
+                   const char *arg_p,
+                   py::ssize_t arg_offset,
+                   char *res_p,
+                   py::ssize_t res_offset,
+                   const std::vector<sycl::event> &depends,
+                   const std::vector<sycl::event> &additional_depends)
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        using resTy = typename UnaryOutputType<argTy>::value_type;
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides};
+
+        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+        cgh.parallel_for<kernel_name<argTy, resTy, IndexerT>>(
+            {nelems},
+            StridedFunctorT<argTy, resTy, IndexerT>(arg_tp, res_tp, indexer));
+    });
+    return comp_ev;
+}
+
 template <typename argT1,
           typename argT2,
           typename resT,
@@ -615,6 +690,261 @@ typedef sycl::event (*binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t)(
     py::ssize_t,
     const std::vector<sycl::event> &);
 
+template <typename argTy1,
+          typename argTy2,
+          template <typename T1, typename T2>
+          class BinaryOutputType,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    unsigned int vs,
+                    unsigned int nv>
+          class BinaryContigFunctorT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    unsigned int vs,
+                    unsigned int nv>
+          class kernel_name,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+sycl::event binary_contig_impl(sycl::queue exec_q,
+                               size_t nelems,
+                               const char *arg1_p,
+                               py::ssize_t arg1_offset,
+                               const char *arg2_p,
+                               py::ssize_t arg2_offset,
+                               char *res_p,
+                               py::ssize_t res_offset,
+                               const std::vector<sycl::event> &depends = {})
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        size_t lws = 64;
+        const size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
+
+        const argTy1 *arg1_tp =
+            reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
+        const argTy2 *arg2_tp =
+            reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
+
+        cgh.parallel_for<kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>>(
+            sycl::nd_range<1>(gws_range, lws_range),
+            BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz, n_vecs>(
+                arg1_tp, arg2_tp, res_tp, nelems));
+    });
+    return comp_ev;
+}
+
+template <typename argTy1,
+          typename argTy2,
+          template <typename T1, typename T2>
+          class BinaryOutputType,
+          template <typename T1, typename T2, typename T3, typename IndT>
+          class BinaryStridedFunctorT,
+          template <typename T1, typename T2, typename T3, typename IndT>
+          class kernel_name>
+sycl::event
+binary_strided_impl(sycl::queue exec_q,
+                    size_t nelems,
+                    int nd,
+                    const py::ssize_t *shape_and_strides,
+                    const char *arg1_p,
+                    py::ssize_t arg1_offset,
+                    const char *arg2_p,
+                    py::ssize_t arg2_offset,
+                    char *res_p,
+                    py::ssize_t res_offset,
+                    const std::vector<sycl::event> &depends,
+                    const std::vector<sycl::event> &additional_depends)
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
+
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+
+        IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset,
+                         shape_and_strides};
+
+        const argTy1 *arg1_tp = reinterpret_cast<const argTy1 *>(arg1_p);
+        const argTy2 *arg2_tp = reinterpret_cast<const argTy2 *>(arg2_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+        cgh.parallel_for<kernel_name<argTy1, argTy2, resTy, IndexerT>>(
+            {nelems}, BinaryStridedFunctorT<argTy1, argTy2, resTy, IndexerT>(
+                          arg1_tp, arg2_tp, res_tp, indexer));
+    });
+    return comp_ev;
+}
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          template <typename T1, typename T2, typename T3>
+          class BinaryContigMatrixContigRowBroadcastFunctorT,
+          template <typename T1, typename T2, typename T3>
+          class kernel_name>
+sycl::event binary_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue exec_q,
+    std::vector<sycl::event> &host_tasks,
+    size_t n0,
+    size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    py::ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    py::ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = op(mat[i,j], vec[j])
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    const argT1 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
+    const argT2 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
+    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
+
+    const auto &dev = exec_q.get_device();
+    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
+    // Get device-specific kernel info max_sub_group_size
+    size_t max_sgSize =
+        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+
+    size_t n1_padded = n1 + max_sgSize;
+    argT2 *padded_vec = sycl::malloc_device<argT2>(n1_padded, exec_q);
+
+    if (padded_vec == nullptr) {
+        throw std::runtime_error("Could not allocate memory on the device");
+    }
+    sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends); // ensure vec contains actual data
+        cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
+            auto i = id[0];
+            padded_vec[i] = vec[i % n1];
+        });
+    });
+
+    // sub-group spans work-items [I, I + sgSize)
+    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
+    // Generically, sg.load( &mat[base]) may load arrays from
+    // different rows of mat. The start corresponds to row (base / n0)
+    // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
+    // ensure that reads are accessible
+
+    size_t lws = 64;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(make_padded_vec_ev);
+
+        auto lwsRange = sycl::range<1>(lws);
+        size_t n_elems = n0 * n1;
+        size_t n_groups = (n_elems + lws - 1) / lws;
+        auto gwsRange = sycl::range<1>(n_groups * lws);
+
+        cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
+            sycl::nd_range<1>(gwsRange, lwsRange),
+            BinaryContigMatrixContigRowBroadcastFunctorT<argT1, argT2, resT>(
+                mat, padded_vec, res, n_elems, n1));
+    });
+
+    sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        sycl::context ctx = exec_q.get_context();
+        cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); });
+    });
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return comp_ev;
+}
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          template <typename T1, typename T2, typename T3>
+          class BinaryContigRowContigMatrixBroadcastFunctorT,
+          template <typename T1, typename T2, typename T3>
+          class kernel_name>
+sycl::event binary_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue exec_q,
+    std::vector<sycl::event> &host_tasks,
+    size_t n0,
+    size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    py::ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    py::ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = op(vec[j], mat[i,j])
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    const argT1 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
+    const argT2 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
+    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
+
+    const auto &dev = exec_q.get_device();
+    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
+    // Get device-specific kernel info max_sub_group_size
+    size_t max_sgSize =
+        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+
+    size_t n1_padded = n1 + max_sgSize;
+    argT2 *padded_vec = sycl::malloc_device<argT2>(n1_padded, exec_q);
+
+    if (padded_vec == nullptr) {
+        throw std::runtime_error("Could not allocate memory on the device");
+    }
+
+    sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends); // ensure vec contains actual data
+        cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
+            auto i = id[0];
+            padded_vec[i] = vec[i % n1];
+        });
+    });
+
+    // sub-group spans work-items [I, I + sgSize)
+    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
+    // Generically, sg.load( &mat[base]) may load arrays from
+    // different rows of mat. The start corresponds to row (base / n0)
+    // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
+    // ensure that reads are accessible
+
+    size_t lws = 64;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(make_padded_vec_ev);
+
+        auto lwsRange = sycl::range<1>(lws);
+        size_t n_elems = n0 * n1;
+        size_t n_groups = (n_elems + lws - 1) / lws;
+        auto gwsRange = sycl::range<1>(n_groups * lws);
+
+        cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
+            sycl::nd_range<1>(gwsRange, lwsRange),
+            BinaryContigRowContigMatrixBroadcastFunctorT<argT1, argT2, resT>(
+                padded_vec, mat, res, n_elems, n1));
+    });
+
+    sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        sycl::context ctx = exec_q.get_context();
+        cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); });
+    });
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return comp_ev;
+};
+
 } // namespace elementwise_common
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
index b6859910e3..7639facb07 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
@@ -103,27 +103,9 @@ sycl::event cos_contig_impl(sycl::queue exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
-    sycl::event cos_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        constexpr size_t lws = 64;
-        constexpr unsigned int vec_sz = 4;
-        constexpr unsigned int n_vecs = 2;
-        static_assert(lws % vec_sz == 0);
-        auto gws_range = sycl::range<1>(
-            ((nelems + n_vecs * lws * vec_sz - 1) / (lws * n_vecs * vec_sz)) *
-            lws);
-        auto lws_range = sycl::range<1>(lws);
-
-        using resTy = typename CosOutputType<argTy>::value_type;
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<class cos_contig_kernel<argTy, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            CosContigFunctor<argTy, resTy, vec_sz, n_vecs>(arg_tp, res_tp,
-                                                           nelems));
-    });
-    return cos_ev;
+    return elementwise_common::unary_contig_impl<
+        argTy, CosOutputType, CosContigFunctor, cos_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct CosContigFactory
@@ -166,26 +148,10 @@ sycl::event cos_strided_impl(sycl::queue exec_q,
                              const std::vector<sycl::event> &depends,
                              const std::vector<sycl::event> &additional_depends)
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using resTy = typename CosOutputType<argTy>::value_type;
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        IndexerT arg_res_indexer(nd, arg_offset, res_offset, shape_and_strides);
-
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        sycl::range<1> gRange{nelems};
-
-        cgh.parallel_for<cos_strided_kernel<argTy, resTy, IndexerT>>(
-            gRange, CosStridedFunctor<argTy, resTy, IndexerT>(arg_tp, res_tp,
-                                                              arg_res_indexer));
-    });
-    return comp_ev;
+    return elementwise_common::unary_strided_impl<
+        argTy, CosOutputType, CosStridedFunctor, cos_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
 }
 
 template <typename fnT, typename T> struct CosStridedFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
index edbbc393c5..a77ad54e85 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
@@ -167,32 +167,10 @@ sycl::event equal_contig_impl(sycl::queue exec_q,
                               py::ssize_t res_offset,
                               const std::vector<sycl::event> &depends = {})
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        size_t lws = 64;
-        constexpr unsigned int vec_sz = 4;
-        constexpr unsigned int n_vecs = 2;
-        const size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
-
-        using resTy = typename EqualOutputType<argTy1, argTy2>::value_type;
-
-        const argTy1 *arg1_tp =
-            reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
-        const argTy2 *arg2_tp =
-            reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
-
-        cgh.parallel_for<
-            equal_contig_kernel<argTy1, argTy2, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            EqualContigFunctor<argTy1, argTy2, resTy, vec_sz, n_vecs>(
-                arg1_tp, arg2_tp, res_tp, nelems));
-    });
-    return comp_ev;
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, EqualOutputType, EqualContigFunctor,
+        equal_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                             arg2_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct EqualContigFactory
@@ -240,28 +218,11 @@ equal_strided_impl(sycl::queue exec_q,
                    const std::vector<sycl::event> &depends,
                    const std::vector<sycl::event> &additional_depends)
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using resTy = typename EqualOutputType<argTy1, argTy2>::value_type;
-
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
-
-        IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset,
-                         shape_and_strides};
-
-        const argTy1 *arg1_tp = reinterpret_cast<const argTy1 *>(arg1_p);
-        const argTy2 *arg2_tp = reinterpret_cast<const argTy2 *>(arg2_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<
-            equal_strided_strided_kernel<argTy1, argTy2, resTy, IndexerT>>(
-            {nelems}, EqualStridedFunctor<argTy1, argTy2, resTy, IndexerT>(
-                          arg1_tp, arg2_tp, res_tp, indexer));
-    });
-    return comp_ev;
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, EqualOutputType, EqualStridedFunctor,
+        equal_strided_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct EqualStridedFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
index 86340329da..cc91bdbb97 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
@@ -113,7 +113,7 @@ template <typename argTy> struct IsFiniteOutputType
     using value_type = bool;
 };
 
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class isfinite_contig_kernel;
 
 template <typename argTy>
@@ -123,29 +123,10 @@ sycl::event isfinite_contig_impl(sycl::queue exec_q,
                                  char *res_p,
                                  const std::vector<sycl::event> &depends = {})
 {
-    sycl::event isfinite_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        constexpr size_t lws = 64;
-        constexpr std::uint8_t vec_sz = 4;
-        constexpr std::uint8_t n_vecs = 2;
-        static_assert(lws % vec_sz == 0);
-        size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        auto gws_range = sycl::range<1>(n_groups * lws);
-        auto lws_range = sycl::range<1>(lws);
-
-        using resTy = typename IsFiniteOutputType<argTy>::value_type;
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<
-            class isfinite_contig_kernel<argTy, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            IsFiniteContigFunctor<argTy, resTy, vec_sz, n_vecs>(arg_tp, res_tp,
-                                                                nelems));
-    });
-    return isfinite_ev;
+    return elementwise_common::unary_contig_impl<argTy, IsFiniteOutputType,
+                                                 IsFiniteContigFunctor,
+                                                 isfinite_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct IsFiniteContigFactory
@@ -182,24 +163,11 @@ isfinite_strided_impl(sycl::queue exec_q,
                       const std::vector<sycl::event> &depends,
                       const std::vector<sycl::event> &additional_depends)
 {
-    sycl::event isfinite_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using resTy = typename IsFiniteOutputType<argTy>::value_type;
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        IndexerT arg_res_indexer{nd, arg_offset, res_offset, shape_and_strides};
-
-        const argTy *arg_tptr = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tptr = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<isfinite_strided_kernel<argTy, resTy, IndexerT>>(
-            {nelems}, IsFiniteStridedFunctor<argTy, resTy, IndexerT>(
-                          arg_tptr, res_tptr, arg_res_indexer));
-    });
-    return isfinite_ev;
+    return elementwise_common::unary_strided_impl<argTy, IsFiniteOutputType,
+                                                  IsFiniteStridedFunctor,
+                                                  isfinite_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
 }
 
 template <typename fnT, typename T> struct IsFiniteStridedFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
index 8f7dcca6c7..8cb93ea456 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
@@ -111,7 +111,7 @@ template <typename argTy> struct IsInfOutputType
     using value_type = bool;
 };
 
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class isinf_contig_kernel;
 
 template <typename argTy>
@@ -121,28 +121,9 @@ sycl::event isinf_contig_impl(sycl::queue exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    sycl::event isinf_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        constexpr size_t lws = 64;
-        constexpr std::uint8_t vec_sz = 4;
-        constexpr std::uint8_t n_vecs = 2;
-        static_assert(lws % vec_sz == 0);
-        auto gws_range = sycl::range<1>(
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)) *
-            lws);
-        auto lws_range = sycl::range<1>(lws);
-
-        using resTy = typename IsInfOutputType<argTy>::value_type;
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<
-            class isinf_contig_kernel<argTy, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            IsInfContigFunctor<argTy, resTy, vec_sz, n_vecs>(arg_tp, res_tp,
-                                                             nelems));
-    });
-    return isinf_ev;
+    return elementwise_common::unary_contig_impl<
+        argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct IsInfContigFactory
@@ -179,26 +160,10 @@ isinf_strided_impl(sycl::queue exec_q,
                    const std::vector<sycl::event> &depends,
                    const std::vector<sycl::event> &additional_depends)
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using resTy = typename IsInfOutputType<argTy>::value_type;
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        IndexerT arg_res_indexer{nd, arg_offset, res_offset, shape_and_strides};
-
-        const argTy *arg_tptr = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tptr = reinterpret_cast<resTy *>(res_p);
-
-        sycl::range<1> gRange{nelems};
-
-        cgh.parallel_for<isinf_strided_kernel<argTy, resTy, IndexerT>>(
-            gRange, IsInfStridedFunctor<argTy, resTy, IndexerT>(
-                        arg_tptr, res_tptr, arg_res_indexer));
-    });
-    return comp_ev;
+    return elementwise_common::unary_strided_impl<
+        argTy, IsInfOutputType, IsInfStridedFunctor, isinf_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
 }
 
 template <typename fnT, typename T> struct IsInfStridedFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
index 3e4f68ed57..5d628437ed 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
@@ -104,119 +104,12 @@ template <typename argTy, typename resTy, typename IndexerT>
 using IsNanStridedFunctor = elementwise_common::
     UnaryStridedFunctor<argTy, resTy, IndexerT, IsNanFunctor<argTy, resTy>>;
 
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz = 4,
-          std::uint8_t n_vecs = 2>
-struct IsNanContigFunctorOld
-{
-private:
-    const argT *in = nullptr;
-    resT *out = nullptr;
-    const size_t nelems_;
-
-public:
-    IsNanContigFunctorOld(const argT *inp, resT *res, const size_t nelems)
-        : in(inp), out(res), nelems_(nelems)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> ndit) const
-    {
-        using dpctl::tensor::type_utils::is_complex;
-        using dpctl::tensor::type_utils::vec_cast;
-
-        if constexpr (is_complex<argT>::value) {
-            std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
-            size_t base = ndit.get_global_linear_id();
-
-            base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
-            for (size_t offset = base;
-                 offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz));
-                 offset += sgSize)
-            {
-                const bool real_isnan = sycl::isnan(std::real(in[offset]));
-                const bool imag_isnan = sycl::isnan(std::imag(in[offset]));
-                out[offset] = real_isnan || imag_isnan;
-            }
-        }
-        else if constexpr (std::is_same<argT, bool>::value ||
-                           std::is_integral<argT>::value)
-        {
-            using out_ptrT =
-                sycl::multi_ptr<resT,
-                                sycl::access::address_space::global_space>;
-
-            auto sg = ndit.get_sub_group();
-            std::uint8_t sgSize = sg.get_local_range()[0];
-            std::uint8_t max_sgSize = sg.get_max_local_range()[0];
-            size_t base = n_vecs * vec_sz *
-                          (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * sgSize);
-            if (base + n_vecs * vec_sz * max_sgSize < nelems_ &&
-                max_sgSize == sgSize) {
-                sycl::vec<bool, vec_sz> res_vec(false);
-#pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     res_vec);
-                }
-            }
-            else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize) {
-                    out[k] = false;
-                }
-            }
-        }
-        else {
-            using in_ptrT =
-                sycl::multi_ptr<const argT,
-                                sycl::access::address_space::global_space>;
-            using out_ptrT =
-                sycl::multi_ptr<bool,
-                                sycl::access::address_space::global_space>;
-            static_assert(std::is_same<resT, bool>::value);
-
-            auto sg = ndit.get_sub_group();
-            std::uint16_t sgSize = sg.get_local_range()[0];
-            std::uint16_t max_sgSize = sg.get_max_local_range()[0];
-            size_t base = n_vecs * vec_sz *
-                          (ndit.get_group(0) * ndit.get_local_range(0) +
-                           sg.get_group_id()[0] * max_sgSize);
-            if (base + n_vecs * vec_sz * max_sgSize < nelems_ &&
-                sgSize == max_sgSize) {
-                sycl::vec<argT, vec_sz> x;
-
-#pragma unroll
-                for (std::uint16_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    x = sg.load<vec_sz>(in_ptrT(&in[base + it * sgSize]));
-                    // returns vec<int, vec_sz>
-                    auto res_vec = sycl::isnan(x);
-                    // cast it to bool
-                    sycl::vec<bool, vec_sz> res_bool =
-                        vec_cast<bool, typename decltype(res_vec)::element_type,
-                                 vec_sz>(res_vec);
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     res_bool);
-                }
-            }
-            else {
-                for (size_t k = base + sg.get_local_id()[0]; k < nelems_;
-                     k += sgSize) {
-                    out[k] = static_cast<bool>(sycl::isnan(in[k]));
-                }
-            }
-        }
-    }
-};
-
 template <typename argTy> struct IsNanOutputType
 {
     using value_type = bool;
 };
 
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
 class isnan_contig_kernel;
 
 template <typename argTy>
@@ -226,28 +119,9 @@ sycl::event isnan_contig_impl(sycl::queue exec_q,
                               char *res_p,
                               const std::vector<sycl::event> &depends = {})
 {
-    sycl::event isnan_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        constexpr size_t lws = 64;
-        constexpr std::uint8_t vec_sz = 4;
-        constexpr std::uint8_t n_vecs = 2;
-        static_assert(lws % vec_sz == 0);
-        auto gws_range = sycl::range<1>(
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)) *
-            lws);
-        auto lws_range = sycl::range<1>(lws);
-
-        using resTy = typename IsNanOutputType<argTy>::value_type;
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<
-            class isnan_contig_kernel<argTy, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            IsNanContigFunctor<argTy, resTy, vec_sz, n_vecs>(arg_tp, res_tp,
-                                                             nelems));
-    });
-    return isnan_ev;
+    return elementwise_common::unary_contig_impl<
+        argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct IsNanContigFactory
@@ -269,48 +143,6 @@ template <typename fnT, typename T> struct IsNanTypeMapFactory
     }
 };
 
-template <typename argT, typename resT, typename IndexerT>
-struct IsNanStridedFunctorOld
-{
-private:
-    const argT *inp_ = nullptr;
-    resT *res_ = nullptr;
-    IndexerT inp_out_indexer_;
-
-public:
-    IsNanStridedFunctorOld(const argT *inp_p,
-                           resT *res_p,
-                           IndexerT inp_out_indexer)
-        : inp_(inp_p), res_(res_p), inp_out_indexer_(inp_out_indexer)
-    {
-    }
-
-    void operator()(sycl::id<1> wid) const
-    {
-        const argT *const &in = inp_;
-        resT *const &out = res_;
-
-        auto offsets_ = inp_out_indexer_(wid.get(0));
-        const py::ssize_t &inp_offset = offsets_.get_first_offset();
-        const py::ssize_t &out_offset = offsets_.get_second_offset();
-
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (std::is_same_v<argT, bool> ||
-                      (std::is_integral<argT>::value)) {
-            out[out_offset] = false;
-        }
-        else if constexpr (is_complex<argT>::value) {
-            const bool real_isnan = sycl::isnan(std::real(in[inp_offset]));
-            const bool imag_isnan = sycl::isnan(std::imag(in[inp_offset]));
-
-            out[out_offset] = real_isnan || imag_isnan;
-        }
-        else {
-            out[out_offset] = sycl::isnan(in[inp_offset]);
-        }
-    }
-};
-
 template <typename T1, typename T2, typename T3> class isnan_strided_kernel;
 
 template <typename argTy>
@@ -326,24 +158,10 @@ isnan_strided_impl(sycl::queue exec_q,
                    const std::vector<sycl::event> &depends,
                    const std::vector<sycl::event> &additional_depends)
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using resTy = typename IsNanOutputType<argTy>::value_type;
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        IndexerT arg_res_indexer{nd, arg_offset, res_offset, shape_and_strides};
-
-        const argTy *arg_tptr = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tptr = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<isnan_strided_kernel<argTy, resTy, IndexerT>>(
-            {nelems}, IsNanStridedFunctor<argTy, resTy, IndexerT>(
-                          arg_tptr, res_tptr, arg_res_indexer));
-    });
-    return comp_ev;
+    return elementwise_common::unary_strided_impl<
+        argTy, IsNanOutputType, IsNanStridedFunctor, isnan_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
 }
 
 template <typename fnT, typename T> struct IsNanStridedFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
new file mode 100644
index 0000000000..71998e0dac
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
@@ -0,0 +1,377 @@
+//=== multiply.hpp -   Binary function MUL               ------  *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of MUL(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace multiply
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT> struct MultiplyFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2)
+    {
+        return in1 * in2;
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
+                                       const sycl::vec<argT2, vec_sz> &in2)
+    {
+        auto tmp = in1 * in2;
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using MultiplyContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            MultiplyFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using MultiplyStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    MultiplyFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2> struct MultiplyOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class multiply_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event multiply_contig_impl(sycl::queue exec_q,
+                                 size_t nelems,
+                                 const char *arg1_p,
+                                 py::ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 py::ssize_t arg2_offset,
+                                 char *res_p,
+                                 py::ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, MultiplyOutputType, MultiplyContigFunctor,
+        multiply_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                                arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct MultiplyContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename MultiplyOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2> struct MultiplyTypeMapFactory
+{
+    /*! @brief get typeid for output type of multiply(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename MultiplyOutputType<T1, T2>::value_type;
+        ;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class multiply_strided_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+multiply_strided_impl(sycl::queue exec_q,
+                      size_t nelems,
+                      int nd,
+                      const py::ssize_t *shape_and_strides,
+                      const char *arg1_p,
+                      py::ssize_t arg1_offset,
+                      const char *arg2_p,
+                      py::ssize_t arg2_offset,
+                      char *res_p,
+                      py::ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, MultiplyOutputType, MultiplyStridedFunctor,
+        multiply_strided_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct MultiplyStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename MultiplyOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+class multiply_matrix_row_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+using MultiplyContigMatrixContigRowBroadcastingFunctor =
+    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        MultiplyFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event multiply_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue exec_q,
+    std::vector<sycl::event> &host_tasks,
+    size_t n0,
+    size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    py::ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    py::ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] * vec[j]
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, MultiplyContigMatrixContigRowBroadcastingFunctor,
+        multiply_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
+                                              mat_offset, vec_p, vec_offset,
+                                              res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyContigMatrixContigRowBroadcastFactory
+{
+    fnT get()
+    {
+        using resT = typename MultiplyOutputType<T1, T2>::value_type;
+        if constexpr (std::is_same_v<resT, void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    multiply_contig_matrix_contig_row_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event multiply_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue exec_q,
+    std::vector<sycl::event> &host_tasks,
+    size_t n0,
+    size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    py::ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    py::ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] * vec[j]
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return multiply_contig_matrix_contig_row_broadcast_impl<argT2, argT1, resT>(
+        exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p,
+        res_offset, depends);
+};
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyContigRowContigMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        using resT = typename MultiplyOutputType<T1, T2>::value_type;
+        if constexpr (std::is_same_v<resT, void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    multiply_contig_row_contig_matrix_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+} // namespace multiply
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index 7e576c8746..7853428227 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -104,28 +104,9 @@ sycl::event sqrt_contig_impl(sycl::queue exec_q,
                              char *res_p,
                              const std::vector<sycl::event> &depends = {})
 {
-    sycl::event sqrt_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        constexpr size_t lws = 64;
-        constexpr unsigned int vec_sz = 4;
-        constexpr unsigned int n_vecs = 2;
-        static_assert(lws % vec_sz == 0);
-        auto gws_range = sycl::range<1>(
-            ((nelems + n_vecs * lws * vec_sz - 1) / (lws * n_vecs * vec_sz)) *
-            lws);
-        auto lws_range = sycl::range<1>(lws);
-
-        using resTy = typename SqrtOutputType<argTy>::value_type;
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<
-            class sqrt_contig_kernel<argTy, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            SqrtContigFunctor<argTy, resTy, vec_sz, n_vecs>(arg_tp, res_tp,
-                                                            nelems));
-    });
-    return sqrt_ev;
+    return elementwise_common::unary_contig_impl<
+        argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct SqrtContigFactory
@@ -169,26 +150,10 @@ sqrt_strided_impl(sycl::queue exec_q,
                   const std::vector<sycl::event> &depends,
                   const std::vector<sycl::event> &additional_depends)
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using resTy = typename SqrtOutputType<argTy>::value_type;
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        IndexerT arg_res_indexer(nd, arg_offset, res_offset, shape_and_strides);
-
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        sycl::range<1> gRange{nelems};
-
-        cgh.parallel_for<sqrt_strided_kernel<argTy, resTy, IndexerT>>(
-            gRange, SqrtStridedFunctor<argTy, resTy, IndexerT>(
-                        arg_tp, res_tp, arg_res_indexer));
-    });
-    return comp_ev;
+    return elementwise_common::unary_strided_impl<
+        argTy, SqrtOutputType, SqrtStridedFunctor, sqrt_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
 }
 
 template <typename fnT, typename T> struct SqrtStridedFactory
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
new file mode 100644
index 0000000000..d6fb44d072
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -0,0 +1,391 @@
+//=== subtract.hpp -   Binary function SUBTRACT         ------  *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of DIVIDE(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace subtract
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT> struct SubtractFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2)
+    {
+        return in1 - in2;
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
+                                       const sycl::vec<argT2, vec_sz> &in2)
+    {
+        auto tmp = in1 - in2;
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using SubtractContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            SubtractFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using SubtractStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    SubtractFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2> struct SubtractOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class subtract_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event subtract_contig_impl(sycl::queue exec_q,
+                                 size_t nelems,
+                                 const char *arg1_p,
+                                 py::ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 py::ssize_t arg2_offset,
+                                 char *res_p,
+                                 py::ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, SubtractOutputType, SubtractContigFunctor,
+        subtract_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                                arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct SubtractContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename SubtractOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2> struct SubtractTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SubtractOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class subtract_strided_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+subtract_strided_impl(sycl::queue exec_q,
+                      size_t nelems,
+                      int nd,
+                      const py::ssize_t *shape_and_strides,
+                      const char *arg1_p,
+                      py::ssize_t arg1_offset,
+                      const char *arg2_p,
+                      py::ssize_t arg2_offset,
+                      char *res_p,
+                      py::ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, SubtractOutputType, SubtractStridedFunctor,
+        subtract_strided_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct SubtractStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename SubtractOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+using SubtractContigMatrixContigRowBroadcastingFunctor =
+    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        SubtractFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+using SubtractContigRowContigMatrixBroadcastingFunctor =
+    elementwise_common::BinaryContigRowContigMatrixBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        SubtractFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+class subtract_matrix_row_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+class subtract_row_matrix_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event subtract_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue exec_q,
+    std::vector<sycl::event> &host_tasks,
+    size_t n0,
+    size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    py::ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    py::ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] - vec[j]
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, SubtractContigMatrixContigRowBroadcastingFunctor,
+        subtract_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
+                                              mat_offset, vec_p, vec_offset,
+                                              res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractContigMatrixContigRowBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename SubtractOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename SubtractOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    subtract_contig_matrix_contig_row_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event subtract_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue exec_q,
+    std::vector<sycl::event> &host_tasks,
+    size_t n0,
+    size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    py::ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    py::ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = op(vec[j], mat[i,j])
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl<
+        argT1, argT2, resT, SubtractContigRowContigMatrixBroadcastingFunctor,
+        subtract_row_matrix_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, vec_p,
+                                              vec_offset, mat_p, mat_offset,
+                                              res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractContigRowContigMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        using resT = typename SubtractOutputType<T1, T2>::value_type;
+        if constexpr (std::is_same_v<resT, void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    subtract_contig_row_contig_matrix_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+} // namespace subtract
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
index f34da1b415..96ca231584 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -165,32 +165,10 @@ true_divide_contig_impl(sycl::queue exec_q,
                         py::ssize_t res_offset,
                         const std::vector<sycl::event> &depends = {})
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        size_t lws = 64;
-        constexpr unsigned int vec_sz = 4;
-        constexpr unsigned int n_vecs = 2;
-        const size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
-
-        using resTy = typename TrueDivideOutputType<argTy1, argTy2>::value_type;
-
-        const argTy1 *arg1_tp =
-            reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
-        const argTy2 *arg2_tp =
-            reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
-
-        cgh.parallel_for<
-            true_divide_contig_kernel<argTy1, argTy2, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            TrueDivideContigFunctor<argTy1, argTy2, resTy, vec_sz, n_vecs>(
-                arg1_tp, arg2_tp, res_tp, nelems));
-    });
-    return comp_ev;
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, TrueDivideOutputType, TrueDivideContigFunctor,
+        true_divide_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                                   arg2_offset, res_p, res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct TrueDivideContigFactory
@@ -240,30 +218,11 @@ true_divide_strided_impl(sycl::queue exec_q,
                          const std::vector<sycl::event> &depends,
                          const std::vector<sycl::event> &additional_depends)
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using resTy = typename TrueDivideOutputType<argTy1, argTy2>::value_type;
-
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
-
-        IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset,
-                         shape_and_strides};
-
-        const argTy1 *arg1_tp = reinterpret_cast<const argTy1 *>(arg1_p);
-        const argTy2 *arg2_tp = reinterpret_cast<const argTy2 *>(arg2_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        sycl::range<1> gRange(nelems);
-
-        cgh.parallel_for<true_divide_strided_strided_kernel<argTy1, argTy2,
-                                                            resTy, IndexerT>>(
-            gRange, TrueDivideStridedFunctor<argTy1, argTy2, resTy, IndexerT>(
-                        arg1_tp, arg2_tp, res_tp, indexer));
-    });
-    return comp_ev;
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, TrueDivideOutputType, TrueDivideStridedFunctor,
+        true_divide_strided_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
 }
 
 template <typename fnT, typename T1, typename T2>
@@ -322,63 +281,11 @@ sycl::event true_divide_contig_matrix_contig_row_broadcast_impl(
     py::ssize_t res_offset,
     const std::vector<sycl::event> &depends = {})
 {
-    const argT1 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
-    const argT2 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
-    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
-
-    const auto &dev = exec_q.get_device();
-    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
-    // Get device-specific kernel info max_sub_group_size
-    size_t max_sgSize =
-        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
-
-    size_t n1_padded = n1 + max_sgSize;
-    argT2 *padded_vec = sycl::malloc_device<argT2>(n1_padded, exec_q);
-
-    if (padded_vec == nullptr) {
-        throw std::runtime_error("Could not allocate memory on the device");
-    }
-    sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends); // ensure vec contains actual data
-        cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
-            auto i = id[0];
-            padded_vec[i] = vec[i % n1];
-        });
-    });
-
-    // sub-group spans work-items [I, I + sgSize)
-    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
-    // Generically, sg.load( &mat[base]) may load arrays from
-    // different rows of mat. The start corresponds to row (base / n0)
-    // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
-    // ensure that reads are accessible
-
-    size_t lws = 64;
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(make_padded_vec_ev);
-
-        auto lwsRange = sycl::range<1>(lws);
-        size_t n_elems = n0 * n1;
-        size_t n_groups = (n_elems + lws - 1) / lws;
-        auto gwsRange = sycl::range<1>(n_groups * lws);
-
-        cgh.parallel_for<
-            class true_divide_matrix_row_broadcast_sg_krn<argT1, argT2, resT>>(
-            sycl::nd_range<1>(gwsRange, lwsRange),
-            TrueDivideContigMatrixContigRowBroadcastingFunctor<argT1, argT2,
-                                                               resT>(
-                mat, padded_vec, res, n_elems, n1));
-    });
-
-    sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(comp_ev);
-        sycl::context ctx = exec_q.get_context();
-        cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); });
-    });
-    host_tasks.push_back(tmp_cleanup_ev);
-
-    return comp_ev;
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, TrueDivideContigMatrixContigRowBroadcastingFunctor,
+        true_divide_matrix_row_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>
@@ -427,63 +334,11 @@ sycl::event true_divide_contig_row_contig_matrix_broadcast_impl(
     py::ssize_t res_offset,
     const std::vector<sycl::event> &depends = {})
 {
-    const argT1 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
-    const argT2 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
-    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
-
-    const auto &dev = exec_q.get_device();
-    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
-    // Get device-specific kernel info max_sub_group_size
-    size_t max_sgSize =
-        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
-
-    size_t n1_padded = n1 + max_sgSize;
-    argT2 *padded_vec = sycl::malloc_device<argT2>(n1_padded, exec_q);
-
-    if (padded_vec == nullptr) {
-        throw std::runtime_error("Could not allocate memory on the device");
-    }
-    sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends); // ensure vec contains actual data
-        cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
-            auto i = id[0];
-            padded_vec[i] = vec[i % n1];
-        });
-    });
-
-    // sub-group spans work-items [I, I + sgSize)
-    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
-    // Generically, sg.load( &mat[base]) may load arrays from
-    // different rows of mat. The start corresponds to row (base / n0)
-    // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
-    // ensure that reads are accessible
-
-    size_t lws = 64;
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(make_padded_vec_ev);
-
-        auto lwsRange = sycl::range<1>(lws);
-        size_t n_elems = n0 * n1;
-        size_t n_groups = (n_elems + lws - 1) / lws;
-        auto gwsRange = sycl::range<1>(n_groups * lws);
-
-        cgh.parallel_for<
-            class true_divide_row_matrix_broadcast_sg_krn<argT1, argT2, resT>>(
-            sycl::nd_range<1>(gwsRange, lwsRange),
-            TrueDivideContigRowContigMatrixBroadcastingFunctor<argT1, argT2,
-                                                               resT>(
-                padded_vec, mat, res, n_elems, n1));
-    });
-
-    sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(comp_ev);
-        sycl::context ctx = exec_q.get_context();
-        cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); });
-    });
-    host_tasks.push_back(tmp_cleanup_ev);
-
-    return comp_ev;
+    return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl<
+        argT1, argT2, resT, TrueDivideContigRowContigMatrixBroadcastingFunctor,
+        true_divide_row_matrix_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset, res_p,
+        res_offset, depends);
 };
 
 template <typename fnT, typename T1, typename T2>
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
index de209fdb1c..c943f28c97 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
@@ -39,7 +39,9 @@
 #include "kernels/elementwise_functions/isfinite.hpp"
 #include "kernels/elementwise_functions/isinf.hpp"
 #include "kernels/elementwise_functions/isnan.hpp"
+#include "kernels/elementwise_functions/multiply.hpp"
 #include "kernels/elementwise_functions/sqrt.hpp"
+#include "kernels/elementwise_functions/subtract.hpp"
 #include "kernels/elementwise_functions/true_divide.hpp"
 
 namespace dpctl
@@ -663,7 +665,71 @@ namespace impl
 // B19: ==== MULTIPLY    (x1, x2)
 namespace impl
 {
-// FIXME: add code for B19
+
+namespace multiply_fn_ns = dpctl::tensor::kernels::multiply;
+
+static binary_contig_impl_fn_ptr_t
+    multiply_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int multiply_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    multiply_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// mul(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    multiply_contig_matrix_contig_row_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+// mul(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    multiply_contig_row_contig_matrix_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+void populate_multiply_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = multiply_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::MultiplyTypeMapFactory;
+    DispatchTableBuilder<int, MultiplyTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(multiply_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::MultiplyStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MultiplyStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(multiply_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::MultiplyContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MultiplyContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(multiply_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::MultiplyContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        MultiplyContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        multiply_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::MultiplyContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        MultiplyContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        multiply_contig_row_contig_matrix_broadcast_dispatch_table);
+};
+
 } // namespace impl
 
 // U25: ==== NEGATIVE    (x)
@@ -770,7 +836,70 @@ void populate_sqrt_dispatch_vectors(void)
 // B23: ==== SUBTRACT    (x1, x2)
 namespace impl
 {
-// FIXME: add code for B23
+namespace subtract_fn_ns = dpctl::tensor::kernels::subtract;
+
+static binary_contig_impl_fn_ptr_t
+    subtract_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int subtract_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    subtract_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// sub(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    subtract_contig_matrix_contig_row_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+// sub(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    subtract_contig_row_contig_matrix_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+void populate_subtract_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = subtract_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::SubtractTypeMapFactory;
+    DispatchTableBuilder<int, SubtractTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(subtract_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::SubtractStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, SubtractStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(subtract_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::SubtractContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, SubtractContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(subtract_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::SubtractContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        SubtractContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        subtract_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::SubtractContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        SubtractContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        subtract_contig_row_contig_matrix_broadcast_dispatch_table);
+};
+
 } // namespace impl
 
 // U34: ==== TAN         (x)
@@ -1140,7 +1269,44 @@ void init_elementwise_functions(py::module_ m)
     // FIXME:
 
     // B19: ==== MULTIPLY    (x1, x2)
-    // FIXME:
+    {
+        impl::populate_multiply_dispatch_tables();
+        using impl::multiply_contig_dispatch_table;
+        using impl::multiply_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::multiply_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::multiply_output_id_table;
+        using impl::multiply_strided_dispatch_table;
+
+        auto multiply_pyapi =
+            [&](dpctl::tensor::usm_ndarray src1,
+                dpctl::tensor::usm_ndarray src2, dpctl::tensor::usm_ndarray dst,
+                sycl::queue exec_q,
+                const std::vector<sycl::event> &depends = {}) {
+                return py_binary_ufunc(
+                    src1, src2, dst, exec_q, depends, multiply_output_id_table,
+                    // function pointers to handle operation on contiguous
+                    // arrays (pointers may be nullptr)
+                    multiply_contig_dispatch_table,
+                    // function pointers to handle operation on strided arrays
+                    // (most general case)
+                    multiply_strided_dispatch_table,
+                    // function pointers to handle operation of c-contig matrix
+                    // and c-contig row with broadcasting (may be nullptr)
+                    multiply_contig_matrix_contig_row_broadcast_dispatch_table,
+                    // function pointers to handle operation of c-contig matrix
+                    // and c-contig row with broadcasting (may be nullptr)
+                    multiply_contig_row_contig_matrix_broadcast_dispatch_table);
+            };
+        auto multiply_result_type_pyapi = [&](py::dtype dtype1,
+                                              py::dtype dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               multiply_output_id_table);
+        };
+        m.def("_multiply", multiply_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_multiply_result_type", multiply_result_type_pyapi, "");
+    }
 
     // U25: ==== NEGATIVE    (x)
     // FIXME:
@@ -1198,7 +1364,44 @@ void init_elementwise_functions(py::module_ m)
     }
 
     // B23: ==== SUBTRACT    (x1, x2)
-    // FIXME:
+    {
+        impl::populate_subtract_dispatch_tables();
+        using impl::subtract_contig_dispatch_table;
+        using impl::subtract_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::subtract_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::subtract_output_id_table;
+        using impl::subtract_strided_dispatch_table;
+
+        auto subtract_pyapi =
+            [&](dpctl::tensor::usm_ndarray src1,
+                dpctl::tensor::usm_ndarray src2, dpctl::tensor::usm_ndarray dst,
+                sycl::queue exec_q,
+                const std::vector<sycl::event> &depends = {}) {
+                return py_binary_ufunc(
+                    src1, src2, dst, exec_q, depends, subtract_output_id_table,
+                    // function pointers to handle operation on contiguous
+                    // arrays (pointers may be nullptr)
+                    subtract_contig_dispatch_table,
+                    // function pointers to handle operation on strided arrays
+                    // (most general case)
+                    subtract_strided_dispatch_table,
+                    // function pointers to handle operation of c-contig matrix
+                    // and c-contig row with broadcasting (may be nullptr)
+                    subtract_contig_matrix_contig_row_broadcast_dispatch_table,
+                    // function pointers to handle operation of c-contig matrix
+                    // and c-contig row with broadcasting (may be nullptr)
+                    subtract_contig_row_contig_matrix_broadcast_dispatch_table);
+            };
+        auto subtract_result_type_pyapi = [&](py::dtype dtype1,
+                                              py::dtype dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               subtract_output_id_table);
+        };
+        m.def("_subtract", subtract_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_subtract_result_type", subtract_result_type_pyapi, "");
+    }
 
     // U34: ==== TAN         (x)
     // FIXME:
diff --git a/dpctl/tests/elementwise/test_multiply.py b/dpctl/tests/elementwise/test_multiply.py
new file mode 100644
index 0000000000..cd506cd182
--- /dev/null
+++ b/dpctl/tests/elementwise/test_multiply.py
@@ -0,0 +1,154 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import ctypes
+
+import numpy as np
+import pytest
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _all_dtypes, _compare_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_multiply_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.multiply(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.multiply(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.multiply(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.multiply(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_multiply_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.multiply(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_multiply_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.multiply(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.multiply(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.multiply(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.multiply(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.multiply(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.multiply(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.multiply(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.multiply(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.multiply(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.multiply(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_multiply_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.multiply(m, v)
+
+    expected = np.multiply(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.multiply(v, m)
+    expected2 = np.multiply(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_multiply_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.multiply(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.multiply(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
diff --git a/dpctl/tests/elementwise/test_subtract.py b/dpctl/tests/elementwise/test_subtract.py
new file mode 100644
index 0000000000..f524151e97
--- /dev/null
+++ b/dpctl/tests/elementwise/test_subtract.py
@@ -0,0 +1,171 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import ctypes
+
+import numpy as np
+import pytest
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _all_dtypes, _compare_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
+def test_subtract_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.subtract(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.subtract(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, 0, dtype=r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(ar1, dtype=r.dtype)
+    dpt.subtract(ar1, ar2, out=r2)
+    assert (dpt.asnumpy(r2) == np.full(r2.shape, 0, dtype=r2.dtype)).all()
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.subtract(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.subtract(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, 0, dtype=r.dtype)).all()
+
+    r2 = dpt.empty_like(ar1, dtype=r.dtype)
+    dpt.subtract(ar3[::-1], ar4[::2], out=r2)
+    assert (dpt.asnumpy(r2) == np.full(r2.shape, 0, dtype=r2.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_subtract_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.subtract(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_subtract_order():
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="C")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="C")
+        r1 = dpt.subtract(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.subtract(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.subtract(ar1, ar2, order="A")
+        assert r3.flags.c_contiguous
+        r4 = dpt.subtract(ar1, ar2, order="K")
+        assert r4.flags.c_contiguous
+
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="F")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="F")
+        r1 = dpt.subtract(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.subtract(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.subtract(ar1, ar2, order="A")
+        assert r3.flags.f_contiguous
+        r4 = dpt.subtract(ar1, ar2, order="K")
+        assert r4.flags.f_contiguous
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
+        r4 = dpt.subtract(ar1, ar2, order="K")
+        assert r4.strides == (n, -1)
+        r5 = dpt.subtract(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
+        r4 = dpt.subtract(ar1, ar2, order="K")
+        assert r4.strides == (-1, n)
+        r5 = dpt.subtract(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+
+def test_subtract_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    r = dpt.subtract(m, v)
+    assert (
+        dpt.asnumpy(r) == np.arange(1, -4, step=-1, dtype="i4")[np.newaxis, :]
+    ).all()
+
+    r2 = dpt.subtract(v, m)
+    assert (
+        dpt.asnumpy(r2) == np.arange(-1, 4, dtype="i4")[np.newaxis, :]
+    ).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_subtract_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        R = dpt.subtract(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.subtract(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)