From 13445cde9eb5ba3a24df19571cd2fe451a0261be Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 26 Jul 2023 12:15:19 -0500
Subject: [PATCH 1/6] Adds implementation of 6 bitwise elementwise functions

Implements bitwise_invert, bitwise_and, bitwise_or, bitwise_xor,
bitwise_left_shift, and bitwise_right_shift

Implements Python API in _tensor_impl for these functions.
---
 .../elementwise_functions/bitwise_and.hpp     | 262 ++++++++++
 .../elementwise_functions/bitwise_invert.hpp  | 213 +++++++++
 .../bitwise_left_shift.hpp                    | 266 +++++++++++
 .../elementwise_functions/bitwise_or.hpp      | 258 ++++++++++
 .../bitwise_right_shift.hpp                   | 266 +++++++++++
 .../elementwise_functions/bitwise_xor.hpp     | 262 ++++++++++
 .../source/elementwise_functions.cpp          | 447 +++++++++++++++++-
 7 files changed, 1962 insertions(+), 12 deletions(-)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
new file mode 100644
index 0000000000..12dece02bd
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -0,0 +1,262 @@
+//=== bitwise_and.hpp -   Binary function BITWISE_AND  -------- *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain in1 copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_and(ar1, ar2) operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace bitwise_and
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseAndFunctor
+{
+    static_assert(std::is_same_v<resT, argT1>);
+    static_assert(std::is_same_v<resT, argT2>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2)
+    {
+        using tu_ns::convert_impl;
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            return in1 && in2;
+        }
+        else {
+            return (in1 & in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
+                                       const sycl::vec<argT2, vec_sz> &in2)
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (in1 && in2);
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            return (in1 & in2);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using BitwiseAndContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseAndFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseAndStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    BitwiseAndFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2> struct BitwiseAndOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by
+                                                  // DPC++
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class bitwise_and_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+bitwise_and_contig_impl(sycl::queue exec_q,
+                        size_t nelems,
+                        const char *arg1_p,
+                        py::ssize_t arg1_offset,
+                        const char *arg2_p,
+                        py::ssize_t arg2_offset,
+                        char *res_p,
+                        py::ssize_t res_offset,
+                        const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseAndOutputType, BitwiseAndContigFunctor,
+        bitwise_and_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                                   arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct BitwiseAndContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseAndOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseAndOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_and_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+bitwise_and_strided_impl(sycl::queue exec_q,
+                         size_t nelems,
+                         int nd,
+                         const py::ssize_t *shape_and_strides,
+                         const char *arg1_p,
+                         py::ssize_t arg1_offset,
+                         const char *arg2_p,
+                         py::ssize_t arg2_offset,
+                         char *res_p,
+                         py::ssize_t res_offset,
+                         const std::vector<sycl::event> &depends,
+                         const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseAndOutputType, BitwiseAndStridedFunctor,
+        bitwise_and_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseAndOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace bitwise_and
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
new file mode 100644
index 0000000000..44ac214b27
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
@@ -0,0 +1,213 @@
+//=== bitwise_invert.hpp -   Unary function bitwise_invert      *-C++-*--/===//
+//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of bitwise_invert(x)
+/// function that inverts bits of binary representation of the argument.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace bitwise_invert
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT> struct BitwiseInvertFunctor
+{
+    static_assert(std::is_same_v<argT, resT>);
+    static_assert(std::is_integral_v<argT> || std::is_same_v<argT, bool>);
+
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::true_type;
+    using supports_sg_loadstore = typename std::true_type;
+    ;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (std::is_same_v<argT, bool>) {
+            return !in;
+        }
+        else {
+            return ~in;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in)
+    {
+        if constexpr (std::is_same_v<argT, bool>) {
+            auto res_vec = !in;
+
+            using deducedT = typename std::remove_cv_t<
+                std::remove_reference_t<decltype(res_vec)>>::element_type;
+            return vec_cast<resT, deducedT, vec_sz>(res_vec);
+        }
+        else {
+            return ~in;
+        }
+    }
+};
+
+template <typename argT,
+          typename resT = argT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using BitwiseInvertContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           BitwiseInvertFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using BitwiseInvertStridedFunctor =
+    elementwise_common::UnaryStridedFunctor<argTy,
+                                            resTy,
+                                            IndexerT,
+                                            BitwiseInvertFunctor<argTy, resTy>>;
+
+template <typename argTy> struct BitwiseInvertOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::TypeMapResultEntry<argTy, bool>,
+        td_ns::TypeMapResultEntry<argTy, std::uint8_t>,
+        td_ns::TypeMapResultEntry<argTy, std::uint16_t>,
+        td_ns::TypeMapResultEntry<argTy, std::uint32_t>,
+        td_ns::TypeMapResultEntry<argTy, std::uint64_t>,
+        td_ns::TypeMapResultEntry<argTy, std::int8_t>,
+        td_ns::TypeMapResultEntry<argTy, std::int16_t>,
+        td_ns::TypeMapResultEntry<argTy, std::int32_t>,
+        td_ns::TypeMapResultEntry<argTy, std::int64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+class bitwise_invert_contig_kernel;
+
+template <typename argTy>
+sycl::event
+bitwise_invert_contig_impl(sycl::queue exec_q,
+                           size_t nelems,
+                           const char *arg_p,
+                           char *res_p,
+                           const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::unary_contig_impl<argTy, BitwiseInvertOutputType,
+                                                 BitwiseInvertContigFunctor,
+                                                 bitwise_invert_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T> struct BitwiseInvertContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseInvertOutputType<T>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_invert_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct BitwiseInvertTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::logical_not(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseInvertOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class bitwise_invert_strided_kernel;
+
+template <typename argTy>
+sycl::event
+bitwise_invert_strided_impl(sycl::queue exec_q,
+                            size_t nelems,
+                            int nd,
+                            const py::ssize_t *shape_and_strides,
+                            const char *arg_p,
+                            py::ssize_t arg_offset,
+                            char *res_p,
+                            py::ssize_t res_offset,
+                            const std::vector<sycl::event> &depends,
+                            const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, BitwiseInvertOutputType, BitwiseInvertStridedFunctor,
+        bitwise_invert_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
+                                       arg_p, arg_offset, res_p, res_offset,
+                                       depends, additional_depends);
+}
+
+template <typename fnT, typename T> struct BitwiseInvertStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseInvertOutputType<T>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_invert_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace bitwise_invert
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
new file mode 100644
index 0000000000..1a1f75bb2a
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -0,0 +1,266 @@
+//=== bitwise_left-shift.hpp - Binary func. BITWISE_LEFT_SHIFT -*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain in1 copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_left_shift(ar1, ar2)
+/// operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace bitwise_left_shift
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseLeftShiftFunctor
+{
+    static_assert(std::is_integral_v<argT1>);
+    static_assert(std::is_integral_v<argT2>);
+    static_assert(!std::is_same_v<argT1, bool>);
+    static_assert(!std::is_same_v<argT2, bool>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2)
+    {
+        if constexpr (std::is_unsigned_v<argT2>) {
+            return (in1 << in2);
+        }
+        else {
+            return (in2 < argT2(0)) ? resT(0) : (in1 << in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
+                                       const sycl::vec<argT2, vec_sz> &in2)
+    {
+        if constexpr (std::is_same_v<argT1, argT2> && std::is_unsigned_v<argT2>)
+        {
+            return (in1 << in2);
+        }
+        else {
+            sycl::vec<resT, vec_sz> res;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                res[i] = (in2[i] < argT2(0)) ? resT(0) : (in1[i] << in2[i]);
+            }
+            return res;
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using BitwiseLeftShiftContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseLeftShiftFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseLeftShiftStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    BitwiseLeftShiftFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2> struct BitwiseLeftShiftOutputType
+{
+    using ResT = T1;
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by
+                                                  // DPC++
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class bitwise_left_shift_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+bitwise_left_shift_contig_impl(sycl::queue exec_q,
+                               size_t nelems,
+                               const char *arg1_p,
+                               py::ssize_t arg1_offset,
+                               const char *arg2_p,
+                               py::ssize_t arg2_offset,
+                               char *res_p,
+                               py::ssize_t res_offset,
+                               const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseLeftShiftOutputType,
+        BitwiseLeftShiftContigFunctor, bitwise_left_shift_contig_kernel>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename BitwiseLeftShiftOutputType<
+                                         T1, T2>::value_type,
+                                     void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_left_shift_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseLeftShiftOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_left_shift_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event bitwise_left_shift_strided_impl(
+    sycl::queue exec_q,
+    size_t nelems,
+    int nd,
+    const py::ssize_t *shape_and_strides,
+    const char *arg1_p,
+    py::ssize_t arg1_offset,
+    const char *arg2_p,
+    py::ssize_t arg2_offset,
+    char *res_p,
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseLeftShiftOutputType,
+        BitwiseLeftShiftStridedFunctor, bitwise_left_shift_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename BitwiseLeftShiftOutputType<
+                                         T1, T2>::value_type,
+                                     void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_left_shift_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace bitwise_left_shift
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
new file mode 100644
index 0000000000..47d34ac182
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
@@ -0,0 +1,258 @@
+//=== bitwise_or.hpp -   Binary function BITWISE_OR    -------- *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain in1 copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_or(ar1, ar2) operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace bitwise_or
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT> struct BitwiseOrFunctor
+{
+    static_assert(std::is_same_v<resT, argT1>);
+    static_assert(std::is_same_v<resT, argT2>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2)
+    {
+        using tu_ns::convert_impl;
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            return in1 || in2;
+        }
+        else {
+            return (in1 | in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
+                                       const sycl::vec<argT2, vec_sz> &in2)
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (in1 || in2);
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            return (in1 | in2);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using BitwiseOrContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseOrFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseOrStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    BitwiseOrFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2> struct BitwiseOrOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by
+                                                  // DPC++
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class bitwise_or_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event bitwise_or_contig_impl(sycl::queue exec_q,
+                                   size_t nelems,
+                                   const char *arg1_p,
+                                   py::ssize_t arg1_offset,
+                                   const char *arg2_p,
+                                   py::ssize_t arg2_offset,
+                                   char *res_p,
+                                   py::ssize_t res_offset,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseOrOutputType, BitwiseOrContigFunctor,
+        bitwise_or_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                                  arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct BitwiseOrContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseOrOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_or_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2> struct BitwiseOrTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseOrOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_or_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+bitwise_or_strided_impl(sycl::queue exec_q,
+                        size_t nelems,
+                        int nd,
+                        const py::ssize_t *shape_and_strides,
+                        const char *arg1_p,
+                        py::ssize_t arg1_offset,
+                        const char *arg2_p,
+                        py::ssize_t arg2_offset,
+                        char *res_p,
+                        py::ssize_t res_offset,
+                        const std::vector<sycl::event> &depends,
+                        const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseOrOutputType, BitwiseOrStridedFunctor,
+        bitwise_or_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct BitwiseOrStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseOrOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_or_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace bitwise_or
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
new file mode 100644
index 0000000000..7f34af8c4a
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -0,0 +1,266 @@
+//=== bitwise_right_shift.hpp - Binary func. BITWISE_RIGHT_SHIFT *-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain in1 copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_right_shift(ar1, ar2)
+/// operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace bitwise_right_shift
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseRightShiftFunctor
+{
+    static_assert(std::is_same_v<resT, argT1>);
+    static_assert(std::is_integral_v<argT1>);
+    static_assert(std::is_integral_v<argT2>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2)
+    {
+        if constexpr (std::is_unsigned_v<argT2>) {
+            return (in1 >> in2);
+        }
+        else {
+            return (in2 < argT2(0)) ? resT(0) : (in1 >> in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
+                                       const sycl::vec<argT2, vec_sz> &in2)
+    {
+        if constexpr (std::is_same_v<argT1, argT2> && std::is_unsigned_v<argT2>)
+        {
+            return (in1 >> in2);
+        }
+        else {
+            sycl::vec<resT, vec_sz> res;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                res[i] = (in2[i] < argT2(0)) ? resT(0) : (in1[i] >> in2[i]);
+            }
+            return res;
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using BitwiseRightShiftContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseRightShiftFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseRightShiftStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<
+        argT1,
+        argT2,
+        resT,
+        IndexerT,
+        BitwiseRightShiftFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2> struct BitwiseRightShiftOutputType
+{
+    using ResT = T1;
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by
+                                                  // DPC++
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class bitwise_right_shift_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+bitwise_right_shift_contig_impl(sycl::queue exec_q,
+                                size_t nelems,
+                                const char *arg1_p,
+                                py::ssize_t arg1_offset,
+                                const char *arg2_p,
+                                py::ssize_t arg2_offset,
+                                char *res_p,
+                                py::ssize_t res_offset,
+                                const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseRightShiftOutputType,
+        BitwiseRightShiftContigFunctor, bitwise_right_shift_contig_kernel>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename BitwiseRightShiftOutputType<
+                                         T1, T2>::value_type,
+                                     void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_right_shift_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseRightShiftOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_right_shift_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event bitwise_right_shift_strided_impl(
+    sycl::queue exec_q,
+    size_t nelems,
+    int nd,
+    const py::ssize_t *shape_and_strides,
+    const char *arg1_p,
+    py::ssize_t arg1_offset,
+    const char *arg2_p,
+    py::ssize_t arg2_offset,
+    char *res_p,
+    py::ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseRightShiftOutputType,
+        BitwiseRightShiftStridedFunctor, bitwise_right_shift_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename BitwiseRightShiftOutputType<
+                                         T1, T2>::value_type,
+                                     void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_right_shift_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace bitwise_right_shift
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
new file mode 100644
index 0000000000..2e5568fe8c
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
@@ -0,0 +1,262 @@
+//=== bitwise_xor.hpp -   Binary function BITWISE_XOR  -------- *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain in1 copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_xor(ar1, ar2) operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace bitwise_xor
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseXorFunctor
+{
+    static_assert(std::is_same_v<resT, argT1>);
+    static_assert(std::is_same_v<resT, argT2>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2)
+    {
+        if constexpr (std::is_same_v<resT, bool>) {
+            // (false != false) -> false, (false != true) -> true
+            // (true != false) -> true,  (true != true) -> false
+            return (in1 != in2);
+        }
+        else {
+            return (in1 ^ in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
+                                       const sycl::vec<argT2, vec_sz> &in2)
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (in1 != in2);
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            return (in1 ^ in2);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using BitwiseXorContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseXorFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseXorStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    BitwiseXorFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2> struct BitwiseXorOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by
+                                                  // DPC++
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class bitwise_xor_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+bitwise_xor_contig_impl(sycl::queue exec_q,
+                        size_t nelems,
+                        const char *arg1_p,
+                        py::ssize_t arg1_offset,
+                        const char *arg2_p,
+                        py::ssize_t arg2_offset,
+                        char *res_p,
+                        py::ssize_t res_offset,
+                        const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseXorOutputType, BitwiseXorContigFunctor,
+        bitwise_xor_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                                   arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct BitwiseXorContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseXorOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_xor_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseXorOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_xor_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+bitwise_xor_strided_impl(sycl::queue exec_q,
+                         size_t nelems,
+                         int nd,
+                         const py::ssize_t *shape_and_strides,
+                         const char *arg1_p,
+                         py::ssize_t arg1_offset,
+                         const char *arg2_p,
+                         py::ssize_t arg2_offset,
+                         char *res_p,
+                         py::ssize_t res_offset,
+                         const std::vector<sycl::event> &depends,
+                         const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseXorOutputType, BitwiseXorStridedFunctor,
+        bitwise_xor_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename BitwiseXorOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_xor_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace bitwise_xor
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
index e02afcd358..6659db9979 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
@@ -40,6 +40,12 @@
 #include "kernels/elementwise_functions/asinh.hpp"
 #include "kernels/elementwise_functions/atan.hpp"
 #include "kernels/elementwise_functions/atanh.hpp"
+#include "kernels/elementwise_functions/bitwise_and.hpp"
+#include "kernels/elementwise_functions/bitwise_invert.hpp"
+#include "kernels/elementwise_functions/bitwise_left_shift.hpp"
+#include "kernels/elementwise_functions/bitwise_or.hpp"
+#include "kernels/elementwise_functions/bitwise_right_shift.hpp"
+#include "kernels/elementwise_functions/bitwise_xor.hpp"
 #include "kernels/elementwise_functions/ceil.hpp"
 #include "kernels/elementwise_functions/conj.hpp"
 #include "kernels/elementwise_functions/cos.hpp"
@@ -509,37 +515,237 @@ void populate_atanh_dispatch_vectors(void)
 // B03: ===== BITWISE_AND           (x1, x2)
 namespace impl
 {
-// FIXME: add code for B03
+namespace bitwise_and_fn_ns = dpctl::tensor::kernels::bitwise_and;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int bitwise_and_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_bitwise_and_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_and_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseAndTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseAndTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_and_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseAndStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseAndStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_and_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseAndContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseAndContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_and_contig_dispatch_table);
+};
+
 } // namespace impl
 
 // B04: ===== BITWISE_LEFT_SHIFT    (x1, x2)
 namespace impl
 {
-// FIXME: add code for B04
+namespace bitwise_left_shift_fn_ns = dpctl::tensor::kernels::bitwise_left_shift;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_left_shift_contig_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+static int bitwise_left_shift_output_id_table[td_ns::num_types]
+                                             [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_left_shift_strided_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+
+void populate_bitwise_left_shift_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_left_shift_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseLeftShiftTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseLeftShiftTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_left_shift_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseLeftShiftStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         BitwiseLeftShiftStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_left_shift_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseLeftShiftContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t,
+                         BitwiseLeftShiftContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_left_shift_contig_dispatch_table);
+};
+
 } // namespace impl
 
 // U08: ===== BITWISE_INVERT        (x)
 namespace impl
 {
-// FIXME: add code for U08
+
+namespace bitwise_invert_fn_ns = dpctl::tensor::kernels::bitwise_invert;
+
+static unary_contig_impl_fn_ptr_t
+    bitwise_invert_contig_dispatch_vector[td_ns::num_types];
+static int bitwise_invert_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    bitwise_invert_strided_dispatch_vector[td_ns::num_types];
+
+void populate_bitwise_invert_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_invert_fn_ns;
+
+    using fn_ns::BitwiseInvertContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t,
+                          BitwiseInvertContigFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(bitwise_invert_contig_dispatch_vector);
+
+    using fn_ns::BitwiseInvertStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t,
+                          BitwiseInvertStridedFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(bitwise_invert_strided_dispatch_vector);
+
+    using fn_ns::BitwiseInvertTypeMapFactory;
+    DispatchVectorBuilder<int, BitwiseInvertTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(bitwise_invert_output_typeid_vector);
+};
+
 } // namespace impl
 
 // B05: ===== BITWISE_OR            (x1, x2)
 namespace impl
 {
-// FIXME: add code for B05
+namespace bitwise_or_fn_ns = dpctl::tensor::kernels::bitwise_or;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int bitwise_or_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_bitwise_or_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_or_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseOrTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseOrTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_or_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseOrStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseOrStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_or_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseOrContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseOrContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_or_contig_dispatch_table);
+};
 } // namespace impl
 
 // B06: ===== BITWISE_RIGHT_SHIFT   (x1, x2)
 namespace impl
 {
-// FIXME: add code for B06
+namespace bitwise_right_shift_fn_ns =
+    dpctl::tensor::kernels::bitwise_right_shift;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_right_shift_contig_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+static int bitwise_right_shift_output_id_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_right_shift_strided_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_bitwise_right_shift_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_right_shift_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseRightShiftTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseRightShiftTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_right_shift_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseRightShiftStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         BitwiseRightShiftStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_right_shift_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseRightShiftContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t,
+                         BitwiseRightShiftContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_right_shift_contig_dispatch_table);
+};
+
 } // namespace impl
 
 // B07: ===== BITWISE_XOR           (x1, x2)
 namespace impl
 {
-// FIXME: add code for B07
+namespace bitwise_xor_fn_ns = dpctl::tensor::kernels::bitwise_xor;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int bitwise_xor_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_bitwise_xor_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_xor_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseXorTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseXorTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_xor_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseXorStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseXorStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_xor_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseXorContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseXorContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_xor_contig_dispatch_table);
+};
 } // namespace impl
 
 // U09: ==== CEIL          (x)
@@ -2602,22 +2808,239 @@ void init_elementwise_functions(py::module_ m)
     }
 
     // B03: ===== BITWISE_AND           (x1, x2)
-    // FIXME:
+    {
+        impl::populate_bitwise_and_dispatch_tables();
+        using impl::bitwise_and_contig_dispatch_table;
+        using impl::bitwise_and_output_id_table;
+        using impl::bitwise_and_strided_dispatch_table;
+
+        auto bitwise_and_pyapi = [&](dpctl::tensor::usm_ndarray src1,
+                                     dpctl::tensor::usm_ndarray src2,
+                                     dpctl::tensor::usm_ndarray dst,
+                                     sycl::queue exec_q,
+                                     const std::vector<sycl::event> &depends =
+                                         {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, bitwise_and_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_and_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_and_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_and_result_type_pyapi = [&](py::dtype dtype1,
+                                                 py::dtype dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               bitwise_and_output_id_table);
+        };
+        m.def("_bitwise_and", bitwise_and_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_bitwise_and_result_type", bitwise_and_result_type_pyapi, "");
+    }
 
     // B04: ===== BITWISE_LEFT_SHIFT    (x1, x2)
-    // FIXME:
+    {
+        impl::populate_bitwise_left_shift_dispatch_tables();
+        using impl::bitwise_left_shift_contig_dispatch_table;
+        using impl::bitwise_left_shift_output_id_table;
+        using impl::bitwise_left_shift_strided_dispatch_table;
+
+        auto bitwise_left_shift_pyapi = [&](dpctl::tensor::usm_ndarray src1,
+                                            dpctl::tensor::usm_ndarray src2,
+                                            dpctl::tensor::usm_ndarray dst,
+                                            sycl::queue exec_q,
+                                            const std::vector<sycl::event>
+                                                &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends,
+                bitwise_left_shift_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_left_shift_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_left_shift_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_left_shift_result_type_pyapi = [&](py::dtype dtype1,
+                                                        py::dtype dtype2) {
+            return py_binary_ufunc_result_type(
+                dtype1, dtype2, bitwise_left_shift_output_id_table);
+        };
+        m.def("_bitwise_left_shift", bitwise_left_shift_pyapi, "",
+              py::arg("src1"), py::arg("src2"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+        m.def("_bitwise_left_shift_result_type",
+              bitwise_left_shift_result_type_pyapi, "");
+    }
 
     // U08: ===== BITWISE_INVERT        (x)
-    // FIXME:
+    {
+        impl::populate_bitwise_invert_dispatch_vectors();
+        using impl::bitwise_invert_contig_dispatch_vector;
+        using impl::bitwise_invert_output_typeid_vector;
+        using impl::bitwise_invert_strided_dispatch_vector;
+
+        auto bitwise_invert_pyapi = [&](arrayT src, arrayT dst,
+                                        sycl::queue exec_q,
+                                        const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  bitwise_invert_output_typeid_vector,
+                                  bitwise_invert_contig_dispatch_vector,
+                                  bitwise_invert_strided_dispatch_vector);
+        };
+        m.def("_bitwise_invert", bitwise_invert_pyapi, "", py::arg("src"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+
+        auto bitwise_invert_result_type_pyapi = [&](py::dtype dtype) {
+            return py_unary_ufunc_result_type(
+                dtype, bitwise_invert_output_typeid_vector);
+        };
+        m.def("_bitwise_invert_result_type", bitwise_invert_result_type_pyapi);
+    }
 
     // B05: ===== BITWISE_OR            (x1, x2)
-    // FIXME:
+    {
+        impl::populate_bitwise_or_dispatch_tables();
+        using impl::bitwise_or_contig_dispatch_table;
+        using impl::bitwise_or_output_id_table;
+        using impl::bitwise_or_strided_dispatch_table;
+
+        auto bitwise_or_pyapi = [&](dpctl::tensor::usm_ndarray src1,
+                                    dpctl::tensor::usm_ndarray src2,
+                                    dpctl::tensor::usm_ndarray dst,
+                                    sycl::queue exec_q,
+                                    const std::vector<sycl::event> &depends =
+                                        {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, bitwise_or_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_or_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_or_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_or_result_type_pyapi = [&](py::dtype dtype1,
+                                                py::dtype dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               bitwise_or_output_id_table);
+        };
+        m.def("_bitwise_or", bitwise_or_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_bitwise_or_result_type", bitwise_or_result_type_pyapi, "");
+    }
 
     // B06: ===== BITWISE_RIGHT_SHIFT   (x1, x2)
-    // FIXME:
+    {
+        impl::populate_bitwise_right_shift_dispatch_tables();
+        using impl::bitwise_right_shift_contig_dispatch_table;
+        using impl::bitwise_right_shift_output_id_table;
+        using impl::bitwise_right_shift_strided_dispatch_table;
+
+        auto bitwise_right_shift_pyapi = [&](dpctl::tensor::usm_ndarray src1,
+                                             dpctl::tensor::usm_ndarray src2,
+                                             dpctl::tensor::usm_ndarray dst,
+                                             sycl::queue exec_q,
+                                             const std::vector<sycl::event>
+                                                 &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends,
+                bitwise_right_shift_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_right_shift_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_right_shift_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_right_shift_result_type_pyapi = [&](py::dtype dtype1,
+                                                         py::dtype dtype2) {
+            return py_binary_ufunc_result_type(
+                dtype1, dtype2, bitwise_right_shift_output_id_table);
+        };
+        m.def("_bitwise_right_shift", bitwise_right_shift_pyapi, "",
+              py::arg("src1"), py::arg("src2"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+        m.def("_bitwise_right_shift_result_type",
+              bitwise_right_shift_result_type_pyapi, "");
+    }
 
     // B07: ===== BITWISE_XOR           (x1, x2)
-    // FIXME:
+    {
+        impl::populate_bitwise_xor_dispatch_tables();
+        using impl::bitwise_xor_contig_dispatch_table;
+        using impl::bitwise_xor_output_id_table;
+        using impl::bitwise_xor_strided_dispatch_table;
+
+        auto bitwise_xor_pyapi = [&](dpctl::tensor::usm_ndarray src1,
+                                     dpctl::tensor::usm_ndarray src2,
+                                     dpctl::tensor::usm_ndarray dst,
+                                     sycl::queue exec_q,
+                                     const std::vector<sycl::event> &depends =
+                                         {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, bitwise_xor_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_xor_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_xor_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_xor_result_type_pyapi = [&](py::dtype dtype1,
+                                                 py::dtype dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               bitwise_xor_output_id_table);
+        };
+        m.def("_bitwise_xor", bitwise_xor_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_bitwise_xor_result_type", bitwise_xor_result_type_pyapi, "");
+    }
 
     // U09: ==== CEIL          (x)
     {

From 7490f29704ac136c77b46be37eac86e36ba3043d Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 26 Jul 2023 12:17:45 -0500
Subject: [PATCH 2/6] Closes gh-1287

Adds logic in vec_cast to return the input if dstT is the same as
srcT.
---
 dpctl/tensor/libtensor/include/utils/type_utils.hpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
index 36c00404c9..4ea17ac730 100644
--- a/dpctl/tensor/libtensor/include/utils/type_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
@@ -113,7 +113,13 @@ template <typename dstT,
           typename Indices = std::make_index_sequence<N>>
 auto vec_cast(const sycl::vec<srcT, N> &s)
 {
-    return vec_cast_impl<sycl::vec<dstT, N>, sycl::vec<srcT, N>>(s, Indices{});
+    if constexpr (std::is_same_v<srcT, dstT>) {
+        return s;
+    }
+    else {
+        return vec_cast_impl<sycl::vec<dstT, N>, sycl::vec<srcT, N>>(s,
+                                                                     Indices{});
+    }
 }
 
 } // namespace type_utils

From a1a69be30ae8eb09840d216470d048e8acc21ecf Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 26 Jul 2023 13:22:34 -0500
Subject: [PATCH 3/6] Adds bitwise functions to dpctl.tensor namespace

Adds tensor.bitwise_invert, tensor.bitwise_and, tensor.bitwise_or,
tensor.bitwise_xor, tensor.bitwise_left_shift,
tensor.bitwise_right_shift
---
 dpctl/tensor/__init__.py           |  12 ++
 dpctl/tensor/_elementwise_funcs.py | 190 ++++++++++++++++++++++++++++-
 2 files changed, 196 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 60c95bc308..7ac58a7f67 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -100,6 +100,12 @@
     asinh,
     atan,
     atanh,
+    bitwise_and,
+    bitwise_invert,
+    bitwise_left_shift,
+    bitwise_or,
+    bitwise_right_shift,
+    bitwise_xor,
     ceil,
     conj,
     cos,
@@ -232,6 +238,12 @@
     "asinh",
     "atan",
     "atanh",
+    "bitwise_and",
+    "bitwise_invert",
+    "bitwise_left_shift",
+    "bitwise_or",
+    "bitwise_right_shift",
+    "bitwise_xor",
     "ceil",
     "conj",
     "cos",
diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
index 5a081197c5..14d69417bb 100644
--- a/dpctl/tensor/_elementwise_funcs.py
+++ b/dpctl/tensor/_elementwise_funcs.py
@@ -238,22 +238,200 @@
 )
 
 # B03: ===== BITWISE_AND           (x1, x2)
-# FIXME: implemetn B03
+_bitwise_and_docstring_ = """
+bitwise_and(x1, x2, out=None, order='K')
+
+Computes the bitwise AND of the underlying binary representation of each
+element `x1_i` of the input array `x1` with the respective element `x2_i`
+of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer or boolean data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer or boolean data
+        type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_and = BinaryElementwiseFunc(
+    "bitwise_and",
+    ti._bitwise_and_result_type,
+    ti._bitwise_and,
+    _bitwise_and_docstring_,
+)
 
 # B04: ===== BITWISE_LEFT_SHIFT    (x1, x2)
-# FIXME: implement B04
+_bitwise_left_shift_docstring_ = """
+bitwise_left_shift(x1, x2, out=None, order='K')
+
+Shifts the bits of each element `x1_i` of the input array x1 to the left by
+appending `x2_i` (i.e., the respective element in the input array `x2`) zeros to
+the right of `x1_i`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer data type.
+        Each element must be greater than or equal to 0.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_left_shift = BinaryElementwiseFunc(
+    "bitwise_left_shift",
+    ti._bitwise_left_shift_result_type,
+    ti._bitwise_left_shift,
+    _bitwise_left_shift_docstring_,
+)
+
 
 # U08: ===== BITWISE_INVERT        (x)
-# FIXME: implement U08
+_bitwise_invert_docstring = """
+bitwise_invert(x, out=None, order='K')
+
+Inverts (flips) each bit for each element `x_i` of the input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have integer or boolean data type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise results.
+        The data type of the returned array is same as the data type of the
+        input array.
+"""
+
+bitwise_invert = UnaryElementwiseFunc(
+    "bitwise_invert",
+    ti._bitwise_invert_result_type,
+    ti._bitwise_invert,
+    _bitwise_invert_docstring,
+)
 
 # B05: ===== BITWISE_OR            (x1, x2)
-# FIXME: implement B05
+_bitwise_or_docstring_ = """
+bitwise_or(x1, x2, out=None, order='K')
+
+Computes the bitwise OR of the underlying binary representation of each
+element `x1_i` of the input array `x1` with the respective element `x2_i`
+of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer or boolean data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer or boolean data
+        type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_or = BinaryElementwiseFunc(
+    "bitwise_or",
+    ti._bitwise_or_result_type,
+    ti._bitwise_or,
+    _bitwise_or_docstring_,
+)
 
 # B06: ===== BITWISE_RIGHT_SHIFT   (x1, x2)
-# FIXME: implement B06
+_bitwise_right_shift_docstring_ = """
+bitwise_right_shift(x1, x2, out=None, order='K')
+
+Shifts the bits of each element `x1_i` of the input array `x1` to the right
+according to the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer data type.
+        Each element must be greater than or equal to 0.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_right_shift = BinaryElementwiseFunc(
+    "bitwise_right_shift",
+    ti._bitwise_right_shift_result_type,
+    ti._bitwise_right_shift,
+    _bitwise_right_shift_docstring_,
+)
+
 
 # B07: ===== BITWISE_XOR           (x1, x2)
-# FIXME: implement B07
+_bitwise_xor_docstring_ = """
+bitwise_xor(x1, x2, out=None, order='K')
+
+Computes the bitwise XOR of the underlying binary representation of each
+element `x1_i` of the input array `x1` with the respective element `x2_i`
+of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer or boolean data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer or boolean data
+        type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_xor = BinaryElementwiseFunc(
+    "bitwise_xor",
+    ti._bitwise_xor_result_type,
+    ti._bitwise_xor,
+    _bitwise_xor_docstring_,
+)
+
 
 # U09: ==== CEIL          (x)
 _ceil_docstring = """

From d92695a7ca7690886ca9e2b98a3e7bc1e069e5e5 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 26 Jul 2023 21:09:07 -0500
Subject: [PATCH 4/6] Adding tests for bitwise functions

---
 dpctl/tests/elementwise/test_bitwise_and.py   |  87 +++++++++++++
 .../tests/elementwise/test_bitwise_invert.py  | 119 ++++++++++++++++++
 .../elementwise/test_bitwise_left_shift.py    |  87 +++++++++++++
 dpctl/tests/elementwise/test_bitwise_or.py    |  87 +++++++++++++
 .../elementwise/test_bitwise_right_shift.py   |  87 +++++++++++++
 dpctl/tests/elementwise/test_bitwise_xor.py   |  87 +++++++++++++
 dpctl/tests/elementwise/utils.py              |  28 ++---
 7 files changed, 563 insertions(+), 19 deletions(-)
 create mode 100644 dpctl/tests/elementwise/test_bitwise_and.py
 create mode 100644 dpctl/tests/elementwise/test_bitwise_invert.py
 create mode 100644 dpctl/tests/elementwise/test_bitwise_left_shift.py
 create mode 100644 dpctl/tests/elementwise/test_bitwise_or.py
 create mode 100644 dpctl/tests/elementwise/test_bitwise_right_shift.py
 create mode 100644 dpctl/tests/elementwise/test_bitwise_xor.py

diff --git a/dpctl/tests/elementwise/test_bitwise_and.py b/dpctl/tests/elementwise/test_bitwise_and.py
new file mode 100644
index 0000000000..b3a5bd665b
--- /dev/null
+++ b/dpctl/tests/elementwise/test_bitwise_and.py
@@ -0,0 +1,87 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless_equal required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_and_dtype_matrix_contig(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+
+    x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)
+
+    r = dpt.bitwise_and(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)
+    r_np = np.bitwise_and(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_and_dtype_matrix_strided(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2]
+
+    x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2]
+
+    r = dpt.bitwise_and(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2]
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2]
+    r_np = np.bitwise_and(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+def test_bitwise_and_bool():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([True, False])
+    x2 = dpt.asarray([False, True])
+
+    r_bw = dpt.bitwise_and(x1[:, dpt.newaxis], x2[dpt.newaxis])
+    r_lo = dpt.logical_and(x1[:, dpt.newaxis], x2[dpt.newaxis])
+
+    assert dpt.all(dpt.equal(r_bw, r_lo))
diff --git a/dpctl/tests/elementwise/test_bitwise_invert.py b/dpctl/tests/elementwise/test_bitwise_invert.py
new file mode 100644
index 0000000000..1850dcfb10
--- /dev/null
+++ b/dpctl/tests/elementwise/test_bitwise_invert.py
@@ -0,0 +1,119 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless_equal required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _compare_dtypes, _integral_dtypes, _usm_types
+
+
+@pytest.mark.parametrize(
+    "op_dtype",
+    [
+        "b1",
+    ]
+    + _integral_dtypes,
+)
+def test_bitwise_invert_dtype_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    ar1 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op_dtype)
+
+    r = dpt.bitwise_invert(ar1)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == ar1.dtype
+
+    expected = np.bitwise_not(dpt.asnumpy(ar1))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.bitwise_invert(ar1, out=r2)
+    assert dpt.all(dpt.equal(r, r2))
+
+    ar2 = dpt.zeros(sz, dtype=op_dtype)
+    r = dpt.bitwise_invert(ar2[::-1])
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.bitwise_not(np.zeros(ar2.shape, dtype=op_dtype))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar2.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    ar3 = dpt.ones(sz, dtype=op_dtype)
+    r2 = dpt.bitwise_invert(ar3[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.bitwise_not(np.ones(ar3.shape, dtype=op_dtype)[::2])
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert (dpt.asnumpy(r2) == expected).all()
+
+    r3 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.bitwise_invert(ar2[::-1], out=r3)
+    assert dpt.all(dpt.equal(r, r3))
+
+
+@pytest.mark.parametrize("op_usm_type", _usm_types)
+def test_bitwise_invert_usm_type_matrix(op_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype="i4", usm_type=op_usm_type
+    )
+
+    r = dpt.bitwise_invert(ar1)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.usm_type == op_usm_type
+
+
+def test_bitwise_invert_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.bitwise_invert(ar1, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.bitwise_invert(ar1, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.bitwise_invert(ar1, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.bitwise_invert(ar1, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.zeros((20, 20), dtype="i4", order="F")
+    r1 = dpt.bitwise_invert(ar1, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.bitwise_invert(ar1, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.bitwise_invert(ar1, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.bitwise_invert(ar1, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.bitwise_invert(ar1, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.zeros((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.bitwise_invert(ar1, order="K")
+    assert r4.strides == (-1, 20)
diff --git a/dpctl/tests/elementwise/test_bitwise_left_shift.py b/dpctl/tests/elementwise/test_bitwise_left_shift.py
new file mode 100644
index 0000000000..34e08850cb
--- /dev/null
+++ b/dpctl/tests/elementwise/test_bitwise_left_shift.py
@@ -0,0 +1,87 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless_equal required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_left_shift_dtype_matrix_contig(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
+        return
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op1_dtype)
+    dt2 = dpt.dtype(op2_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+    x2 = dpt.arange(0, n, dtype=dt2)
+
+    r = dpt.bitwise_left_shift(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.sycl_queue == x1.sycl_queue
+    assert r.sycl_queue == x2.sycl_queue
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op1_dtype)
+    x2_np = np.arange(0, n, dtype=op2_dtype)
+    r_np = np.left_shift(x1_np, x2_np)
+
+    assert r.dtype == r_np.dtype
+    assert (dpt.asnumpy(r) == r_np).all()
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_left_shift_dtype_matrix_strided(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
+        return
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op1_dtype)
+    dt2 = dpt.dtype(op2_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
+    x2 = dpt.arange(0, n, dtype=dt2)[::2]
+
+    r = dpt.bitwise_left_shift(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.sycl_queue == x1.sycl_queue
+    assert r.sycl_queue == x2.sycl_queue
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
+    x2_np = np.arange(0, n, dtype=dt2)[::2]
+    r_np = np.left_shift(x1_np, x2_np)
+
+    assert r.dtype == r_np.dtype
+    assert (dpt.asnumpy(r) == r_np).all()
diff --git a/dpctl/tests/elementwise/test_bitwise_or.py b/dpctl/tests/elementwise/test_bitwise_or.py
new file mode 100644
index 0000000000..d273bd1507
--- /dev/null
+++ b/dpctl/tests/elementwise/test_bitwise_or.py
@@ -0,0 +1,87 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless_equal required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_or_dtype_matrix_contig(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+
+    x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)
+
+    r = dpt.bitwise_or(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)
+    r_np = np.bitwise_or(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_or_dtype_matrix_strided(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2]
+
+    x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2]
+
+    r = dpt.bitwise_or(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2]
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2]
+    r_np = np.bitwise_or(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+def test_bitwise_or_bool():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([True, False])
+    x2 = dpt.asarray([False, True])
+
+    r_bw = dpt.bitwise_or(x1[:, dpt.newaxis], x2[dpt.newaxis])
+    r_lo = dpt.logical_or(x1[:, dpt.newaxis], x2[dpt.newaxis])
+
+    assert dpt.all(dpt.equal(r_bw, r_lo))
diff --git a/dpctl/tests/elementwise/test_bitwise_right_shift.py b/dpctl/tests/elementwise/test_bitwise_right_shift.py
new file mode 100644
index 0000000000..c69754c43e
--- /dev/null
+++ b/dpctl/tests/elementwise/test_bitwise_right_shift.py
@@ -0,0 +1,87 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless_equal required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_right_shift_dtype_matrix_contig(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
+        return
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op1_dtype)
+    dt2 = dpt.dtype(op2_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+    x2 = dpt.arange(0, n, dtype=dt2)
+
+    r = dpt.bitwise_right_shift(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.sycl_queue == x1.sycl_queue
+    assert r.sycl_queue == x2.sycl_queue
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op1_dtype)
+    x2_np = np.arange(0, n, dtype=op2_dtype)
+    r_np = np.right_shift(x1_np, x2_np)
+
+    assert r.dtype == r_np.dtype
+    assert (dpt.asnumpy(r) == r_np).all()
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_left_shift_dtype_matrix_strided(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
+        return
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op1_dtype)
+    dt2 = dpt.dtype(op2_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
+    x2 = dpt.arange(0, n, dtype=dt2)[::2]
+
+    r = dpt.bitwise_right_shift(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.sycl_queue == x1.sycl_queue
+    assert r.sycl_queue == x2.sycl_queue
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
+    x2_np = np.arange(0, n, dtype=dt2)[::2]
+    r_np = np.right_shift(x1_np, x2_np)
+
+    assert r.dtype == r_np.dtype
+    assert (dpt.asnumpy(r) == r_np).all()
diff --git a/dpctl/tests/elementwise/test_bitwise_xor.py b/dpctl/tests/elementwise/test_bitwise_xor.py
new file mode 100644
index 0000000000..b2cb11bc84
--- /dev/null
+++ b/dpctl/tests/elementwise/test_bitwise_xor.py
@@ -0,0 +1,87 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless_equal required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_xor_dtype_matrix_contig(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+
+    x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)
+
+    r = dpt.bitwise_xor(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)
+    r_np = np.bitwise_xor(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_xor_dtype_matrix_strided(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2]
+
+    x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2]
+
+    r = dpt.bitwise_xor(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2]
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2]
+    r_np = np.bitwise_xor(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+def test_bitwise_xor_bool():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([True, False])
+    x2 = dpt.asarray([False, True])
+
+    r_bw = dpt.bitwise_xor(x1[:, dpt.newaxis], x2[dpt.newaxis])
+    r_lo = dpt.logical_xor(x1[:, dpt.newaxis], x2[dpt.newaxis])
+
+    assert dpt.all(dpt.equal(r_bw, r_lo))
diff --git a/dpctl/tests/elementwise/utils.py b/dpctl/tests/elementwise/utils.py
index 8ff339e7d3..69bb6e5b89 100644
--- a/dpctl/tests/elementwise/utils.py
+++ b/dpctl/tests/elementwise/utils.py
@@ -17,21 +17,7 @@
 import dpctl
 import dpctl.tensor._type_utils as tu
 
-_no_complex_dtypes = [
-    "b1",
-    "i1",
-    "u1",
-    "i2",
-    "u2",
-    "i4",
-    "u4",
-    "i8",
-    "u8",
-    "f2",
-    "f4",
-    "f8",
-]
-_real_value_dtypes = [
+_integral_dtypes = [
     "i1",
     "u1",
     "i2",
@@ -40,14 +26,18 @@
     "u4",
     "i8",
     "u8",
-    "f2",
-    "f4",
-    "f8",
 ]
-_all_dtypes = _no_complex_dtypes + [
+_real_fp_dtypes = ["f2", "f4", "f8"]
+_complex_fp_dtypes = [
     "c8",
     "c16",
 ]
+_real_value_dtypes = _integral_dtypes + _real_fp_dtypes
+_no_complex_dtypes = [
+    "b1",
+] + _real_value_dtypes
+_all_dtypes = _no_complex_dtypes + _complex_fp_dtypes
+
 _usm_types = ["device", "shared", "host"]
 
 

From a21b87186f343419e3cf6c65e227eed0859d5307 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 1 Aug 2023 09:34:08 -0500
Subject: [PATCH 5/6] Bitshift for second operand in excess of first arg bit
 count is undefined behavior

The array API extends the definition to defined bitshift operators to be zero for
shift-size outside of bitwidth range.

```
In [1]: import dpctl.tensor as dpt, numpy as np

In [2]:  x = dpt.asarray([1, 1])

In [3]: y = dpt.asarray(64)

In [4]: dpt.bitwise_right_shift(x, y)
Out[4]: usm_ndarray([0, 0])

In [5]: x_np, y_np = dpt.asnumpy(x), dpt.asnumpy(y)

In [6]: np.right_shift(x_np, y_np)
Out[6]: array([0, 0])
```
---
 .../bitwise_left_shift.hpp                    | 39 +++++++++-------
 .../bitwise_right_shift.hpp                   | 46 +++++++++++--------
 2 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
index 1a1f75bb2a..96deccc0a3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -60,31 +60,38 @@ struct BitwiseLeftShiftFunctor
     using supports_sg_loadstore = typename std::true_type;
     using supports_vec = typename std::true_type;
 
-    resT operator()(const argT1 &in1, const argT2 &in2)
+    resT operator()(const argT1 &in1, const argT2 &in2) const
     {
-        if constexpr (std::is_unsigned_v<argT2>) {
-            return (in1 << in2);
-        }
-        else {
-            return (in2 < argT2(0)) ? resT(0) : (in1 << in2);
-        }
+        return impl(in1, in2);
     }
 
     template <int vec_sz>
     sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
                                        const sycl::vec<argT2, vec_sz> &in2)
     {
-        if constexpr (std::is_same_v<argT1, argT2> && std::is_unsigned_v<argT2>)
-        {
-            return (in1 << in2);
+        sycl::vec<resT, vec_sz> res;
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            res[i] = impl(in1[i], in2[i]);
+        }
+        return res;
+    }
+
+private:
+    resT impl(const argT1 &in1, const argT2 &in2) const
+    {
+        constexpr argT2 in1_bitsize = static_cast<argT2>(sizeof(argT1) * 8);
+        constexpr resT zero = resT(0);
+
+        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
+        // array API spec mandates 0
+        if constexpr (std::is_unsigned_v<argT2>) {
+            return (in2 < in1_bitsize) ? (in1 << in2) : zero;
         }
         else {
-            sycl::vec<resT, vec_sz> res;
-#pragma unroll
-            for (int i = 0; i < vec_sz; ++i) {
-                res[i] = (in2[i] < argT2(0)) ? resT(0) : (in1[i] << in2[i]);
-            }
-            return res;
+            return (in2 < argT2(0))
+                       ? zero
+                       : ((in2 < in1_bitsize) ? (in1 << in2) : zero);
         }
     }
 };
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
index 7f34af8c4a..c2666bb1cd 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -59,31 +59,41 @@ struct BitwiseRightShiftFunctor
     using supports_sg_loadstore = typename std::true_type;
     using supports_vec = typename std::true_type;
 
-    resT operator()(const argT1 &in1, const argT2 &in2)
+    resT operator()(const argT1 &in1, const argT2 &in2) const
     {
-        if constexpr (std::is_unsigned_v<argT2>) {
-            return (in1 >> in2);
-        }
-        else {
-            return (in2 < argT2(0)) ? resT(0) : (in1 >> in2);
-        }
+        return impl(in1, in2);
     }
 
     template <int vec_sz>
-    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
-                                       const sycl::vec<argT2, vec_sz> &in2)
+    sycl::vec<resT, vec_sz>
+    operator()(const sycl::vec<argT1, vec_sz> &in1,
+               const sycl::vec<argT2, vec_sz> &in2) const
     {
-        if constexpr (std::is_same_v<argT1, argT2> && std::is_unsigned_v<argT2>)
-        {
-            return (in1 >> in2);
+        sycl::vec<resT, vec_sz> res;
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            res[i] = impl(in1[i], in2[i]);
+        }
+        return res;
+    }
+
+private:
+    resT impl(const argT1 &in1, const argT2 &in2) const
+    {
+        constexpr argT2 in1_bitsize = static_cast<argT2>(sizeof(argT1) * 8);
+        constexpr resT zero = resT(0);
+
+        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
+        // array API spec mandates 0
+        if constexpr (std::is_unsigned_v<argT2>) {
+            return (in2 < in1_bitsize) ? (in1 >> in2) : zero;
         }
         else {
-            sycl::vec<resT, vec_sz> res;
-#pragma unroll
-            for (int i = 0; i < vec_sz; ++i) {
-                res[i] = (in2[i] < argT2(0)) ? resT(0) : (in1[i] >> in2[i]);
-            }
-            return res;
+            return (in2 < argT2(0))
+                       ? zero
+                       : ((in2 < in1_bitsize)
+                              ? (in1 >> in2)
+                              : (in1 < argT1(0) ? resT(-1) : zero));
         }
     }
 };

From 9a99003fe206860140b2a3cf02da32e9e9e75f13 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 1 Aug 2023 10:16:43 -0500
Subject: [PATCH 6/6] Add tests for bitshift of large size

---
 dpctl/tests/elementwise/test_bitwise_left_shift.py | 12 ++++++++++++
 .../tests/elementwise/test_bitwise_right_shift.py  | 14 +++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/dpctl/tests/elementwise/test_bitwise_left_shift.py b/dpctl/tests/elementwise/test_bitwise_left_shift.py
index 34e08850cb..cee1019353 100644
--- a/dpctl/tests/elementwise/test_bitwise_left_shift.py
+++ b/dpctl/tests/elementwise/test_bitwise_left_shift.py
@@ -85,3 +85,15 @@ def test_bitwise_left_shift_dtype_matrix_strided(op1_dtype, op2_dtype):
 
     assert r.dtype == r_np.dtype
     assert (dpt.asnumpy(r) == r_np).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_left_shift_range(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    x = dpt.ones(255, dtype=op_dtype)
+    y = dpt.asarray(64, dtype=op_dtype)
+
+    z = dpt.bitwise_left_shift(x, y)
+    assert dpt.all(dpt.equal(z, 0))
diff --git a/dpctl/tests/elementwise/test_bitwise_right_shift.py b/dpctl/tests/elementwise/test_bitwise_right_shift.py
index c69754c43e..ceadb9414d 100644
--- a/dpctl/tests/elementwise/test_bitwise_right_shift.py
+++ b/dpctl/tests/elementwise/test_bitwise_right_shift.py
@@ -57,7 +57,7 @@ def test_bitwise_right_shift_dtype_matrix_contig(op1_dtype, op2_dtype):
 
 @pytest.mark.parametrize("op1_dtype", _integral_dtypes)
 @pytest.mark.parametrize("op2_dtype", _integral_dtypes)
-def test_bitwise_left_shift_dtype_matrix_strided(op1_dtype, op2_dtype):
+def test_bitwise_right_shift_dtype_matrix_strided(op1_dtype, op2_dtype):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(op1_dtype, q)
     skip_if_dtype_not_supported(op2_dtype, q)
@@ -85,3 +85,15 @@ def test_bitwise_left_shift_dtype_matrix_strided(op1_dtype, op2_dtype):
 
     assert r.dtype == r_np.dtype
     assert (dpt.asnumpy(r) == r_np).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_right_shift_range(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    x = dpt.ones(255, dtype=op_dtype)
+    y = dpt.asarray(64, dtype=op_dtype)
+
+    z = dpt.bitwise_right_shift(x, y)
+    assert dpt.all(dpt.equal(z, 0))