diff --git a/dpctl/.gitignore b/dpctl/.gitignore
index 3e23a8af25..3376d3081c 100644
--- a/dpctl/.gitignore
+++ b/dpctl/.gitignore
@@ -1,5 +1,5 @@
 *.so
-*.cpp
+_*.cpp
 *.cxx
 *.c
 *.h
diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp
index f55ef3ec93..5b40f222ce 100644
--- a/dpctl/apis/include/dpctl4pybind11.hpp
+++ b/dpctl/apis/include/dpctl4pybind11.hpp
@@ -34,6 +34,295 @@
 
 namespace py = pybind11;
 
+namespace dpctl
+{
+namespace detail
+{
+
+// Lookup a type according to its size, and return a value corresponding to the
+// NumPy typenum.
+template <typename Concrete> constexpr int platform_typeid_lookup()
+{
+    return -1;
+}
+
+template <typename Concrete, typename T, typename... Ts, typename... Ints>
+constexpr int platform_typeid_lookup(int I, Ints... Is)
+{
+    return sizeof(Concrete) == sizeof(T)
+               ? I
+               : platform_typeid_lookup<Concrete, Ts...>(Is...);
+}
+
+struct dpctl_capi
+{
+
+    // dpctl type objects
+    PyTypeObject *Py_SyclDeviceType_;
+    PyTypeObject *PySyclDeviceType_;
+    PyTypeObject *Py_SyclContextType_;
+    PyTypeObject *PySyclContextType_;
+    PyTypeObject *Py_SyclEventType_;
+    PyTypeObject *PySyclEventType_;
+    PyTypeObject *Py_SyclQueueType_;
+    PyTypeObject *PySyclQueueType_;
+    PyTypeObject *Py_MemoryType_;
+    PyTypeObject *PyMemoryUSMDeviceType_;
+    PyTypeObject *PyMemoryUSMSharedType_;
+    PyTypeObject *PyMemoryUSMHostType_;
+    PyTypeObject *PyUSMArrayType_;
+
+    DPCTLSyclDeviceRef (*SyclDevice_GetDeviceRef_)(PySyclDeviceObject *);
+    PySyclDeviceObject *(*SyclDevice_Make_)(DPCTLSyclDeviceRef);
+
+    DPCTLSyclContextRef (*SyclContext_GetContextRef_)(PySyclContextObject *);
+    PySyclContextObject *(*SyclContext_Make_)(DPCTLSyclContextRef);
+
+    DPCTLSyclEventRef (*SyclEvent_GetEventRef_)(PySyclEventObject *);
+    PySyclEventObject *(*SyclEvent_Make_)(DPCTLSyclEventRef);
+
+    DPCTLSyclQueueRef (*SyclQueue_GetQueueRef_)(PySyclQueueObject *);
+    PySyclQueueObject *(*SyclQueue_Make_)(DPCTLSyclQueueRef);
+
+    // memory
+    DPCTLSyclUSMRef (*Memory_GetUsmPointer_)(Py_MemoryObject *);
+    DPCTLSyclContextRef (*Memory_GetContextRef_)(Py_MemoryObject *);
+    DPCTLSyclQueueRef (*Memory_GetQueueRef_)(Py_MemoryObject *);
+    size_t (*Memory_GetNumBytes_)(Py_MemoryObject *);
+    PyObject *(*Memory_Make_)(DPCTLSyclUSMRef,
+                              size_t,
+                              DPCTLSyclQueueRef,
+                              PyObject *);
+
+    // tensor
+    char *(*UsmNDArray_GetData_)(PyUSMArrayObject *);
+    int (*UsmNDArray_GetNDim_)(PyUSMArrayObject *);
+    py::ssize_t *(*UsmNDArray_GetShape_)(PyUSMArrayObject *);
+    py::ssize_t *(*UsmNDArray_GetStrides_)(PyUSMArrayObject *);
+    int (*UsmNDArray_GetTypenum_)(PyUSMArrayObject *);
+    int (*UsmNDArray_GetElementSize_)(PyUSMArrayObject *);
+    int (*UsmNDArray_GetFlags_)(PyUSMArrayObject *);
+    DPCTLSyclQueueRef (*UsmNDArray_GetQueueRef_)(PyUSMArrayObject *);
+    py::ssize_t (*UsmNDArray_GetOffset_)(PyUSMArrayObject *);
+
+    int USM_ARRAY_C_CONTIGUOUS_;
+    int USM_ARRAY_F_CONTIGUOUS_;
+    int USM_ARRAY_WRITABLE_;
+    int UAR_BOOL_, UAR_BYTE_, UAR_UBYTE_, UAR_SHORT_, UAR_USHORT_, UAR_INT_,
+        UAR_UINT_, UAR_LONG_, UAR_ULONG_, UAR_LONGLONG_, UAR_ULONGLONG_,
+        UAR_FLOAT_, UAR_DOUBLE_, UAR_CFLOAT_, UAR_CDOUBLE_, UAR_TYPE_SENTINEL_,
+        UAR_HALF_;
+    int UAR_INT8_, UAR_UINT8_, UAR_INT16_, UAR_UINT16_, UAR_INT32_, UAR_UINT32_,
+        UAR_INT64_, UAR_UINT64_;
+
+    bool PySyclDevice_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclDeviceType_) != 0;
+    }
+    bool PySyclContext_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclContextType_) != 0;
+    }
+    bool PySyclEvent_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclEventType_) != 0;
+    }
+    bool PySyclQueue_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclQueueType_) != 0;
+    }
+
+    ~dpctl_capi(){};
+
+    static auto &get()
+    {
+        static dpctl_capi api = lookup();
+        return api;
+    }
+
+    py::object default_sycl_queue_pyobj()
+    {
+        return *default_sycl_queue;
+    }
+    py::object default_usm_memory_pyobj()
+    {
+        return *default_usm_memory;
+    }
+    py::object default_usm_ndarray_pyobj()
+    {
+        return *default_usm_ndarray;
+    }
+    py::object as_usm_memory_pyobj()
+    {
+        return *as_usm_memory;
+    }
+
+private:
+    struct Deleter
+    {
+        void operator()(py::object *p) const
+        {
+            bool guard = (Py_IsInitialized() && !_Py_IsFinalizing());
+
+            if (guard) {
+                delete p;
+            }
+        }
+    };
+
+    std::shared_ptr<py::object> default_sycl_queue;
+    std::shared_ptr<py::object> default_usm_memory;
+    std::shared_ptr<py::object> default_usm_ndarray;
+    std::shared_ptr<py::object> as_usm_memory;
+
+    dpctl_capi()
+        : default_sycl_queue{}, default_usm_memory{}, default_usm_ndarray{},
+          as_usm_memory{}
+    {
+        // Import Cython-generated C-API for dpctl
+        // This imports python modules and initializes
+        // static variables such as function pointers for C-API,
+        // e.g. SyclDevice_GetDeviceRef, etc.
+        // pointers to Python types, i.e. PySyclDeviceType, etc.
+        // and exported constants, i.e. USM_ARRAY_C_CONTIGUOUS, etc.
+        import_dpctl();
+
+        // Python type objects for classes implemented by dpctl
+        this->Py_SyclDeviceType_ = &Py_SyclDeviceType;
+        this->PySyclDeviceType_ = &PySyclDeviceType;
+        this->Py_SyclContextType_ = &Py_SyclContextType;
+        this->PySyclContextType_ = &PySyclContextType;
+        this->Py_SyclEventType_ = &Py_SyclEventType;
+        this->PySyclEventType_ = &PySyclEventType;
+        this->Py_SyclQueueType_ = &Py_SyclQueueType;
+        this->PySyclQueueType_ = &PySyclQueueType;
+        this->Py_MemoryType_ = &Py_MemoryType;
+        this->PyMemoryUSMDeviceType_ = &PyMemoryUSMDeviceType;
+        this->PyMemoryUSMSharedType_ = &PyMemoryUSMSharedType;
+        this->PyMemoryUSMHostType_ = &PyMemoryUSMHostType;
+        this->PyUSMArrayType_ = &PyUSMArrayType;
+
+        // SyclDevice API
+        this->SyclDevice_GetDeviceRef_ = SyclDevice_GetDeviceRef;
+        this->SyclDevice_Make_ = SyclDevice_Make;
+
+        // SyclContext API
+        this->SyclContext_GetContextRef_ = SyclContext_GetContextRef;
+        this->SyclContext_Make_ = SyclContext_Make;
+
+        // SyclEvent API
+        this->SyclEvent_GetEventRef_ = SyclEvent_GetEventRef;
+        this->SyclEvent_Make_ = SyclEvent_Make;
+
+        // SyclQueue API
+        this->SyclQueue_GetQueueRef_ = SyclQueue_GetQueueRef;
+        this->SyclQueue_Make_ = SyclQueue_Make;
+
+        // dpctl.memory API
+        this->Memory_GetUsmPointer_ = Memory_GetUsmPointer;
+        this->Memory_GetContextRef_ = Memory_GetContextRef;
+        this->Memory_GetQueueRef_ = Memory_GetQueueRef;
+        this->Memory_GetNumBytes_ = Memory_GetNumBytes;
+        this->Memory_Make_ = Memory_Make;
+
+        // dpctl.tensor.usm_ndarray API
+        this->UsmNDArray_GetData_ = UsmNDArray_GetData;
+        this->UsmNDArray_GetNDim_ = UsmNDArray_GetNDim;
+        this->UsmNDArray_GetShape_ = UsmNDArray_GetShape;
+        this->UsmNDArray_GetStrides_ = UsmNDArray_GetStrides;
+        this->UsmNDArray_GetTypenum_ = UsmNDArray_GetTypenum;
+        this->UsmNDArray_GetElementSize_ = UsmNDArray_GetElementSize;
+        this->UsmNDArray_GetFlags_ = UsmNDArray_GetFlags;
+        this->UsmNDArray_GetQueueRef_ = UsmNDArray_GetQueueRef;
+        this->UsmNDArray_GetOffset_ = UsmNDArray_GetOffset;
+
+        // constants
+        this->USM_ARRAY_C_CONTIGUOUS_ = USM_ARRAY_C_CONTIGUOUS;
+        this->USM_ARRAY_F_CONTIGUOUS_ = USM_ARRAY_F_CONTIGUOUS;
+        this->USM_ARRAY_WRITABLE_ = USM_ARRAY_WRITABLE;
+        this->UAR_BOOL_ = UAR_BOOL;
+        this->UAR_SHORT_ = UAR_SHORT;
+        this->UAR_USHORT_ = UAR_USHORT;
+        this->UAR_INT_ = UAR_INT;
+        this->UAR_UINT_ = UAR_UINT;
+        this->UAR_LONG_ = UAR_LONG;
+        this->UAR_ULONG_ = UAR_ULONG;
+        this->UAR_LONGLONG_ = UAR_LONGLONG;
+        this->UAR_ULONGLONG_ = UAR_ULONGLONG;
+        this->UAR_FLOAT_ = UAR_FLOAT;
+        this->UAR_DOUBLE_ = UAR_DOUBLE;
+        this->UAR_CFLOAT_ = UAR_CFLOAT;
+        this->UAR_CDOUBLE_ = UAR_CDOUBLE;
+        this->UAR_TYPE_SENTINEL_ = UAR_TYPE_SENTINEL;
+        this->UAR_HALF_ = UAR_HALF;
+
+        // deduced disjoint types
+        this->UAR_INT8_ = UAR_BYTE;
+        this->UAR_UINT8_ = UAR_UBYTE;
+        this->UAR_INT16_ = UAR_SHORT;
+        this->UAR_UINT16_ = UAR_USHORT;
+        this->UAR_INT32_ =
+            platform_typeid_lookup<std::int32_t, long, int, short>(
+                UAR_LONG, UAR_INT, UAR_SHORT);
+        this->UAR_UINT32_ =
+            platform_typeid_lookup<std::uint32_t, unsigned long, unsigned int,
+                                   unsigned short>(UAR_ULONG, UAR_UINT,
+                                                   UAR_USHORT);
+        this->UAR_INT64_ =
+            platform_typeid_lookup<std::int64_t, long, long long, int>(
+                UAR_LONG, UAR_LONGLONG, UAR_INT);
+        this->UAR_UINT64_ =
+            platform_typeid_lookup<std::uint64_t, unsigned long,
+                                   unsigned long long, unsigned int>(
+                UAR_ULONG, UAR_ULONGLONG, UAR_UINT);
+
+        // create shared pointers to python objects used in type-casters
+        // for dpctl::memory::usm_memory and dpctl::tensor::usm_ndarray
+        sycl::queue q_{};
+        PySyclQueueObject *py_q_tmp =
+            SyclQueue_Make(reinterpret_cast<DPCTLSyclQueueRef>(&q_));
+        py::object py_sycl_queue = py::reinterpret_steal<py::object>(
+            reinterpret_cast<PyObject *>(py_q_tmp));
+
+        default_sycl_queue = std::shared_ptr<py::object>(
+            new py::object(py_sycl_queue), Deleter{});
+
+        py::module_ mod_memory = py::module_::import("dpctl.memory");
+        py::object py_as_usm_memory = mod_memory.attr("as_usm_memory");
+        as_usm_memory = std::shared_ptr<py::object>(
+            new py::object{py_as_usm_memory}, Deleter{});
+
+        auto mem_kl = mod_memory.attr("MemoryUSMHost");
+        py::object py_default_usm_memory =
+            mem_kl(1, py::arg("queue") = py_sycl_queue);
+        default_usm_memory = std::shared_ptr<py::object>(
+            new py::object{py_default_usm_memory}, Deleter{});
+
+        py::module_ mod_usmarray =
+            py::module_::import("dpctl.tensor._usmarray");
+        auto tensor_kl = mod_usmarray.attr("usm_ndarray");
+
+        py::object py_default_usm_ndarray =
+            tensor_kl(py::tuple(), py::arg("dtype") = py::str("u1"),
+                      py::arg("buffer") = py_default_usm_memory);
+
+        default_usm_ndarray = std::shared_ptr<py::object>(
+            new py::object{py_default_usm_ndarray}, Deleter{});
+    }
+
+    dpctl_capi(dpctl_capi const &) = default;
+    dpctl_capi &operator=(dpctl_capi const &) = default;
+
+    static dpctl_capi lookup()
+    {
+        static dpctl_capi api;
+        return api;
+    }
+
+}; // struct dpctl_capi
+} // namespace detail
+} // namespace dpctl
+
 namespace pybind11
 {
 namespace detail
@@ -88,8 +377,9 @@ template <> struct type_caster<sycl::queue>
     bool load(handle src, bool)
     {
         PyObject *source = src.ptr();
-        if (PyObject_TypeCheck(source, &PySyclQueueType)) {
-            DPCTLSyclQueueRef QRef = SyclQueue_GetQueueRef(
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclQueue_Check_(source)) {
+            DPCTLSyclQueueRef QRef = api.SyclQueue_GetQueueRef_(
                 reinterpret_cast<PySyclQueueObject *>(source));
             value = std::make_unique<sycl::queue>(
                 *(reinterpret_cast<sycl::queue *>(QRef)));
@@ -103,7 +393,9 @@ template <> struct type_caster<sycl::queue>
 
     static handle cast(sycl::queue src, return_value_policy, handle)
     {
-        auto tmp = SyclQueue_Make(reinterpret_cast<DPCTLSyclQueueRef>(&src));
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclQueue_Make_(reinterpret_cast<DPCTLSyclQueueRef>(&src));
         return handle(reinterpret_cast<PyObject *>(tmp));
     }
 
@@ -120,8 +412,9 @@ template <> struct type_caster<sycl::device>
     bool load(handle src, bool)
     {
         PyObject *source = src.ptr();
-        if (PyObject_TypeCheck(source, &PySyclDeviceType)) {
-            DPCTLSyclDeviceRef DRef = SyclDevice_GetDeviceRef(
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclDevice_Check_(source)) {
+            DPCTLSyclDeviceRef DRef = api.SyclDevice_GetDeviceRef_(
                 reinterpret_cast<PySyclDeviceObject *>(source));
             value = std::make_unique<sycl::device>(
                 *(reinterpret_cast<sycl::device *>(DRef)));
@@ -135,7 +428,9 @@ template <> struct type_caster<sycl::device>
 
     static handle cast(sycl::device src, return_value_policy, handle)
     {
-        auto tmp = SyclDevice_Make(reinterpret_cast<DPCTLSyclDeviceRef>(&src));
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclDevice_Make_(reinterpret_cast<DPCTLSyclDeviceRef>(&src));
         return handle(reinterpret_cast<PyObject *>(tmp));
     }
 
@@ -152,8 +447,9 @@ template <> struct type_caster<sycl::context>
     bool load(handle src, bool)
     {
         PyObject *source = src.ptr();
-        if (PyObject_TypeCheck(source, &PySyclContextType)) {
-            DPCTLSyclContextRef CRef = SyclContext_GetContextRef(
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclContext_Check_(source)) {
+            DPCTLSyclContextRef CRef = api.SyclContext_GetContextRef_(
                 reinterpret_cast<PySyclContextObject *>(source));
             value = std::make_unique<sycl::context>(
                 *(reinterpret_cast<sycl::context *>(CRef)));
@@ -167,8 +463,9 @@ template <> struct type_caster<sycl::context>
 
     static handle cast(sycl::context src, return_value_policy, handle)
     {
+        auto &api = ::dpctl::detail::dpctl_capi::get();
         auto tmp =
-            SyclContext_Make(reinterpret_cast<DPCTLSyclContextRef>(&src));
+            api.SyclContext_Make_(reinterpret_cast<DPCTLSyclContextRef>(&src));
         return handle(reinterpret_cast<PyObject *>(tmp));
     }
 
@@ -185,8 +482,9 @@ template <> struct type_caster<sycl::event>
     bool load(handle src, bool)
     {
         PyObject *source = src.ptr();
-        if (PyObject_TypeCheck(source, &PySyclEventType)) {
-            DPCTLSyclEventRef ERef = SyclEvent_GetEventRef(
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclEvent_Check_(source)) {
+            DPCTLSyclEventRef ERef = api.SyclEvent_GetEventRef_(
                 reinterpret_cast<PySyclEventObject *>(source));
             value = std::make_unique<sycl::event>(
                 *(reinterpret_cast<sycl::event *>(ERef)));
@@ -200,7 +498,9 @@ template <> struct type_caster<sycl::event>
 
     static handle cast(sycl::event src, return_value_policy, handle)
     {
-        auto tmp = SyclEvent_Make(reinterpret_cast<DPCTLSyclEventRef>(&src));
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclEvent_Make_(reinterpret_cast<DPCTLSyclEventRef>(&src));
         return handle(reinterpret_cast<PyObject *>(tmp));
     }
 
@@ -212,93 +512,6 @@ template <> struct type_caster<sycl::event>
 namespace dpctl
 {
 
-namespace detail
-{
-
-struct dpctl_api
-{
-public:
-    static dpctl_api &get()
-    {
-        static dpctl_api api;
-        return api;
-    }
-
-    py::object sycl_queue_()
-    {
-        return *sycl_queue;
-    }
-    py::object default_usm_memory_()
-    {
-        return *default_usm_memory;
-    }
-    py::object default_usm_ndarray_()
-    {
-        return *default_usm_ndarray;
-    }
-    py::object as_usm_memory_()
-    {
-        return *as_usm_memory;
-    }
-
-private:
-    struct Deleter
-    {
-        void operator()(py::object *p) const
-        {
-            bool guard = (Py_IsInitialized() && !_Py_IsFinalizing());
-
-            if (guard) {
-                delete p;
-            }
-        }
-    };
-
-    std::shared_ptr<py::object> sycl_queue;
-    std::shared_ptr<py::object> default_usm_memory;
-    std::shared_ptr<py::object> default_usm_ndarray;
-    std::shared_ptr<py::object> as_usm_memory;
-
-    dpctl_api() : sycl_queue{}, default_usm_memory{}, default_usm_ndarray{}
-    {
-        import_dpctl();
-
-        sycl::queue q_;
-        py::object py_sycl_queue = py::cast(q_);
-        sycl_queue = std::shared_ptr<py::object>(new py::object{py_sycl_queue},
-                                                 Deleter{});
-
-        py::module_ mod_memory = py::module_::import("dpctl.memory");
-        py::object py_as_usm_memory = mod_memory.attr("as_usm_memory");
-        as_usm_memory = std::shared_ptr<py::object>(
-            new py::object{py_as_usm_memory}, Deleter{});
-
-        auto mem_kl = mod_memory.attr("MemoryUSMHost");
-        py::object py_default_usm_memory =
-            mem_kl(1, py::arg("queue") = py_sycl_queue);
-        default_usm_memory = std::shared_ptr<py::object>(
-            new py::object{py_default_usm_memory}, Deleter{});
-
-        py::module_ mod_usmarray =
-            py::module_::import("dpctl.tensor._usmarray");
-        auto tensor_kl = mod_usmarray.attr("usm_ndarray");
-
-        py::object py_default_usm_ndarray =
-            tensor_kl(py::tuple(), py::arg("dtype") = py::str("u1"),
-                      py::arg("buffer") = py_default_usm_memory);
-
-        default_usm_ndarray = std::shared_ptr<py::object>(
-            new py::object{py_default_usm_ndarray}, Deleter{});
-    }
-
-public:
-    dpctl_api(dpctl_api const &) = delete;
-    void operator=(dpctl_api const &) = delete;
-    ~dpctl_api(){};
-};
-
-} // namespace detail
-
 namespace memory
 {
 
@@ -313,13 +526,16 @@ class usm_memory : public py::object
         usm_memory,
         py::object,
         [](PyObject *o) -> bool {
-            return PyObject_TypeCheck(o, &Py_MemoryType) != 0;
+            return PyObject_TypeCheck(
+                       o, ::dpctl::detail::dpctl_capi::get().Py_MemoryType_) !=
+                   0;
         },
         [](PyObject *o) -> PyObject * { return as_usm_memory(o); })
 
     usm_memory()
-        : py::object(::dpctl::detail::dpctl_api::get().default_usm_memory_(),
-                     borrowed_t{})
+        : py::object(
+              ::dpctl::detail::dpctl_capi::get().default_usm_memory_pyobj(),
+              borrowed_t{})
     {
         if (!m_ptr)
             throw py::error_already_set();
@@ -328,7 +544,8 @@ class usm_memory : public py::object
     sycl::queue get_queue() const
     {
         Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
-        DPCTLSyclQueueRef QRef = Memory_GetQueueRef(mem_obj);
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclQueueRef QRef = api.Memory_GetQueueRef_(mem_obj);
         sycl::queue *obj_q = reinterpret_cast<sycl::queue *>(QRef);
         return *obj_q;
     }
@@ -336,14 +553,16 @@ class usm_memory : public py::object
     char *get_pointer() const
     {
         Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
-        DPCTLSyclUSMRef MRef = Memory_GetUsmPointer(mem_obj);
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclUSMRef MRef = api.Memory_GetUsmPointer_(mem_obj);
         return reinterpret_cast<char *>(MRef);
     }
 
     size_t get_nbytes() const
     {
+        auto &api = ::dpctl::detail::dpctl_capi::get();
         Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
-        return Memory_GetNumBytes(mem_obj);
+        return api.Memory_GetNumBytes_(mem_obj);
     }
 
 protected:
@@ -355,7 +574,8 @@ class usm_memory : public py::object
             return nullptr;
         }
 
-        auto convertor = ::dpctl::detail::dpctl_api::get().as_usm_memory_();
+        auto convertor =
+            ::dpctl::detail::dpctl_capi::get().as_usm_memory_pyobj();
 
         py::object res;
         try {
@@ -371,16 +591,69 @@ class usm_memory : public py::object
 
 namespace tensor
 {
+
+inline std::vector<py::ssize_t>
+c_contiguous_strides(int nd,
+                     const py::ssize_t *shape,
+                     py::ssize_t element_size = 1)
+{
+    if (nd > 0) {
+        std::vector<py::ssize_t> c_strides(nd, element_size);
+        for (int ic = nd - 1; ic > 0;) {
+            py::ssize_t next_v = c_strides[ic] * shape[ic];
+            c_strides[--ic] = next_v;
+        }
+        return c_strides;
+    }
+    else {
+        return std::vector<py::ssize_t>();
+    }
+}
+
+inline std::vector<py::ssize_t>
+f_contiguous_strides(int nd,
+                     const py::ssize_t *shape,
+                     py::ssize_t element_size = 1)
+{
+    if (nd > 0) {
+        std::vector<py::ssize_t> f_strides(nd, element_size);
+        for (int i = 0; i < nd - 1;) {
+            py::ssize_t next_v = f_strides[i] * shape[i];
+            f_strides[++i] = next_v;
+        }
+        return f_strides;
+    }
+    else {
+        return std::vector<py::ssize_t>();
+    }
+}
+
+inline std::vector<py::ssize_t>
+c_contiguous_strides(const std::vector<py::ssize_t> &shape,
+                     py::ssize_t element_size = 1)
+{
+    return c_contiguous_strides(shape.size(), shape.data(), element_size);
+}
+
+inline std::vector<py::ssize_t>
+f_contiguous_strides(const std::vector<py::ssize_t> &shape,
+                     py::ssize_t element_size = 1)
+{
+    return f_contiguous_strides(shape.size(), shape.data(), element_size);
+}
+
 class usm_ndarray : public py::object
 {
 public:
     PYBIND11_OBJECT(usm_ndarray, py::object, [](PyObject *o) -> bool {
-        return PyObject_TypeCheck(o, &PyUSMArrayType) != 0;
+        return PyObject_TypeCheck(
+                   o, ::dpctl::detail::dpctl_capi::get().PyUSMArrayType_) != 0;
     })
 
     usm_ndarray()
-        : py::object(::dpctl::detail::dpctl_api::get().default_usm_ndarray_(),
-                     borrowed_t{})
+        : py::object(
+              ::dpctl::detail::dpctl_capi::get().default_usm_ndarray_pyobj(),
+              borrowed_t{})
     {
         if (!m_ptr)
             throw py::error_already_set();
@@ -388,10 +661,10 @@ class usm_ndarray : public py::object
 
     char *get_data() const
     {
-        PyObject *raw_o = this->ptr();
-        PyUSMArrayObject *raw_ar = reinterpret_cast<PyUSMArrayObject *>(raw_o);
+        PyUSMArrayObject *raw_ar = this->usm_array_ptr();
 
-        return UsmNDArray_GetData(raw_ar);
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetData_(raw_ar);
     }
 
     template <typename T> T *get_data() const
@@ -401,18 +674,18 @@ class usm_ndarray : public py::object
 
     int get_ndim() const
     {
-        PyObject *raw_o = this->ptr();
-        PyUSMArrayObject *raw_ar = reinterpret_cast<PyUSMArrayObject *>(raw_o);
+        PyUSMArrayObject *raw_ar = this->usm_array_ptr();
 
-        return UsmNDArray_GetNDim(raw_ar);
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetNDim_(raw_ar);
     }
 
     const py::ssize_t *get_shape_raw() const
     {
-        PyObject *raw_o = this->ptr();
-        PyUSMArrayObject *raw_ar = reinterpret_cast<PyUSMArrayObject *>(raw_o);
+        PyUSMArrayObject *raw_ar = this->usm_array_ptr();
 
-        return UsmNDArray_GetShape(raw_ar);
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetShape_(raw_ar);
     }
 
     py::ssize_t get_shape(int i) const
@@ -423,19 +696,19 @@ class usm_ndarray : public py::object
 
     const py::ssize_t *get_strides_raw() const
     {
-        PyObject *raw_o = this->ptr();
-        PyUSMArrayObject *raw_ar = reinterpret_cast<PyUSMArrayObject *>(raw_o);
+        PyUSMArrayObject *raw_ar = this->usm_array_ptr();
 
-        return UsmNDArray_GetStrides(raw_ar);
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetStrides_(raw_ar);
     }
 
     py::ssize_t get_size() const
     {
-        PyObject *raw_o = this->ptr();
-        PyUSMArrayObject *raw_ar = reinterpret_cast<PyUSMArrayObject *>(raw_o);
+        PyUSMArrayObject *raw_ar = this->usm_array_ptr();
 
-        int ndim = UsmNDArray_GetNDim(raw_ar);
-        const py::ssize_t *shape = UsmNDArray_GetShape(raw_ar);
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        int ndim = api.UsmNDArray_GetNDim_(raw_ar);
+        const py::ssize_t *shape = api.UsmNDArray_GetShape_(raw_ar);
 
         py::ssize_t nelems = 1;
         for (int i = 0; i < ndim; ++i) {
@@ -448,12 +721,12 @@ class usm_ndarray : public py::object
 
     std::pair<py::ssize_t, py::ssize_t> get_minmax_offsets() const
     {
-        PyObject *raw_o = this->ptr();
-        PyUSMArrayObject *raw_ar = reinterpret_cast<PyUSMArrayObject *>(raw_o);
+        PyUSMArrayObject *raw_ar = this->usm_array_ptr();
 
-        int nd = UsmNDArray_GetNDim(raw_ar);
-        const py::ssize_t *shape = UsmNDArray_GetShape(raw_ar);
-        const py::ssize_t *strides = UsmNDArray_GetStrides(raw_ar);
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        int nd = api.UsmNDArray_GetNDim_(raw_ar);
+        const py::ssize_t *shape = api.UsmNDArray_GetShape_(raw_ar);
+        const py::ssize_t *strides = api.UsmNDArray_GetStrides_(raw_ar);
 
         py::ssize_t offset_min = 0;
         py::ssize_t offset_max = 0;
@@ -465,7 +738,7 @@ class usm_ndarray : public py::object
             }
         }
         else {
-            offset_min = UsmNDArray_GetOffset(raw_ar);
+            offset_min = api.UsmNDArray_GetOffset_(raw_ar);
             offset_max = offset_min;
             for (int i = 0; i < nd; ++i) {
                 py::ssize_t delta = strides[i] * (shape[i] - 1);
@@ -482,35 +755,62 @@ class usm_ndarray : public py::object
 
     sycl::queue get_queue() const
     {
-        PyObject *raw_o = this->ptr();
-        PyUSMArrayObject *raw_ar = reinterpret_cast<PyUSMArrayObject *>(raw_o);
+        PyUSMArrayObject *raw_ar = this->usm_array_ptr();
 
-        DPCTLSyclQueueRef QRef = UsmNDArray_GetQueueRef(raw_ar);
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclQueueRef QRef = api.UsmNDArray_GetQueueRef_(raw_ar);
         return *(reinterpret_cast<sycl::queue *>(QRef));
     }
 
     int get_typenum() const
     {
-        PyObject *raw_o = this->ptr();
-        PyUSMArrayObject *raw_ar = reinterpret_cast<PyUSMArrayObject *>(raw_o);
+        PyUSMArrayObject *raw_ar = this->usm_array_ptr();
 
-        return UsmNDArray_GetTypenum(raw_ar);
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetTypenum_(raw_ar);
     }
 
     int get_flags() const
     {
-        PyObject *raw_o = this->ptr();
-        PyUSMArrayObject *raw_ar = reinterpret_cast<PyUSMArrayObject *>(raw_o);
+        PyUSMArrayObject *raw_ar = this->usm_array_ptr();
 
-        return UsmNDArray_GetFlags(raw_ar);
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetFlags_(raw_ar);
     }
 
     int get_elemsize() const
     {
-        PyObject *raw_o = this->ptr();
-        PyUSMArrayObject *raw_ar = reinterpret_cast<PyUSMArrayObject *>(raw_o);
+        PyUSMArrayObject *raw_ar = this->usm_array_ptr();
 
-        return UsmNDArray_GetElementSize(raw_ar);
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        return api.UsmNDArray_GetElementSize_(raw_ar);
+    }
+
+    bool is_c_contiguous() const
+    {
+        int flags = this->get_flags();
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        return static_cast<bool>(flags & api.USM_ARRAY_C_CONTIGUOUS_);
+    }
+
+    bool is_f_contiguous() const
+    {
+        int flags = this->get_flags();
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        return static_cast<bool>(flags & api.USM_ARRAY_F_CONTIGUOUS_);
+    }
+
+    bool is_writable() const
+    {
+        int flags = this->get_flags();
+        auto &api = ::dpctl::detail::dpctl_capi::get();
+        return static_cast<bool>(flags & api.USM_ARRAY_WRITABLE_);
+    }
+
+private:
+    PyUSMArrayObject *usm_array_ptr() const
+    {
+        return reinterpret_cast<PyUSMArrayObject *>(m_ptr);
     }
 };
 
diff --git a/dpctl/apis/include/dpctl_capi.h b/dpctl/apis/include/dpctl_capi.h
index af4a5fd951..d6c104581a 100644
--- a/dpctl/apis/include/dpctl_capi.h
+++ b/dpctl/apis/include/dpctl_capi.h
@@ -47,8 +47,11 @@
  * C functions can use dpctl's C-API functions without linking to
  * shared objects defining this symbols, if they call `import_dpctl()`
  * prior to using those symbols.
+ *
+ * It is declared inline to allow multiple definitions in
+ * different translation units
  */
-void import_dpctl(void)
+static inline void import_dpctl(void)
 {
     import_dpctl___sycl_device();
     import_dpctl___sycl_context();
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 23a99304d8..a587d84609 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -18,12 +18,22 @@ add_custom_target(_dpctl4pybind11_deps
 set(python_module_name _tensor_impl)
 pybind11_add_module(${python_module_name} MODULE
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
 )
 target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
 target_include_directories(${python_module_name}
     PRIVATE
     ${CMAKE_CURRENT_SOURCE_DIR}/../include
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
 )
 add_dependencies(${python_module_name} _dpctl4pybind11_deps)
 install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor")
diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp
new file mode 100644
index 0000000000..4023d291af
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/constructors.hpp
@@ -0,0 +1,657 @@
+//=== constructors.hpp -  -----------------------------------*-C++-*--/===//
+//===              Implementation of tensor constructors kernels ------===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor constructors.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include "utils/strided_iters.hpp"
+#include "utils/type_utils.hpp"
+#include <CL/sycl.hpp>
+#include <complex>
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace constructors
+{
+
+/*!
+  @defgroup CtorKernels
+ */
+
+template <typename Ty> class linear_sequence_step_kernel;
+template <typename Ty, typename wTy> class linear_sequence_affine_kernel;
+template <typename Ty> class eye_kernel;
+
+namespace py = pybind11;
+
+/* =========== Unboxing Python scalar =============== */
+
+/*!
+ * @brief Cast pybind11 class managing Python object to specified type `T`.
+ * @defgroup CtorKernels
+ */
+template <typename T> T unbox_py_scalar(py::object o)
+{
+    return py::cast<T>(o);
+}
+
+template <> inline sycl::half unbox_py_scalar<sycl::half>(py::object o)
+{
+    float tmp = py::cast<float>(o);
+    return static_cast<sycl::half>(tmp);
+}
+
+// Constructor to populate tensor with linear sequence defined by
+// start and step data
+
+typedef sycl::event (*lin_space_step_fn_ptr_t)(
+    sycl::queue,
+    size_t, // num_elements
+    py::object start,
+    py::object step,
+    char *, // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+template <typename Ty> class LinearSequenceStepFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty start_v;
+    Ty step_v;
+
+public:
+    LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), step_v(dv)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        auto i = wiid.get(0);
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<Ty>::value) {
+            p[i] = Ty{start_v.real() + i * step_v.real(),
+                      start_v.imag() + i * step_v.imag()};
+        }
+        else {
+            p[i] = start_v + i * step_v;
+        }
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by typed starting value and
+ * increment.
+ *
+ * @param q  Sycl queue to which the kernel is submitted
+ * @param nelems Length of the sequence
+ * @param start_v Typed starting value of the sequence
+ * @param step_v  Typed increment of the sequence
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_step_impl(sycl::queue exec_q,
+                                size_t nelems,
+                                Ty start_v,
+                                Ty step_v,
+                                char *array_data,
+                                const std::vector<sycl::event> &depends)
+{
+    sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<linear_sequence_step_kernel<Ty>>(
+            sycl::range<1>{nelems},
+            LinearSequenceStepFunctor<Ty>(array_data, start_v, step_v));
+    });
+
+    return lin_space_step_event;
+}
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by starting value and increment
+ * given as Python objects.
+ *
+ * @param q  Sycl queue to which the kernel is submitted
+ * @param nelems Length of the sequence
+ * @param start Starting value of the sequence as Python object. Must be
+ * convertible to array element data type `Ty`.
+ * @param step  Increment of the sequence as Python object. Must be convertible
+ * to array element data type `Ty`.
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_step_impl(sycl::queue exec_q,
+                                size_t nelems,
+                                py::object start,
+                                py::object step,
+                                char *array_data,
+                                const std::vector<sycl::event> &depends)
+{
+    Ty start_v;
+    Ty step_v;
+    try {
+        start_v = unbox_py_scalar<Ty>(start);
+        step_v = unbox_py_scalar<Ty>(step);
+    } catch (const py::error_already_set &e) {
+        throw;
+    }
+
+    auto lin_space_step_event = lin_space_step_impl<Ty>(
+        exec_q, nelems, start_v, step_v, array_data, depends);
+
+    return lin_space_step_event;
+}
+
+/*!
+ * @brief  Factor to get function pointer of type `fnT` for array with elements
+ * of type `Ty`.
+ * @defgroup CtorKernels
+ */
+template <typename fnT, typename Ty> struct LinSpaceStepFactory
+{
+    fnT get()
+    {
+        fnT f = lin_space_step_impl<Ty>;
+        return f;
+    }
+};
+
+// Constructor to populate tensor with linear sequence defined by
+// start and and data
+
+typedef sycl::event (*lin_space_affine_fn_ptr_t)(
+    sycl::queue,
+    size_t, // num_elements
+    py::object start,
+    py::object end,
+    bool include_endpoint,
+    char *, // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+template <typename Ty, typename wTy> class LinearSequenceAffineFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty start_v;
+    Ty end_v;
+    size_t n;
+
+public:
+    LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, size_t den)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1),
+          n((den == 0) ? 1 : den)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        auto i = wiid.get(0);
+        wTy wc = wTy(i) / n;
+        wTy w = wTy(n - i) / n;
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<Ty>::value) {
+            auto _w = static_cast<typename Ty::value_type>(w);
+            auto _wc = static_cast<typename Ty::value_type>(wc);
+            auto re_comb = start_v.real() * _w + end_v.real() * _wc;
+            auto im_comb = start_v.imag() * _w + end_v.imag() * _wc;
+            Ty affine_comb = Ty{re_comb, im_comb};
+            p[i] = affine_comb;
+        }
+        else {
+            using dpctl::tensor::type_utils::convert_impl;
+            auto affine_comb = start_v * w + end_v * wc;
+            p[i] = convert_impl<Ty, decltype(affine_comb)>(affine_comb);
+        }
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by typed starting and end values.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence.
+ * @param start_v Stating value of the sequence.
+ * @param end_v   End-value of the sequence.
+ * @param include_endpoint  Whether the end-value is included in the sequence.
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_affine_impl(sycl::queue exec_q,
+                                  size_t nelems,
+                                  Ty start_v,
+                                  Ty end_v,
+                                  bool include_endpoint,
+                                  char *array_data,
+                                  const std::vector<sycl::event> &depends)
+{
+    bool device_supports_doubles = exec_q.get_device().has(sycl::aspect::fp64);
+    sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        if (device_supports_doubles) {
+            cgh.parallel_for<linear_sequence_affine_kernel<Ty, double>>(
+                sycl::range<1>{nelems},
+                LinearSequenceAffineFunctor<Ty, double>(
+                    array_data, start_v, end_v,
+                    (include_endpoint) ? nelems - 1 : nelems));
+        }
+        else {
+            cgh.parallel_for<linear_sequence_affine_kernel<Ty, float>>(
+                sycl::range<1>{nelems},
+                LinearSequenceAffineFunctor<Ty, float>(
+                    array_data, start_v, end_v,
+                    (include_endpoint) ? nelems - 1 : nelems));
+        }
+    });
+
+    return lin_space_affine_event;
+}
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified  by starting and end values given
+ * as Python objects.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param start Stating value of the sequence as Python object. Must be
+ * convertible to array data element type `Ty`.
+ * @param end   End-value of the sequence as Python object. Must be convertible
+ * to array data element type `Ty`.
+ * @param include_endpoint  Whether the end-value is included in the sequence
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_affine_impl(sycl::queue exec_q,
+                                  size_t nelems,
+                                  py::object start,
+                                  py::object end,
+                                  bool include_endpoint,
+                                  char *array_data,
+                                  const std::vector<sycl::event> &depends)
+{
+    Ty start_v, end_v;
+    try {
+        start_v = unbox_py_scalar<Ty>(start);
+        end_v = unbox_py_scalar<Ty>(end);
+    } catch (const py::error_already_set &e) {
+        throw;
+    }
+
+    auto lin_space_affine_event = lin_space_affine_impl<Ty>(
+        exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends);
+
+    return lin_space_affine_event;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for array data type
+ * `Ty`.
+ */
+template <typename fnT, typename Ty> struct LinSpaceAffineFactory
+{
+    fnT get()
+    {
+        fnT f = lin_space_affine_impl<Ty>;
+        return f;
+    }
+};
+
+/* ================ Full ================== */
+
+typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue,
+                                            size_t,
+                                            py::object,
+                                            char *,
+                                            const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param fill_v  Value to fill the array with
+ * @param dst_p Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_contig_impl(sycl::queue q,
+                             size_t nelems,
+                             dstTy fill_v,
+                             char *dst_p,
+                             const std::vector<sycl::event> &depends)
+{
+    sycl::event fill_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        dstTy *p = reinterpret_cast<dstTy *>(dst_p);
+        cgh.fill<dstTy>(p, fill_v, nelems);
+    });
+
+    return fill_ev;
+}
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param py_value Python object representing the value to fill the array with.
+ * Must be convertible to `dstTy`.
+ * @param dst_p Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_contig_impl(sycl::queue exec_q,
+                             size_t nelems,
+                             py::object py_value,
+                             char *dst_p,
+                             const std::vector<sycl::event> &depends)
+{
+    dstTy fill_v;
+    try {
+        fill_v = unbox_py_scalar<dstTy>(py_value);
+    } catch (const py::error_already_set &e) {
+        throw;
+    }
+
+    sycl::event fill_ev =
+        full_contig_impl<dstTy>(exec_q, nelems, fill_v, dst_p, depends);
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty> struct FullContigFactory
+{
+    fnT get()
+    {
+        fnT f = full_contig_impl<Ty>;
+        return f;
+    }
+};
+
+/* ================ Eye ================== */
+
+typedef sycl::event (*eye_fn_ptr_t)(sycl::queue,
+                                    size_t nelems, // num_elements
+                                    py::ssize_t start,
+                                    py::ssize_t end,
+                                    py::ssize_t step,
+                                    char *, // dst_data_ptr
+                                    const std::vector<sycl::event> &);
+
+template <typename Ty> class EyeFunctor
+{
+private:
+    Ty *p = nullptr;
+    py::ssize_t start_v;
+    py::ssize_t end_v;
+    py::ssize_t step_v;
+
+public:
+    EyeFunctor(char *dst_p,
+               const py::ssize_t v0,
+               const py::ssize_t v1,
+               const py::ssize_t dv)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1), step_v(dv)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        Ty set_v = 0;
+        py::ssize_t i = static_cast<py::ssize_t>(wiid.get(0));
+        if (i >= start_v and i <= end_v) {
+            if ((i - start_v) % step_v == 0) {
+                set_v = 1;
+            }
+        }
+        p[i] = set_v;
+    }
+};
+
+/*!
+ * @brief Function to populate 2D array with eye matrix.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Number of elements to assign.
+ * @param start   Position of the first non-zero value.
+ * @param end     Position of the last non-zero value.
+ * @param step    Number of array elements between non-zeros.
+ * @param array_data Kernel accessible USM pointer for the destination array.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return  Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event eye_impl(sycl::queue exec_q,
+                     size_t nelems,
+                     const py::ssize_t start,
+                     const py::ssize_t end,
+                     const py::ssize_t step,
+                     char *array_data,
+                     const std::vector<sycl::event> &depends)
+{
+    sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<eye_kernel<Ty>>(
+            sycl::range<1>{nelems},
+            EyeFunctor<Ty>(array_data, start, end, step));
+    });
+
+    return eye_event;
+}
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty> struct EyeFactory
+{
+    fnT get()
+    {
+        fnT f = eye_impl<Ty>;
+        return f;
+    }
+};
+
+/* =========================== Tril and triu ============================== */
+
+// define function type
+typedef sycl::event (*tri_fn_ptr_t)(sycl::queue,
+                                    py::ssize_t,   // inner_range  //py::ssize_t
+                                    py::ssize_t,   // outer_range
+                                    char *,        // src_data_ptr
+                                    char *,        // dst_data_ptr
+                                    py::ssize_t,   // nd
+                                    py::ssize_t *, // shape_and_strides
+                                    py::ssize_t,   // k
+                                    const std::vector<sycl::event> &,
+                                    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy triangular matrices from source stack to destination
+ * stack.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param inner_range  Number of elements in each matrix.
+ * @param outer_range  Number of matrices to copy.
+ * @param src_p  Kernel accessible USM pointer for the source array.
+ * @param dst_p  Kernel accessible USM pointer for the destination array.
+ * @param nd  The array dimensionality of source and destination arrays.
+ * @param shape_and_strides  Kernel accessible USM pointer to packed shape and
+ * strides of arrays.
+ * @param k Position of the diagonal above/below which to copy filling the rest
+ * with zero elements.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ * @param additional_depends  List of additional events to wait for before
+ * starting computations, if any.
+ *
+ * @return  Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty, bool> class tri_kernel;
+template <typename Ty, bool upper>
+sycl::event tri_impl(sycl::queue exec_q,
+                     py::ssize_t inner_range,
+                     py::ssize_t outer_range,
+                     char *src_p,
+                     char *dst_p,
+                     py::ssize_t nd,
+                     py::ssize_t *shape_and_strides,
+                     py::ssize_t k,
+                     const std::vector<sycl::event> &depends,
+                     const std::vector<sycl::event> &additional_depends)
+{
+    constexpr int d2 = 2;
+    py::ssize_t src_s = nd;
+    py::ssize_t dst_s = 2 * nd;
+    py::ssize_t nd_1 = nd - 1;
+    py::ssize_t nd_2 = nd - 2;
+    Ty *src = reinterpret_cast<Ty *>(src_p);
+    Ty *dst = reinterpret_cast<Ty *>(dst_p);
+
+    sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+        cgh.parallel_for<tri_kernel<Ty, upper>>(
+            sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) {
+                py::ssize_t outer_gid = idx[0] / inner_range;
+                py::ssize_t inner_gid = idx[0] - inner_range * outer_gid;
+
+                py::ssize_t src_inner_offset, dst_inner_offset;
+                bool to_copy;
+
+                {
+                    // py::ssize_t inner_gid = idx.get_id(0);
+                    CIndexer_array<d2, py::ssize_t> indexer_i(
+                        {shape_and_strides[nd_2], shape_and_strides[nd_1]});
+                    indexer_i.set(inner_gid);
+                    const std::array<py::ssize_t, d2> &inner = indexer_i.get();
+                    src_inner_offset =
+                        inner[0] * shape_and_strides[src_s + nd_2] +
+                        inner[1] * shape_and_strides[src_s + nd_1];
+                    dst_inner_offset =
+                        inner[0] * shape_and_strides[dst_s + nd_2] +
+                        inner[1] * shape_and_strides[dst_s + nd_1];
+
+                    if constexpr (upper)
+                        to_copy = (inner[0] + k >= inner[1]);
+                    else
+                        to_copy = (inner[0] + k <= inner[1]);
+                }
+
+                py::ssize_t src_offset = 0;
+                py::ssize_t dst_offset = 0;
+                {
+                    // py::ssize_t outer_gid = idx.get_id(1);
+                    CIndexer_vector<py::ssize_t> outer(nd - d2);
+                    outer.get_displacement(
+                        outer_gid, shape_and_strides, shape_and_strides + src_s,
+                        shape_and_strides + dst_s, src_offset, dst_offset);
+                }
+
+                src_offset += src_inner_offset;
+                dst_offset += dst_inner_offset;
+
+                dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0);
+            });
+    });
+    return tri_ev;
+}
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty> struct TrilGenericFactory
+{
+    fnT get()
+    {
+        fnT f = tri_impl<Ty, /*tril*/ true>;
+        return f;
+    }
+};
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty> struct TriuGenericFactory
+{
+    fnT get()
+    {
+        fnT f = tri_impl<Ty, /*triu*/ false>;
+        return f;
+    }
+};
+
+} // namespace constructors
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
new file mode 100644
index 0000000000..0d5a1d21ca
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -0,0 +1,669 @@
+//=== copy_and_cast.hpp - Implementation of copy-and-cast kernels *-C++-*/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor copying and value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include "utils/strided_iters.hpp"
+#include "utils/type_utils.hpp"
+#include <CL/sycl.hpp>
+#include <complex>
+#include <cstdint>
+#include <pybind11/pybind11.h>
+#include <type_traits>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace copy_and_cast
+{
+
+namespace py = pybind11;
+
+template <typename srcT, typename dstT> class copy_cast_generic_kernel;
+template <typename srcT, typename dstT> class copy_cast_from_host_kernel;
+template <typename srcT, typename dstT, int nd> class copy_cast_spec_kernel;
+template <typename Ty> class copy_for_reshape_generic_kernel;
+
+template <typename srcT, typename dstT> class Caster
+{
+public:
+    Caster() = default;
+    void operator()(char *src,
+                    std::ptrdiff_t src_offset,
+                    char *dst,
+                    std::ptrdiff_t dst_offset) const
+    {
+        using dpctl::tensor::type_utils::convert_impl;
+
+        srcT *src_ = reinterpret_cast<srcT *>(src) + src_offset;
+        dstT *dst_ = reinterpret_cast<dstT *>(dst) + dst_offset;
+        *dst_ = convert_impl<dstT, srcT>(*src_);
+    }
+};
+
+template <typename CastFnT> class GenericCopyFunctor
+{
+private:
+    char *src_ = nullptr;
+    char *dst_ = nullptr;
+    py::ssize_t *shape_strides_ = nullptr;
+    int nd_ = 0;
+    py::ssize_t src_offset0 = 0;
+    py::ssize_t dst_offset0 = 0;
+
+public:
+    GenericCopyFunctor(char *src_cp,
+                       char *dst_cp,
+                       py::ssize_t *shape_strides,
+                       int nd,
+                       py::ssize_t src_offset,
+                       py::ssize_t dst_offset)
+        : src_(src_cp), dst_(dst_cp), shape_strides_(shape_strides), nd_(nd),
+          src_offset0(src_offset), dst_offset0(dst_offset)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        py::ssize_t src_offset(0);
+        py::ssize_t dst_offset(0);
+        CIndexer_vector<py::ssize_t> indxr(nd_);
+        indxr.get_displacement<const py::ssize_t *, const py::ssize_t *>(
+            static_cast<py::ssize_t>(wiid.get(0)),
+            const_cast<const py::ssize_t *>(shape_strides_), // common shape
+            const_cast<const py::ssize_t *>(shape_strides_ +
+                                            nd_), // src strides
+            const_cast<const py::ssize_t *>(shape_strides_ +
+                                            2 * nd_), // dst strides
+            src_offset,                               // modified by reference
+            dst_offset                                // modified by reference
+        );
+        CastFnT fn{};
+        fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset);
+    }
+};
+
+template <int nd, typename CastFnT> class NDSpecializedCopyFunctor
+{
+private:
+    char *src_ = nullptr;
+    char *dst_ = nullptr;
+    CIndexer_array<nd, py::ssize_t> indxr;
+    const std::array<py::ssize_t, nd> src_strides_;
+    const std::array<py::ssize_t, nd> dst_strides_;
+    static const int nd_ = nd;
+    py::ssize_t src_offset0 = 0;
+    py::ssize_t dst_offset0 = 0;
+
+public:
+    NDSpecializedCopyFunctor(char *src_cp, // USM pointer
+                             char *dst_cp, // USM pointer
+                             const std::array<py::ssize_t, nd> shape,
+                             const std::array<py::ssize_t, nd> src_strides,
+                             const std::array<py::ssize_t, nd> dst_strides,
+                             py::ssize_t src_offset,
+                             py::ssize_t dst_offset)
+        : src_(src_cp), dst_(dst_cp), indxr(shape), src_strides_(src_strides),
+          dst_strides_(dst_strides), src_offset0(src_offset),
+          dst_offset0(dst_offset)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        py::ssize_t src_offset = 0;
+        py::ssize_t dst_offset = 0;
+        CIndexer_array<nd, py::ssize_t> local_indxr(std::move(indxr));
+
+        local_indxr.set(wiid.get(0));
+        auto mi = local_indxr.get();
+        for (int i = 0; i < nd; ++i)
+            src_offset += mi[i] * src_strides_[i];
+        for (int i = 0; i < nd; ++i)
+            dst_offset += mi[i] * dst_strides_[i];
+
+        CastFnT fn{};
+        fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset);
+    }
+};
+
+/*!
+  @defgroup CopyAndCastKernels
+ */
+
+/*!
+ * @brief Function pointer type for generic array cast and copying function.
+ */
+typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    int,
+    py::ssize_t *,
+    char *,
+    py::ssize_t,
+    char *,
+    py::ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to
+ `dst` usm_ndarray while casting from `srcTy` to `dstTy`.
+
+   Both arrays have array dimensionality specied via argument `nd`. The
+ `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the
+ first `nd` elements encode common shape, second `nd` elements contain strides
+ of `src` array, and the trailing `nd` elements contain strides of `dst` array.
+ `src_p` and `dst_p` represent pointers into respective arrays, but the start of
+ iteration begins at offset of `src_offset` elements for `src` array and at
+ offset `dst_offset` elements for `dst` array. Kernel is submitted to sycl queue
+ `q` with events `depends` and `additional_depends` as dependencies.
+
+   @param  q       Sycl queue to which the kernel is submitted.
+   @param  nelems  Number of elements to cast and copy.
+   @param  nd      Array dimensionality, i.e. number of indices needed to
+ identify an element of each array.
+   @param  shape_and_strides  Kernel accessible USM pointer to packed shape and
+ strides.
+   @param  src_p   Kernel accessible USM pointer for the source array
+   @param  src_offset  Offset to the beginning of iteration in number of
+ elements of source array from `src_p`.
+   @param  dst_p   Kernel accessible USM pointer for the destination array
+   @param  dst_offset  Offset to the beginning of iteration in number of
+ elements of destination array from `dst_p`.
+   @param  depends  List of events to wait for before starting computations, if
+ any.
+   @param  additional_depends Additional list of events to wait for before
+ starting computations, if any.
+
+   @return  Event to wait on to ensure that computation completes.
+   @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+sycl::event
+copy_and_cast_generic_impl(sycl::queue q,
+                           size_t nelems,
+                           int nd,
+                           py::ssize_t *shape_and_strides,
+                           char *src_p,
+                           py::ssize_t src_offset,
+                           char *dst_p,
+                           py::ssize_t dst_offset,
+                           const std::vector<sycl::event> &depends,
+                           const std::vector<sycl::event> &additional_depends)
+{
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+        cgh.parallel_for<copy_cast_generic_kernel<srcTy, dstTy>>(
+            sycl::range<1>(nelems),
+            GenericCopyFunctor<Caster<srcTy, dstTy>>(
+                src_p, dst_p, shape_and_strides, nd, src_offset, dst_offset));
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get generic function pointer of type `fnT` for given source
+ * data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S> struct CopyAndCastGenericFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_generic_impl<D, S>;
+        return f;
+    }
+};
+
+// Specialization of copy_and_cast for 1D arrays
+
+/*!
+ * @brief Factory to get function pointer for casting and copying 1D arrays.
+ * @ingroup CopyAndCastKernels
+ */
+typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    const std::array<py::ssize_t, 1>,
+    const std::array<py::ssize_t, 1>,
+    const std::array<py::ssize_t, 1>,
+    char *,
+    py::ssize_t,
+    char *,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Factory to get function pointer for casting and copying 2D arrays.
+ * @ingroup CopyAndCastKernels
+ */
+typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    const std::array<py::ssize_t, 2>,
+    const std::array<py::ssize_t, 2>,
+    const std::array<py::ssize_t, 2>,
+    char *,
+    py::ssize_t,
+    char *,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Specialized for given array dimension function to copy `nelems`
+ elements from `src` usm_ndarray to `dst` usm_ndarray while casting from `srcTy`
+ to `dstTy`.
+
+   Both arrays have array dimensionality known at compile time and specified in
+ template parameters `nd`. Arrays' shape and strides are provided as
+ `std::array`. `src_p` and `dst_p` represent pointers into respective arrays,
+ but the start of iteration begins at offset of `src_offset` elements for `src`
+ array and at offset `dst_offset` elements for `dst` array. Kernel is submitted
+ to sycl queue `q` with events `depends` as dependencies.
+
+   @param q  The queue where the routine should be executed.
+   @param nelems  Number of elements to cast and copy.
+   @param shape   Common shape of the arrays.
+   @param src_strides Strides of the source array.
+   @param dst_strides Strides of the destination array.
+   @param src_p  Kernel accessible USM pointer for the source array
+   @param src_offset  Offset to the beginning of iteration in number of elements
+ of the source array from `src_p`.
+   @param dst_p  Kernel accessible USM pointer for the destination array
+   @param dst_offset  Offset to the beginning of iteration in number of elements
+ of the destination array from `src_p`.
+   @param depends  List of events to wait for before starting computations, if
+ any.
+
+   @return  Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy, int nd>
+sycl::event
+copy_and_cast_nd_specialized_impl(sycl::queue q,
+                                  size_t nelems,
+                                  const std::array<py::ssize_t, nd> shape,
+                                  const std::array<py::ssize_t, nd> src_strides,
+                                  const std::array<py::ssize_t, nd> dst_strides,
+                                  char *src_p,
+                                  py::ssize_t src_offset,
+                                  char *dst_p,
+                                  py::ssize_t dst_offset,
+                                  const std::vector<sycl::event> &depends)
+{
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<copy_cast_spec_kernel<srcTy, dstTy, nd>>(
+            sycl::range<1>(nelems),
+            NDSpecializedCopyFunctor<nd, Caster<srcTy, dstTy>>(
+                src_p, dst_p, shape, src_strides, dst_strides, src_offset,
+                dst_offset));
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get 1D-specialized function pointer of type `fnT` for given
+ * source data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S> struct CopyAndCast1DFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_nd_specialized_impl<D, S, 1>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get 2D-specialized function pointer of type `fnT` for given
+ * source data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S> struct CopyAndCast2DFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_nd_specialized_impl<D, S, 2>;
+        return f;
+    }
+};
+
+// ====================== Copying from host to USM
+
+template <typename srcT, typename dstT, typename AccessorT>
+class CasterForAccessor
+{
+public:
+    CasterForAccessor() = default;
+    void operator()(AccessorT src,
+                    std::ptrdiff_t src_offset,
+                    char *dst,
+                    std::ptrdiff_t dst_offset) const
+    {
+        using dpctl::tensor::type_utils::convert_impl;
+
+        dstT *dst_ = reinterpret_cast<dstT *>(dst) + dst_offset;
+        *dst_ = convert_impl<dstT, srcT>(src[src_offset]);
+    }
+};
+
+template <typename CastFnT, typename AccessorT> class GenericCopyFromHostFunctor
+{
+private:
+    AccessorT src_acc_;
+    char *dst_ = nullptr;
+    py::ssize_t *shape_strides_ = nullptr;
+    int nd_ = 0;
+    py::ssize_t src_offset0 = 0;
+    py::ssize_t dst_offset0 = 0;
+
+public:
+    GenericCopyFromHostFunctor(AccessorT src_acc,
+                               char *dst_cp,
+                               py::ssize_t *shape_strides,
+                               int nd,
+                               py::ssize_t src_offset,
+                               py::ssize_t dst_offset)
+        : src_acc_(src_acc), dst_(dst_cp), shape_strides_(shape_strides),
+          nd_(nd), src_offset0(src_offset), dst_offset0(dst_offset)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        py::ssize_t src_offset(0);
+        py::ssize_t dst_offset(0);
+        CIndexer_vector<py::ssize_t> indxr(nd_);
+        indxr.get_displacement<const py::ssize_t *, const py::ssize_t *>(
+            static_cast<py::ssize_t>(wiid.get(0)),
+            const_cast<const py::ssize_t *>(shape_strides_), // common shape
+            const_cast<const py::ssize_t *>(shape_strides_ +
+                                            nd_), // src strides
+            const_cast<const py::ssize_t *>(shape_strides_ +
+                                            2 * nd_), // dst strides
+            src_offset,                               // modified by reference
+            dst_offset                                // modified by reference
+        );
+        CastFnT fn{};
+        fn(src_acc_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset);
+    }
+};
+
+typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    int,
+    py::ssize_t *,
+    const char *,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    char *,
+    py::ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy from NumPy's ndarray with elements of type `srcTy`
+ * into usm_ndarray with elements of type `srcTy`.
+ *
+ * Function to cast and copy elements from numpy.ndarray specified by typeless
+ * `host_src_p` and the `src_offset` given in the number of array elements.
+ * Arrays' metadata are given in packed USM vector of length `3*nd` whose first
+ * `nd` elements contain arrays' shape, next `nd` elements specify source
+ * strides in elements (not bytes), and trailing `nd` elements specify
+ * destination array strides. Kernel dependencies are given by two vectors of
+ * events: `depends` and `additional_depends`. The function execution is
+ * complete at the return.
+ *
+ * @param q  The queue where the routine should be executed.
+ * @param nelems Number of elements to cast and copy.
+ * @param nd The dimensionality of arrays
+ * @param shape_and_strides  Kernel accessible USM pointer to packed shape and
+ * strides.
+ * @param host_src_p  Host (not USM allocated) pointer associated with the
+ * source array.
+ * @param src_offset  Offset to the beginning of iteration in number of elements
+ * of the source array from `host_src_p`.
+ * @param src_min_nelem_offset  Smallest value of offset relative to
+ * `host_src_p` in number of elements attained while iterating over elements of
+ * the source array.
+ * @param src_max_nelem_offset  Largest value of offset relative to `host_src_p`
+ * in number of elements attained while iterating over elements of the source
+ * array.
+ * @param dst_p  USM pointer associated with the destination array.
+ * @param dst_offset  Offset to the beginning of iteration in number of elements
+ * of the destination array from `dst_p`.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ * @param additional_depends List of additional events to wait for before
+ * starting computations, if any.
+ *
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+void copy_and_cast_from_host_impl(
+    sycl::queue q,
+    size_t nelems,
+    int nd,
+    py::ssize_t *shape_and_strides,
+    const char *host_src_p,
+    py::ssize_t src_offset,
+    py::ssize_t src_min_nelem_offset,
+    py::ssize_t src_max_nelem_offset,
+    char *dst_p,
+    py::ssize_t dst_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    py::ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1;
+    sycl::buffer<srcTy, 1> npy_buf(
+        reinterpret_cast<const srcTy *>(host_src_p) + src_min_nelem_offset,
+        sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}});
+
+    sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
+
+        cgh.parallel_for<copy_cast_from_host_kernel<srcTy, dstTy>>(
+            sycl::range<1>(nelems),
+            GenericCopyFromHostFunctor<
+                CasterForAccessor<srcTy, dstTy, decltype(npy_acc)>,
+                decltype(npy_acc)>(npy_acc, dst_p, shape_and_strides, nd,
+                                   src_offset - src_min_nelem_offset,
+                                   dst_offset));
+    });
+
+    // perform explicit synchronization. Implicit synchronization would be
+    // performed by sycl::buffer destructor.
+    copy_and_cast_from_host_ev.wait_and_throw();
+
+    return;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given NumPy array
+ * source data type `S` and destination data type `D`.
+ * @defgroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastFromHostFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_from_host_impl<D, S>;
+        return f;
+    }
+};
+
+// =============== Copying for reshape ================== //
+
+template <typename Ty> class GenericCopyForReshapeFunctor
+{
+private:
+    py::ssize_t offset = 0;
+    py::ssize_t size = 1;
+    int src_nd = -1;
+    int dst_nd = -1;
+    // USM array of size 2*(src_nd + dst_nd)
+    //   [ src_shape; src_strides; dst_shape; dst_strides ]
+    const py::ssize_t *src_dst_shapes_and_strides = nullptr;
+    Ty *src_p = nullptr;
+    Ty *dst_p = nullptr;
+
+public:
+    GenericCopyForReshapeFunctor(py::ssize_t shift,
+                                 py::ssize_t nelems,
+                                 int src_ndim,
+                                 int dst_ndim,
+                                 const py::ssize_t *packed_shapes_and_strides,
+                                 char *src_ptr,
+                                 char *dst_ptr)
+        : offset(shift), size(nelems), src_nd(src_ndim), dst_nd(dst_ndim),
+          src_dst_shapes_and_strides(packed_shapes_and_strides),
+          src_p(reinterpret_cast<Ty *>(src_ptr)),
+          dst_p(reinterpret_cast<Ty *>(dst_ptr))
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        py::ssize_t this_src_offset(0);
+        CIndexer_vector<py::ssize_t> src_indxr(src_nd);
+
+        src_indxr.get_displacement<const py::ssize_t *, const py::ssize_t *>(
+            static_cast<py::ssize_t>(wiid.get(0)),
+            const_cast<const py::ssize_t *>(
+                src_dst_shapes_and_strides), // src shape
+            const_cast<const py::ssize_t *>(src_dst_shapes_and_strides +
+                                            src_nd), // src strides
+            this_src_offset                          // modified by reference
+        );
+        const Ty *in = src_p + this_src_offset;
+
+        py::ssize_t this_dst_offset(0);
+        CIndexer_vector<py::ssize_t> dst_indxr(dst_nd);
+        py::ssize_t shifted_wiid =
+            (static_cast<py::ssize_t>(wiid.get(0)) + offset) % size;
+        shifted_wiid = (shifted_wiid >= 0) ? shifted_wiid : shifted_wiid + size;
+        dst_indxr.get_displacement<const py::ssize_t *, const py::ssize_t *>(
+            shifted_wiid,
+            const_cast<const py::ssize_t *>(src_dst_shapes_and_strides +
+                                            2 * src_nd), // dst shape
+            const_cast<const py::ssize_t *>(src_dst_shapes_and_strides +
+                                            2 * src_nd + dst_nd), // dst strides
+            this_dst_offset // modified by reference
+        );
+
+        Ty *out = dst_p + this_dst_offset;
+        *out = *in;
+    }
+};
+
+// define function type
+typedef sycl::event (*copy_for_reshape_fn_ptr_t)(
+    sycl::queue,
+    py::ssize_t, // shift
+    size_t,      // num_elements
+    int,
+    int,           // src_nd, dst_nd
+    py::ssize_t *, // packed shapes and strides
+    char *,        // src_data_ptr
+    char *,        // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy content of array while reshaping.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  shift  The shift in flat indexing.
+ * @param  nelems The number of elements to copy
+ * @param  src_nd Array dimension of the source array
+ * @param  dst_nd Array dimension of the destination array
+ * @param  packed_shapes_and_strides Kernel accessible USM array of size
+ * `2*src_nd + 2*dst_nd` with contant `[src_shape, src_strides, dst_shape,
+ * dst_strides]`.
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event
+copy_for_reshape_generic_impl(sycl::queue q,
+                              py::ssize_t shift,
+                              size_t nelems,
+                              int src_nd,
+                              int dst_nd,
+                              py::ssize_t *packed_shapes_and_strides,
+                              char *src_p,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+    sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<copy_for_reshape_generic_kernel<Ty>>(
+            sycl::range<1>(nelems),
+            GenericCopyForReshapeFunctor<Ty>(shift, nelems, src_nd, dst_nd,
+                                             packed_shapes_and_strides, src_p,
+                                             dst_p));
+    });
+
+    return copy_for_reshape_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty> struct CopyForReshapeGenericFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_reshape_generic_impl<Ty>;
+        return f;
+    }
+};
+
+} // namespace copy_and_cast
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
index 9725e5fb6b..c24ed54941 100644
--- a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
+++ b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
@@ -499,16 +499,16 @@ int simplify_iteration_two_strides(const int nd,
     return nd_;
 }
 
-using vecT = std::vector<py::ssize_t>;
-std::tuple<vecT, vecT, py::size_t> contract_iter(vecT shape, vecT strides)
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T> contract_iter(vecT shape, vecT strides)
 {
     const size_t dim = shape.size();
     if (dim != strides.size()) {
-        throw py::value_error("Shape and strides must be of equal size.");
+        throw Error("Shape and strides must be of equal size.");
     }
     vecT out_shape = shape;
     vecT out_strides = strides;
-    py::ssize_t disp(0);
+    T disp(0);
 
     int nd = simplify_iteration_stride(dim, out_shape.data(),
                                        out_strides.data(), disp);
@@ -517,18 +517,19 @@ std::tuple<vecT, vecT, py::size_t> contract_iter(vecT shape, vecT strides)
     return std::make_tuple(out_shape, out_strides, disp);
 }
 
-std::tuple<vecT, vecT, py::size_t, vecT, py::ssize_t>
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T, vecT, T>
 contract_iter2(vecT shape, vecT strides1, vecT strides2)
 {
     const size_t dim = shape.size();
     if (dim != strides1.size() || dim != strides2.size()) {
-        throw py::value_error("Shape and strides must be of equal size.");
+        throw Error("Shape and strides must be of equal size.");
     }
     vecT out_shape = shape;
     vecT out_strides1 = strides1;
     vecT out_strides2 = strides2;
-    py::ssize_t disp1(0);
-    py::ssize_t disp2(0);
+    T disp1(0);
+    T disp2(0);
 
     int nd = simplify_iteration_two_strides(dim, out_shape.data(),
                                             out_strides1.data(),
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
index a617daf975..feea1308cb 100644
--- a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
+++ b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
@@ -159,83 +159,64 @@ class DispatchVectorBuilder
     }
 };
 
-// Lookup a type according to its size, and return a value corresponding to the
-// NumPy typenum.
-template <typename Concrete> constexpr int platform_typeid_lookup()
-{
-    return -1;
-}
-
-template <typename Concrete, typename T, typename... Ts, typename... Ints>
-constexpr int platform_typeid_lookup(int I, Ints... Is)
-{
-    return sizeof(Concrete) == sizeof(T)
-               ? I
-               : platform_typeid_lookup<Concrete, Ts...>(Is...);
-}
-
 struct usm_ndarray_types
 {
-    static usm_ndarray_types &get()
-    {
-        static usm_ndarray_types singleton = populate_fields();
-        return singleton;
-    }
 
     int typenum_to_lookup_id(int typenum)
     {
         using typenum_t = dpctl::tensor::detail::typenum_t;
+        auto &api = ::dpctl::detail::dpctl_capi::get();
 
-        if (typenum == UAR_DOUBLE_) {
+        if (typenum == api.UAR_DOUBLE_) {
             return static_cast<int>(typenum_t::DOUBLE);
         }
-        else if (typenum == UAR_INT64_) {
+        else if (typenum == api.UAR_INT64_) {
             return static_cast<int>(typenum_t::INT64);
         }
-        else if (typenum == UAR_INT32_) {
+        else if (typenum == api.UAR_INT32_) {
             return static_cast<int>(typenum_t::INT32);
         }
-        else if (typenum == UAR_BOOL_) {
+        else if (typenum == api.UAR_BOOL_) {
             return static_cast<int>(typenum_t::BOOL);
         }
-        else if (typenum == UAR_CDOUBLE_) {
+        else if (typenum == api.UAR_CDOUBLE_) {
             return static_cast<int>(typenum_t::CDOUBLE);
         }
-        else if (typenum == UAR_FLOAT_) {
+        else if (typenum == api.UAR_FLOAT_) {
             return static_cast<int>(typenum_t::FLOAT);
         }
-        else if (typenum == UAR_INT16_) {
+        else if (typenum == api.UAR_INT16_) {
             return static_cast<int>(typenum_t::INT16);
         }
-        else if (typenum == UAR_INT8_) {
+        else if (typenum == api.UAR_INT8_) {
             return static_cast<int>(typenum_t::INT8);
         }
-        else if (typenum == UAR_UINT64_) {
+        else if (typenum == api.UAR_UINT64_) {
             return static_cast<int>(typenum_t::UINT64);
         }
-        else if (typenum == UAR_UINT32_) {
+        else if (typenum == api.UAR_UINT32_) {
             return static_cast<int>(typenum_t::UINT32);
         }
-        else if (typenum == UAR_UINT16_) {
+        else if (typenum == api.UAR_UINT16_) {
             return static_cast<int>(typenum_t::UINT16);
         }
-        else if (typenum == UAR_UINT8_) {
+        else if (typenum == api.UAR_UINT8_) {
             return static_cast<int>(typenum_t::UINT8);
         }
-        else if (typenum == UAR_CFLOAT_) {
+        else if (typenum == api.UAR_CFLOAT_) {
             return static_cast<int>(typenum_t::CFLOAT);
         }
-        else if (typenum == UAR_HALF_) {
+        else if (typenum == api.UAR_HALF_) {
             return static_cast<int>(typenum_t::HALF);
         }
-        else if (typenum == UAR_INT || typenum == UAR_UINT) {
+        else if (typenum == api.UAR_INT_ || typenum == api.UAR_UINT_) {
             switch (sizeof(int)) {
             case sizeof(std::int32_t):
-                return ((typenum == UAR_INT)
+                return ((typenum == api.UAR_INT_)
                             ? static_cast<int>(typenum_t::INT32)
                             : static_cast<int>(typenum_t::UINT32));
             case sizeof(std::int64_t):
-                return ((typenum == UAR_INT)
+                return ((typenum == api.UAR_INT_)
                             ? static_cast<int>(typenum_t::INT64)
                             : static_cast<int>(typenum_t::UINT64));
             default:
@@ -251,58 +232,6 @@ struct usm_ndarray_types
     }
 
 private:
-    int UAR_BOOL_ = -1;
-    // Platform-dependent normalization
-    int UAR_INT8_ = -1;
-    int UAR_UINT8_ = -1;
-    int UAR_INT16_ = -1;
-    int UAR_UINT16_ = -1;
-    int UAR_INT32_ = -1;
-    int UAR_UINT32_ = -1;
-    int UAR_INT64_ = -1;
-    int UAR_UINT64_ = -1;
-    int UAR_HALF_ = -1;
-    int UAR_FLOAT_ = -1;
-    int UAR_DOUBLE_ = -1;
-    int UAR_CFLOAT_ = -1;
-    int UAR_CDOUBLE_ = -1;
-    int UAR_TYPE_SENTINEL_ = -1;
-
-    void init_constants()
-    {
-        UAR_BOOL_ = UAR_BOOL;
-        UAR_INT8_ = UAR_BYTE;
-        UAR_UINT8_ = UAR_UBYTE;
-        UAR_INT16_ = UAR_SHORT;
-        UAR_UINT16_ = UAR_USHORT;
-        UAR_INT32_ = platform_typeid_lookup<std::int32_t, long, int, short>(
-            UAR_LONG, UAR_INT, UAR_SHORT);
-        UAR_UINT32_ = platform_typeid_lookup<std::uint32_t, unsigned long,
-                                             unsigned int, unsigned short>(
-            UAR_ULONG, UAR_UINT, UAR_USHORT);
-        UAR_INT64_ = platform_typeid_lookup<std::int64_t, long, long long, int>(
-            UAR_LONG, UAR_LONGLONG, UAR_INT);
-        UAR_UINT64_ = platform_typeid_lookup<std::uint64_t, unsigned long,
-                                             unsigned long long, unsigned int>(
-            UAR_ULONG, UAR_ULONGLONG, UAR_UINT);
-        UAR_HALF_ = UAR_HALF;
-        UAR_FLOAT_ = UAR_FLOAT;
-        UAR_DOUBLE_ = UAR_DOUBLE;
-        UAR_CFLOAT_ = UAR_CFLOAT;
-        UAR_CDOUBLE_ = UAR_CDOUBLE;
-        UAR_TYPE_SENTINEL_ = UAR_TYPE_SENTINEL;
-    }
-
-    static usm_ndarray_types populate_fields()
-    {
-        import_dpctl();
-
-        usm_ndarray_types types;
-        types.init_constants();
-
-        return types;
-    }
-
     void throw_unrecognized_typenum_error(int typenum)
     {
         throw std::runtime_error("Unrecogized typenum " +
diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
new file mode 100644
index 0000000000..181ff89adc
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
@@ -0,0 +1,73 @@
+//===------ type_utils.hpp - Implementation of types utils  ----*-C++-*/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions for value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace type_utils
+{
+
+template <class T> struct is_complex : std::false_type
+{
+};
+template <class T> struct is_complex<std::complex<T>> : std::true_type
+{
+};
+
+template <typename dstTy, typename srcTy> dstTy convert_impl(const srcTy &v)
+{
+    if constexpr (std::is_same<dstTy, srcTy>::value) {
+        return v;
+    }
+    else if constexpr (std::is_same_v<dstTy, bool> && is_complex<srcTy>::value)
+    {
+        // bool(complex_v) == (complex_v.real() != 0) && (complex_v.imag() !=0)
+        return (convert_impl<bool, typename srcTy::value_type>(v.real()) ||
+                convert_impl<bool, typename srcTy::value_type>(v.imag()));
+    }
+    else if constexpr (is_complex<srcTy>::value && !is_complex<dstTy>::value) {
+        // real_t(complex_v) == real_t(complex_v.real())
+        return convert_impl<dstTy, typename srcTy::value_type>(v.real());
+    }
+    else if constexpr (!std::is_integral<srcTy>::value &&
+                       !std::is_same<dstTy, bool>::value &&
+                       std::is_integral<dstTy>::value &&
+                       std::is_unsigned<dstTy>::value)
+    {
+        // first cast to signed variant, the cast to unsigned one
+        using signedT = typename std::make_signed<dstTy>::type;
+        return static_cast<dstTy>(convert_impl<signedT, srcTy>(v));
+    }
+    else {
+        return static_cast<dstTy>(v);
+    }
+}
+
+} // namespace type_utils
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
new file mode 100644
index 0000000000..c81430d54b
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
@@ -0,0 +1,359 @@
+//===-- tensor_py.cpp - Implementation of _tensor_impl module  --*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+#include <pybind11/complex.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <thread>
+#include <type_traits>
+#include <utility>
+
+#include "dpctl4pybind11.hpp"
+#include "kernels/copy_and_cast.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace _ns = dpctl::tensor::detail;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_2d_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t;
+
+static copy_and_cast_generic_fn_ptr_t
+    copy_and_cast_generic_dispatch_table[_ns::num_types][_ns::num_types];
+static copy_and_cast_1d_fn_ptr_t
+    copy_and_cast_1d_dispatch_table[_ns::num_types][_ns::num_types];
+static copy_and_cast_2d_fn_ptr_t
+    copy_and_cast_2d_dispatch_table[_ns::num_types][_ns::num_types];
+
+namespace py = pybind11;
+
+using dpctl::tensor::c_contiguous_strides;
+using dpctl::tensor::f_contiguous_strides;
+
+using dpctl::utils::keep_args_alive;
+
+sycl::event _populate_packed_shape_strides_for_copycast_kernel(
+    sycl::queue exec_q,
+    py::ssize_t *device_shape_strides, // to be populated
+    const std::vector<py::ssize_t> &common_shape,
+    const std::vector<py::ssize_t> &src_strides,
+    const std::vector<py::ssize_t> &dst_strides)
+{
+    // memory transfer optimization, use USM-host for temporary speeds up
+    // tranfer to device, especially on dGPUs
+    using usm_host_allocatorT =
+        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
+    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
+    size_t nd = common_shape.size();
+
+    usm_host_allocatorT allocator(exec_q);
+
+    // create host temporary for packed shape and strides managed by shared
+    // pointer. Packed vector is concatenation of common_shape, src_stride and
+    // std_strides
+    std::shared_ptr<shT> shp_host_shape_strides =
+        std::make_shared<shT>(3 * nd, allocator);
+    std::copy(common_shape.begin(), common_shape.end(),
+              shp_host_shape_strides->begin());
+
+    std::copy(src_strides.begin(), src_strides.end(),
+              shp_host_shape_strides->begin() + nd);
+
+    std::copy(dst_strides.begin(), dst_strides.end(),
+              shp_host_shape_strides->begin() + 2 * nd);
+
+    sycl::event copy_shape_ev = exec_q.copy<py::ssize_t>(
+        shp_host_shape_strides->data(), device_shape_strides,
+        shp_host_shape_strides->size());
+
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(copy_shape_ev);
+        cgh.host_task([shp_host_shape_strides]() {
+            // increment shared pointer ref-count to keep it alive
+            // till copy operation completes;
+        });
+    });
+
+    return copy_shape_ev;
+}
+
+std::pair<sycl::event, sycl::event>
+copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src,
+                                  dpctl::tensor::usm_ndarray dst,
+                                  sycl::queue exec_q,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    // array dimensions must be the same
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // shapes must be the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    size_t src_nelems(1);
+
+    for (int i = 0; i < src_nd; ++i) {
+        src_nelems *= static_cast<size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accomodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < src_nelems) {
+            throw py::value_error(
+                "Destination array can not accomodate all the "
+                "elements of source array.");
+        }
+    }
+
+    // check compatibility of execution queue and allocation queue
+    sycl::queue src_q = src.get_queue();
+    sycl::queue dst_q = dst.get_queue();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto src_offsets = src.get_minmax_offsets();
+    int src_elem_size = src.get_elemsize();
+    int dst_elem_size = dst.get_elemsize();
+
+    bool memory_overlap =
+        ((dst_data - src_data > src_offsets.second * src_elem_size -
+                                    dst_offsets.first * dst_elem_size) &&
+         (src_data - dst_data > dst_offsets.second * dst_elem_size -
+                                    src_offsets.first * src_elem_size));
+    if (memory_overlap) {
+        // TODO: could use a temporary, but this is done by the caller
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    // check for applicability of special cases:
+    //      (same type && (both C-contiguous || both F-contiguous)
+    bool both_c_contig = (is_src_c_contig && is_dst_c_contig);
+    bool both_f_contig = (is_src_f_contig && is_dst_f_contig);
+    if (both_c_contig || both_f_contig) {
+        if (src_type_id == dst_type_id) {
+
+            sycl::event copy_ev =
+                exec_q.memcpy(static_cast<void *>(dst_data),
+                              static_cast<const void *>(src_data),
+                              src_nelems * src_elem_size, depends);
+
+            // make sure src and dst are not GC-ed before copy_ev is complete
+            return std::make_pair(
+                keep_args_alive(exec_q, {src, dst}, {copy_ev}), copy_ev);
+        }
+        // With contract_iter2 in place, there is no need to write
+        // dedicated kernels for casting between contiguous arrays
+    }
+
+    const py::ssize_t *src_strides = src.get_strides_raw();
+    const py::ssize_t *dst_strides = dst.get_strides_raw();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+    const py::ssize_t *shape = src_shape;
+
+    constexpr py::ssize_t src_itemsize = 1; // in elements
+    constexpr py::ssize_t dst_itemsize = 1; // in elements
+
+    // all args except itemsizes and is_?_contig bools can be modified by
+    // reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, shape, src_strides, src_itemsize, is_src_c_contig, is_src_f_contig,
+        dst_strides, dst_itemsize, is_dst_c_contig, is_dst_f_contig,
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (nd < 3) {
+        if (nd == 1) {
+            std::array<py::ssize_t, 1> shape_arr = {shape[0]};
+            // strides may be null
+            std::array<py::ssize_t, 1> src_strides_arr = {
+                (src_strides ? src_strides[0] : 1)};
+            std::array<py::ssize_t, 1> dst_strides_arr = {
+                (dst_strides ? dst_strides[0] : 1)};
+
+            auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
+            sycl::event copy_and_cast_1d_event = fn(
+                exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
+                src_data, src_offset, dst_data, dst_offset, depends);
+
+            return std::make_pair(
+                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}),
+                copy_and_cast_1d_event);
+        }
+        else if (nd == 2) {
+            std::array<py::ssize_t, 2> shape_arr = {shape[0], shape[1]};
+            std::array<py::ssize_t, 2> src_strides_arr = {src_strides[0],
+                                                          src_strides[1]};
+            std::array<py::ssize_t, 2> dst_strides_arr = {dst_strides[0],
+                                                          dst_strides[1]};
+
+            auto fn = copy_and_cast_2d_dispatch_table[dst_type_id][src_type_id];
+
+            sycl::event copy_and_cast_2d_event = fn(
+                exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
+                src_data, src_offset, dst_data, dst_offset, depends);
+
+            return std::make_pair(
+                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_2d_event}),
+                copy_and_cast_2d_event);
+        }
+        else if (nd == 0) { // case of a scalar
+            assert(src_nelems == 1);
+            std::array<py::ssize_t, 1> shape_arr = {1};
+            std::array<py::ssize_t, 1> src_strides_arr = {1};
+            std::array<py::ssize_t, 1> dst_strides_arr = {1};
+
+            auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
+
+            sycl::event copy_and_cast_0d_event = fn(
+                exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
+                src_data, src_offset, dst_data, dst_offset, depends);
+
+            return std::make_pair(
+                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}),
+                copy_and_cast_0d_event);
+        }
+    }
+
+    // Generic implementation
+    auto copy_and_cast_fn =
+        copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id];
+
+    //   If shape/strides are accessed with accessors, buffer destructor
+    //   will force syncronization.
+    py::ssize_t *shape_strides =
+        sycl::malloc_device<py::ssize_t>(3 * nd, exec_q);
+
+    if (shape_strides == nullptr) {
+        throw std::runtime_error("Unabled to allocate device memory");
+    }
+
+    sycl::event copy_shape_ev =
+        _populate_packed_shape_strides_for_copycast_kernel(
+            exec_q, shape_strides, simplified_shape, simplified_src_strides,
+            simplified_dst_strides);
+
+    sycl::event copy_and_cast_generic_ev = copy_and_cast_fn(
+        exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data,
+        dst_offset, depends, {copy_shape_ev});
+
+    // async free of shape_strides temporary
+    auto ctx = exec_q.get_context();
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(copy_and_cast_generic_ev);
+        cgh.host_task(
+            [ctx, shape_strides]() { sycl::free(shape_strides, ctx); });
+    });
+
+    return std::make_pair(
+        keep_args_alive(exec_q, {src, dst}, {copy_and_cast_generic_ev}),
+        copy_and_cast_generic_ev);
+}
+
+void init_copy_and_cast_usm_to_usm_dispatch_tables(void)
+{
+    using namespace dpctl::tensor::detail;
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory;
+    DispatchTableBuilder<copy_and_cast_generic_fn_ptr_t,
+                         CopyAndCastGenericFactory, num_types>
+        dtb_generic;
+    dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCast1DFactory;
+    DispatchTableBuilder<copy_and_cast_1d_fn_ptr_t, CopyAndCast1DFactory,
+                         num_types>
+        dtb_1d;
+    dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCast2DFactory;
+    DispatchTableBuilder<copy_and_cast_2d_fn_ptr_t, CopyAndCast2DFactory,
+                         num_types>
+        dtb_2d;
+    dtb_2d.populate_dispatch_table(copy_and_cast_2d_dispatch_table);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
new file mode 100644
index 0000000000..192d70c0f2
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
@@ -0,0 +1,50 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src,
+                                  dpctl::tensor::usm_ndarray dst,
+                                  sycl::queue exec_q,
+                                  const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_and_cast_usm_to_usm_dispatch_tables();
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp
new file mode 100644
index 0000000000..7f7e866bb1
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp
@@ -0,0 +1,258 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "copy_for_reshape.hpp"
+#include "dpctl4pybind11.hpp"
+#include "kernels/copy_and_cast.hpp"
+#include "utils/type_dispatch.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace _ns = dpctl::tensor::detail;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_for_reshape_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+// define static vector
+static copy_for_reshape_fn_ptr_t
+    copy_for_reshape_generic_dispatch_vector[_ns::num_types];
+
+/*
+ * Copies src into dst (same data type) of different shapes by using flat
+ * iterations.
+ *
+ * Equivalent to the following loop:
+ *
+ * for i for range(src.size):
+ *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
+ */
+std::pair<sycl::event, sycl::event>
+copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
+                             dpctl::tensor::usm_ndarray dst,
+                             py::ssize_t shift,
+                             sycl::queue exec_q,
+                             const std::vector<sycl::event> &depends)
+{
+    py::ssize_t src_nelems = src.get_size();
+    py::ssize_t dst_nelems = dst.get_size();
+
+    // Must have the same number of elements
+    if (src_nelems != dst_nelems) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "have the same number of elements.");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // destination must be ample enough to accomodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        py::ssize_t range =
+            static_cast<py::ssize_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < src_nelems) {
+            throw py::value_error(
+                "Destination array can not accomodate all the "
+                "elements of source array.");
+        }
+    }
+
+    // check same contexts
+    sycl::queue src_q = src.get_queue();
+    sycl::queue dst_q = dst.get_queue();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    // dimensions may be different
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    auto array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    auto fn = copy_for_reshape_generic_dispatch_vector[type_id];
+
+    // packed_shape_strides = [src_shape, src_strides, dst_shape, dst_strides]
+    py::ssize_t *packed_shapes_strides =
+        sycl::malloc_device<py::ssize_t>(2 * (src_nd + dst_nd), exec_q);
+
+    if (packed_shapes_strides == nullptr) {
+        throw std::runtime_error("Unabled to allocate device memory");
+    }
+
+    using usm_host_allocatorT =
+        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
+    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
+    usm_host_allocatorT allocator(exec_q);
+    std::shared_ptr<shT> packed_host_shapes_strides_shp =
+        std::make_shared<shT>(2 * (src_nd + dst_nd), allocator);
+
+    std::copy(src_shape, src_shape + src_nd,
+              packed_host_shapes_strides_shp->begin());
+    std::copy(dst_shape, dst_shape + dst_nd,
+              packed_host_shapes_strides_shp->begin() + 2 * src_nd);
+
+    const py::ssize_t *src_strides = src.get_strides_raw();
+    if (src_strides == nullptr) {
+        if (src.is_c_contiguous()) {
+            const auto &src_contig_strides =
+                c_contiguous_strides(src_nd, src_shape);
+            std::copy(src_contig_strides.begin(), src_contig_strides.end(),
+                      packed_host_shapes_strides_shp->begin() + src_nd);
+        }
+        else if (src.is_f_contiguous()) {
+            const auto &src_contig_strides =
+                f_contiguous_strides(src_nd, src_shape);
+            std::copy(src_contig_strides.begin(), src_contig_strides.end(),
+                      packed_host_shapes_strides_shp->begin() + src_nd);
+        }
+        else {
+            sycl::free(packed_shapes_strides, exec_q);
+            throw std::runtime_error(
+                "Invalid src array encountered: in copy_for_reshape function");
+        }
+    }
+    else {
+        std::copy(src_strides, src_strides + src_nd,
+                  packed_host_shapes_strides_shp->begin() + src_nd);
+    }
+
+    const py::ssize_t *dst_strides = dst.get_strides_raw();
+    if (dst_strides == nullptr) {
+        if (dst.is_c_contiguous()) {
+            const auto &dst_contig_strides =
+                c_contiguous_strides(dst_nd, dst_shape);
+            std::copy(dst_contig_strides.begin(), dst_contig_strides.end(),
+                      packed_host_shapes_strides_shp->begin() + 2 * src_nd +
+                          dst_nd);
+        }
+        else if (dst.is_f_contiguous()) {
+            const auto &dst_contig_strides =
+                f_contiguous_strides(dst_nd, dst_shape);
+            std::copy(dst_contig_strides.begin(), dst_contig_strides.end(),
+                      packed_host_shapes_strides_shp->begin() + 2 * src_nd +
+                          dst_nd);
+        }
+        else {
+            sycl::free(packed_shapes_strides, exec_q);
+            throw std::runtime_error(
+                "Invalid dst array encountered: in copy_for_reshape function");
+        }
+    }
+    else {
+        std::copy(dst_strides, dst_strides + dst_nd,
+                  packed_host_shapes_strides_shp->begin() + 2 * src_nd +
+                      dst_nd);
+    }
+
+    // copy packed shapes and strides from host to devices
+    sycl::event packed_shape_strides_copy_ev = exec_q.copy<py::ssize_t>(
+        packed_host_shapes_strides_shp->data(), packed_shapes_strides,
+        packed_host_shapes_strides_shp->size());
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(packed_shape_strides_copy_ev);
+        cgh.host_task([packed_host_shapes_strides_shp] {
+            // Capturing shared pointer ensures that the underlying vector is
+            // not destroyed until after its data are copied into packed USM
+            // vector
+        });
+    });
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(packed_shape_strides_copy_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_reshape_event =
+        fn(exec_q, shift, src_nelems, src_nd, dst_nd, packed_shapes_strides,
+           src_data, dst_data, all_deps);
+
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(copy_for_reshape_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([packed_shapes_strides, ctx]() {
+            sycl::free(packed_shapes_strides, ctx);
+        });
+    });
+
+    return std::make_pair(
+        keep_args_alive(exec_q, {src, dst}, {copy_for_reshape_event}),
+        copy_for_reshape_event);
+}
+
+void init_copy_for_reshape_dispatch_vectors(void)
+{
+    using namespace dpctl::tensor::detail;
+    using dpctl::tensor::kernels::copy_and_cast::CopyForReshapeGenericFactory;
+
+    DispatchVectorBuilder<copy_for_reshape_fn_ptr_t,
+                          CopyForReshapeGenericFactory, num_types>
+        dvb;
+    dvb.populate_dispatch_vector(copy_for_reshape_generic_dispatch_vector);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp
new file mode 100644
index 0000000000..51c3719b97
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp
@@ -0,0 +1,51 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
+                             dpctl::tensor::usm_ndarray dst,
+                             py::ssize_t shift,
+                             sycl::queue exec_q,
+                             const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_for_reshape_dispatch_vectors();
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
new file mode 100644
index 0000000000..c6b42e48ff
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -0,0 +1,276 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/copy_and_cast.hpp"
+#include "utils/strided_iters.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "copy_numpy_ndarray_into_usm_ndarray.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace py = pybind11;
+namespace _ns = dpctl::tensor::detail;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_and_cast_from_host_blocking_fn_ptr_t;
+
+static copy_and_cast_from_host_blocking_fn_ptr_t
+    copy_and_cast_from_host_blocking_dispatch_table[_ns::num_types]
+                                                   [_ns::num_types];
+
+void copy_numpy_ndarray_into_usm_ndarray(
+    py::array npy_src,
+    dpctl::tensor::usm_ndarray dst,
+    sycl::queue exec_q,
+    const std::vector<sycl::event> &depends)
+{
+    int src_ndim = npy_src.ndim();
+    int dst_ndim = dst.get_ndim();
+
+    if (src_ndim != dst_ndim) {
+        throw py::value_error("Source ndarray and destination usm_ndarray have "
+                              "different array ranks, "
+                              "i.e. different number of indices needed to "
+                              "address array elements.");
+    }
+
+    const py::ssize_t *src_shape = npy_src.shape();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    size_t src_nelems(1);
+    for (int i = 0; i < src_ndim; ++i) {
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+        src_nelems *= static_cast<size_t>(src_shape[i]);
+    }
+
+    if (!shapes_equal) {
+        throw py::value_error("Source ndarray and destination usm_ndarray have "
+                              "difference shapes.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return;
+    }
+
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accomodate all elements of source
+    // array
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < src_nelems) {
+            throw py::value_error(
+                "Destination array can not accomodate all the "
+                "elements of source array.");
+        }
+    }
+
+    sycl::queue dst_q = dst.get_queue();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) {
+        throw py::value_error("Execution queue is not compatible with the "
+                              "allocation queue");
+    }
+
+    // here we assume that NumPy's type numbers agree with ours for types
+    // supported in both
+    int src_typenum =
+        py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num;
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::buffer_info src_pybuf = npy_src.request();
+    const char *const src_data = static_cast<const char *const>(src_pybuf.ptr);
+    char *dst_data = dst.get_data();
+
+    int src_flags = npy_src.flags();
+
+    // check for applicability of special cases:
+    //      (same type && (both C-contiguous || both F-contiguous)
+    bool both_c_contig =
+        ((src_flags & py::array::c_style) && dst.is_c_contiguous());
+    bool both_f_contig =
+        ((src_flags & py::array::f_style) && dst.is_f_contiguous());
+    if (both_c_contig || both_f_contig) {
+        if (src_type_id == dst_type_id) {
+            int src_elem_size = npy_src.itemsize();
+
+            sycl::event copy_ev =
+                exec_q.memcpy(static_cast<void *>(dst_data),
+                              static_cast<const void *>(src_data),
+                              src_nelems * src_elem_size, depends);
+
+            // wait for copy_ev to complete
+            copy_ev.wait_and_throw();
+
+            return;
+        }
+        // With contract_iter2 in place, there is no need to write
+        // dedicated kernels for casting between contiguous arrays
+    }
+
+    const py::ssize_t *src_strides =
+        npy_src.strides(); // N.B.: strides in bytes
+    const py::ssize_t *dst_strides =
+        dst.get_strides_raw(); // N.B.: strides in elements
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes
+    constexpr py::ssize_t dst_itemsize = 1;        // item size in elements
+
+    int nd = src_ndim;
+    const py::ssize_t *shape = src_shape;
+
+    bool is_src_c_contig = ((src_flags & py::array::c_style) != 0);
+    bool is_src_f_contig = ((src_flags & py::array::f_style) != 0);
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    // all args except itemsizes and is_?_contig bools can be modified by
+    // reference
+    simplify_iteration_space(nd, shape, src_strides, src_itemsize,
+                             is_src_c_contig, is_src_f_contig, dst_strides,
+                             dst_itemsize, is_dst_c_contig, is_dst_f_contig,
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    assert(simplified_shape.size() == static_cast<size_t>(nd));
+    assert(simplified_src_strides.size() == static_cast<size_t>(nd));
+    assert(simplified_dst_strides.size() == static_cast<size_t>(nd));
+
+    // handle nd == 0
+    if (nd == 0) {
+        nd = 1;
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(1);
+
+        simplified_src_strides.reserve(nd);
+        simplified_src_strides.push_back(src_itemsize);
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.push_back(dst_itemsize);
+    }
+
+    // Minumum and maximum element offsets for source np.ndarray
+    py::ssize_t npy_src_min_nelem_offset(0);
+    py::ssize_t npy_src_max_nelem_offset(0);
+    for (int i = 0; i < nd; ++i) {
+        // convert source strides from bytes to elements
+        simplified_src_strides[i] = simplified_src_strides[i] / src_itemsize;
+        if (simplified_src_strides[i] < 0) {
+            npy_src_min_nelem_offset +=
+                simplified_src_strides[i] * (simplified_shape[i] - 1);
+        }
+        else {
+            npy_src_max_nelem_offset +=
+                simplified_src_strides[i] * (simplified_shape[i] - 1);
+        }
+    }
+
+    // Create shared pointers with shape and src/dst strides, copy into device
+    // memory
+    using shT = std::vector<py::ssize_t>;
+
+    // Get implementation function pointer
+    auto copy_and_cast_from_host_blocking_fn =
+        copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
+                                                       [src_type_id];
+
+    //   If shape/strides are accessed with accessors, buffer destructor
+    //   will force syncronization.
+    py::ssize_t *shape_strides =
+        sycl::malloc_device<py::ssize_t>(3 * nd, exec_q);
+
+    if (shape_strides == nullptr) {
+        throw std::runtime_error("Unabled to allocate device memory");
+    }
+
+    using usm_host_allocatorT =
+        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
+    using usmshT = std::vector<py::ssize_t, usm_host_allocatorT>;
+    usm_host_allocatorT alloc(exec_q);
+
+    auto host_shape_strides_shp = std::make_shared<usmshT>(3 * nd, alloc);
+    std::copy(simplified_shape.begin(), simplified_shape.end(),
+              host_shape_strides_shp->begin());
+    std::copy(simplified_src_strides.begin(), simplified_src_strides.end(),
+              host_shape_strides_shp->begin() + nd);
+    std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(),
+              host_shape_strides_shp->begin() + 2 * nd);
+
+    sycl::event copy_packed_ev =
+        exec_q.copy<py::ssize_t>(host_shape_strides_shp->data(), shape_strides,
+                                 host_shape_strides_shp->size());
+
+    copy_and_cast_from_host_blocking_fn(
+        exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
+        npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
+        dst_offset, depends, {copy_packed_ev});
+
+    sycl::free(shape_strides, exec_q);
+
+    return;
+}
+
+void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void)
+{
+    using namespace dpctl::tensor::detail;
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastFromHostFactory;
+
+    DispatchTableBuilder<copy_and_cast_from_host_blocking_fn_ptr_t,
+                         CopyAndCastFromHostFactory, _ns::num_types>
+        dtb_copy_from_numpy;
+
+    dtb_copy_from_numpy.populate_dispatch_table(
+        copy_and_cast_from_host_blocking_dispatch_table);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
new file mode 100644
index 0000000000..16adb921ee
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
@@ -0,0 +1,50 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void copy_numpy_ndarray_into_usm_ndarray(
+    py::array npy_src,
+    dpctl::tensor::usm_ndarray dst,
+    sycl::queue exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/device_support_queries.cpp b/dpctl/tensor/libtensor/source/device_support_queries.cpp
new file mode 100644
index 0000000000..0b15754f56
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/device_support_queries.cpp
@@ -0,0 +1,118 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <string>
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace
+{
+
+std::string _default_device_fp_type(sycl::device d)
+{
+    if (d.has(sycl::aspect::fp64)) {
+        return "f8";
+    }
+    else {
+        return "f4";
+    }
+}
+
+std::string _default_device_int_type(sycl::device)
+{
+    return "i8";
+}
+
+std::string _default_device_complex_type(sycl::device d)
+{
+    if (d.has(sycl::aspect::fp64)) {
+        return "c16";
+    }
+    else {
+        return "c8";
+    }
+}
+
+std::string _default_device_bool_type(sycl::device)
+{
+    return "b1";
+}
+
+sycl::device _extract_device(py::object arg)
+{
+    auto &api = dpctl::detail::dpctl_capi::get();
+
+    PyObject *source = arg.ptr();
+    if (api.PySyclQueue_Check_(source)) {
+        sycl::queue q = py::cast<sycl::queue>(arg);
+        return q.get_device();
+    }
+    else if (api.PySyclDevice_Check_(source)) {
+        return py::cast<sycl::device>(arg);
+    }
+    else {
+        throw py::type_error(
+            "Expected type `dpctl.SyclQueue` or `dpctl.SyclDevice`.");
+    }
+}
+
+} // namespace
+
+std::string default_device_fp_type(py::object arg)
+{
+    sycl::device d = _extract_device(arg);
+    return _default_device_fp_type(d);
+}
+
+std::string default_device_int_type(py::object arg)
+{
+    sycl::device d = _extract_device(arg);
+    return _default_device_int_type(d);
+}
+
+std::string default_device_bool_type(py::object arg)
+{
+    sycl::device d = _extract_device(arg);
+    return _default_device_bool_type(d);
+}
+
+std::string default_device_complex_type(py::object arg)
+{
+    sycl::device d = _extract_device(arg);
+    return _default_device_complex_type(d);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/device_support_queries.hpp b/dpctl/tensor/libtensor/source/device_support_queries.hpp
new file mode 100644
index 0000000000..905ba4b535
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/device_support_queries.hpp
@@ -0,0 +1,47 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <string>
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::string default_device_fp_type(py::object);
+extern std::string default_device_int_type(py::object);
+extern std::string default_device_bool_type(py::object);
+extern std::string default_device_complex_type(py::object);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/eye_ctor.cpp b/dpctl/tensor/libtensor/source/eye_ctor.cpp
new file mode 100644
index 0000000000..d36447749a
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/eye_ctor.cpp
@@ -0,0 +1,133 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "eye_ctor.hpp"
+#include "kernels/constructors.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace _ns = dpctl::tensor::detail;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+using dpctl::tensor::kernels::constructors::eye_fn_ptr_t;
+static eye_fn_ptr_t eye_dispatch_vector[_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+usm_ndarray_eye(py::ssize_t k,
+                dpctl::tensor::usm_ndarray dst,
+                sycl::queue exec_q,
+                const std::vector<sycl::event> &depends)
+{
+    // dst must be 2D
+
+    if (dst.get_ndim() != 2) {
+        throw py::value_error(
+            "usm_ndarray_eye: Expecting 2D array to populate");
+    }
+
+    sycl::queue dst_q = dst.get_queue();
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) {
+        throw py::value_error("Execution queue is not compatible with the "
+                              "allocation queue");
+    }
+
+    auto array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    const py::ssize_t nelem = dst.get_size();
+    const py::ssize_t rows = dst.get_shape(0);
+    const py::ssize_t cols = dst.get_shape(1);
+    if (rows == 0 || cols == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+    if (!is_dst_c_contig && !is_dst_f_contig) {
+        throw py::value_error("USM array is not contiguous");
+    }
+
+    py::ssize_t start;
+    if (is_dst_c_contig) {
+        start = (k < 0) ? -k * cols : k;
+    }
+    else {
+        start = (k < 0) ? -k : k * rows;
+    }
+
+    const py::ssize_t *strides = dst.get_strides_raw();
+    py::ssize_t step;
+    if (strides == nullptr) {
+        step = (is_dst_c_contig) ? cols + 1 : rows + 1;
+    }
+    else {
+        step = strides[0] + strides[1];
+    }
+
+    const py::ssize_t length = std::min({rows, cols, rows + k, cols - k});
+    const py::ssize_t end = start + step * (length - 1);
+
+    char *dst_data = dst.get_data();
+    sycl::event eye_event;
+
+    auto fn = eye_dispatch_vector[dst_typeid];
+
+    eye_event = fn(exec_q, static_cast<size_t>(nelem), start, end, step,
+                   dst_data, depends);
+
+    return std::make_pair(keep_args_alive(exec_q, {dst}, {eye_event}),
+                          eye_event);
+}
+
+void init_eye_ctor_dispatch_vectors(void)
+{
+    using namespace dpctl::tensor::detail;
+    using dpctl::tensor::kernels::constructors::EyeFactory;
+
+    DispatchVectorBuilder<eye_fn_ptr_t, EyeFactory, num_types> dvb;
+    dvb.populate_dispatch_vector(eye_dispatch_vector);
+
+    return;
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/eye_ctor.hpp b/dpctl/tensor/libtensor/source/eye_ctor.hpp
new file mode 100644
index 0000000000..1067ed8d8b
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/eye_ctor.hpp
@@ -0,0 +1,50 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+usm_ndarray_eye(py::ssize_t k,
+                dpctl::tensor::usm_ndarray dst,
+                sycl::queue exec_q,
+                const std::vector<sycl::event> &depends = {});
+
+extern void init_eye_ctor_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp
new file mode 100644
index 0000000000..e5b1da362b
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/full_ctor.cpp
@@ -0,0 +1,116 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <complex>
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <utility>
+#include <vector>
+
+#include "kernels/constructors.hpp"
+#include "utils/strided_iters.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "full_ctor.hpp"
+
+namespace py = pybind11;
+namespace _ns = dpctl::tensor::detail;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+using dpctl::tensor::kernels::constructors::lin_space_step_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+using dpctl::tensor::kernels::constructors::full_contig_fn_ptr_t;
+
+static full_contig_fn_ptr_t full_contig_dispatch_vector[_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+usm_ndarray_full(py::object py_value,
+                 dpctl::tensor::usm_ndarray dst,
+                 sycl::queue exec_q,
+                 const std::vector<sycl::event> &depends)
+{
+    // start, end should be coercible into data type of dst
+
+    py::ssize_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    sycl::queue dst_q = dst.get_queue();
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    auto array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *dst_data = dst.get_data();
+    sycl::event full_event;
+
+    if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
+        auto fn = full_contig_dispatch_vector[dst_typeid];
+
+        sycl::event full_contig_event =
+            fn(exec_q, static_cast<size_t>(dst_nelems), py_value, dst_data,
+               depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {dst}, {full_contig_event}),
+            full_contig_event);
+    }
+    else {
+        throw std::runtime_error(
+            "Only population of contiguous usm_ndarray objects is supported.");
+    }
+}
+
+void init_full_ctor_dispatch_vectors(void)
+{
+    using namespace dpctl::tensor::detail;
+    using dpctl::tensor::kernels::constructors::FullContigFactory;
+
+    DispatchVectorBuilder<full_contig_fn_ptr_t, FullContigFactory, num_types>
+        dvb;
+    dvb.populate_dispatch_vector(full_contig_dispatch_vector);
+
+    return;
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/full_ctor.hpp b/dpctl/tensor/libtensor/source/full_ctor.hpp
new file mode 100644
index 0000000000..4a620a03db
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/full_ctor.hpp
@@ -0,0 +1,50 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+usm_ndarray_full(py::object py_value,
+                 dpctl::tensor::usm_ndarray dst,
+                 sycl::queue exec_q,
+                 const std::vector<sycl::event> &depends = {});
+
+extern void init_full_ctor_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/linear_sequences.cpp b/dpctl/tensor/libtensor/source/linear_sequences.cpp
new file mode 100644
index 0000000000..8b72923679
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/linear_sequences.cpp
@@ -0,0 +1,178 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <complex>
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <utility>
+#include <vector>
+
+#include "kernels/constructors.hpp"
+#include "utils/strided_iters.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "linear_sequences.hpp"
+
+namespace py = pybind11;
+namespace _ns = dpctl::tensor::detail;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+using dpctl::tensor::kernels::constructors::lin_space_step_fn_ptr_t;
+
+static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[_ns::num_types];
+
+using dpctl::tensor::kernels::constructors::lin_space_affine_fn_ptr_t;
+
+static lin_space_affine_fn_ptr_t
+    lin_space_affine_dispatch_vector[_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+usm_ndarray_linear_sequence_step(py::object start,
+                                 py::object dt,
+                                 dpctl::tensor::usm_ndarray dst,
+                                 sycl::queue exec_q,
+                                 const std::vector<sycl::event> &depends)
+{
+    // dst must be 1D and C-contiguous
+    // start, end should be coercible into data type of dst
+
+    if (dst.get_ndim() != 1) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Expecting 1D array to populate");
+    }
+
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
+    }
+
+    sycl::queue dst_q = dst.get_queue();
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    auto array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::ssize_t len = dst.get_shape(0);
+    if (len == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *dst_data = dst.get_data();
+    sycl::event linspace_step_event;
+
+    auto fn = lin_space_step_dispatch_vector[dst_typeid];
+
+    linspace_step_event =
+        fn(exec_q, static_cast<size_t>(len), start, dt, dst_data, depends);
+
+    return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}),
+                          linspace_step_event);
+}
+
+std::pair<sycl::event, sycl::event>
+usm_ndarray_linear_sequence_affine(py::object start,
+                                   py::object end,
+                                   dpctl::tensor::usm_ndarray dst,
+                                   bool include_endpoint,
+                                   sycl::queue exec_q,
+                                   const std::vector<sycl::event> &depends)
+{
+    // dst must be 1D and C-contiguous
+    // start, end should be coercible into data type of dst
+
+    if (dst.get_ndim() != 1) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Expecting 1D array to populate");
+    }
+
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
+    }
+
+    sycl::queue dst_q = dst.get_queue();
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) {
+        throw py::value_error(
+            "Execution queue context is not the same as allocation context");
+    }
+
+    auto array_types = dpctl::tensor::detail::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::ssize_t len = dst.get_shape(0);
+    if (len == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *dst_data = dst.get_data();
+    sycl::event linspace_affine_event;
+
+    auto fn = lin_space_affine_dispatch_vector[dst_typeid];
+
+    linspace_affine_event = fn(exec_q, static_cast<size_t>(len), start, end,
+                               include_endpoint, dst_data, depends);
+
+    return std::make_pair(
+        keep_args_alive(exec_q, {dst}, {linspace_affine_event}),
+        linspace_affine_event);
+}
+
+void init_linear_sequences_dispatch_vectors(void)
+{
+    using namespace dpctl::tensor::detail;
+    using dpctl::tensor::kernels::constructors::LinSpaceAffineFactory;
+    using dpctl::tensor::kernels::constructors::LinSpaceStepFactory;
+
+    DispatchVectorBuilder<lin_space_step_fn_ptr_t, LinSpaceStepFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector);
+
+    DispatchVectorBuilder<lin_space_affine_fn_ptr_t, LinSpaceAffineFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/linear_sequences.hpp b/dpctl/tensor/libtensor/source/linear_sequences.hpp
new file mode 100644
index 0000000000..b463fdf533
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/linear_sequences.hpp
@@ -0,0 +1,59 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+usm_ndarray_linear_sequence_step(py::object start,
+                                 py::object dt,
+                                 dpctl::tensor::usm_ndarray dst,
+                                 sycl::queue exec_q,
+                                 const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_affine(
+    py::object start,
+    py::object end,
+    dpctl::tensor::usm_ndarray dst,
+    bool include_endpoint,
+    sycl::queue exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_linear_sequences_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
new file mode 100644
index 0000000000..8937300047
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -0,0 +1,178 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "simplify_iteration_space.hpp"
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <vector>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace py = pybind11;
+
+using dpctl::tensor::c_contiguous_strides;
+using dpctl::tensor::f_contiguous_strides;
+
+void simplify_iteration_space(int &nd,
+                              const py::ssize_t *&shape,
+                              const py::ssize_t *&src_strides,
+                              py::ssize_t src_itemsize,
+                              bool is_src_c_contig,
+                              bool is_src_f_contig,
+                              const py::ssize_t *&dst_strides,
+                              py::ssize_t dst_itemsize,
+                              bool is_dst_c_contig,
+                              bool is_dst_f_contig,
+                              std::vector<py::ssize_t> &simplified_shape,
+                              std::vector<py::ssize_t> &simplified_src_strides,
+                              std::vector<py::ssize_t> &simplified_dst_strides,
+                              py::ssize_t &src_offset,
+                              py::ssize_t &dst_offset)
+{
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        for (int i = 0; i < nd; ++i) {
+            simplified_shape.push_back(shape[i]);
+        }
+
+        simplified_src_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+        if (src_strides == nullptr) {
+            if (is_src_c_contig) {
+                simplified_src_strides =
+                    c_contiguous_strides(nd, shape, src_itemsize);
+            }
+            else if (is_src_f_contig) {
+                simplified_src_strides =
+                    f_contiguous_strides(nd, shape, src_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Source array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            for (int i = 0; i < nd; ++i) {
+                simplified_src_strides.push_back(src_strides[i]);
+            }
+        }
+        if (dst_strides == nullptr) {
+            if (is_dst_c_contig) {
+                simplified_dst_strides =
+                    c_contiguous_strides(nd, shape, dst_itemsize);
+            }
+            else if (is_dst_f_contig) {
+                simplified_dst_strides =
+                    f_contiguous_strides(nd, shape, dst_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Destination array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            for (int i = 0; i < nd; ++i) {
+                simplified_dst_strides.push_back(dst_strides[i]);
+            }
+        }
+
+        assert(simplified_shape.size() == static_cast<size_t>(nd));
+        assert(simplified_src_strides.size() == static_cast<size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<size_t>(nd));
+        int contracted_nd = simplify_iteration_two_strides(
+            nd, simplified_shape.data(), simplified_src_strides.data(),
+            simplified_dst_strides.data(),
+            src_offset, // modified by reference
+            dst_offset  // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+        shape = const_cast<const py::ssize_t *>(simplified_shape.data());
+        src_strides =
+            const_cast<const py::ssize_t *>(simplified_src_strides.data());
+        dst_strides =
+            const_cast<const py::ssize_t *>(simplified_dst_strides.data());
+    }
+    else if (nd == 1) {
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+
+        simplified_src_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if (src_strides == nullptr) {
+            if (is_src_c_contig) {
+                simplified_src_strides.push_back(src_itemsize);
+            }
+            else if (is_src_f_contig) {
+                simplified_src_strides.push_back(src_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Source array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            simplified_src_strides.push_back(src_strides[0]);
+        }
+        if (dst_strides == nullptr) {
+            if (is_dst_c_contig) {
+                simplified_dst_strides.push_back(dst_itemsize);
+            }
+            else if (is_dst_f_contig) {
+                simplified_dst_strides.push_back(dst_itemsize);
+            }
+            else {
+                throw std::runtime_error(
+                    "Destination array has null strides "
+                    "but has neither C- nor F- contiguous flag set");
+            }
+        }
+        else {
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_shape.size() == static_cast<size_t>(nd));
+        assert(simplified_src_strides.size() == static_cast<size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<size_t>(nd));
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
new file mode 100644
index 0000000000..515e795d20
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
@@ -0,0 +1,57 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include "utils/strided_iters.hpp"
+#include <pybind11/pybind11.h>
+#include <vector>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace py = pybind11;
+
+void simplify_iteration_space(int &,
+                              const py::ssize_t *&,
+                              const py::ssize_t *&,
+                              py::ssize_t,
+                              bool,
+                              bool,
+                              const py::ssize_t *&,
+                              py::ssize_t,
+                              bool,
+                              bool,
+                              std::vector<py::ssize_t> &,
+                              std::vector<py::ssize_t> &,
+                              std::vector<py::ssize_t> &,
+                              py::ssize_t &,
+                              py::ssize_t &);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 7a6c1fcca6..460855e76e 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -26,2303 +26,91 @@
 #include <algorithm>
 #include <complex>
 #include <cstdint>
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <thread>
 #include <type_traits>
 
 #include "dpctl4pybind11.hpp"
+
+#include "copy_and_cast_usm_to_usm.hpp"
+#include "copy_for_reshape.hpp"
+#include "copy_numpy_ndarray_into_usm_ndarray.hpp"
+#include "device_support_queries.hpp"
+#include "eye_ctor.hpp"
+#include "full_ctor.hpp"
+#include "linear_sequences.hpp"
+#include "triul_ctor.hpp"
 #include "utils/strided_iters.hpp"
-#include "utils/type_dispatch.hpp"
 
 namespace py = pybind11;
 
-template <typename srcT, typename dstT> class copy_cast_generic_kernel;
-template <typename srcT, typename dstT> class copy_cast_from_host_kernel;
-template <typename srcT, typename dstT, int nd> class copy_cast_spec_kernel;
-template <typename Ty> class copy_for_reshape_generic_kernel;
-template <typename Ty> class linear_sequence_step_kernel;
-template <typename Ty, typename wTy> class linear_sequence_affine_kernel;
-template <typename Ty> class eye_kernel;
-
-static dpctl::tensor::detail::usm_ndarray_types array_types;
-
 namespace
 {
 
-template <class T> struct is_complex : std::false_type
-{
-};
-template <class T> struct is_complex<std::complex<T>> : std::true_type
-{
-};
-template <typename dstTy, typename srcTy> dstTy convert_impl(const srcTy &v)
-{
-    if constexpr (std::is_same<dstTy, srcTy>::value) {
-        return v;
-    }
-    else if constexpr (std::is_same_v<dstTy, bool> && is_complex<srcTy>::value)
-    {
-        // bool(complex_v) == (complex_v.real() != 0) && (complex_v.imag() !=0)
-        return (convert_impl<bool, typename srcTy::value_type>(v.real()) ||
-                convert_impl<bool, typename srcTy::value_type>(v.imag()));
-    }
-    else if constexpr (is_complex<srcTy>::value && !is_complex<dstTy>::value) {
-        // real_t(complex_v) == real_t(complex_v.real())
-        return convert_impl<dstTy, typename srcTy::value_type>(v.real());
-    }
-    else if constexpr (!std::is_integral<srcTy>::value &&
-                       !std::is_same<dstTy, bool>::value &&
-                       std::is_integral<dstTy>::value &&
-                       std::is_unsigned<dstTy>::value)
-    {
-        // first cast to signed variant, the cast to unsigned one
-        using signedT = typename std::make_signed<dstTy>::type;
-        return static_cast<dstTy>(convert_impl<signedT, srcTy>(v));
-    }
-    else {
-        return static_cast<dstTy>(v);
-    }
-}
-
-template <typename srcT, typename dstT> class Caster
-{
-public:
-    Caster() = default;
-    void operator()(char *src,
-                    std::ptrdiff_t src_offset,
-                    char *dst,
-                    std::ptrdiff_t dst_offset) const
-    {
-        srcT *src_ = reinterpret_cast<srcT *>(src) + src_offset;
-        dstT *dst_ = reinterpret_cast<dstT *>(dst) + dst_offset;
-        *dst_ = convert_impl<dstT, srcT>(*src_);
-    }
-};
-
-template <typename CastFnT> class GenericCopyFunctor
-{
-private:
-    char *src_ = nullptr;
-    char *dst_ = nullptr;
-    py::ssize_t *shape_strides_ = nullptr;
-    int nd_ = 0;
-    py::ssize_t src_offset0 = 0;
-    py::ssize_t dst_offset0 = 0;
-
-public:
-    GenericCopyFunctor(char *src_cp,
-                       char *dst_cp,
-                       py::ssize_t *shape_strides,
-                       int nd,
-                       py::ssize_t src_offset,
-                       py::ssize_t dst_offset)
-        : src_(src_cp), dst_(dst_cp), shape_strides_(shape_strides), nd_(nd),
-          src_offset0(src_offset), dst_offset0(dst_offset)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        py::ssize_t src_offset(0);
-        py::ssize_t dst_offset(0);
-        CIndexer_vector<py::ssize_t> indxr(nd_);
-        indxr.get_displacement<const py::ssize_t *, const py::ssize_t *>(
-            static_cast<py::ssize_t>(wiid.get(0)),
-            const_cast<const py::ssize_t *>(shape_strides_), // common shape
-            const_cast<const py::ssize_t *>(shape_strides_ +
-                                            nd_), // src strides
-            const_cast<const py::ssize_t *>(shape_strides_ +
-                                            2 * nd_), // dst strides
-            src_offset,                               // modified by reference
-            dst_offset                                // modified by reference
-        );
-        CastFnT fn{};
-        fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset);
-    }
-};
-
-template <int nd, typename CastFnT> class NDSpecializedCopyFunctor
-{
-private:
-    char *src_ = nullptr;
-    char *dst_ = nullptr;
-    CIndexer_array<nd, py::ssize_t> indxr;
-    const std::array<py::ssize_t, nd> src_strides_;
-    const std::array<py::ssize_t, nd> dst_strides_;
-    static const int nd_ = nd;
-    py::ssize_t src_offset0 = 0;
-    py::ssize_t dst_offset0 = 0;
-
-public:
-    NDSpecializedCopyFunctor(char *src_cp, // USM pointer
-                             char *dst_cp, // USM pointer
-                             const std::array<py::ssize_t, nd> shape,
-                             const std::array<py::ssize_t, nd> src_strides,
-                             const std::array<py::ssize_t, nd> dst_strides,
-                             py::ssize_t src_offset,
-                             py::ssize_t dst_offset)
-        : src_(src_cp), dst_(dst_cp), indxr(shape), src_strides_(src_strides),
-          dst_strides_(dst_strides), src_offset0(src_offset),
-          dst_offset0(dst_offset)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        py::ssize_t src_offset = 0;
-        py::ssize_t dst_offset = 0;
-        CIndexer_array<nd, py::ssize_t> local_indxr(std::move(indxr));
-
-        local_indxr.set(wiid.get(0));
-        auto mi = local_indxr.get();
-        for (int i = 0; i < nd; ++i)
-            src_offset += mi[i] * src_strides_[i];
-        for (int i = 0; i < nd; ++i)
-            dst_offset += mi[i] * dst_strides_[i];
-
-        CastFnT fn{};
-        fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset);
-    }
-};
-
-typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)(
-    sycl::queue,
-    size_t,
-    int,
-    py::ssize_t *,
-    char *,
-    py::ssize_t,
-    char *,
-    py::ssize_t,
-    const std::vector<sycl::event> &,
-    const std::vector<sycl::event> &);
-
-template <typename dstTy, typename srcTy>
-sycl::event
-copy_and_cast_generic_impl(sycl::queue q,
-                           size_t nelems,
-                           int nd,
-                           py::ssize_t *shape_and_strides,
-                           char *src_p,
-                           py::ssize_t src_offset,
-                           char *dst_p,
-                           py::ssize_t dst_offset,
-                           const std::vector<sycl::event> &depends,
-                           const std::vector<sycl::event> &additional_depends)
-{
-    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-        cgh.parallel_for<copy_cast_generic_kernel<srcTy, dstTy>>(
-            sycl::range<1>(nelems),
-            GenericCopyFunctor<Caster<srcTy, dstTy>>(
-                src_p, dst_p, shape_and_strides, nd, src_offset, dst_offset));
-    });
-
-    return copy_and_cast_ev;
-}
-
-typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)(
-    sycl::queue,
-    size_t,
-    const std::array<py::ssize_t, 1>,
-    const std::array<py::ssize_t, 1>,
-    const std::array<py::ssize_t, 1>,
-    char *,
-    py::ssize_t,
-    char *,
-    py::ssize_t,
-    const std::vector<sycl::event> &);
-
-typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)(
-    sycl::queue,
-    size_t,
-    const std::array<py::ssize_t, 2>,
-    const std::array<py::ssize_t, 2>,
-    const std::array<py::ssize_t, 2>,
-    char *,
-    py::ssize_t,
-    char *,
-    py::ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename dstTy, typename srcTy, int nd>
-sycl::event
-copy_and_cast_nd_specialized_impl(sycl::queue q,
-                                  size_t nelems,
-                                  const std::array<py::ssize_t, nd> shape,
-                                  const std::array<py::ssize_t, nd> src_strides,
-                                  const std::array<py::ssize_t, nd> dst_strides,
-                                  char *src_p,
-                                  py::ssize_t src_offset,
-                                  char *dst_p,
-                                  py::ssize_t dst_offset,
-                                  const std::vector<sycl::event> &depends)
-{
-    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.parallel_for<copy_cast_spec_kernel<srcTy, dstTy, nd>>(
-            sycl::range<1>(nelems),
-            NDSpecializedCopyFunctor<nd, Caster<srcTy, dstTy>>(
-                src_p, dst_p, shape, src_strides, dst_strides, src_offset,
-                dst_offset));
-    });
-
-    return copy_and_cast_ev;
-}
-
-namespace _ns = dpctl::tensor::detail;
-
-static copy_and_cast_generic_fn_ptr_t
-    copy_and_cast_generic_dispatch_table[_ns::num_types][_ns::num_types];
-static copy_and_cast_1d_fn_ptr_t
-    copy_and_cast_1d_dispatch_table[_ns::num_types][_ns::num_types];
-static copy_and_cast_2d_fn_ptr_t
-    copy_and_cast_2d_dispatch_table[_ns::num_types][_ns::num_types];
-
-template <typename fnT, typename D, typename S> struct CopyAndCastGenericFactory
-{
-    fnT get()
-    {
-        fnT f = copy_and_cast_generic_impl<D, S>;
-        return f;
-    }
-};
-
-template <typename fnT, typename D, typename S> struct CopyAndCast1DFactory
-{
-    fnT get()
-    {
-        fnT f = copy_and_cast_nd_specialized_impl<D, S, 1>;
-        return f;
-    }
-};
-
-template <typename fnT, typename D, typename S> struct CopyAndCast2DFactory
-{
-    fnT get()
-    {
-        fnT f = copy_and_cast_nd_specialized_impl<D, S, 2>;
-        return f;
-    }
-};
-
-std::vector<py::ssize_t> c_contiguous_strides(int nd,
-                                              const py::ssize_t *shape,
-                                              py::ssize_t element_size = 1)
-{
-    if (nd > 0) {
-        std::vector<py::ssize_t> c_strides(nd, element_size);
-        for (int ic = nd - 1; ic > 0;) {
-            py::ssize_t next_v = c_strides[ic] * shape[ic];
-            c_strides[--ic] = next_v;
-        }
-        return c_strides;
-    }
-    else {
-        return std::vector<py::ssize_t>();
-    }
-}
-
-std::vector<py::ssize_t> f_contiguous_strides(int nd,
-                                              const py::ssize_t *shape,
-                                              py::ssize_t element_size = 1)
-{
-    if (nd > 0) {
-        std::vector<py::ssize_t> f_strides(nd, element_size);
-        for (int i = 0; i < nd - 1;) {
-            py::ssize_t next_v = f_strides[i] * shape[i];
-            f_strides[++i] = next_v;
-        }
-        return f_strides;
-    }
-    else {
-        return std::vector<py::ssize_t>();
-    }
-}
-
-using dpctl::utils::keep_args_alive;
-
-void simplify_iteration_space(int &nd,
-                              const py::ssize_t *&shape,
-                              const py::ssize_t *&src_strides,
-                              py::ssize_t src_itemsize,
-                              bool is_src_c_contig,
-                              bool is_src_f_contig,
-                              const py::ssize_t *&dst_strides,
-                              py::ssize_t dst_itemsize,
-                              bool is_dst_c_contig,
-                              bool is_dst_f_contig,
-                              std::vector<py::ssize_t> &simplified_shape,
-                              std::vector<py::ssize_t> &simplified_src_strides,
-                              std::vector<py::ssize_t> &simplified_dst_strides,
-                              py::ssize_t &src_offset,
-                              py::ssize_t &dst_offset)
-{
-    if (nd > 1) {
-        // Simplify iteration space to reduce dimensionality
-        // and improve access pattern
-        simplified_shape.reserve(nd);
-        for (int i = 0; i < nd; ++i) {
-            simplified_shape.push_back(shape[i]);
-        }
-
-        simplified_src_strides.reserve(nd);
-        simplified_dst_strides.reserve(nd);
-        if (src_strides == nullptr) {
-            if (is_src_c_contig) {
-                simplified_src_strides =
-                    c_contiguous_strides(nd, shape, src_itemsize);
-            }
-            else if (is_src_f_contig) {
-                simplified_src_strides =
-                    f_contiguous_strides(nd, shape, src_itemsize);
-            }
-            else {
-                throw std::runtime_error(
-                    "Source array has null strides "
-                    "but has neither C- nor F- contiguous flag set");
-            }
-        }
-        else {
-            for (int i = 0; i < nd; ++i) {
-                simplified_src_strides.push_back(src_strides[i]);
-            }
-        }
-        if (dst_strides == nullptr) {
-            if (is_dst_c_contig) {
-                simplified_dst_strides =
-                    c_contiguous_strides(nd, shape, dst_itemsize);
-            }
-            else if (is_dst_f_contig) {
-                simplified_dst_strides =
-                    f_contiguous_strides(nd, shape, dst_itemsize);
-            }
-            else {
-                throw std::runtime_error(
-                    "Destination array has null strides "
-                    "but has neither C- nor F- contiguous flag set");
-            }
-        }
-        else {
-            for (int i = 0; i < nd; ++i) {
-                simplified_dst_strides.push_back(dst_strides[i]);
-            }
-        }
-
-        assert(simplified_shape.size() == static_cast<size_t>(nd));
-        assert(simplified_src_strides.size() == static_cast<size_t>(nd));
-        assert(simplified_dst_strides.size() == static_cast<size_t>(nd));
-        int contracted_nd = simplify_iteration_two_strides(
-            nd, simplified_shape.data(), simplified_src_strides.data(),
-            simplified_dst_strides.data(),
-            src_offset, // modified by reference
-            dst_offset  // modified by reference
-        );
-        simplified_shape.resize(contracted_nd);
-        simplified_src_strides.resize(contracted_nd);
-        simplified_dst_strides.resize(contracted_nd);
-
-        nd = contracted_nd;
-        shape = const_cast<const py::ssize_t *>(simplified_shape.data());
-        src_strides =
-            const_cast<const py::ssize_t *>(simplified_src_strides.data());
-        dst_strides =
-            const_cast<const py::ssize_t *>(simplified_dst_strides.data());
-    }
-    else if (nd == 1) {
-        // Populate vectors
-        simplified_shape.reserve(nd);
-        simplified_shape.push_back(shape[0]);
-
-        simplified_src_strides.reserve(nd);
-        simplified_dst_strides.reserve(nd);
-
-        if (src_strides == nullptr) {
-            if (is_src_c_contig) {
-                simplified_src_strides.push_back(src_itemsize);
-            }
-            else if (is_src_f_contig) {
-                simplified_src_strides.push_back(src_itemsize);
-            }
-            else {
-                throw std::runtime_error(
-                    "Source array has null strides "
-                    "but has neither C- nor F- contiguous flag set");
-            }
-        }
-        else {
-            simplified_src_strides.push_back(src_strides[0]);
-        }
-        if (dst_strides == nullptr) {
-            if (is_dst_c_contig) {
-                simplified_dst_strides.push_back(dst_itemsize);
-            }
-            else if (is_dst_f_contig) {
-                simplified_dst_strides.push_back(dst_itemsize);
-            }
-            else {
-                throw std::runtime_error(
-                    "Destination array has null strides "
-                    "but has neither C- nor F- contiguous flag set");
-            }
-        }
-        else {
-            simplified_dst_strides.push_back(dst_strides[0]);
-        }
-
-        assert(simplified_shape.size() == static_cast<size_t>(nd));
-        assert(simplified_src_strides.size() == static_cast<size_t>(nd));
-        assert(simplified_dst_strides.size() == static_cast<size_t>(nd));
-    }
-}
-
-sycl::event _populate_packed_shape_strides_for_copycast_kernel(
-    sycl::queue exec_q,
-    py::ssize_t *device_shape_strides, // to be populated
-    const std::vector<py::ssize_t> &common_shape,
-    const std::vector<py::ssize_t> &src_strides,
-    const std::vector<py::ssize_t> &dst_strides)
-{
-    // memory transfer optimization, use USM-host for temporary speeds up
-    // tranfer to device, especially on dGPUs
-    using usm_host_allocatorT =
-        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
-    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
-    size_t nd = common_shape.size();
-
-    usm_host_allocatorT allocator(exec_q);
-
-    // create host temporary for packed shape and strides managed by shared
-    // pointer. Packed vector is concatenation of common_shape, src_stride and
-    // std_strides
-    std::shared_ptr<shT> shp_host_shape_strides =
-        std::make_shared<shT>(3 * nd, allocator);
-    std::copy(common_shape.begin(), common_shape.end(),
-              shp_host_shape_strides->begin());
-
-    std::copy(src_strides.begin(), src_strides.end(),
-              shp_host_shape_strides->begin() + nd);
-
-    std::copy(dst_strides.begin(), dst_strides.end(),
-              shp_host_shape_strides->begin() + 2 * nd);
-
-    sycl::event copy_shape_ev = exec_q.copy<py::ssize_t>(
-        shp_host_shape_strides->data(), device_shape_strides,
-        shp_host_shape_strides->size());
-
-    exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(copy_shape_ev);
-        cgh.host_task([shp_host_shape_strides]() {
-            // increment shared pointer ref-count to keep it alive
-            // till copy operation completes;
-        });
-    });
-
-    return copy_shape_ev;
-}
-
-std::pair<sycl::event, sycl::event>
-copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src,
-                                  dpctl::tensor::usm_ndarray dst,
-                                  sycl::queue exec_q,
-                                  const std::vector<sycl::event> &depends = {})
-{
-
-    // array dimensions must be the same
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-    if (src_nd != dst_nd) {
-        throw py::value_error("Array dimensions are not the same.");
-    }
-
-    // shapes must be the same
-    const py::ssize_t *src_shape = src.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-
-    bool shapes_equal(true);
-    size_t src_nelems(1);
-
-    for (int i = 0; i < src_nd; ++i) {
-        src_nelems *= static_cast<size_t>(src_shape[i]);
-        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
-    }
-    if (!shapes_equal) {
-        throw py::value_error("Array shapes are not the same.");
-    }
-
-    if (src_nelems == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    auto dst_offsets = dst.get_minmax_offsets();
-    // destination must be ample enough to accomodate all elements
-    {
-        size_t range =
-            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
-        if (range + 1 < src_nelems) {
-            throw py::value_error(
-                "Destination array can not accomodate all the "
-                "elements of source array.");
-        }
-    }
-
-    // check compatibility of execution queue and allocation queue
-    sycl::queue src_q = src.get_queue();
-    sycl::queue dst_q = dst.get_queue();
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
-
-    char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    // check that arrays do not overlap, and concurrent copying is safe.
-    auto src_offsets = src.get_minmax_offsets();
-    int src_elem_size = src.get_elemsize();
-    int dst_elem_size = dst.get_elemsize();
-
-    bool memory_overlap =
-        ((dst_data - src_data > src_offsets.second * src_elem_size -
-                                    dst_offsets.first * dst_elem_size) &&
-         (src_data - dst_data > dst_offsets.second * dst_elem_size -
-                                    src_offsets.first * src_elem_size));
-    if (memory_overlap) {
-        // TODO: could use a temporary, but this is done by the caller
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    int src_flags = src.get_flags();
-    int dst_flags = dst.get_flags();
-
-    // check for applicability of special cases:
-    //      (same type && (both C-contiguous || both F-contiguous)
-    bool both_c_contig = ((src_flags & USM_ARRAY_C_CONTIGUOUS) &&
-                          (dst_flags & USM_ARRAY_C_CONTIGUOUS));
-    bool both_f_contig = ((src_flags & USM_ARRAY_F_CONTIGUOUS) &&
-                          (dst_flags & USM_ARRAY_F_CONTIGUOUS));
-    if (both_c_contig || both_f_contig) {
-        if (src_type_id == dst_type_id) {
-
-            sycl::event copy_ev =
-                exec_q.memcpy(static_cast<void *>(dst_data),
-                              static_cast<const void *>(src_data),
-                              src_nelems * src_elem_size, depends);
-
-            // make sure src and dst are not GC-ed before copy_ev is complete
-            return std::make_pair(
-                keep_args_alive(exec_q, {src, dst}, {copy_ev}), copy_ev);
-        }
-        // With contract_iter2 in place, there is no need to write
-        // dedicated kernels for casting between contiguous arrays
-    }
-
-    const py::ssize_t *src_strides = src.get_strides_raw();
-    const py::ssize_t *dst_strides = dst.get_strides_raw();
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t dst_offset(0);
-
-    int nd = src_nd;
-    const py::ssize_t *shape = src_shape;
-
-    bool is_src_c_contig = ((src_flags & USM_ARRAY_C_CONTIGUOUS) != 0);
-    bool is_src_f_contig = ((src_flags & USM_ARRAY_F_CONTIGUOUS) != 0);
-
-    bool is_dst_c_contig = ((dst_flags & USM_ARRAY_C_CONTIGUOUS) != 0);
-    bool is_dst_f_contig = ((dst_flags & USM_ARRAY_F_CONTIGUOUS) != 0);
-
-    constexpr py::ssize_t src_itemsize = 1; // in elements
-    constexpr py::ssize_t dst_itemsize = 1; // in elements
-
-    // all args except itemsizes and is_?_contig bools can be modified by
-    // reference
-    simplify_iteration_space(nd, shape, src_strides, src_itemsize,
-                             is_src_c_contig, is_src_f_contig, dst_strides,
-                             dst_itemsize, is_dst_c_contig, is_dst_f_contig,
-                             simplified_shape, simplified_src_strides,
-                             simplified_dst_strides, src_offset, dst_offset);
-
-    if (nd < 3) {
-        if (nd == 1) {
-            std::array<py::ssize_t, 1> shape_arr = {shape[0]};
-            // strides may be null
-            std::array<py::ssize_t, 1> src_strides_arr = {
-                (src_strides ? src_strides[0] : 1)};
-            std::array<py::ssize_t, 1> dst_strides_arr = {
-                (dst_strides ? dst_strides[0] : 1)};
-
-            auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
-            sycl::event copy_and_cast_1d_event = fn(
-                exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
-                src_data, src_offset, dst_data, dst_offset, depends);
-
-            return std::make_pair(
-                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}),
-                copy_and_cast_1d_event);
-        }
-        else if (nd == 2) {
-            std::array<py::ssize_t, 2> shape_arr = {shape[0], shape[1]};
-            std::array<py::ssize_t, 2> src_strides_arr = {src_strides[0],
-                                                          src_strides[1]};
-            std::array<py::ssize_t, 2> dst_strides_arr = {dst_strides[0],
-                                                          dst_strides[1]};
-
-            auto fn = copy_and_cast_2d_dispatch_table[dst_type_id][src_type_id];
-            sycl::event copy_and_cast_2d_event = fn(
-                exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
-                src_data, src_offset, dst_data, dst_offset, depends);
-
-            return std::make_pair(
-                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_2d_event}),
-                copy_and_cast_2d_event);
-        }
-        else if (nd == 0) { // case of a scalar
-            assert(src_nelems == 1);
-            std::array<py::ssize_t, 1> shape_arr = {1};
-            std::array<py::ssize_t, 1> src_strides_arr = {1};
-            std::array<py::ssize_t, 1> dst_strides_arr = {1};
-
-            auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
-            sycl::event copy_and_cast_0d_event = fn(
-                exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
-                src_data, src_offset, dst_data, dst_offset, depends);
-
-            return std::make_pair(
-                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}),
-                copy_and_cast_0d_event);
-        }
-    }
-
-    // Generic implementation
-    auto copy_and_cast_fn =
-        copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id];
+using dpctl::tensor::c_contiguous_strides;
+using dpctl::tensor::f_contiguous_strides;
 
-    //   If shape/strides are accessed with accessors, buffer destructor
-    //   will force syncronization.
-    py::ssize_t *shape_strides =
-        sycl::malloc_device<py::ssize_t>(3 * nd, exec_q);
+using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray;
 
-    if (shape_strides == nullptr) {
-        throw std::runtime_error("Unabled to allocate device memory");
-    }
+/* =========================== Copy for reshape ============================= */
 
-    sycl::event copy_shape_ev =
-        _populate_packed_shape_strides_for_copycast_kernel(
-            exec_q, shape_strides, simplified_shape, simplified_src_strides,
-            simplified_dst_strides);
-
-    sycl::event copy_and_cast_generic_ev = copy_and_cast_fn(
-        exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data,
-        dst_offset, depends, {copy_shape_ev});
-
-    // async free of shape_strides temporary
-    auto ctx = exec_q.get_context();
-    exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(copy_and_cast_generic_ev);
-        cgh.host_task(
-            [ctx, shape_strides]() { sycl::free(shape_strides, ctx); });
-    });
-
-    return std::make_pair(
-        keep_args_alive(exec_q, {src, dst}, {copy_and_cast_generic_ev}),
-        copy_and_cast_generic_ev);
-}
-
-/* =========================== Copy for reshape ==============================
- */
-
-template <typename Ty> class GenericCopyForReshapeFunctor
-{
-private:
-    py::ssize_t offset = 0;
-    py::ssize_t size = 1;
-    int src_nd = -1;
-    int dst_nd = -1;
-    // USM array of size 2*(src_nd + dst_nd)
-    //   [ src_shape; src_strides; dst_shape; dst_strides ]
-    const py::ssize_t *src_dst_shapes_and_strides = nullptr;
-    Ty *src_p = nullptr;
-    Ty *dst_p = nullptr;
-
-public:
-    GenericCopyForReshapeFunctor(py::ssize_t shift,
-                                 py::ssize_t nelems,
-                                 int src_ndim,
-                                 int dst_ndim,
-                                 const py::ssize_t *packed_shapes_and_strides,
-                                 char *src_ptr,
-                                 char *dst_ptr)
-        : offset(shift), size(nelems), src_nd(src_ndim), dst_nd(dst_ndim),
-          src_dst_shapes_and_strides(packed_shapes_and_strides),
-          src_p(reinterpret_cast<Ty *>(src_ptr)),
-          dst_p(reinterpret_cast<Ty *>(dst_ptr))
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        py::ssize_t this_src_offset(0);
-        CIndexer_vector<py::ssize_t> src_indxr(src_nd);
-
-        src_indxr.get_displacement<const py::ssize_t *, const py::ssize_t *>(
-            static_cast<py::ssize_t>(wiid.get(0)),
-            const_cast<const py::ssize_t *>(
-                src_dst_shapes_and_strides), // src shape
-            const_cast<const py::ssize_t *>(src_dst_shapes_and_strides +
-                                            src_nd), // src strides
-            this_src_offset                          // modified by reference
-        );
-        const Ty *in = src_p + this_src_offset;
-
-        py::ssize_t this_dst_offset(0);
-        CIndexer_vector<py::ssize_t> dst_indxr(dst_nd);
-        py::ssize_t shifted_wiid =
-            (static_cast<py::ssize_t>(wiid.get(0)) + offset) % size;
-        shifted_wiid = (shifted_wiid >= 0) ? shifted_wiid : shifted_wiid + size;
-        dst_indxr.get_displacement<const py::ssize_t *, const py::ssize_t *>(
-            shifted_wiid,
-            const_cast<const py::ssize_t *>(src_dst_shapes_and_strides +
-                                            2 * src_nd), // dst shape
-            const_cast<const py::ssize_t *>(src_dst_shapes_and_strides +
-                                            2 * src_nd + dst_nd), // dst strides
-            this_dst_offset // modified by reference
-        );
-
-        Ty *out = dst_p + this_dst_offset;
-        *out = *in;
-    }
-};
-
-// define function type
-typedef sycl::event (*copy_for_reshape_fn_ptr_t)(
-    sycl::queue,
-    py::ssize_t, // shift
-    size_t,      // num_elements
-    int,
-    int,           // src_nd, dst_nd
-    py::ssize_t *, // packed shapes and strides
-    char *,        // src_data_ptr
-    char *,        // dst_data_ptr
-    const std::vector<sycl::event> &);
-
-template <typename Ty>
-sycl::event
-copy_for_reshape_generic_impl(sycl::queue q,
-                              py::ssize_t shift,
-                              size_t nelems,
-                              int src_nd,
-                              int dst_nd,
-                              py::ssize_t *packed_shapes_and_strides,
-                              char *src_p,
-                              char *dst_p,
-                              const std::vector<sycl::event> &depends)
-{
-    sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.parallel_for<copy_for_reshape_generic_kernel<Ty>>(
-            sycl::range<1>(nelems),
-            GenericCopyForReshapeFunctor<Ty>(shift, nelems, src_nd, dst_nd,
-                                             packed_shapes_and_strides, src_p,
-                                             dst_p));
-    });
-
-    return copy_for_reshape_ev;
-}
-
-// define static vector
-static copy_for_reshape_fn_ptr_t
-    copy_for_reshape_generic_dispatch_vector[_ns::num_types];
-
-template <typename fnT, typename Ty> struct CopyForReshapeGenericFactory
-{
-    fnT get()
-    {
-        fnT f = copy_for_reshape_generic_impl<Ty>;
-        return f;
-    }
-};
-
-/*
- * Copies src into dst (same data type) of different shapes by using flat
- * iterations.
- *
- * Equivalent to the following loop:
- *
- * for i for range(src.size):
- *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
- */
-std::pair<sycl::event, sycl::event>
-copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
-                             dpctl::tensor::usm_ndarray dst,
-                             py::ssize_t shift,
-                             sycl::queue exec_q,
-                             const std::vector<sycl::event> &depends = {})
-{
-    py::ssize_t src_nelems = src.get_size();
-    py::ssize_t dst_nelems = dst.get_size();
-
-    // Must have the same number of elements
-    if (src_nelems != dst_nelems) {
-        throw py::value_error(
-            "copy_usm_ndarray_for_reshape requires src and dst to "
-            "have the same number of elements.");
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    // typenames must be the same
-    if (src_typenum != dst_typenum) {
-        throw py::value_error(
-            "copy_usm_ndarray_for_reshape requires src and dst to "
-            "have the same type.");
-    }
-
-    if (src_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    // destination must be ample enough to accomodate all elements
-    {
-        auto dst_offsets = dst.get_minmax_offsets();
-        py::ssize_t range =
-            static_cast<py::ssize_t>(dst_offsets.second - dst_offsets.first);
-        if (range + 1 < src_nelems) {
-            throw py::value_error(
-                "Destination array can not accomodate all the "
-                "elements of source array.");
-        }
-    }
-
-    // check same contexts
-    sycl::queue src_q = src.get_queue();
-    sycl::queue dst_q = dst.get_queue();
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    if (src_nelems == 1) {
-        // handle special case of 1-element array
-        int src_elemsize = src.get_elemsize();
-        char *src_data = src.get_data();
-        char *dst_data = dst.get_data();
-        sycl::event copy_ev =
-            exec_q.copy<char>(src_data, dst_data, src_elemsize);
-        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
-                              copy_ev);
-    }
-
-    // dimensions may be different
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-
-    const py::ssize_t *src_shape = src.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-
-    int type_id = array_types.typenum_to_lookup_id(src_typenum);
-
-    auto fn = copy_for_reshape_generic_dispatch_vector[type_id];
-
-    // packed_shape_strides = [src_shape, src_strides, dst_shape, dst_strides]
-    py::ssize_t *packed_shapes_strides =
-        sycl::malloc_device<py::ssize_t>(2 * (src_nd + dst_nd), exec_q);
-
-    if (packed_shapes_strides == nullptr) {
-        throw std::runtime_error("Unabled to allocate device memory");
-    }
-
-    using usm_host_allocatorT =
-        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
-    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
-    usm_host_allocatorT allocator(exec_q);
-    std::shared_ptr<shT> packed_host_shapes_strides_shp =
-        std::make_shared<shT>(2 * (src_nd + dst_nd), allocator);
-
-    std::copy(src_shape, src_shape + src_nd,
-              packed_host_shapes_strides_shp->begin());
-    std::copy(dst_shape, dst_shape + dst_nd,
-              packed_host_shapes_strides_shp->begin() + 2 * src_nd);
-
-    const py::ssize_t *src_strides = src.get_strides_raw();
-    if (src_strides == nullptr) {
-        int src_flags = src.get_flags();
-        if (src_flags & USM_ARRAY_C_CONTIGUOUS) {
-            const auto &src_contig_strides =
-                c_contiguous_strides(src_nd, src_shape);
-            std::copy(src_contig_strides.begin(), src_contig_strides.end(),
-                      packed_host_shapes_strides_shp->begin() + src_nd);
-        }
-        else if (src_flags & USM_ARRAY_F_CONTIGUOUS) {
-            const auto &src_contig_strides =
-                c_contiguous_strides(src_nd, src_shape);
-            std::copy(src_contig_strides.begin(), src_contig_strides.end(),
-                      packed_host_shapes_strides_shp->begin() + src_nd);
-        }
-        else {
-            sycl::free(packed_shapes_strides, exec_q);
-            throw std::runtime_error(
-                "Invalid src array encountered: in copy_for_reshape function");
-        }
-    }
-    else {
-        std::copy(src_strides, src_strides + src_nd,
-                  packed_host_shapes_strides_shp->begin() + src_nd);
-    }
-
-    const py::ssize_t *dst_strides = dst.get_strides_raw();
-    if (dst_strides == nullptr) {
-        int dst_flags = dst.get_flags();
-        if (dst_flags & USM_ARRAY_C_CONTIGUOUS) {
-            const auto &dst_contig_strides =
-                c_contiguous_strides(dst_nd, dst_shape);
-            std::copy(dst_contig_strides.begin(), dst_contig_strides.end(),
-                      packed_host_shapes_strides_shp->begin() + 2 * src_nd +
-                          dst_nd);
-        }
-        else if (dst_flags & USM_ARRAY_F_CONTIGUOUS) {
-            const auto &dst_contig_strides =
-                f_contiguous_strides(dst_nd, dst_shape);
-            std::copy(dst_contig_strides.begin(), dst_contig_strides.end(),
-                      packed_host_shapes_strides_shp->begin() + 2 * src_nd +
-                          dst_nd);
-        }
-        else {
-            sycl::free(packed_shapes_strides, exec_q);
-            throw std::runtime_error(
-                "Invalid dst array encountered: in copy_for_reshape function");
-        }
-    }
-    else {
-        std::copy(dst_strides, dst_strides + dst_nd,
-                  packed_host_shapes_strides_shp->begin() + 2 * src_nd +
-                      dst_nd);
-    }
-
-    // copy packed shapes and strides from host to devices
-    sycl::event packed_shape_strides_copy_ev = exec_q.copy<py::ssize_t>(
-        packed_host_shapes_strides_shp->data(), packed_shapes_strides,
-        packed_host_shapes_strides_shp->size());
-    exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(packed_shape_strides_copy_ev);
-        cgh.host_task([packed_host_shapes_strides_shp] {
-            // Capturing shared pointer ensures that the underlying vector is
-            // not destroyed until after its data are copied into packed USM
-            // vector
-        });
-    });
-
-    char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    std::vector<sycl::event> all_deps(depends.size() + 1);
-    all_deps.push_back(packed_shape_strides_copy_ev);
-    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
-
-    sycl::event copy_for_reshape_event =
-        fn(exec_q, shift, src_nelems, src_nd, dst_nd, packed_shapes_strides,
-           src_data, dst_data, all_deps);
-
-    exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(copy_for_reshape_event);
-        auto ctx = exec_q.get_context();
-        cgh.host_task([packed_shapes_strides, ctx]() {
-            sycl::free(packed_shapes_strides, ctx);
-        });
-    });
-
-    return std::make_pair(
-        keep_args_alive(exec_q, {src, dst}, {copy_for_reshape_event}),
-        copy_for_reshape_event);
-}
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
 
 /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */
 
-template <typename srcT, typename dstT, typename AccessorT>
-class CasterForAccessor
-{
-public:
-    CasterForAccessor() = default;
-    void operator()(AccessorT src,
-                    std::ptrdiff_t src_offset,
-                    char *dst,
-                    std::ptrdiff_t dst_offset) const
-    {
-        dstT *dst_ = reinterpret_cast<dstT *>(dst) + dst_offset;
-        *dst_ = convert_impl<dstT, srcT>(src[src_offset]);
-    }
-};
-
-template <typename CastFnT, typename AccessorT> class GenericCopyFromHostFunctor
-{
-private:
-    AccessorT src_acc_;
-    char *dst_ = nullptr;
-    py::ssize_t *shape_strides_ = nullptr;
-    int nd_ = 0;
-    py::ssize_t src_offset0 = 0;
-    py::ssize_t dst_offset0 = 0;
-
-public:
-    GenericCopyFromHostFunctor(AccessorT src_acc,
-                               char *dst_cp,
-                               py::ssize_t *shape_strides,
-                               int nd,
-                               py::ssize_t src_offset,
-                               py::ssize_t dst_offset)
-        : src_acc_(src_acc), dst_(dst_cp), shape_strides_(shape_strides),
-          nd_(nd), src_offset0(src_offset), dst_offset0(dst_offset)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        py::ssize_t src_offset(0);
-        py::ssize_t dst_offset(0);
-        CIndexer_vector<py::ssize_t> indxr(nd_);
-        indxr.get_displacement<const py::ssize_t *, const py::ssize_t *>(
-            static_cast<py::ssize_t>(wiid.get(0)),
-            const_cast<const py::ssize_t *>(shape_strides_), // common shape
-            const_cast<const py::ssize_t *>(shape_strides_ +
-                                            nd_), // src strides
-            const_cast<const py::ssize_t *>(shape_strides_ +
-                                            2 * nd_), // dst strides
-            src_offset,                               // modified by reference
-            dst_offset                                // modified by reference
-        );
-        CastFnT fn{};
-        fn(src_acc_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset);
-    }
-};
-
-typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)(
-    sycl::queue,
-    size_t,
-    int,
-    py::ssize_t *,
-    const char *,
-    py::ssize_t,
-    py::ssize_t,
-    py::ssize_t,
-    char *,
-    py::ssize_t,
-    const std::vector<sycl::event> &,
-    const std::vector<sycl::event> &);
-
-template <typename dstTy, typename srcTy>
-void copy_and_cast_from_host_impl(
-    sycl::queue q,
-    size_t nelems,
-    int nd,
-    py::ssize_t *shape_and_strides,
-    const char *host_src_p,
-    py::ssize_t src_offset,
-    py::ssize_t src_min_nelem_offset,
-    py::ssize_t src_max_nelem_offset,
-    char *dst_p,
-    py::ssize_t dst_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    py::ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1;
-    sycl::buffer<srcTy, 1> npy_buf(
-        reinterpret_cast<const srcTy *>(host_src_p) + src_min_nelem_offset,
-        sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}});
-
-    sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
-
-        cgh.parallel_for<copy_cast_from_host_kernel<srcTy, dstTy>>(
-            sycl::range<1>(nelems),
-            GenericCopyFromHostFunctor<
-                CasterForAccessor<srcTy, dstTy, decltype(npy_acc)>,
-                decltype(npy_acc)>(npy_acc, dst_p, shape_and_strides, nd,
-                                   src_offset - src_min_nelem_offset,
-                                   dst_offset));
-    });
-
-    copy_and_cast_from_host_ev.wait_and_throw();
-
-    return;
-}
-
-static copy_and_cast_from_host_blocking_fn_ptr_t
-    copy_and_cast_from_host_blocking_dispatch_table[_ns::num_types]
-                                                   [_ns::num_types];
-
-template <typename fnT, typename D, typename S>
-struct CopyAndCastFromHostFactory
-{
-    fnT get()
-    {
-        fnT f = copy_and_cast_from_host_impl<D, S>;
-        return f;
-    }
-};
-
-void copy_numpy_ndarray_into_usm_ndarray(
-    py::array npy_src,
-    dpctl::tensor::usm_ndarray dst,
-    sycl::queue exec_q,
-    const std::vector<sycl::event> &depends = {})
-{
-    int src_ndim = npy_src.ndim();
-    int dst_ndim = dst.get_ndim();
-
-    if (src_ndim != dst_ndim) {
-        throw py::value_error("Source ndarray and destination usm_ndarray have "
-                              "different array ranks, "
-                              "i.e. different number of indices needed to "
-                              "address array elements.");
-    }
-
-    const py::ssize_t *src_shape = npy_src.shape();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-    bool shapes_equal(true);
-    size_t src_nelems(1);
-    for (int i = 0; i < src_ndim; ++i) {
-        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
-        src_nelems *= static_cast<size_t>(src_shape[i]);
-    }
-
-    if (!shapes_equal) {
-        throw py::value_error("Source ndarray and destination usm_ndarray have "
-                              "difference shapes.");
-    }
-
-    if (src_nelems == 0) {
-        // nothing to do
-        return;
-    }
-
-    auto dst_offsets = dst.get_minmax_offsets();
-    // destination must be ample enough to accomodate all elements of source
-    // array
-    {
-        size_t range =
-            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
-        if (range + 1 < src_nelems) {
-            throw py::value_error(
-                "Destination array can not accomodate all the "
-                "elements of source array.");
-        }
-    }
-
-    sycl::queue dst_q = dst.get_queue();
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) {
-        throw py::value_error("Execution queue is not compatible with the "
-                              "allocation queue");
-    }
-
-    // here we assume that NumPy's type numbers agree with ours for types
-    // supported in both
-    int src_typenum =
-        py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num;
-    int dst_typenum = dst.get_typenum();
-
-    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
-
-    py::buffer_info src_pybuf = npy_src.request();
-    const char *const src_data = static_cast<const char *const>(src_pybuf.ptr);
-    char *dst_data = dst.get_data();
-
-    int src_flags = npy_src.flags();
-    int dst_flags = dst.get_flags();
-
-    // check for applicability of special cases:
-    //      (same type && (both C-contiguous || both F-contiguous)
-    bool both_c_contig = ((src_flags & py::array::c_style) &&
-                          (dst_flags & USM_ARRAY_C_CONTIGUOUS));
-    bool both_f_contig = ((src_flags & py::array::f_style) &&
-                          (dst_flags & USM_ARRAY_F_CONTIGUOUS));
-    if (both_c_contig || both_f_contig) {
-        if (src_type_id == dst_type_id) {
-            int src_elem_size = npy_src.itemsize();
-
-            sycl::event copy_ev =
-                exec_q.memcpy(static_cast<void *>(dst_data),
-                              static_cast<const void *>(src_data),
-                              src_nelems * src_elem_size, depends);
-
-            // wait for copy_ev to complete
-            copy_ev.wait_and_throw();
-
-            return;
-        }
-        // With contract_iter2 in place, there is no need to write
-        // dedicated kernels for casting between contiguous arrays
-    }
-
-    const py::ssize_t *src_strides =
-        npy_src.strides(); // N.B.: strides in bytes
-    const py::ssize_t *dst_strides =
-        dst.get_strides_raw(); // N.B.: strides in elements
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t dst_offset(0);
-
-    py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes
-    constexpr py::ssize_t dst_itemsize = 1;        // item size in elements
-
-    int nd = src_ndim;
-    const py::ssize_t *shape = src_shape;
-
-    bool is_src_c_contig = ((src_flags & py::array::c_style) != 0);
-    bool is_src_f_contig = ((src_flags & py::array::f_style) != 0);
-
-    bool is_dst_c_contig = ((dst_flags & USM_ARRAY_C_CONTIGUOUS) != 0);
-    bool is_dst_f_contig = ((dst_flags & USM_ARRAY_F_CONTIGUOUS) != 0);
-
-    // all args except itemsizes and is_?_contig bools can be modified by
-    // reference
-    simplify_iteration_space(nd, shape, src_strides, src_itemsize,
-                             is_src_c_contig, is_src_f_contig, dst_strides,
-                             dst_itemsize, is_dst_c_contig, is_dst_f_contig,
-                             simplified_shape, simplified_src_strides,
-                             simplified_dst_strides, src_offset, dst_offset);
-
-    assert(simplified_shape.size() == static_cast<size_t>(nd));
-    assert(simplified_src_strides.size() == static_cast<size_t>(nd));
-    assert(simplified_dst_strides.size() == static_cast<size_t>(nd));
-
-    // handle nd == 0
-    if (nd == 0) {
-        nd = 1;
-        simplified_shape.reserve(nd);
-        simplified_shape.push_back(1);
-
-        simplified_src_strides.reserve(nd);
-        simplified_src_strides.push_back(src_itemsize);
-
-        simplified_dst_strides.reserve(nd);
-        simplified_dst_strides.push_back(dst_itemsize);
-    }
-
-    // Minumum and maximum element offsets for source np.ndarray
-    py::ssize_t npy_src_min_nelem_offset(0);
-    py::ssize_t npy_src_max_nelem_offset(0);
-    for (int i = 0; i < nd; ++i) {
-        // convert source strides from bytes to elements
-        simplified_src_strides[i] = simplified_src_strides[i] / src_itemsize;
-        if (simplified_src_strides[i] < 0) {
-            npy_src_min_nelem_offset +=
-                simplified_src_strides[i] * (simplified_shape[i] - 1);
-        }
-        else {
-            npy_src_max_nelem_offset +=
-                simplified_src_strides[i] * (simplified_shape[i] - 1);
-        }
-    }
-
-    // Create shared pointers with shape and src/dst strides, copy into device
-    // memory
-    using shT = std::vector<py::ssize_t>;
-
-    // Get implementation function pointer
-    auto copy_and_cast_from_host_blocking_fn =
-        copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
-                                                       [src_type_id];
-
-    //   If shape/strides are accessed with accessors, buffer destructor
-    //   will force syncronization.
-    py::ssize_t *shape_strides =
-        sycl::malloc_device<py::ssize_t>(3 * nd, exec_q);
-
-    if (shape_strides == nullptr) {
-        throw std::runtime_error("Unabled to allocate device memory");
-    }
-
-    using usm_host_allocatorT =
-        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
-    using usmshT = std::vector<py::ssize_t, usm_host_allocatorT>;
-    usm_host_allocatorT alloc(exec_q);
-
-    auto host_shape_strides_shp = std::make_shared<usmshT>(3 * nd, alloc);
-    std::copy(simplified_shape.begin(), simplified_shape.end(),
-              host_shape_strides_shp->begin());
-    std::copy(simplified_src_strides.begin(), simplified_src_strides.end(),
-              host_shape_strides_shp->begin() + nd);
-    std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(),
-              host_shape_strides_shp->begin() + 2 * nd);
-
-    sycl::event copy_packed_ev =
-        exec_q.copy<py::ssize_t>(host_shape_strides_shp->data(), shape_strides,
-                                 host_shape_strides_shp->size());
-
-    copy_and_cast_from_host_blocking_fn(
-        exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
-        npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
-        dst_offset, depends, {copy_packed_ev});
-
-    sycl::free(shape_strides, exec_q);
-
-    return;
-}
-
-/* =========== Unboxing Python scalar =============== */
-
-template <typename T> T unbox_py_scalar(py::object o)
-{
-    return py::cast<T>(o);
-}
-
-template <> sycl::half unbox_py_scalar<sycl::half>(py::object o)
-{
-    float tmp = py::cast<float>(o);
-    return static_cast<sycl::half>(tmp);
-}
+using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
 
 /* ============= linear-sequence ==================== */
 
-typedef sycl::event (*lin_space_step_fn_ptr_t)(
-    sycl::queue,
-    size_t, // num_elements
-    py::object start,
-    py::object step,
-    char *, // dst_data_ptr
-    const std::vector<sycl::event> &);
-
-static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[_ns::num_types];
-
-template <typename Ty> class LinearSequenceStepFunctor
-{
-private:
-    Ty *p = nullptr;
-    Ty start_v;
-    Ty step_v;
-
-public:
-    LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv)
-        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), step_v(dv)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        auto i = wiid.get(0);
-        if constexpr (is_complex<Ty>::value) {
-            p[i] = Ty{start_v.real() + i * step_v.real(),
-                      start_v.imag() + i * step_v.imag()};
-        }
-        else {
-            p[i] = start_v + i * step_v;
-        }
-    }
-};
-
-template <typename Ty>
-sycl::event lin_space_step_impl(sycl::queue exec_q,
-                                size_t nelems,
-                                py::object start,
-                                py::object step,
-                                char *array_data,
-                                const std::vector<sycl::event> &depends)
-{
-    Ty start_v;
-    Ty step_v;
-    try {
-        start_v = unbox_py_scalar<Ty>(start);
-        step_v = unbox_py_scalar<Ty>(step);
-    } catch (const py::error_already_set &e) {
-        throw;
-    }
-
-    sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.parallel_for<linear_sequence_step_kernel<Ty>>(
-            sycl::range<1>{nelems},
-            LinearSequenceStepFunctor<Ty>(array_data, start_v, step_v));
-    });
-
-    return lin_space_step_event;
-}
-
-template <typename fnT, typename Ty> struct LinSpaceStepFactory
-{
-    fnT get()
-    {
-        fnT f = lin_space_step_impl<Ty>;
-        return f;
-    }
-};
-
-typedef sycl::event (*lin_space_affine_fn_ptr_t)(
-    sycl::queue,
-    size_t, // num_elements
-    py::object start,
-    py::object end,
-    bool include_endpoint,
-    char *, // dst_data_ptr
-    const std::vector<sycl::event> &);
-
-static lin_space_affine_fn_ptr_t
-    lin_space_affine_dispatch_vector[_ns::num_types];
-
-template <typename Ty, typename wTy> class LinearSequenceAffineFunctor
-{
-private:
-    Ty *p = nullptr;
-    Ty start_v;
-    Ty end_v;
-    size_t n;
-
-public:
-    LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, size_t den)
-        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1),
-          n((den == 0) ? 1 : den)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        auto i = wiid.get(0);
-        wTy wc = wTy(i) / n;
-        wTy w = wTy(n - i) / n;
-        if constexpr (is_complex<Ty>::value) {
-            auto _w = static_cast<typename Ty::value_type>(w);
-            auto _wc = static_cast<typename Ty::value_type>(wc);
-            auto re_comb = start_v.real() * _w + end_v.real() * _wc;
-            auto im_comb = start_v.imag() * _w + end_v.imag() * _wc;
-            Ty affine_comb = Ty{re_comb, im_comb};
-            p[i] = affine_comb;
-        }
-        else {
-            auto affine_comb = start_v * w + end_v * wc;
-            p[i] = convert_impl<Ty, decltype(affine_comb)>(affine_comb);
-        }
-    }
-};
-
-template <typename Ty>
-sycl::event lin_space_affine_impl(sycl::queue exec_q,
-                                  size_t nelems,
-                                  py::object start,
-                                  py::object end,
-                                  bool include_endpoint,
-                                  char *array_data,
-                                  const std::vector<sycl::event> &depends)
-{
-    Ty start_v, end_v;
-    try {
-        start_v = unbox_py_scalar<Ty>(start);
-        end_v = unbox_py_scalar<Ty>(end);
-    } catch (const py::error_already_set &e) {
-        throw;
-    }
-
-    bool device_supports_doubles = exec_q.get_device().has(sycl::aspect::fp64);
-    sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        if (device_supports_doubles) {
-            cgh.parallel_for<linear_sequence_affine_kernel<Ty, double>>(
-                sycl::range<1>{nelems},
-                LinearSequenceAffineFunctor<Ty, double>(
-                    array_data, start_v, end_v,
-                    (include_endpoint) ? nelems - 1 : nelems));
-        }
-        else {
-            cgh.parallel_for<linear_sequence_affine_kernel<Ty, float>>(
-                sycl::range<1>{nelems},
-                LinearSequenceAffineFunctor<Ty, float>(
-                    array_data, start_v, end_v,
-                    (include_endpoint) ? nelems - 1 : nelems));
-        }
-    });
-
-    return lin_space_affine_event;
-}
-
-template <typename fnT, typename Ty> struct LinSpaceAffineFactory
-{
-    fnT get()
-    {
-        fnT f = lin_space_affine_impl<Ty>;
-        return f;
-    }
-};
-
-std::pair<sycl::event, sycl::event>
-usm_ndarray_linear_sequence_step(py::object start,
-                                 py::object dt,
-                                 dpctl::tensor::usm_ndarray dst,
-                                 sycl::queue exec_q,
-                                 const std::vector<sycl::event> &depends = {})
-{
-    // dst must be 1D and C-contiguous
-    // start, end should be coercible into data type of dst
-
-    if (dst.get_ndim() != 1) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Expecting 1D array to populate");
-    }
-
-    int flags = dst.get_flags();
-    if (!(flags & USM_ARRAY_C_CONTIGUOUS)) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
-    }
-
-    sycl::queue dst_q = dst.get_queue();
-    if (dst_q != exec_q && dst_q.get_context() != exec_q.get_context()) {
-        throw py::value_error(
-            "Execution queue context is not the same as allocation context");
-    }
-
-    int dst_typenum = dst.get_typenum();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    py::ssize_t len = dst.get_shape(0);
-    if (len == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    char *dst_data = dst.get_data();
-    sycl::event linspace_step_event;
-
-    auto fn = lin_space_step_dispatch_vector[dst_typeid];
-
-    linspace_step_event =
-        fn(exec_q, static_cast<size_t>(len), start, dt, dst_data, depends);
-
-    return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}),
-                          linspace_step_event);
-}
-
-std::pair<sycl::event, sycl::event>
-usm_ndarray_linear_sequence_affine(py::object start,
-                                   py::object end,
-                                   dpctl::tensor::usm_ndarray dst,
-                                   bool include_endpoint,
-                                   sycl::queue exec_q,
-                                   const std::vector<sycl::event> &depends = {})
-{
-    // dst must be 1D and C-contiguous
-    // start, end should be coercible into data type of dst
-
-    if (dst.get_ndim() != 1) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Expecting 1D array to populate");
-    }
-
-    int flags = dst.get_flags();
-    if (!(flags & USM_ARRAY_C_CONTIGUOUS)) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
-    }
-
-    sycl::queue dst_q = dst.get_queue();
-    if (dst_q != exec_q && dst_q.get_context() != exec_q.get_context()) {
-        throw py::value_error(
-            "Execution queue context is not the same as allocation context");
-    }
-
-    int dst_typenum = dst.get_typenum();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    py::ssize_t len = dst.get_shape(0);
-    if (len == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    char *dst_data = dst.get_data();
-    sycl::event linspace_affine_event;
-
-    auto fn = lin_space_affine_dispatch_vector[dst_typeid];
-
-    linspace_affine_event = fn(exec_q, static_cast<size_t>(len), start, end,
-                               include_endpoint, dst_data, depends);
-
-    return std::make_pair(
-        keep_args_alive(exec_q, {dst}, {linspace_affine_event}),
-        linspace_affine_event);
-}
+using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
+using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
 
 /* ================ Full ================== */
 
-typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue,
-                                            size_t,
-                                            py::object,
-                                            char *,
-                                            const std::vector<sycl::event> &);
-
-static full_contig_fn_ptr_t full_contig_dispatch_vector[_ns::num_types];
-
-template <typename dstTy>
-sycl::event full_contig_impl(sycl::queue q,
-                             size_t nelems,
-                             py::object py_value,
-                             char *dst_p,
-                             const std::vector<sycl::event> &depends)
-{
-    dstTy fill_v;
-    try {
-        fill_v = unbox_py_scalar<dstTy>(py_value);
-    } catch (const py::error_already_set &e) {
-        throw;
-    }
-
-    sycl::event fill_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        dstTy *p = reinterpret_cast<dstTy *>(dst_p);
-        cgh.fill<dstTy>(p, fill_v, nelems);
-    });
-
-    return fill_ev;
-}
-
-template <typename fnT, typename Ty> struct FullContigFactory
-{
-    fnT get()
-    {
-        fnT f = full_contig_impl<Ty>;
-        return f;
-    }
-};
-
-std::pair<sycl::event, sycl::event>
-usm_ndarray_full(py::object py_value,
-                 dpctl::tensor::usm_ndarray dst,
-                 sycl::queue exec_q,
-                 const std::vector<sycl::event> &depends = {})
-{
-    // start, end should be coercible into data type of dst
-
-    py::ssize_t dst_nelems = dst.get_size();
-
-    if (dst_nelems == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    int dst_flags = dst.get_flags();
-
-    sycl::queue dst_q = dst.get_queue();
-    if (dst_q != exec_q && dst_q.get_context() != exec_q.get_context()) {
-        throw py::value_error(
-            "Execution queue context is not the same as allocation context");
-    }
-
-    int dst_typenum = dst.get_typenum();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    char *dst_data = dst.get_data();
-    sycl::event full_event;
-
-    if (dst_nelems == 1 || (dst_flags & USM_ARRAY_C_CONTIGUOUS) ||
-        (dst_flags & USM_ARRAY_F_CONTIGUOUS))
-    {
-        auto fn = full_contig_dispatch_vector[dst_typeid];
-
-        sycl::event full_contig_event =
-            fn(exec_q, static_cast<size_t>(dst_nelems), py_value, dst_data,
-               depends);
-
-        return std::make_pair(
-            keep_args_alive(exec_q, {dst}, {full_contig_event}),
-            full_contig_event);
-    }
-    else {
-        throw std::runtime_error(
-            "Only population of contiguous usm_ndarray objects is supported.");
-    }
-}
+using dpctl::tensor::py_internal::usm_ndarray_full;
 
 /* ================ Eye ================== */
 
-typedef sycl::event (*eye_fn_ptr_t)(sycl::queue,
-                                    size_t nelems, // num_elements
-                                    py::ssize_t start,
-                                    py::ssize_t end,
-                                    py::ssize_t step,
-                                    char *, // dst_data_ptr
-                                    const std::vector<sycl::event> &);
-
-static eye_fn_ptr_t eye_dispatch_vector[_ns::num_types];
-
-template <typename Ty> class EyeFunctor
-{
-private:
-    Ty *p = nullptr;
-    py::ssize_t start_v;
-    py::ssize_t end_v;
-    py::ssize_t step_v;
-
-public:
-    EyeFunctor(char *dst_p,
-               const py::ssize_t v0,
-               const py::ssize_t v1,
-               const py::ssize_t dv)
-        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1), step_v(dv)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        Ty set_v = 0;
-        py::ssize_t i = static_cast<py::ssize_t>(wiid.get(0));
-        if (i >= start_v and i <= end_v) {
-            if ((i - start_v) % step_v == 0) {
-                set_v = 1;
-            }
-        }
-        p[i] = set_v;
-    }
-};
-
-template <typename Ty>
-sycl::event eye_impl(sycl::queue exec_q,
-                     size_t nelems,
-                     const py::ssize_t start,
-                     const py::ssize_t end,
-                     const py::ssize_t step,
-                     char *array_data,
-                     const std::vector<sycl::event> &depends)
-{
-    sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.parallel_for<eye_kernel<Ty>>(
-            sycl::range<1>{nelems},
-            EyeFunctor<Ty>(array_data, start, end, step));
-    });
-
-    return eye_event;
-}
-
-template <typename fnT, typename Ty> struct EyeFactory
-{
-    fnT get()
-    {
-        fnT f = eye_impl<Ty>;
-        return f;
-    }
-};
-
-std::pair<sycl::event, sycl::event>
-eye(py::ssize_t k,
-    dpctl::tensor::usm_ndarray dst,
-    sycl::queue exec_q,
-    const std::vector<sycl::event> &depends = {})
-{
-    // dst must be 2D
-
-    if (dst.get_ndim() != 2) {
-        throw py::value_error(
-            "usm_ndarray_eye: Expecting 2D array to populate");
-    }
-
-    sycl::queue dst_q = dst.get_queue();
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst_q})) {
-        throw py::value_error("Execution queue is not compatible with the "
-                              "allocation queue");
-    }
-
-    int dst_typenum = dst.get_typenum();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    const py::ssize_t nelem = dst.get_size();
-    const py::ssize_t rows = dst.get_shape(0);
-    const py::ssize_t cols = dst.get_shape(1);
-    if (rows == 0 || cols == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    bool is_dst_c_contig = ((dst.get_flags() & USM_ARRAY_C_CONTIGUOUS) != 0);
-    bool is_dst_f_contig = ((dst.get_flags() & USM_ARRAY_F_CONTIGUOUS) != 0);
-    if (!is_dst_c_contig && !is_dst_f_contig) {
-        throw py::value_error("USM array is not contiguous");
-    }
-
-    py::ssize_t start;
-    if (is_dst_c_contig) {
-        start = (k < 0) ? -k * cols : k;
-    }
-    else {
-        start = (k < 0) ? -k : k * rows;
-    }
-
-    const py::ssize_t *strides = dst.get_strides_raw();
-    py::ssize_t step;
-    if (strides == nullptr) {
-        step = (is_dst_c_contig) ? cols + 1 : rows + 1;
-    }
-    else {
-        step = strides[0] + strides[1];
-    }
-
-    const py::ssize_t length = std::min({rows, cols, rows + k, cols - k});
-    const py::ssize_t end = start + step * (length - 1);
-
-    char *dst_data = dst.get_data();
-    sycl::event eye_event;
-
-    auto fn = eye_dispatch_vector[dst_typeid];
-
-    eye_event = fn(exec_q, static_cast<size_t>(nelem), start, end, step,
-                   dst_data, depends);
-
-    return std::make_pair(keep_args_alive(exec_q, {dst}, {eye_event}),
-                          eye_event);
-}
+using dpctl::tensor::py_internal::usm_ndarray_eye;
 
 /* =========================== Tril and triu ============================== */
-// define function type
-typedef sycl::event (*tri_fn_ptr_t)(sycl::queue,
-                                    py::ssize_t,   // inner_range  //py::ssize_t
-                                    py::ssize_t,   // outer_range
-                                    char *,        // src_data_ptr
-                                    char *,        // dst_data_ptr
-                                    py::ssize_t,   // nd
-                                    py::ssize_t *, // shape_and_strides
-                                    py::ssize_t,   // k
-                                    const std::vector<sycl::event> &,
-                                    const std::vector<sycl::event> &);
-
-template <typename Ty, bool> class tri_kernel;
-template <typename Ty, bool l>
-sycl::event tri_impl(sycl::queue exec_q,
-                     py::ssize_t inner_range,
-                     py::ssize_t outer_range,
-                     char *src_p,
-                     char *dst_p,
-                     py::ssize_t nd,
-                     py::ssize_t *shape_and_strides,
-                     py::ssize_t k,
-                     const std::vector<sycl::event> &depends,
-                     const std::vector<sycl::event> &additional_depends)
-{
-    constexpr int d2 = 2;
-    py::ssize_t src_s = nd;
-    py::ssize_t dst_s = 2 * nd;
-    py::ssize_t nd_1 = nd - 1;
-    py::ssize_t nd_2 = nd - 2;
-    Ty *src = reinterpret_cast<Ty *>(src_p);
-    Ty *dst = reinterpret_cast<Ty *>(dst_p);
-
-    sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-        cgh.parallel_for<tri_kernel<Ty, l>>(
-            sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) {
-                py::ssize_t outer_gid = idx[0] / inner_range;
-                py::ssize_t inner_gid = idx[0] - inner_range * outer_gid;
-
-                py::ssize_t src_inner_offset, dst_inner_offset;
-                bool to_copy;
-
-                {
-                    // py::ssize_t inner_gid = idx.get_id(0);
-                    CIndexer_array<d2, py::ssize_t> indexer_i(
-                        {shape_and_strides[nd_2], shape_and_strides[nd_1]});
-                    indexer_i.set(inner_gid);
-                    const std::array<py::ssize_t, d2> &inner = indexer_i.get();
-                    src_inner_offset =
-                        inner[0] * shape_and_strides[src_s + nd_2] +
-                        inner[1] * shape_and_strides[src_s + nd_1];
-                    dst_inner_offset =
-                        inner[0] * shape_and_strides[dst_s + nd_2] +
-                        inner[1] * shape_and_strides[dst_s + nd_1];
-
-                    if (l)
-                        to_copy = (inner[0] + k >= inner[1]);
-                    else
-                        to_copy = (inner[0] + k <= inner[1]);
-                }
-
-                py::ssize_t src_offset = 0;
-                py::ssize_t dst_offset = 0;
-                {
-                    // py::ssize_t outer_gid = idx.get_id(1);
-                    CIndexer_vector<py::ssize_t> outer(nd - d2);
-                    outer.get_displacement(
-                        outer_gid, shape_and_strides, shape_and_strides + src_s,
-                        shape_and_strides + dst_s, src_offset, dst_offset);
-                }
-
-                src_offset += src_inner_offset;
-                dst_offset += dst_inner_offset;
-
-                dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0);
-            });
-    });
-    return tri_ev;
-}
-
-static tri_fn_ptr_t tril_generic_dispatch_vector[_ns::num_types];
-
-template <typename fnT, typename Ty> struct TrilGenericFactory
-{
-    fnT get()
-    {
-        fnT f = tri_impl<Ty, /*tril*/ true>;
-        return f;
-    }
-};
-
-static tri_fn_ptr_t triu_generic_dispatch_vector[_ns::num_types];
-
-template <typename fnT, typename Ty> struct TriuGenericFactory
-{
-    fnT get()
-    {
-        fnT f = tri_impl<Ty, /*triu*/ false>;
-        return f;
-    }
-};
-
-std::pair<sycl::event, sycl::event>
-tri(sycl::queue &exec_q,
-    dpctl::tensor::usm_ndarray src,
-    dpctl::tensor::usm_ndarray dst,
-    char part,
-    py::ssize_t k = 0,
-    const std::vector<sycl::event> &depends = {})
-{
-    // array dimensions must be the same
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-    if (src_nd != dst_nd) {
-        throw py::value_error("Array dimensions are not the same.");
-    }
-
-    if (src_nd < 2) {
-        throw py::value_error("Array dimensions less than 2.");
-    }
-
-    // shapes must be the same
-    const py::ssize_t *src_shape = src.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-
-    bool shapes_equal(true);
-    size_t src_nelems(1);
-
-    for (int i = 0; shapes_equal && i < src_nd; ++i) {
-        src_nelems *= static_cast<size_t>(src_shape[i]);
-        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
-    }
-    if (!shapes_equal) {
-        throw py::value_error("Array shapes are not the same.");
-    }
-
-    if (src_nelems == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    // check that arrays do not overlap, and concurrent copying is safe.
-    auto src_offsets = src.get_minmax_offsets();
-    auto dst_offsets = dst.get_minmax_offsets();
-    int src_elem_size = src.get_elemsize();
-    int dst_elem_size = dst.get_elemsize();
-
-    bool memory_overlap =
-        ((dst_data - src_data > src_offsets.second * src_elem_size -
-                                    dst_offsets.first * dst_elem_size) &&
-         (src_data - dst_data > dst_offsets.second * dst_elem_size -
-                                    src_offsets.first * src_elem_size));
-    if (memory_overlap) {
-        // TODO: could use a temporary, but this is done by the caller
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if (dst_typeid != src_typeid) {
-        throw py::value_error("Array dtype are not the same.");
-    }
-
-    // check same contexts
-    sycl::queue src_q = src.get_queue();
-    sycl::queue dst_q = dst.get_queue();
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) {
-        throw py::value_error(
-            "Execution queue context is not the same as allocation contexts");
-    }
-
-    using shT = std::vector<py::ssize_t>;
-    shT src_strides(src_nd);
-
-    int src_flags = src.get_flags();
-    bool is_src_c_contig = ((src_flags & USM_ARRAY_C_CONTIGUOUS) != 0);
-    bool is_src_f_contig = ((src_flags & USM_ARRAY_F_CONTIGUOUS) != 0);
-
-    const py::ssize_t *src_strides_raw = src.get_strides_raw();
-    if (src_strides_raw == nullptr) {
-        if (is_src_c_contig) {
-            src_strides = c_contiguous_strides(src_nd, src_shape);
-        }
-        else if (is_src_f_contig) {
-            src_strides = f_contiguous_strides(src_nd, src_shape);
-        }
-        else {
-            throw std::runtime_error("Source array has null strides but has "
-                                     "neither C- nor F- contiguous flag set");
-        }
-    }
-    else {
-        std::copy(src_strides_raw, src_strides_raw + src_nd,
-                  src_strides.begin());
-    }
-
-    shT dst_strides(src_nd);
-
-    int dst_flags = dst.get_flags();
-    bool is_dst_c_contig = ((dst_flags & USM_ARRAY_C_CONTIGUOUS) != 0);
-    bool is_dst_f_contig = ((dst_flags & USM_ARRAY_F_CONTIGUOUS) != 0);
-
-    const py::ssize_t *dst_strides_raw = dst.get_strides_raw();
-    if (dst_strides_raw == nullptr) {
-        if (is_dst_c_contig) {
-            dst_strides = c_contiguous_strides(src_nd, src_shape);
-        }
-        else if (is_dst_f_contig) {
-            dst_strides = f_contiguous_strides(src_nd, src_shape);
-        }
-        else {
-            throw std::runtime_error("Source array has null strides but has "
-                                     "neither C- nor F- contiguous flag set");
-        }
-    }
-    else {
-        std::copy(dst_strides_raw, dst_strides_raw + dst_nd,
-                  dst_strides.begin());
-    }
 
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t dst_offset(0);
-
-    constexpr py::ssize_t src_itemsize = 1; // item size in elements
-    constexpr py::ssize_t dst_itemsize = 1; // item size in elements
-
-    int nd = src_nd - 2;
-    const py::ssize_t *shape = src_shape;
-    const py::ssize_t *p_src_strides = src_strides.data();
-    const py::ssize_t *p_dst_strides = dst_strides.data();
-
-    simplify_iteration_space(nd, shape, p_src_strides, src_itemsize,
-                             is_src_c_contig, is_src_f_contig, p_dst_strides,
-                             dst_itemsize, is_dst_c_contig, is_dst_f_contig,
-                             simplified_shape, simplified_src_strides,
-                             simplified_dst_strides, src_offset, dst_offset);
-
-    if (src_offset != 0 || dst_offset != 0) {
-        throw py::value_error("Reversed slice for dst is not supported");
-    }
-
-    nd += 2;
-
-    using usm_host_allocatorT =
-        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
-    using usmshT = std::vector<py::ssize_t, usm_host_allocatorT>;
-
-    usm_host_allocatorT allocator(exec_q);
-    auto shp_host_shape_and_strides =
-        std::make_shared<usmshT>(3 * nd, allocator);
-
-    std::copy(simplified_shape.begin(), simplified_shape.end(),
-              shp_host_shape_and_strides->begin());
-    (*shp_host_shape_and_strides)[nd - 2] = src_shape[src_nd - 2];
-    (*shp_host_shape_and_strides)[nd - 1] = src_shape[src_nd - 1];
-
-    std::copy(simplified_src_strides.begin(), simplified_src_strides.end(),
-              shp_host_shape_and_strides->begin() + nd);
-    (*shp_host_shape_and_strides)[2 * nd - 2] = src_strides[src_nd - 2];
-    (*shp_host_shape_and_strides)[2 * nd - 1] = src_strides[src_nd - 1];
-
-    std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(),
-              shp_host_shape_and_strides->begin() + 2 * nd);
-    (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2];
-    (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1];
-
-    py::ssize_t *dev_shape_and_strides =
-        sycl::malloc_device<ssize_t>(3 * nd, exec_q);
-    if (dev_shape_and_strides == nullptr) {
-        throw std::runtime_error("Unabled to allocate device memory");
-    }
-    sycl::event copy_shape_and_strides = exec_q.copy<ssize_t>(
-        shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd);
-
-    py::ssize_t inner_range = src_shape[src_nd - 1] * src_shape[src_nd - 2];
-    py::ssize_t outer_range = src_nelems / inner_range;
-
-    sycl::event tri_ev;
-    if (part == 'l') {
-        auto fn = tril_generic_dispatch_vector[src_typeid];
-        tri_ev =
-            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
-               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
-    }
-    else {
-        auto fn = triu_generic_dispatch_vector[src_typeid];
-        tri_ev =
-            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
-               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
-    }
-
-    exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on({tri_ev});
-        auto ctx = exec_q.get_context();
-        cgh.host_task(
-            [shp_host_shape_and_strides, dev_shape_and_strides, ctx]() {
-                // capture of shp_host_shape_and_strides ensure the underlying
-                // vector exists for the entire execution of copying kernel
-                sycl::free(dev_shape_and_strides, ctx);
-            });
-    });
-
-    return std::make_pair(keep_args_alive(exec_q, {src, dst}, {tri_ev}),
-                          tri_ev);
-}
+using dpctl::tensor::py_internal::usm_ndarray_triul;
 
 // populate dispatch tables
-void init_copy_and_cast_dispatch_tables(void)
+void init_dispatch_tables(void)
 {
-    using namespace dpctl::tensor::detail;
-
-    DispatchTableBuilder<copy_and_cast_generic_fn_ptr_t,
-                         CopyAndCastGenericFactory, num_types>
-        dtb_generic;
-    dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table);
-
-    DispatchTableBuilder<copy_and_cast_1d_fn_ptr_t, CopyAndCast1DFactory,
-                         num_types>
-        dtb_1d;
-    dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table);
-
-    DispatchTableBuilder<copy_and_cast_2d_fn_ptr_t, CopyAndCast2DFactory,
-                         num_types>
-        dtb_2d;
-    dtb_2d.populate_dispatch_table(copy_and_cast_2d_dispatch_table);
-
-    DispatchTableBuilder<copy_and_cast_from_host_blocking_fn_ptr_t,
-                         CopyAndCastFromHostFactory, num_types>
-        dtb_copy_from_numpy;
-
-    dtb_copy_from_numpy.populate_dispatch_table(
-        copy_and_cast_from_host_blocking_dispatch_table);
+    using namespace dpctl::tensor::py_internal;
 
+    init_copy_and_cast_usm_to_usm_dispatch_tables();
+    init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
     return;
 }
 
 // populate dispatch vectors
-void init_copy_for_reshape_dispatch_vector(void)
+void init_dispatch_vectors(void)
 {
-    using namespace dpctl::tensor::detail;
-
-    DispatchVectorBuilder<copy_for_reshape_fn_ptr_t,
-                          CopyForReshapeGenericFactory, num_types>
-        dvb;
-    dvb.populate_dispatch_vector(copy_for_reshape_generic_dispatch_vector);
-
-    DispatchVectorBuilder<lin_space_step_fn_ptr_t, LinSpaceStepFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector);
-
-    DispatchVectorBuilder<lin_space_affine_fn_ptr_t, LinSpaceAffineFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector);
+    using namespace dpctl::tensor::py_internal;
 
-    DispatchVectorBuilder<full_contig_fn_ptr_t, FullContigFactory, num_types>
-        dvb3;
-    dvb3.populate_dispatch_vector(full_contig_dispatch_vector);
-
-    DispatchVectorBuilder<eye_fn_ptr_t, EyeFactory, num_types> dvb4;
-    dvb4.populate_dispatch_vector(eye_dispatch_vector);
-
-    DispatchVectorBuilder<tri_fn_ptr_t, TrilGenericFactory, num_types> dvb5;
-    dvb5.populate_dispatch_vector(tril_generic_dispatch_vector);
-
-    DispatchVectorBuilder<tri_fn_ptr_t, TriuGenericFactory, num_types> dvb6;
-    dvb6.populate_dispatch_vector(triu_generic_dispatch_vector);
+    init_copy_for_reshape_dispatch_vectors();
+    init_linear_sequences_dispatch_vectors();
+    init_full_ctor_dispatch_vectors();
+    init_eye_ctor_dispatch_vectors();
+    init_triul_ctor_dispatch_vectors();
 
     return;
 }
 
-std::string get_default_device_fp_type(sycl::device d)
-{
-    if (d.has(sycl::aspect::fp64)) {
-        return "f8";
-    }
-    else {
-        return "f4";
-    }
-}
-
-std::string get_default_device_int_type(sycl::device)
-{
-    return "i8";
-}
-
-std::string get_default_device_complex_type(sycl::device d)
-{
-    if (d.has(sycl::aspect::fp64)) {
-        return "c16";
-    }
-    else {
-        return "c8";
-    }
-}
-
-std::string get_default_device_bool_type(sycl::device)
-{
-    return "b1";
-}
-
 } // namespace
 
 PYBIND11_MODULE(_tensor_impl, m)
 {
-
-    init_copy_and_cast_dispatch_tables();
-    init_copy_for_reshape_dispatch_vector();
-    import_dpctl();
-
-    // populate types constants for type dispatching functions
-    array_types = dpctl::tensor::detail::usm_ndarray_types::get();
+    init_dispatch_tables();
+    init_dispatch_vectors();
 
     m.def(
-        "_contract_iter", &contract_iter,
+        "_contract_iter", &contract_iter<py::ssize_t, py::value_error>,
         "Simplifies iteration of array of given shape & stride. Returns "
         "a triple: shape, stride and offset for the new iterator of possible "
         "smaller dimension, which traverses the same elements as the original "
@@ -2337,7 +125,7 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("depends") = py::list());
 
     m.def(
-        "_contract_iter2", &contract_iter2,
+        "_contract_iter2", &contract_iter2<py::ssize_t, py::value_error>,
         "Simplifies iteration over elements of pair of arrays of given shape "
         "with strides stride1 and stride2. Returns "
         "a 5-tuple: shape, stride and offset for the new iterator of possible "
@@ -2381,7 +169,7 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
-    m.def("_eye", &eye,
+    m.def("_eye", &usm_ndarray_eye,
           "Fills input 2D contiguous usm_ndarray `dst` with "
           "zeros outside of the diagonal "
           "specified by "
@@ -2391,57 +179,43 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("k"), py::arg("dst"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
-    m.def("default_device_fp_type", [](sycl::queue q) -> std::string {
-        return get_default_device_fp_type(q.get_device());
-    });
-    m.def("default_device_fp_type_device", [](sycl::device dev) -> std::string {
-        return get_default_device_fp_type(dev);
-    });
-
-    m.def("default_device_int_type", [](sycl::queue q) -> std::string {
-        return get_default_device_int_type(q.get_device());
-    });
-    m.def("default_device_int_type_device",
-          [](sycl::device dev) -> std::string {
-              return get_default_device_int_type(dev);
-          });
-
-    m.def("default_device_bool_type", [](sycl::queue q) -> std::string {
-        return get_default_device_bool_type(q.get_device());
-    });
-    m.def("default_device_bool_type_device",
-          [](sycl::device dev) -> std::string {
-              return get_default_device_bool_type(dev);
-          });
-
-    m.def("default_device_complex_type", [](sycl::queue q) -> std::string {
-        return get_default_device_complex_type(q.get_device());
-    });
-    m.def("default_device_complex_type_device",
-          [](sycl::device dev) -> std::string {
-              return get_default_device_complex_type(dev);
-          });
-    m.def(
-        "_tril",
-        [](dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst,
-           py::ssize_t k, sycl::queue exec_q,
-           const std::vector<sycl::event> depends)
-            -> std::pair<sycl::event, sycl::event> {
-            return tri(exec_q, src, dst, 'l', k, depends);
-        },
-        "Tril helper function.", py::arg("src"), py::arg("dst"),
-        py::arg("k") = 0, py::arg("sycl_queue"),
-        py::arg("depends") = py::list());
+    m.def("default_device_fp_type",
+          dpctl::tensor::py_internal::default_device_fp_type,
+          "Gives default floating point type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_int_type",
+          dpctl::tensor::py_internal::default_device_int_type,
+          "Gives default integer type supported by device.", py::arg("dev"));
+
+    m.def("default_device_bool_type",
+          dpctl::tensor::py_internal::default_device_bool_type,
+          "Gives default boolean type supported by device.", py::arg("dev"));
+
+    m.def("default_device_complex_type",
+          dpctl::tensor::py_internal::default_device_complex_type,
+          "Gives default complex floating point type support by device.",
+          py::arg("dev"));
+
+    auto tril_fn = [](dpctl::tensor::usm_ndarray src,
+                      dpctl::tensor::usm_ndarray dst, py::ssize_t k,
+                      sycl::queue exec_q,
+                      const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends);
+    };
+    m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"),
+          py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
-    m.def(
-        "_triu",
-        [](dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst,
-           py::ssize_t k, sycl::queue exec_q,
-           const std::vector<sycl::event> depends)
-            -> std::pair<sycl::event, sycl::event> {
-            return tri(exec_q, src, dst, 'u', k, depends);
-        },
-        "Triu helper function.", py::arg("src"), py::arg("dst"),
-        py::arg("k") = 0, py::arg("sycl_queue"),
-        py::arg("depends") = py::list());
+    auto triu_fn = [](dpctl::tensor::usm_ndarray src,
+                      dpctl::tensor::usm_ndarray dst, py::ssize_t k,
+                      sycl::queue exec_q,
+                      const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends);
+    };
+    m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
+          py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 }
diff --git a/dpctl/tensor/libtensor/source/triul_ctor.cpp b/dpctl/tensor/libtensor/source/triul_ctor.cpp
new file mode 100644
index 0000000000..60e5572d6d
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/triul_ctor.cpp
@@ -0,0 +1,287 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace _ns = dpctl::tensor::detail;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+using dpctl::tensor::c_contiguous_strides;
+using dpctl::tensor::f_contiguous_strides;
+using dpctl::utils::keep_args_alive;
+
+using dpctl::tensor::kernels::constructors::tri_fn_ptr_t;
+
+static tri_fn_ptr_t tril_generic_dispatch_vector[_ns::num_types];
+static tri_fn_ptr_t triu_generic_dispatch_vector[_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+usm_ndarray_triul(sycl::queue exec_q,
+                  dpctl::tensor::usm_ndarray src,
+                  dpctl::tensor::usm_ndarray dst,
+                  char part,
+                  py::ssize_t k = 0,
+                  const std::vector<sycl::event> &depends = {})
+{
+    // array dimensions must be the same
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    if (src_nd < 2) {
+        throw py::value_error("Array dimensions less than 2.");
+    }
+
+    // shapes must be the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    size_t src_nelems(1);
+
+    for (int i = 0; shapes_equal && i < src_nd; ++i) {
+        src_nelems *= static_cast<size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto src_offsets = src.get_minmax_offsets();
+    auto dst_offsets = dst.get_minmax_offsets();
+    int src_elem_size = src.get_elemsize();
+    int dst_elem_size = dst.get_elemsize();
+
+    bool memory_overlap =
+        ((dst_data - src_data > src_offsets.second * src_elem_size -
+                                    dst_offsets.first * dst_elem_size) &&
+         (src_data - dst_data > dst_offsets.second * dst_elem_size -
+                                    src_offsets.first * src_elem_size));
+    if (memory_overlap) {
+        // TODO: could use a temporary, but this is done by the caller
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    auto array_types = dpctl::tensor::detail::usm_ndarray_types();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (dst_typeid != src_typeid) {
+        throw py::value_error("Array dtype are not the same.");
+    }
+
+    // check same contexts
+    sycl::queue src_q = src.get_queue();
+    sycl::queue dst_q = dst.get_queue();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) {
+        throw py::value_error(
+            "Execution queue context is not the same as allocation contexts");
+    }
+
+    using shT = std::vector<py::ssize_t>;
+    shT src_strides(src_nd);
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    const py::ssize_t *src_strides_raw = src.get_strides_raw();
+    if (src_strides_raw == nullptr) {
+        if (is_src_c_contig) {
+            src_strides = c_contiguous_strides(src_nd, src_shape);
+        }
+        else if (is_src_f_contig) {
+            src_strides = f_contiguous_strides(src_nd, src_shape);
+        }
+        else {
+            throw std::runtime_error("Source array has null strides but has "
+                                     "neither C- nor F- contiguous flag set");
+        }
+    }
+    else {
+        std::copy(src_strides_raw, src_strides_raw + src_nd,
+                  src_strides.begin());
+    }
+
+    shT dst_strides(src_nd);
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    const py::ssize_t *dst_strides_raw = dst.get_strides_raw();
+    if (dst_strides_raw == nullptr) {
+        if (is_dst_c_contig) {
+            dst_strides =
+                dpctl::tensor::c_contiguous_strides(src_nd, src_shape);
+        }
+        else if (is_dst_f_contig) {
+            dst_strides =
+                dpctl::tensor::f_contiguous_strides(src_nd, src_shape);
+        }
+        else {
+            throw std::runtime_error("Source array has null strides but has "
+                                     "neither C- nor F- contiguous flag set");
+        }
+    }
+    else {
+        std::copy(dst_strides_raw, dst_strides_raw + dst_nd,
+                  dst_strides.begin());
+    }
+
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    constexpr py::ssize_t src_itemsize = 1; // item size in elements
+    constexpr py::ssize_t dst_itemsize = 1; // item size in elements
+
+    int nd = src_nd - 2;
+    const py::ssize_t *shape = src_shape;
+    const py::ssize_t *p_src_strides = src_strides.data();
+    const py::ssize_t *p_dst_strides = dst_strides.data();
+
+    simplify_iteration_space(nd, shape, p_src_strides, src_itemsize,
+                             is_src_c_contig, is_src_f_contig, p_dst_strides,
+                             dst_itemsize, is_dst_c_contig, is_dst_f_contig,
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (src_offset != 0 || dst_offset != 0) {
+        throw py::value_error("Reversed slice for dst is not supported");
+    }
+
+    nd += 2;
+
+    using usm_host_allocatorT =
+        sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
+    using usmshT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT allocator(exec_q);
+    auto shp_host_shape_and_strides =
+        std::make_shared<usmshT>(3 * nd, allocator);
+
+    std::copy(simplified_shape.begin(), simplified_shape.end(),
+              shp_host_shape_and_strides->begin());
+    (*shp_host_shape_and_strides)[nd - 2] = src_shape[src_nd - 2];
+    (*shp_host_shape_and_strides)[nd - 1] = src_shape[src_nd - 1];
+
+    std::copy(simplified_src_strides.begin(), simplified_src_strides.end(),
+              shp_host_shape_and_strides->begin() + nd);
+    (*shp_host_shape_and_strides)[2 * nd - 2] = src_strides[src_nd - 2];
+    (*shp_host_shape_and_strides)[2 * nd - 1] = src_strides[src_nd - 1];
+
+    std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(),
+              shp_host_shape_and_strides->begin() + 2 * nd);
+    (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2];
+    (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1];
+
+    py::ssize_t *dev_shape_and_strides =
+        sycl::malloc_device<ssize_t>(3 * nd, exec_q);
+    if (dev_shape_and_strides == nullptr) {
+        throw std::runtime_error("Unabled to allocate device memory");
+    }
+    sycl::event copy_shape_and_strides = exec_q.copy<ssize_t>(
+        shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd);
+
+    py::ssize_t inner_range = src_shape[src_nd - 1] * src_shape[src_nd - 2];
+    py::ssize_t outer_range = src_nelems / inner_range;
+
+    sycl::event tri_ev;
+    if (part == 'l') {
+        auto fn = tril_generic_dispatch_vector[src_typeid];
+        tri_ev =
+            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
+               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+    }
+    else {
+        auto fn = triu_generic_dispatch_vector[src_typeid];
+        tri_ev =
+            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
+               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+    }
+
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on({tri_ev});
+        auto ctx = exec_q.get_context();
+        cgh.host_task(
+            [shp_host_shape_and_strides, dev_shape_and_strides, ctx]() {
+                // capture of shp_host_shape_and_strides ensure the underlying
+                // vector exists for the entire execution of copying kernel
+                sycl::free(dev_shape_and_strides, ctx);
+            });
+    });
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, {tri_ev}),
+                          tri_ev);
+}
+
+void init_triul_ctor_dispatch_vectors(void)
+{
+
+    using namespace dpctl::tensor::detail;
+    using dpctl::tensor::kernels::constructors::TrilGenericFactory;
+    using dpctl::tensor::kernels::constructors::TriuGenericFactory;
+
+    DispatchVectorBuilder<tri_fn_ptr_t, TrilGenericFactory, num_types> dvb1;
+    dvb1.populate_dispatch_vector(tril_generic_dispatch_vector);
+
+    DispatchVectorBuilder<tri_fn_ptr_t, TriuGenericFactory, num_types> dvb2;
+    dvb2.populate_dispatch_vector(triu_generic_dispatch_vector);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/triul_ctor.hpp b/dpctl/tensor/libtensor/source/triul_ctor.hpp
new file mode 100644
index 0000000000..3789df80c5
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/triul_ctor.hpp
@@ -0,0 +1,52 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+usm_ndarray_triul(sycl::queue exec_q,
+                  dpctl::tensor::usm_ndarray src,
+                  dpctl::tensor::usm_ndarray dst,
+                  char part,
+                  py::ssize_t k = 0,
+                  const std::vector<sycl::event> &depends = {});
+
+extern void init_triul_ctor_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl