diff --git a/examples/pybind11/external_usm_allocation/CMakeLists.txt b/examples/pybind11/external_usm_allocation/CMakeLists.txt
new file mode 100644
index 0000000000..6f42a3e6d3
--- /dev/null
+++ b/examples/pybind11/external_usm_allocation/CMakeLists.txt
@@ -0,0 +1,36 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(external_usm_allocation LANGUAGES CXX)
+
+set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake")
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH})
+find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# Fetch pybind11
+include(FetchContent)
+FetchContent_Declare(
+  pybind11
+  URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.tar.gz
+  URL_HASH SHA256=6bd528c4dbe2276635dc787b6b1f2e5316cf6b49ee3e150264e455a0d68d19c1
+)
+FetchContent_MakeAvailable(pybind11)
+
+find_package(PythonExtensions REQUIRED)
+find_package(Dpctl REQUIRED)
+find_package(NumPy REQUIRED)
+
+set(py_module_name _external_usm_alloc)
+pybind11_add_module(${py_module_name}
+    MODULE
+    external_usm_allocation/_usm_alloc_example.cpp
+)
+target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
+target_compile_options(${py_module_name} PRIVATE -Wno-deprecated-declarations)
+install(TARGETS ${py_module_name}
+  DESTINATION external_usm_allocation
+)
+
+set(ignoreMe "${SKBUILD}")
diff --git a/examples/pybind11/external_usm_allocation/README.md b/examples/pybind11/external_usm_allocation/README.md
index 7431ba4ab6..0b8dd62dc2 100644
--- a/examples/pybind11/external_usm_allocation/README.md
+++ b/examples/pybind11/external_usm_allocation/README.md
@@ -10,6 +10,7 @@ to dpctl.memory entities using `__sycl_usm_array_interface__`.
 ```
 source /opt/intel/oneapi/compiler/latest/env/vars.sh
 CXX=dpcpp CC=dpcpp python setup.py build_ext --inplace
+python -m pytest tests
 python example.py
 ```
 
@@ -17,7 +18,7 @@ python example.py
 
 ```
 (idp) [12:43:20 ansatnuc04 external_usm_allocation]$ python example.py
-<external_usm_alloc.DMatrix object at 0x7f2b98b4cef0>
+<external_usm_allocation._external_usm_alloc.DMatrix object at 0x7f2b98b4cef0>
 {'data': [94846745444352, True], 'shape': (5, 5), 'strides': None, 'version': 1, 'typestr': '|f8', 'syclobj': <capsule object "SyclQueueRef" at 0x7f2b9b941d80>}
 shared
 
diff --git a/examples/pybind11/external_usm_allocation/example.py b/examples/pybind11/external_usm_allocation/example.py
index 4455616146..6001d87eca 100644
--- a/examples/pybind11/external_usm_allocation/example.py
+++ b/examples/pybind11/external_usm_allocation/example.py
@@ -16,7 +16,7 @@
 
 # coding: utf-8
 
-import external_usm_alloc as eua
+import external_usm_allocation as eua
 import numpy as np
 
 import dpctl
diff --git a/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py b/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py
new file mode 100644
index 0000000000..1af16147fa
--- /dev/null
+++ b/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py
@@ -0,0 +1,27 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding: utf-8
+
+from ._external_usm_alloc import DMatrix
+
+__all__ = ["DMatrix"]
+
+__doc__ = """
+   Example of implementing C++ class with its own USM memory allocation logic
+and interfacing that allocation with `dpctl` by implementing
+`__sycl_usm_array_interface__`.
+"""
diff --git a/examples/pybind11/external_usm_allocation/_usm_alloc_example.cpp b/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp
similarity index 99%
rename from examples/pybind11/external_usm_allocation/_usm_alloc_example.cpp
rename to examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp
index 30216dac55..2b8212076b 100644
--- a/examples/pybind11/external_usm_allocation/_usm_alloc_example.cpp
+++ b/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp
@@ -132,7 +132,7 @@ py::list tolist(DMatrix &m)
     return rows;
 }
 
-PYBIND11_MODULE(external_usm_alloc, m)
+PYBIND11_MODULE(_external_usm_alloc, m)
 {
     // Import the dpctl extensions
     import_dpctl();
diff --git a/examples/pybind11/external_usm_allocation/setup.py b/examples/pybind11/external_usm_allocation/setup.py
index 2802075d41..86abd2854a 100644
--- a/examples/pybind11/external_usm_allocation/setup.py
+++ b/examples/pybind11/external_usm_allocation/setup.py
@@ -14,21 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pybind11.setup_helpers import Pybind11Extension
-from setuptools import setup
+from skbuild import setup
 
-import dpctl
-
-ext_modules = [
-    Pybind11Extension(
-        "external_usm_alloc",
-        ["./_usm_alloc_example.cpp"],
-        include_dirs=[dpctl.get_include()],
-        extra_compile_args=["-fPIC"],
-        extra_link_args=["-fPIC"],
-        libraries=["sycl"],
-        language="c++",
-    )
-]
-
-setup(name="external_usm_alloc", ext_modules=ext_modules)
+setup(
+    name="external_usm_allocation",
+    version="0.0.1",
+    description="an example of SYCL-powered Python package (with pybind11)",
+    author="Intel Scripting",
+    license="Apache 2.0",
+    packages=["external_usm_allocation"],
+)
diff --git a/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py b/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py
new file mode 100644
index 0000000000..96bfb951e8
--- /dev/null
+++ b/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py
@@ -0,0 +1,48 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding: utf-8
+
+import external_usm_allocation as eua
+import numpy as np
+
+import dpctl
+import dpctl.memory as dpm
+
+
+def test_dmatrix():
+    q = dpctl.SyclQueue()
+    matr = eua.DMatrix(q, 5, 5)
+    assert hasattr(matr, "__sycl_usm_array_interface__")
+
+    blob = dpm.as_usm_memory(matr)
+    assert blob.get_usm_type() == "shared"
+
+    Xh = np.array(
+        [
+            [1, 1, 1, 2, 2],
+            [1, 0, 1, 2, 2],
+            [1, 1, 0, 2, 2],
+            [0, 0, 0, 3, -1],
+            [0, 0, 0, -1, 5],
+        ],
+        dtype="d",
+    )
+    host_bytes_view = Xh.reshape((-1)).view(np.ubyte)
+    blob.copy_from_host(host_bytes_view)
+
+    list_of_lists = matr.tolist()
+    assert list_of_lists == Xh.tolist()
diff --git a/examples/pybind11/onemkl_gemv/CMakeLists.txt b/examples/pybind11/onemkl_gemv/CMakeLists.txt
index 3ba71c6252..848e8c7727 100644
--- a/examples/pybind11/onemkl_gemv/CMakeLists.txt
+++ b/examples/pybind11/onemkl_gemv/CMakeLists.txt
@@ -38,7 +38,7 @@ pybind11_add_module(${py_module_name}
     sycl_gemm/_onemkl.cpp
 )
 target_include_directories(${py_module_name}
-    PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR}
+    PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm
 )
 target_link_libraries(${py_module_name}
     PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb}
@@ -53,4 +53,18 @@ set_source_files_properties(${_sycl_gemm_sources}
   COMPILE_OPTIONS "-O3;-Wno-deprecated-declarations"
 )
 
+add_executable(standalone_cpp
+  EXCLUDE_FROM_ALL
+  cpp/main.cpp
+)
+target_compile_options(standalone_cpp
+  PRIVATE -O3 -Wno-deprecated-declarations
+)
+target_include_directories(standalone_cpp
+  PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm
+  )
+target_link_libraries(standalone_cpp
+    PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb}
+)
+
 set(ignoreMe "${SKBUILD}")
diff --git a/examples/pybind11/onemkl_gemv/README.md b/examples/pybind11/onemkl_gemv/README.md
index 9645a4dd92..e984a5e013 100644
--- a/examples/pybind11/onemkl_gemv/README.md
+++ b/examples/pybind11/onemkl_gemv/README.md
@@ -1,9 +1,15 @@
 Example of SYCL built pybind11 extension
 
-To build, use (assumes scikit-build and dpcpp) is installed
+To build, use (assumes scikit-build and dpcpp is installed):
 
 ```sh
-python setup.py develop -- -G "Ninja" -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_CXX_COMPILER:PATH=icpx -DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib -DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib -DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include -DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include
+python setup.py develop -- -G "Ninja" \
+     -DCMAKE_C_COMPILER:PATH=icx \
+     -DCMAKE_CXX_COMPILER:PATH=icpx \
+     -DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib \
+     -DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib \
+     -DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include \
+     -DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include
 ```
 
 To run test suite
@@ -11,3 +17,16 @@ To run test suite
 ```sh
 python -m pytest tests
 ```
+
+To compare Python overhead,
+
+```
+# build standad-alone executable
+cmake --build $(find . -name cmake-build) --target standalone_cpp
+# execute it
+$(find . -name cmake-build)/standalone_cpp 1000 11
+# launch Python computatin
+python sycl_timing_solver.py 1000 11
+```
+
+Compare host times vs. C++ wall-clock times while making sure that the number of iterations is the same
diff --git a/examples/pybind11/onemkl_gemv/cpp/main.cpp b/examples/pybind11/onemkl_gemv/cpp/main.cpp
new file mode 100644
index 0000000000..c0f9a6115b
--- /dev/null
+++ b/examples/pybind11/onemkl_gemv/cpp/main.cpp
@@ -0,0 +1,128 @@
+#include "cg_solver.hpp"
+#include <CL/sycl.hpp>
+#include <chrono>
+#include <iostream>
+#include <oneapi/mkl.hpp>
+
+using T = double;
+
+int main(int argc, char *argv[])
+{
+    size_t n = 1000;
+    size_t rank = 16;
+
+    if (argc > 1) {
+        n = std::stoi(argv[1]);
+    }
+
+    if (argc > 2) {
+        rank = std::stoi(argv[2]);
+    }
+
+    std::cout << "Solving " << n << " by " << n << " diagonal system with rank-"
+              << rank << " perturbation." << std::endl;
+
+    sycl::queue q;
+
+    // USM allocation for data needed by program
+    size_t buf_size = n * n + rank * n + 2 * n;
+    T *buf = sycl::malloc_device<T>(buf_size, q);
+    sycl::event memset_ev = q.fill<T>(buf, T(0), buf_size);
+
+    T *Amat = buf;
+    T *umat = buf + n * n;
+    T *bvec = umat + rank * n;
+    T *sol_vec = bvec + n;
+
+    sycl::event set_diag_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on({memset_ev});
+        cgh.parallel_for({n}, [=](sycl::id<1> id) {
+            auto i = id[0];
+            Amat[i * (n + 1)] = T(1);
+        });
+    });
+
+    oneapi::mkl::rng::philox4x32x10 engine(q, 7777);
+    oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>
+        distr(0.0, 1.0);
+
+    // populate umat and bvec in one call
+    sycl::event umat_rand_ev =
+        oneapi::mkl::rng::generate(distr, engine, n * rank + n, umat);
+
+    sycl::event syrk_ev = oneapi::mkl::blas::row_major::syrk(
+        q, oneapi::mkl::uplo::U, oneapi::mkl::transpose::N, n, rank, T(1), umat,
+        rank, T(1), Amat, n, {umat_rand_ev, set_diag_ev});
+
+    // need to transpose
+    sycl::event transpose_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(syrk_ev);
+        cgh.parallel_for({n * n}, [=](sycl::id<1> id) {
+            size_t i = id[0];
+            size_t i0 = i / n;
+            size_t i1 = i - i0 * n;
+            if (i0 > i1) {
+                Amat[i] = Amat[i1 * n + i0];
+            }
+        });
+    });
+
+    q.wait();
+
+    constexpr int reps = 6;
+
+    std::vector<double> time;
+    std::vector<int> conv_iters;
+
+    time.reserve(reps);
+    conv_iters.reserve(reps);
+    for (int i = 0; i < reps; ++i) {
+        auto start = std::chrono::high_resolution_clock::now();
+        int conv_iter_count = cg_solver::cg_solve(q, n, Amat, bvec, sol_vec);
+        auto end = std::chrono::high_resolution_clock::now();
+
+        time.push_back(
+            std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
+                .count() *
+            1e-06);
+
+        conv_iters.push_back(conv_iter_count);
+    }
+
+    std::cout << "Converged in : ";
+    for (auto &el : conv_iters) {
+        std::cout << el << " , ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "Wall-clock cg_solve execution times: ";
+    for (auto &el : time) {
+        std::cout << el << " , ";
+    }
+    std::cout << std::endl;
+
+    T *Ax = sycl::malloc_device<T>(2 * n + 1, q);
+    T *delta = Ax + n;
+
+    sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv(
+        q, oneapi::mkl::transpose::N, n, n, T(1), Amat, n, sol_vec, 1, T(0), Ax,
+        1);
+
+    sycl::event sub_ev = oneapi::mkl::vm::sub(q, n, Ax, bvec, delta, {gemv_ev},
+                                              oneapi::mkl::vm::mode::ha);
+
+    T *n2 = delta + n;
+    sycl::event dot_ev = oneapi::mkl::blas::row_major::dot(
+        q, n, delta, 1, delta, 1, n2, {sub_ev});
+
+    T n2_host{};
+    q.copy<T>(n2, &n2_host, 1, {dot_ev}).wait_and_throw();
+
+    std::cout << "Redisual norm squared: " << n2_host << std::endl;
+
+    q.wait_and_throw();
+    sycl::free(Ax, q);
+    sycl::free(buf, q);
+
+    return 0;
+}
diff --git a/examples/pybind11/onemkl_gemv/setup.py b/examples/pybind11/onemkl_gemv/setup.py
index dd00d8e1a9..1ed65e3af8 100644
--- a/examples/pybind11/onemkl_gemv/setup.py
+++ b/examples/pybind11/onemkl_gemv/setup.py
@@ -1,3 +1,19 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from skbuild import setup
 
 setup(
diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py
new file mode 100644
index 0000000000..6304808078
--- /dev/null
+++ b/examples/pybind11/onemkl_gemv/solve.py
@@ -0,0 +1,258 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import sycl_gemm
+
+import dpctl
+import dpctl.tensor as dpt
+
+
+def empty_like(A):
+    return dpt.empty(A.shape, A.dtype, device=A.device)
+
+
+def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]):
+    """Chebyshev iterative solver using SYCL routines"""
+    d = (lMax + lMin) / 2
+    c = (lMax - lMin) / 2
+
+    x = dpt.copy(x0)
+    exec_queue = A.sycl_queue
+    assert exec_queue == x.sycl_queue
+    Ax = empty_like(A[:, 0])
+    r = empty_like(Ax)
+    p = empty_like(Ax)
+
+    e_x = dpctl.SyclEvent()
+    # Ax = A @ x
+    _, e_dot = sycl_gemm.gemv(exec_queue, A, x, Ax, depends=depends)
+    # r = b - Ax
+    _, e_sub = sycl_gemm.sub(exec_queue, b, Ax, r, depends=[e_dot])
+    r_ev = e_sub
+    for i in range(nIters):
+        z = r
+        z_ev = r_ev
+        if i == 0:
+            p[:] = z
+            alpha = 1 / d
+            _, e_axpby = dpctl.SyclEvent(), dpctl.SyclEvent()
+        elif i == 1:
+            beta = 0.5 * (c * alpha) ** 2
+            alpha = 1 / (d - beta / alpha)
+            # p = z + beta * p
+            _, e_axpby = sycl_gemm.axpby_inplace(
+                exec_queue, 1, z, beta, p, depends=[z_ev]
+            )
+        else:
+            beta = (c / 2 * alpha) ** 2
+            alpha = 1 / (d - beta / alpha)
+            # p = z + beta * p
+            _, e_axpby = sycl_gemm.axpby_inplace(
+                exec_queue, 1, z, beta, p, depends=[z_ev]
+            )
+        # x = x + alpha * p
+        _, e_x = sycl_gemm.axpby_inplace(
+            exec_queue, alpha, p, 1, x, depends=[e_axpby, e_x]
+        )
+        # Ax = A @ x
+        _, e_dot = sycl_gemm.gemv(exec_queue, A, x, Ax, depends=[e_x])
+        # r = b - Ax
+        _, e_sub = sycl_gemm.sub(exec_queue, b, Ax, r, depends=[e_dot])
+        # residual = dot(r, r)
+        residual = sycl_gemm.norm_squared_blocking(
+            exec_queue, r, depends=[e_sub]
+        )
+        if residual <= 1e-29:
+            print(f"chebyshev: converged in {i} iters")
+            break
+    exec_queue.wait()  # wait for all host tasks to complete
+    return x
+
+
+def check_with_numpy(A, b):
+    """Direct solver using numpy"""
+    import numpy as np
+
+    return np.linalg.solve(Anp, bnp)
+
+
+def chebyshev_numpy(A, b, x0, nIters, lMax, lMin):
+    """Chebyshev iterative solver using numpy"""
+    d = (lMax + lMin) / 2
+    c = (lMax - lMin) / 2
+
+    x = x0
+
+    Ax = np.dot(A, x)
+    r = b - Ax
+    for i in range(nIters):
+        z = r
+        if i == 0:
+            p = z
+            alpha = 1 / d
+        elif i == 1:
+            beta = 0.5 * (c * alpha) ** 2
+            alpha = 1 / (d - beta / alpha)
+            p = z + beta * p
+        else:
+            beta = (c / 2 * alpha) ** 2
+            alpha = 1 / (d - beta / alpha)
+            p = z + beta * p
+        x = x + alpha * p
+        Ax = np.dot(A, x)
+        r = b - Ax
+        residual = np.dot(r, r)
+        if residual <= 1e-29:
+            print(f"chebyshev_numpy: converged in {i} iters")
+            break
+    return x
+
+
+def cg_solve(A, b):
+    """
+    Conjugate gradient solver for A @ x == b.
+
+    Returns tuple: (x, converged)
+
+    converged is False if solver has not converged, or the iteration number
+    """
+    exec_queue = A.sycl_queue
+    x = dpt.zeros(b.shape, dtype=b.dtype)
+    Ap = empty_like(x)
+
+    all_host_tasks = []
+    r = dpt.copy(b)
+    p = dpt.copy(b)
+    rsold = sycl_gemm.norm_squared_blocking(exec_queue, r)
+    if rsold < 1e-20:
+        return (b, 0)
+    converged = False
+    max_iters = b.shape[0]
+
+    e_p = dpctl.SyclEvent()
+    e_x = dpctl.SyclEvent()
+    for i in range(max_iters):
+        # Ap = A @ p
+        he_dot, e_dot = sycl_gemm.gemv(exec_queue, A, p, Ap, depends=[e_p])
+        all_host_tasks.append(he_dot)
+        # alpha = rsold / dot(p, Ap)
+        alpha = rsold / sycl_gemm.dot_blocking(
+            exec_queue, p, Ap, depends=[e_dot]
+        )
+        # x = x + alpha * p
+        he1_x_update, e1_x_update = sycl_gemm.axpby_inplace(
+            exec_queue, alpha, p, 1, x, depends=[e_p, e_x]
+        )
+        all_host_tasks.append(he1_x_update)
+        e_x = e1_x_update
+
+        # r = r - alpha * Ap
+        he2_r_update, e2_r_update = sycl_gemm.axpby_inplace(
+            exec_queue, -alpha, Ap, 1, r, depends=[e_p]
+        )
+        all_host_tasks.append(he2_r_update)
+
+        # rsnew = dot(r, r)
+        rsnew = sycl_gemm.norm_squared_blocking(
+            exec_queue, r, depends=[e2_r_update]
+        )
+        if rsnew < 1e-20:
+            e1_x_update.wait()
+            converged = i
+            break
+        beta = rsnew / rsold
+
+        # p = r + beta * p
+        he3_p_update, e3_p_update = sycl_gemm.axpby_inplace(
+            exec_queue, 1, r, beta, p, depends=[e2_r_update]
+        )
+
+        rsold = rsnew
+        all_host_tasks.append(he3_p_update)
+        e_p = e3_p_update
+        e_x = e1_x_update
+
+    dpctl.SyclEvent.wait_for(all_host_tasks)
+    return x, converged
+
+
+def cg_solve_numpy(A, b):
+    x = np.zeros_like(b)
+    r = b - np.dot(A, x)
+    p = r
+    rsold = np.dot(r, r)
+    converged = False
+    max_iters = b.shape[0]
+
+    for i in range(max_iters):
+        Ap = np.dot(A, p)
+        alpha = rsold / np.dot(p, Ap)
+        x = x + alpha * p
+        r = r - alpha * Ap
+        rsnew = np.dot(r, r)
+
+        if rsnew < 1e-20:
+            converged = i
+            break
+
+        beta = rsnew / rsold
+        p = r + beta * p
+        rsold = rsnew
+
+    return (x, converged)
+
+
+if __name__ == "__main__":
+    n = 32
+    Anp = (
+        2 * np.eye(n, n, k=0, dtype="d")
+        + np.eye(n, n, k=1, dtype="d")
+        + np.eye(n, n, k=-1, dtype="d")
+    )
+    bnp = np.geomspace(0.5, 2, n, dtype="d")
+    # bounds on eigenvalues of cartan matrix are, needed only
+    # for the Chebyshev solver
+    lambda_max = 4
+    lambda_min = 4 * np.square(np.sin(np.pi / (2 * (n + 2))))
+
+    q = dpctl.SyclQueue(property="enable_profiling")
+    q.print_device_info()
+    A = dpt.asarray(Anp, dtype="d", usm_type="device", sycl_queue=q)
+    dev = A.device
+    b = dpt.asarray(bnp, dtype="d", usm_type="device", device=dev)
+    x0 = b
+    t = dpctl.SyclTimer()
+    with t(dev.sycl_queue):
+        x, conv = cg_solve(A, b)
+    print("SYCL solver, 1st run: ", (conv, t.dt))
+    with t(dev.sycl_queue):
+        x, conv = cg_solve(A, b)
+    print("SYCL solver, 2nd run: ", (conv, t.dt))
+
+    x_ref = check_with_numpy(Anp, bnp)  # solve usign LU solver
+
+    with t(dev.sycl_queue):
+        x_np, conv = cg_solve_numpy(dpt.asnumpy(A), dpt.asnumpy(b))
+    print("NumPy's powered CG solver: ", (conv, t.dt))
+    print(
+        "SYCL cg-solver solution close to reference: ",
+        np.allclose(dpt.asnumpy(x), x_ref),
+    )
+    print(
+        "NumPy's cg-solver solution close to reference: ",
+        np.allclose(x_np, x_ref),
+    )
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
index 07d7ec6ced..defce5b76f 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
@@ -1,3 +1,33 @@
-from ._onemkl import gemv
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-__all__ = ["gemv"]
+from ._onemkl import (
+    axpby_inplace,
+    cpp_cg_solve,
+    dot_blocking,
+    gemv,
+    norm_squared_blocking,
+    sub,
+)
+
+__all__ = [
+    "gemv",
+    "sub",
+    "axpby_inplace",
+    "norm_squared_blocking",
+    "dot_blocking",
+    "cpp_cg_solve",
+]
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
index 569428004f..0531b8880f 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
@@ -1,8 +1,38 @@
+//==- _onemkl.cpp - Example of Pybind11 extension working with  -===//
+//  dpctl Python objects.
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements Pybind11-generated extension exposing functions that
+/// take dpctl Python objects, such as dpctl.SyclQueue, dpctl.SyclDevice, and
+/// dpctl.tensor.usm_ndarray as arguments.
+///
+//===----------------------------------------------------------------------===//
+
 // clang-format off
 #include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
+#include "cg_solver.hpp"
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <pybind11/complex.h>
 #include "dpctl4pybind11.hpp"
 // clang-format on
 
@@ -11,11 +41,11 @@ namespace py = pybind11;
 using dpctl::utils::keep_args_alive;
 
 std::pair<sycl::event, sycl::event>
-gemv(sycl::queue q,
-     dpctl::tensor::usm_ndarray matrix,
-     dpctl::tensor::usm_ndarray vector,
-     dpctl::tensor::usm_ndarray result,
-     const std::vector<sycl::event> &depends = {})
+py_gemv(sycl::queue q,
+        dpctl::tensor::usm_ndarray matrix,
+        dpctl::tensor::usm_ndarray vector,
+        dpctl::tensor::usm_ndarray result,
+        const std::vector<sycl::event> &depends = {})
 {
     if (matrix.get_ndim() != 2 || vector.get_ndim() != 1 ||
         result.get_ndim() != 1) {
@@ -32,6 +62,15 @@ gemv(sycl::queue q,
         throw std::runtime_error("Inconsistent shapes.");
     }
 
+    auto q_ctx = q.get_context();
+    if (q_ctx != matrix.get_queue().get_context() ||
+        q_ctx != vector.get_queue().get_context() ||
+        q_ctx != result.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue.");
+    }
+
     int mat_flags = matrix.get_flags();
     int v_flags = vector.get_flags();
     int r_flags = result.get_flags();
@@ -110,9 +149,489 @@ gemv(sycl::queue q,
     return std::make_pair(ht_event, res_ev);
 }
 
+template <typename T>
+sycl::event sub_impl(sycl::queue q,
+                     size_t n,
+                     const char *v1_i,
+                     const char *v2_i,
+                     char *r_i,
+                     const std::vector<sycl::event> &depends = {})
+{
+    const T *v1 = reinterpret_cast<const T *>(v1_i);
+    const T *v2 = reinterpret_cast<const T *>(v2_i);
+    T *r = reinterpret_cast<T *>(r_i);
+
+    sycl::event r_ev = cg_solver::detail::sub(q, n, v1, v2, r, depends);
+
+    return r_ev;
+}
+
+// out_r = in_v1 - in_v2
+std::pair<sycl::event, sycl::event>
+py_sub(sycl::queue q,
+       dpctl::tensor::usm_ndarray in_v1,
+       dpctl::tensor::usm_ndarray in_v2,
+       dpctl::tensor::usm_ndarray out_r,
+       const std::vector<sycl::event> &depends = {})
+{
+    if (in_v1.get_ndim() != 1 || in_v2.get_ndim() != 1 || out_r.get_ndim() != 1)
+    {
+        throw std::runtime_error("Inconsistent dimensions, expecting vectors");
+    }
+
+    py::ssize_t n = in_v1.get_shape(0); // get length of the vector
+
+    if (n != in_v2.get_shape(0) || n != out_r.get_shape(0)) {
+        throw std::runtime_error("Vectors must have the same length");
+    }
+
+    if (q.get_context() != in_v1.get_queue().get_context() ||
+        q.get_context() != in_v2.get_queue().get_context() ||
+        q.get_context() != out_r.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue");
+    }
+
+    int in_v1_flags = in_v1.get_flags();
+    int in_v2_flags = in_v2.get_flags();
+    int out_r_flags = out_r.get_flags();
+
+    if (!((in_v1_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) &&
+          (in_v2_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) &&
+          (out_r_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS))))
+    {
+        throw std::runtime_error("Vectors must be contiguous.");
+    }
+
+    int in_v1_typenum = in_v1.get_typenum();
+    int in_v2_typenum = in_v2.get_typenum();
+    int out_r_typenum = out_r.get_typenum();
+
+    if ((in_v2_typenum != in_v1_typenum) || (out_r_typenum != in_v1_typenum) ||
+        !((in_v1_typenum == UAR_DOUBLE) || (in_v1_typenum == UAR_FLOAT) ||
+          (in_v1_typenum == UAR_CDOUBLE) || (in_v1_typenum == UAR_CFLOAT)))
+    {
+        throw std::runtime_error(
+            "Only real and complex floating point arrays are supported.");
+    }
+
+    const char *in_v1_typeless_ptr = in_v1.get_data();
+    const char *in_v2_typeless_ptr = in_v2.get_data();
+    char *out_r_typeless_ptr = out_r.get_data();
+
+    sycl::event res_ev;
+    if (out_r_typenum == UAR_DOUBLE) {
+        using T = double;
+        res_ev = sub_impl<T>(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr,
+                             out_r_typeless_ptr, depends);
+    }
+    else if (out_r_typenum == UAR_FLOAT) {
+        using T = float;
+        res_ev = sub_impl<T>(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr,
+                             out_r_typeless_ptr, depends);
+    }
+    else if (out_r_typenum == UAR_CDOUBLE) {
+        using T = std::complex<double>;
+        res_ev = sub_impl<T>(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr,
+                             out_r_typeless_ptr, depends);
+    }
+    else if (out_r_typenum == UAR_CFLOAT) {
+        using T = std::complex<float>;
+        res_ev = sub_impl<T>(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr,
+                             out_r_typeless_ptr, depends);
+    }
+    else {
+        throw std::runtime_error("Type dispatch ran into trouble.");
+    }
+
+    sycl::event ht_event = keep_args_alive(q, {in_v1, in_v2, out_r}, {res_ev});
+
+    return std::make_pair(ht_event, res_ev);
+}
+
+template <typename T> class axpy_inplace_kern;
+template <typename T> class axpby_inplace_kern;
+
+template <typename T>
+sycl::event axpby_inplace_impl(sycl::queue q,
+                               size_t nelems,
+                               py::object pyobj_a,
+                               const char *x_typeless,
+                               py::object pyobj_b,
+                               char *y_typeless,
+                               const std::vector<sycl::event> depends = {})
+{
+    T a = py::cast<T>(pyobj_a);
+    T b = py::cast<T>(pyobj_b);
+
+    const T *x = reinterpret_cast<const T *>(x_typeless);
+    T *y = reinterpret_cast<T *>(y_typeless);
+
+    sycl::event res_ev =
+        cg_solver::detail::axpby_inplace(q, nelems, a, x, b, y, depends);
+
+    return res_ev;
+}
+
+// y = a * x + b * y
+std::pair<sycl::event, sycl::event>
+py_axpby_inplace(sycl::queue q,
+                 py::object a,
+                 dpctl::tensor::usm_ndarray x,
+                 py::object b,
+                 dpctl::tensor::usm_ndarray y,
+                 const std::vector<sycl::event> &depends = {})
+{
+
+    if (x.get_ndim() != 1 || y.get_ndim() != 1) {
+        throw std::runtime_error("Inconsistent dimensions, expecting vectors");
+    }
+
+    py::ssize_t n = x.get_shape(0); // get length of the vector
+
+    if (n != y.get_shape(0)) {
+        throw std::runtime_error("Vectors must have the same length");
+    }
+
+    if (q.get_context() != x.get_queue().get_context() ||
+        q.get_context() != y.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue");
+    }
+
+    int x_flags = x.get_flags();
+    int y_flags = y.get_flags();
+
+    if (!((x_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) &&
+          (y_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS))))
+    {
+        throw std::runtime_error("Vectors must be contiguous.");
+    }
+
+    int x_typenum = x.get_typenum();
+    int y_typenum = y.get_typenum();
+
+    if ((x_typenum != y_typenum) ||
+        !((x_typenum == UAR_DOUBLE) || (x_typenum == UAR_FLOAT) ||
+          (x_typenum == UAR_CDOUBLE) || (x_typenum == UAR_CFLOAT)))
+    {
+        throw std::runtime_error(
+            "Only real and complex floating point arrays are supported.");
+    }
+
+    const char *x_typeless_ptr = x.get_data();
+    char *y_typeless_ptr = y.get_data();
+
+    sycl::event res_ev;
+    if (x_typenum == UAR_DOUBLE) {
+        using T = double;
+        res_ev = axpby_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
+                                       y_typeless_ptr, depends);
+    }
+    else if (x_typenum == UAR_FLOAT) {
+        using T = float;
+        res_ev = axpby_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
+                                       y_typeless_ptr, depends);
+    }
+    else if (x_typenum == UAR_CDOUBLE) {
+        using T = std::complex<double>;
+        res_ev = axpby_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
+                                       y_typeless_ptr, depends);
+    }
+    else if (x_typenum == UAR_CFLOAT) {
+        using T = std::complex<float>;
+        res_ev = axpby_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
+                                       y_typeless_ptr, depends);
+    }
+    else {
+        throw std::runtime_error("Type dispatch ran into trouble.");
+    }
+
+    sycl::event ht_event = keep_args_alive(q, {x, y}, {res_ev});
+
+    return std::make_pair(ht_event, res_ev);
+}
+
+template <typename T>
+T norm_squared_blocking_impl(sycl::queue q,
+                             size_t nelems,
+                             const char *r_typeless,
+                             const std::vector<sycl::event> &depends = {})
+{
+    const T *r = reinterpret_cast<const T *>(r_typeless);
+
+    return cg_solver::detail::norm_squared_blocking(q, nelems, r, depends);
+}
+
+template <typename T> class complex_norm_squared_blocking_kern;
+
+template <typename T>
+T complex_norm_squared_blocking_impl(
+    sycl::queue q,
+    size_t nelems,
+    const char *r_typeless,
+    const std::vector<sycl::event> &depends = {})
+{
+    const std::complex<T> *r =
+        reinterpret_cast<const std::complex<T> *>(r_typeless);
+
+    return cg_solver::detail::complex_norm_squared_blocking(q, nelems, r,
+                                                            depends);
+}
+
+py::object py_norm_squared_blocking(sycl::queue q,
+                                    dpctl::tensor::usm_ndarray r,
+                                    const std::vector<sycl::event> depends = {})
+{
+    if (r.get_ndim() != 1) {
+        throw std::runtime_error("Expecting a vector");
+    }
+
+    py::ssize_t n = r.get_shape(0); // get length of the vector
+
+    int r_flags = r.get_flags();
+
+    if (!(r_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS))) {
+        throw std::runtime_error("Vector must be contiguous.");
+    }
+
+    if (q.get_context() != r.get_queue().get_context()) {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue");
+    }
+
+    int r_typenum = r.get_typenum();
+    if ((r_typenum != UAR_DOUBLE) && (r_typenum != UAR_FLOAT) &&
+        (r_typenum != UAR_CDOUBLE) && (r_typenum != UAR_CFLOAT))
+    {
+        throw std::runtime_error(
+            "Only real and complex floating point arrays are supported.");
+    }
+
+    const char *r_typeless_ptr = r.get_data();
+    py::object res;
+
+    if (r_typenum == UAR_DOUBLE) {
+        using T = double;
+        T n_sq = norm_squared_blocking_impl<T>(q, n, r_typeless_ptr, depends);
+        res = py::float_(n_sq);
+    }
+    else if (r_typenum == UAR_FLOAT) {
+        using T = float;
+        T n_sq = norm_squared_blocking_impl<T>(q, n, r_typeless_ptr, depends);
+        res = py::float_(n_sq);
+    }
+    else if (r_typenum == UAR_CDOUBLE) {
+        using T = std::complex<double>;
+        double n_sq = complex_norm_squared_blocking_impl<double>(
+            q, n, r_typeless_ptr, depends);
+        res = py::float_(n_sq);
+    }
+    else if (r_typenum == UAR_CFLOAT) {
+        using T = std::complex<float>;
+        float n_sq = complex_norm_squared_blocking_impl<float>(
+            q, n, r_typeless_ptr, depends);
+        res = py::float_(n_sq);
+    }
+    else {
+        throw std::runtime_error("Type dispatch ran into trouble.");
+    }
+
+    return res;
+}
+
+py::object py_dot_blocking(sycl::queue q,
+                           dpctl::tensor::usm_ndarray v1,
+                           dpctl::tensor::usm_ndarray v2,
+                           const std::vector<sycl::event> &depends = {})
+{
+    if (v1.get_ndim() != 1 || v2.get_ndim() != 1) {
+        throw std::runtime_error("Expecting two vectors");
+    }
+
+    py::ssize_t n = v1.get_shape(0); // get length of the vector
+
+    if (n != v2.get_shape(0)) {
+        throw std::runtime_error("Length of vectors are not the same");
+    }
+
+    int v1_flags = v1.get_flags();
+    int v2_flags = v2.get_flags();
+
+    if (!(v1_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) ||
+        !(v2_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)))
+    {
+        throw std::runtime_error("Vectors must be contiguous.");
+    }
+
+    if (q.get_context() != v1.get_queue().get_context() ||
+        q.get_context() != v2.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue");
+    }
+
+    int v1_typenum = v1.get_typenum();
+    int v2_typenum = v2.get_typenum();
+
+    if ((v1_typenum != v2_typenum) ||
+        ((v1_typenum != UAR_DOUBLE) && (v1_typenum != UAR_FLOAT) &&
+         (v1_typenum != UAR_CDOUBLE) && (v1_typenum != UAR_CFLOAT)))
+    {
+        throw py::value_error(
+            "Data types of vectors must be the same. "
+            "Only real and complex floating types are supported.");
+    }
+
+    const char *v1_typeless_ptr = v1.get_data();
+    const char *v2_typeless_ptr = v2.get_data();
+    py::object res;
+
+    if (v1_typenum == UAR_DOUBLE) {
+        using T = double;
+        T *res_usm = sycl::malloc_device<T>(1, q);
+        sycl::event dot_ev = oneapi::mkl::blas::row_major::dot(
+            q, n, reinterpret_cast<const T *>(v1_typeless_ptr), 1,
+            reinterpret_cast<const T *>(v2_typeless_ptr), 1, res_usm, depends);
+        T res_v{};
+        q.copy<T>(res_usm, &res_v, 1, {dot_ev}).wait_and_throw();
+        res = py::float_(res_v);
+    }
+    else if (v1_typenum == UAR_FLOAT) {
+        using T = float;
+        T *res_usm = sycl::malloc_device<T>(1, q);
+        sycl::event dot_ev = oneapi::mkl::blas::row_major::dot(
+            q, n, reinterpret_cast<const T *>(v1_typeless_ptr), 1,
+            reinterpret_cast<const T *>(v2_typeless_ptr), 1, res_usm, depends);
+        T res_v(0);
+        q.copy<T>(res_usm, &res_v, 1, {dot_ev}).wait_and_throw();
+        res = py::float_(res_v);
+    }
+    else if (v1_typenum == UAR_CDOUBLE) {
+        using T = std::complex<double>;
+        T *res_usm = sycl::malloc_device<T>(1, q);
+        sycl::event dotc_ev = oneapi::mkl::blas::row_major::dotc(
+            q, n, reinterpret_cast<const T *>(v1_typeless_ptr), 1,
+            reinterpret_cast<const T *>(v2_typeless_ptr), 1, res_usm, depends);
+        T res_v{};
+        q.copy<T>(res_usm, &res_v, 1, {dotc_ev}).wait_and_throw();
+        res = py::cast(res_v);
+    }
+    else if (v1_typenum == UAR_CFLOAT) {
+        using T = std::complex<float>;
+        T *res_usm = sycl::malloc_device<T>(1, q);
+        sycl::event dotc_ev = oneapi::mkl::blas::row_major::dotc(
+            q, n, reinterpret_cast<const T *>(v1_typeless_ptr), 1,
+            reinterpret_cast<const T *>(v2_typeless_ptr), 1, res_usm, depends);
+        T res_v{};
+        q.copy<T>(res_usm, &res_v, 1, {dotc_ev}).wait_and_throw();
+        res = py::cast(res_v);
+    }
+    else {
+        throw std::runtime_error("Type dispatch ran into trouble.");
+    }
+
+    return res;
+}
+
+int py_cg_solve(sycl::queue exec_q,
+                dpctl::tensor::usm_ndarray Amat,
+                dpctl::tensor::usm_ndarray bvec,
+                dpctl::tensor::usm_ndarray xvec,
+                double rs_tol,
+                const std::vector<sycl::event> &depends = {})
+{
+    if (Amat.get_ndim() != 2 || bvec.get_ndim() != 1 || xvec.get_ndim() != 1) {
+        throw py::value_error("Expecting a matrix and two vectors");
+    }
+
+    py::ssize_t n0 = Amat.get_shape(0);
+    py::ssize_t n1 = Amat.get_shape(1);
+
+    if (n0 != n1) {
+        throw py::value_error("Matrix must be square.");
+    }
+
+    if (n0 != bvec.get_shape(0) || n0 != xvec.get_shape(0)) {
+        throw py::value_error(
+            "Dimensions of the matrix and vectors are not consistent.");
+    }
+
+    bool all_contig = (Amat.get_flags() & USM_ARRAY_C_CONTIGUOUS) &&
+                      (bvec.get_flags() & USM_ARRAY_C_CONTIGUOUS) &&
+                      (xvec.get_flags() & USM_ARRAY_C_CONTIGUOUS);
+    if (!all_contig) {
+        throw py::value_error("All inputs must be C-contiguous");
+    }
+
+    int A_typenum = Amat.get_typenum();
+    int b_typenum = bvec.get_typenum();
+    int x_typenum = xvec.get_typenum();
+
+    if (A_typenum != b_typenum || A_typenum != x_typenum) {
+        throw py::value_error("All arrays must have the same type");
+    }
+
+    if (exec_q.get_context() != Amat.get_queue().get_context() ||
+        exec_q.get_context() != bvec.get_queue().get_context() ||
+        exec_q.get_context() != xvec.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocations are not bound to context in execution queue");
+    }
+
+    const char *A_ch = Amat.get_data();
+    const char *b_ch = bvec.get_data();
+    char *x_ch = xvec.get_data();
+
+    if (A_typenum == UAR_DOUBLE) {
+        using T = double;
+        int iters = cg_solver::cg_solve<T>(
+            exec_q, n0, reinterpret_cast<const T *>(A_ch),
+            reinterpret_cast<const T *>(b_ch), reinterpret_cast<T *>(x_ch),
+            depends, static_cast<T>(rs_tol));
+
+        return iters;
+    }
+    else if (A_typenum == UAR_FLOAT) {
+        using T = float;
+        int iters = cg_solver::cg_solve<T>(
+            exec_q, n0, reinterpret_cast<const T *>(A_ch),
+            reinterpret_cast<const T *>(b_ch), reinterpret_cast<T *>(x_ch),
+            depends, static_cast<T>(rs_tol));
+
+        return iters;
+    }
+    else {
+        throw std::runtime_error(
+            "Unsupported data type. Use single or double precision.");
+    }
+}
+
 PYBIND11_MODULE(_onemkl, m)
 {
     // Import the dpctl extensions
     import_dpctl();
-    m.def("gemv", &gemv, "Uses oneMKL to compute dot(matrix, vector)");
+
+    m.def("gemv", &py_gemv, "Uses oneMKL to compute dot(matrix, vector)",
+          py::arg("exec_queue"), py::arg("Amatrix"), py::arg("xvec"),
+          py::arg("resvec"), py::arg("depends") = py::list());
+    m.def("sub", &py_sub, "Subtraction: out = v1 - v2", py::arg("exec_queue"),
+          py::arg("in1"), py::arg("in2"), py::arg("out"),
+          py::arg("depends") = py::list());
+    m.def("axpby_inplace", &py_axpby_inplace, "y = a * x + b * y",
+          py::arg("exec_queue"), py::arg("a"), py::arg("x"), py::arg("b"),
+          py::arg("y"), py::arg("depends") = py::list());
+    m.def("norm_squared_blocking", &py_norm_squared_blocking, "norm(r)**2",
+          py::arg("exec_queue"), py::arg("r"), py::arg("depends") = py::list());
+    m.def("dot_blocking", &py_dot_blocking, "<v1, v2>", py::arg("exec_queue"),
+          py::arg("v1"), py::arg("v2"), py::arg("depends") = py::list());
+
+    m.def("cpp_cg_solve", &py_cg_solve,
+          "Dispatch to call C++ implementation of cg_solve",
+          py::arg("exec_queue"), py::arg("Amat"), py::arg("bvec"),
+          py::arg("xvec"), py::arg("rs_squared_tolerance") = py::float_(1e-20),
+          py::arg("depends") = py::list());
 }
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp b/examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp
new file mode 100644
index 0000000000..6e0396c7a1
--- /dev/null
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp
@@ -0,0 +1,226 @@
+//==- _cg_solver.cpp - C++ impl of conjugate gradient linear solver  -==//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements Pybind11-generated extension exposing functions that
+/// take dpctl Python objects, such as dpctl.SyclQueue, dpctl.SyclDevice, and
+/// dpctl.tensor.usm_ndarray as arguments.
+///
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+namespace cg_solver
+{
+namespace detail
+{
+
+template <typename T> class sub_kern;
+
+template <typename T>
+sycl::event sub(sycl::queue &q,
+                size_t n,
+                const T *v1,
+                const T *v2,
+                T *r,
+                const std::vector<sycl::event> &depends = {})
+{
+    sycl::event r_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<sub_kern<T>>(sycl::range<1>{n}, [=](sycl::id<1> id) {
+            auto i = id.get(0);
+            r[i] = v1[i] - v2[i];
+        });
+    });
+
+    return r_ev;
+}
+
+template <typename T> class axpy_inplace_kern;
+template <typename T> class axpby_inplace_kern;
+
+template <typename T>
+sycl::event axpby_inplace(sycl::queue &q,
+                          size_t nelems,
+                          T a,
+                          const T *x,
+                          T b,
+                          T *y,
+                          const std::vector<sycl::event> depends = {})
+{
+
+    sycl::event res_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        if (b == T(1)) {
+            cgh.parallel_for<axpy_inplace_kern<T>>(sycl::range<1>{nelems},
+                                                   [=](sycl::id<1> id) {
+                                                       auto i = id.get(0);
+                                                       y[i] += a * x[i];
+                                                   });
+        }
+        else {
+            cgh.parallel_for<axpby_inplace_kern<T>>(
+                sycl::range<1>{nelems}, [=](sycl::id<1> id) {
+                    auto i = id.get(0);
+                    y[i] = b * y[i] + a * x[i];
+                });
+        }
+    });
+
+    return res_ev;
+}
+
+template <typename T> class norm_squared_blocking_kern;
+
+template <typename T>
+T norm_squared_blocking(sycl::queue &q,
+                        size_t nelems,
+                        const T *r,
+                        const std::vector<sycl::event> &depends = {})
+{
+    sycl::buffer<T, 1> sum_sq_buf(sycl::range<1>{1});
+
+    q.submit([&](sycl::handler &cgh) {
+         cgh.depends_on(depends);
+         auto sum_sq_reduction = sycl::reduction(
+             sum_sq_buf, cgh, sycl::plus<T>(),
+             {sycl::property::reduction::initialize_to_identity{}});
+         cgh.parallel_for<norm_squared_blocking_kern<T>>(
+             sycl::range<1>{nelems}, sum_sq_reduction,
+             [=](sycl::id<1> id, auto &sum_sq) {
+                 auto i = id.get(0);
+                 sum_sq += r[i] * r[i];
+             });
+     }).wait_and_throw();
+
+    sycl::host_accessor ha(sum_sq_buf);
+    return ha[0];
+}
+
+template <typename T> class complex_norm_squared_blocking_kern;
+
+template <typename T>
+T complex_norm_squared_blocking(sycl::queue &q,
+                                size_t nelems,
+                                const std::complex<T> *r,
+                                const std::vector<sycl::event> &depends = {})
+{
+    sycl::buffer<T, 1> sum_sq_buf(sycl::range<1>{1});
+
+    q.submit([&](sycl::handler &cgh) {
+         cgh.depends_on(depends);
+         auto sum_sq_reduction = sycl::reduction(
+             sum_sq_buf, cgh, sycl::plus<T>(),
+             {sycl::property::reduction::initialize_to_identity{}});
+         cgh.parallel_for<complex_norm_squared_blocking_kern<T>>(
+             sycl::range<1>{nelems}, sum_sq_reduction,
+             [=](sycl::id<1> id, auto &sum_sq) {
+                 auto i = id.get(0);
+                 sum_sq +=
+                     r[i].real() * r[i].real() + r[i].imag() * r[i].imag();
+             });
+     }).wait_and_throw();
+
+    sycl::host_accessor ha(sum_sq_buf);
+    return ha[0];
+}
+
+} // namespace detail
+
+template <typename T>
+int cg_solve(sycl::queue exec_q,
+             std::int64_t n,
+             const T *Amat,
+             const T *bvec,
+             T *sol_vec,
+             const std::vector<sycl::event> &depends = {},
+             T rs_threshold = T(1e-20))
+{
+    T *x_vec = sol_vec;
+    sycl::event fill_ev = exec_q.fill<T>(x_vec, T(0), n, depends);
+
+    // n for r, n for p, n for Ap and 1 for pAp_dot_dev
+    T *r = sycl::malloc_device<T>(3 * n + 1, exec_q);
+    T *p = r + n;
+    T *Ap = p + n;
+    T *pAp_dot_dev = Ap + n;
+
+    sycl::event copy_to_r_ev = exec_q.copy<T>(bvec, r, n, depends);
+    sycl::event copy_to_p_ev = exec_q.copy<T>(bvec, p, n, depends);
+    T rs_old = detail::norm_squared_blocking(exec_q, n, r, {copy_to_r_ev});
+
+    std::int64_t max_iters = n;
+
+    if (rs_old < rs_threshold) {
+        sycl::free(r, exec_q);
+        return 0;
+    }
+
+    int converged_at = max_iters;
+    sycl::event prev_dep = copy_to_p_ev;
+    sycl::event e_x = fill_ev;
+
+    for (std::int64_t i = 0; i < max_iters; ++i) {
+        sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv(
+            exec_q, oneapi::mkl::transpose::N, n, n, T(1), Amat, n, p, 1, T(0),
+            Ap, 1, {prev_dep});
+
+        sycl::event pAp_dot_ev = oneapi::mkl::blas::row_major::dot(
+            exec_q, n, p, 1, Ap, 1, pAp_dot_dev, {prev_dep, gemv_ev});
+
+        T pAp_dot_host{};
+        exec_q.copy<T>(pAp_dot_dev, &pAp_dot_host, 1, {pAp_dot_ev})
+            .wait_and_throw();
+        T alpha = rs_old / pAp_dot_host;
+
+        // x = x + alpha * p
+        sycl::event x_update_ev =
+            detail::axpby_inplace(exec_q, n, alpha, p, T(1), x_vec, {e_x});
+
+        // r = r - alpha * Ap
+        sycl::event r_update_ev =
+            detail::axpby_inplace(exec_q, n, -alpha, Ap, T(1), r);
+
+        T rs_new = detail::norm_squared_blocking(exec_q, n, r, {r_update_ev});
+
+        if (rs_new < rs_threshold) {
+            converged_at = i;
+            x_update_ev.wait();
+            break;
+        }
+
+        T beta = rs_new / rs_old;
+
+        // p = r + beta * p
+        prev_dep =
+            detail::axpby_inplace(exec_q, n, T(1), r, beta, p, {r_update_ev});
+        e_x = x_update_ev;
+
+        rs_old = rs_new;
+    }
+
+    sycl::free(r, exec_q);
+    return converged_at;
+}
+
+} // namespace cg_solver
diff --git a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py
new file mode 100644
index 0000000000..42178ee0ed
--- /dev/null
+++ b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py
@@ -0,0 +1,92 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import numpy as np
+
+# coding: utf-8
+import solve
+import sycl_gemm
+
+import dpctl
+import dpctl.tensor as dpt
+
+argv = sys.argv
+
+n = 1000
+rank = 11
+
+if len(argv) > 1:
+    n = int(argv[1])
+if len(argv) > 2:
+    rank = int(argv[2])
+
+
+print(
+    f"Solving {n} by {n} diagonal linear "
+    f"system with rank {rank} perturbation."
+)
+
+Anp = np.eye(n, n) + (lambda x: x.T @ x)(np.random.randn(rank, n))
+bnp = np.random.rand(n)
+
+q = dpctl.SyclQueue(property=["enable_profiling"])
+q.print_device_info()
+if q.is_in_order:
+    print("Using in-order queue")
+else:
+    print("Using not in-order queue")
+
+api_dev = dpctl.tensor.Device.create_device(q)
+A = dpt.asarray(Anp, "d", device=api_dev)
+b = dpt.asarray(bnp, "d", device=api_dev)
+
+timer = dpctl.SyclTimer(time_scale=1e3)
+
+iters = []
+for i in range(6):
+    with timer(api_dev.sycl_queue):
+        x, conv_in = solve.cg_solve(A, b)
+
+    print(i, "(host_dt, device_dt)=", timer.dt)
+    iters.append(conv_in)
+
+print("Converged in: ", iters)
+
+r = dpt.empty_like(b)
+hev, ev = sycl_gemm.gemv(q, A, x, r)
+delta = dpt.empty_like(b)
+hev2, ev2 = sycl_gemm.sub(q, r, b, delta, [ev])
+rs = sycl_gemm.norm_squared_blocking(q, delta)
+dpctl.SyclEvent.wait_for([hev, hev2])
+print(f"Python solution residual norm squared: {rs}")
+
+x_cpp = dpt.empty_like(b)
+iters = []
+for i in range(6):
+    with timer(api_dev.sycl_queue):
+        conv_in = sycl_gemm.cpp_cg_solve(q, A, b, x_cpp)
+
+    print(i, "(host_dt, device_dt)=", timer.dt)
+    iters.append(conv_in)
+
+print("Converged in: ", iters)
+hev, ev = sycl_gemm.gemv(q, A, x_cpp, r)
+hev2, ev2 = sycl_gemm.sub(q, r, b, delta, [ev])
+rs = sycl_gemm.norm_squared_blocking(q, delta)
+dpctl.SyclEvent.wait_for([hev, hev2])
+print(f"cpp_cg_solve solution residual norm squared: {rs}")
diff --git a/examples/pybind11/onemkl_gemv/tests/test_gemm.py b/examples/pybind11/onemkl_gemv/tests/test_gemm.py
index 7a78fba346..6090322c75 100644
--- a/examples/pybind11/onemkl_gemv/tests/test_gemm.py
+++ b/examples/pybind11/onemkl_gemv/tests/test_gemm.py
@@ -1,6 +1,12 @@
 import numpy as np
 import pytest
-from sycl_gemm import gemv
+from sycl_gemm import (
+    axpby_inplace,
+    dot_blocking,
+    gemv,
+    norm_squared_blocking,
+    sub,
+)
 
 import dpctl
 import dpctl.tensor as dpt
@@ -15,7 +21,59 @@ def test_gemv():
     r = dpt.empty((5,), dtype="d", sycl_queue=q)
     M = dpt.asarray(Mnp, sycl_queue=q)
     v = dpt.asarray(vnp, sycl_queue=q)
-    hev, ev = gemv(M.sycl_queue, M, v, r, [])
+    hev, ev = gemv(q, M, v, r, [])
     hev.wait()
     rnp = dpt.asnumpy(r)
     assert np.allclose(rnp, Mnp @ vnp)
+
+
+def test_sub():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    anp, bnp = np.random.randn(5), np.random.randn(5)
+    r = dpt.empty((5,), dtype="d", sycl_queue=q)
+    a = dpt.asarray(anp, sycl_queue=q)
+    b = dpt.asarray(bnp, sycl_queue=q)
+    hev, ev = sub(q, a, b, r, [])
+    hev.wait()
+    rnp = dpt.asnumpy(r)
+    assert np.allclose(rnp + bnp, anp)
+
+
+def test_axpby():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    xnp, pnp = np.random.randn(5), np.random.randn(5)
+    x = dpt.asarray(xnp, sycl_queue=q)
+    p = dpt.asarray(pnp, sycl_queue=q)
+    hev, ev = axpby_inplace(q, 0.5, x, -0.7, p, [])
+    hev.wait()
+    rnp = dpt.asnumpy(p)
+    assert np.allclose(rnp, 0.5 * xnp - 0.7 * pnp)
+
+
+def test_dot():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    anp, bnp = np.random.randn(5), np.random.randn(5)
+    a = dpt.asarray(anp, sycl_queue=q)
+    b = dpt.asarray(bnp, sycl_queue=q)
+    dot_res = dot_blocking(q, a, b)
+    assert np.allclose(dot_res, np.dot(anp, bnp))
+
+
+def test_norm_squared():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    anp = np.random.randn(5)
+    a = dpt.asarray(anp, sycl_queue=q)
+    dot_res = norm_squared_blocking(q, a)
+    assert np.allclose(dot_res, np.dot(anp, anp))
diff --git a/examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt b/examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt
new file mode 100644
index 0000000000..a788c56bae
--- /dev/null
+++ b/examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt
@@ -0,0 +1,36 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(use_queue_device LANGUAGES CXX)
+
+set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake")
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH})
+find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# Fetch pybind11
+include(FetchContent)
+FetchContent_Declare(
+  pybind11
+  URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.tar.gz
+  URL_HASH SHA256=6bd528c4dbe2276635dc787b6b1f2e5316cf6b49ee3e150264e455a0d68d19c1
+)
+FetchContent_MakeAvailable(pybind11)
+
+find_package(PythonExtensions REQUIRED)
+find_package(Dpctl REQUIRED)
+find_package(NumPy REQUIRED)
+
+set(py_module_name _use_queue_device)
+pybind11_add_module(${py_module_name}
+    MODULE
+    use_queue_device/_example.cpp
+)
+target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
+target_compile_options(${py_module_name} PRIVATE -Wno-deprecated-declarations)
+install(TARGETS ${py_module_name}
+  DESTINATION use_queue_device
+)
+
+set(ignoreMe "${SKBUILD}")
diff --git a/examples/pybind11/use_dpctl_syclqueue/README.md b/examples/pybind11/use_dpctl_syclqueue/README.md
index 3e7b6804a9..f3ef6e92e5 100644
--- a/examples/pybind11/use_dpctl_syclqueue/README.md
+++ b/examples/pybind11/use_dpctl_syclqueue/README.md
@@ -9,7 +9,8 @@ extensions.
 
 ```
 source /opt/intel/oneapi/compiler/latest/env/vars.sh
-CXX=dpcpp CC=dpcpp python setup.py build_ext --inplace
+CXX=icpx python setup.py build_ext --inplace
+python -m pytest tests
 python example.py
 ```
 
diff --git a/examples/pybind11/use_dpctl_syclqueue/example.py b/examples/pybind11/use_dpctl_syclqueue/example.py
index 7189adecb8..698e43baf9 100644
--- a/examples/pybind11/use_dpctl_syclqueue/example.py
+++ b/examples/pybind11/use_dpctl_syclqueue/example.py
@@ -17,7 +17,7 @@
 # coding: utf-8
 
 import numpy as np
-import use_queue_device_ext as eg
+import use_queue_device as eg
 
 import dpctl
 
diff --git a/examples/pybind11/use_dpctl_syclqueue/setup.py b/examples/pybind11/use_dpctl_syclqueue/setup.py
index 34eeebe38c..4d09be820e 100644
--- a/examples/pybind11/use_dpctl_syclqueue/setup.py
+++ b/examples/pybind11/use_dpctl_syclqueue/setup.py
@@ -14,20 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pybind11.setup_helpers import Pybind11Extension
-from setuptools import setup
+from skbuild import setup
 
-import dpctl
-
-exts = [
-    Pybind11Extension(
-        "use_queue_device_ext",
-        ["./_example.cpp"],
-        include_dirs=[dpctl.get_include()],
-        extra_compile_args=["-fPIC"],
-        extra_link_args=["-fPIC"],
-        language="c++",
-    ),
-]
-
-setup(name="pybind11_example", ext_modules=exts)
+setup(
+    name="use_queue_device",
+    version="0.0.1",
+    description="an example of SYCL-powered Python package (with pybind11)",
+    author="Intel Scripting",
+    license="Apache 2.0",
+    packages=["use_queue_device"],
+)
diff --git a/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py b/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py
new file mode 100644
index 0000000000..120cc1fd9e
--- /dev/null
+++ b/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py
@@ -0,0 +1,57 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding: utf-8
+
+import numpy as np
+import use_queue_device as uqd
+
+import dpctl
+
+
+def test_compute_units():
+    q = dpctl.SyclQueue()
+    mcu = uqd.get_max_compute_units(q)
+
+    assert type(mcu) is int
+    assert mcu == q.sycl_device.max_compute_units
+
+
+def test_global_memory():
+    d = dpctl.SyclDevice()
+    gm = uqd.get_device_global_mem_size(d)
+    assert type(gm) is int
+    assert gm == d.global_mem_size
+
+
+def test_local_memory():
+    d = dpctl.SyclDevice()
+    lm = uqd.get_device_local_mem_size(d)
+    assert type(lm) is int
+    assert lm == d.local_mem_size
+
+
+def test_offload_array_mod():
+    execution_queue = dpctl.SyclQueue()
+    X = np.random.randint(low=1, high=2**16 - 1, size=10**6, dtype=np.int64)
+    modulus_p = 347
+
+    # Y is a regular NumPy array with NumPy allocated host memory
+    Y = uqd.offloaded_array_mod(execution_queue, X, modulus_p)
+
+    Ynp = X % modulus_p
+
+    assert np.array_equal(Y, Ynp)
diff --git a/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py
new file mode 100644
index 0000000000..ccbca8fd73
--- /dev/null
+++ b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py
@@ -0,0 +1,42 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding: utf-8
+
+from ._use_queue_device import (
+    get_device_global_mem_size,
+    get_device_local_mem_size,
+    get_max_compute_units,
+    offloaded_array_mod,
+)
+
+__all__ = [
+    "get_max_compute_units",
+    "get_device_global_mem_size",
+    "get_device_local_mem_size",
+    "offloaded_array_mod",
+]
+
+__doc__ = """
+Example pybind11 extension demonstrating binding of dpctl entities to
+SYCL entities.
+
+dpctl provides type casters that bind ``sycl::queue`` to `dpctl.SyclQueue`,
+``sycl::device`` to `dpctl.SyclDevice`, etc.
+
+Use of these type casters simplifies writing of Python extensions and compile
+then using SYCL C++ compilers, such as Intel(R) oneAPI DPC++ compiler.
+"""
diff --git a/examples/pybind11/use_dpctl_syclqueue/_example.cpp b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/_example.cpp
similarity index 98%
rename from examples/pybind11/use_dpctl_syclqueue/_example.cpp
rename to examples/pybind11/use_dpctl_syclqueue/use_queue_device/_example.cpp
index 7b26a35858..54474a0ca1 100644
--- a/examples/pybind11/use_dpctl_syclqueue/_example.cpp
+++ b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/_example.cpp
@@ -84,7 +84,7 @@ offloaded_array_mod(sycl::queue &q,
     return res;
 }
 
-PYBIND11_MODULE(use_queue_device_ext, m)
+PYBIND11_MODULE(_use_queue_device, m)
 {
     // Import the dpctl extensions
     import_dpctl();