From 7dd904d9ab71b7fef650569f8ff60c1d495cb0ce Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 4 Dec 2020 15:31:43 +0300
Subject: [PATCH 01/40] Add support for dpctl.dparray.

---
 numba_dppy/dparray.py            | 354 +++++++++++++++++++++++++++++++
 numba_dppy/dppy_rt.c             |  89 ++++++++
 numba_dppy/tests/test_dparray.py | 228 ++++++++++++++++++++
 setup.py                         |  12 ++
 4 files changed, 683 insertions(+)
 create mode 100644 numba_dppy/dparray.py
 create mode 100644 numba_dppy/dppy_rt.c
 create mode 100644 numba_dppy/tests/test_dparray.py

diff --git a/numba_dppy/dparray.py b/numba_dppy/dparray.py
new file mode 100644
index 0000000000..654230fc81
--- /dev/null
+++ b/numba_dppy/dparray.py
@@ -0,0 +1,354 @@
+# This class creates a type in Numba.
+class DPArrayType(types.Array):
+    def __init__(
+        self,
+        dtype,
+        ndim,
+        layout,
+        readonly=False,
+        name=None,
+        aligned=True,
+        addrspace=None,
+    ):
+        # This name defines how this type will be shown in Numba's type dumps.
+        name = "DPArray:ndarray(%s, %sd, %s)" % (dtype, ndim, layout)
+        super(DPArrayType, self).__init__(
+            dtype,
+            ndim,
+            layout,
+            py_type=ndarray,
+            readonly=readonly,
+            name=name,
+            addrspace=addrspace,
+        )
+
+    # Tell Numba typing how to combine DPArrayType with other ndarray types.
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+        if method == "__call__":
+            for inp in inputs:
+                if not isinstance(inp, (DPArrayType, types.Array, types.Number)):
+                    return None
+
+            return DPArrayType
+        else:
+            return None
+
+
+# This tells Numba how to create a DPArrayType when a dparray is passed
+# into a njit function.
+@typeof_impl.register(ndarray)
+def typeof_ta_ndarray(val, c):
+    try:
+        dtype = numpy_support.from_dtype(val.dtype)
+    except NotImplementedError:
+        raise ValueError("Unsupported array dtype: %s" % (val.dtype,))
+    layout = numpy_support.map_layout(val)
+    readonly = not val.flags.writeable
+    return DPArrayType(dtype, val.ndim, layout, readonly=readonly)
+
+
+# This tells Numba to use the default Numpy ndarray data layout for
+# object of type DPArray.
+register_model(DPArrayType)(numba.core.datamodel.models.ArrayModel)
+
+# This tells Numba how to convert from its native representation
+# of a DPArray in a njit function back to a Python DPArray.
+@box(DPArrayType)
+def box_array(typ, val, c):
+    nativearycls = c.context.make_array(typ)
+    nativeary = nativearycls(c.context, c.builder, value=val)
+    if c.context.enable_nrt:
+        np_dtype = numpy_support.as_dtype(typ.dtype)
+        dtypeptr = c.env_manager.read_const(c.env_manager.add_const(np_dtype))
+        # Steals NRT ref
+        newary = c.pyapi.nrt_adapt_ndarray_to_python(typ, val, dtypeptr)
+        return newary
+    else:
+        parent = nativeary.parent
+        c.pyapi.incref(parent)
+        return parent
+
+
+# This tells Numba to use this function when it needs to allocate a
+# DPArray in a njit function.
+@allocator(DPArrayType)
+def allocator_DPArray(context, builder, size, align):
+    context.nrt._require_nrt()
+
+    mod = builder.module
+    u32 = ir.IntType(32)
+
+    # Get the Numba external allocator for USM memory.
+    ext_allocator_fnty = ir.FunctionType(cgutils.voidptr_t, [])
+    ext_allocator_fn = mod.get_or_insert_function(
+        ext_allocator_fnty, name="dparray_get_ext_allocator"
+    )
+    ext_allocator = builder.call(ext_allocator_fn, [])
+    # Get the Numba function to allocate an aligned array with an external allocator.
+    fnty = ir.FunctionType(cgutils.voidptr_t, [cgutils.intp_t, u32, cgutils.voidptr_t])
+    fn = mod.get_or_insert_function(
+        fnty, name="NRT_MemInfo_alloc_safe_aligned_external"
+    )
+    fn.return_value.add_attribute("noalias")
+    if isinstance(align, builtins.int):
+        align = context.get_constant(types.uint32, align)
+    else:
+        assert align.type == u32, "align must be a uint32"
+    return builder.call(fn, [size, align, ext_allocator])
+
+
+registered = False
+
+
+def numba_register():
+    global registered
+    if not registered:
+        registered = True
+        numba_register_typing()
+        numba_register_lower_builtin()
+
+
+# Copy a function registered as a lowerer in Numba but change the
+# "np" import in Numba to point to dparray instead of NumPy.
+def copy_func_for_dparray(f, dparray_mod):
+    import copy as cc
+
+    # Make a copy so our change below doesn't affect anything else.
+    gglobals = cc.copy(f.__globals__)
+    # Make the "np"'s in the code use dparray instead of Numba's default NumPy.
+    gglobals["np"] = dparray_mod
+    # Create a new function using the original code but the new globals.
+    g = ftype(f.__code__, gglobals, None, f.__defaults__, f.__closure__)
+    # Some other tricks to make sure the function copy works.
+    g = functools.update_wrapper(g, f)
+    g.__kwdefaults__ = f.__kwdefaults__
+    return g
+
+
+def types_replace_array(x):
+    return tuple([z if z != types.Array else DPArrayType for z in x])
+
+
+def numba_register_lower_builtin():
+    todo = []
+    todo_builtin = []
+    todo_getattr = []
+
+    # For all Numpy identifiers that have been registered for typing in Numba...
+    # this registry contains functions, getattrs, setattrs, casts and constants...need to do them all? FIX FIX FIX
+    for ig in lower_registry.functions:
+        impl, func, types = ig
+        # If it is a Numpy function...
+        if isinstance(func, ftype):
+            if func.__module__ == np.__name__:
+                # If we have overloaded that function in the dparray module (always True right now)...
+                if func.__name__ in functions_list:
+                    todo.append(ig)
+        if isinstance(func, bftype):
+            if func.__module__ == np.__name__:
+                # If we have overloaded that function in the dparray module (always True right now)...
+                if func.__name__ in functions_list:
+                    todo.append(ig)
+
+    for lg in lower_registry.getattrs:
+        func, attr, types = lg
+        types_with_dparray = types_replace_array(types)
+        if DPArrayType in types_with_dparray:
+            dprint(
+                "lower_getattr:", func, type(func), attr, type(attr), types, type(types)
+            )
+            todo_getattr.append((func, attr, types_with_dparray))
+
+    for lg in todo_getattr:
+        lower_registry.getattrs.append(lg)
+
+    cur_mod = importlib.import_module(__name__)
+    for impl, func, types in todo + todo_builtin:
+        dparray_func = eval(func.__name__)
+        dprint(
+            "need to re-register lowerer for dparray", impl, func, types, dparray_func
+        )
+        new_impl = copy_func_for_dparray(impl, cur_mod)
+        lower_registry.functions.append((new_impl, dparray_func, types))
+
+
+def argspec_to_string(argspec):
+    first_default_arg = len(argspec.args) - len(argspec.defaults)
+    non_def = argspec.args[:first_default_arg]
+    arg_zip = list(zip(argspec.args[first_default_arg:], argspec.defaults))
+    combined = [a + "=" + str(b) for a, b in arg_zip]
+    return ",".join(non_def + combined)
+
+
+def numba_register_typing():
+    todo = []
+    todo_classes = []
+    todo_getattr = []
+
+    # For all Numpy identifiers that have been registered for typing in Numba...
+    for ig in typing_registry.globals:
+        val, typ = ig
+        # If it is a Numpy function...
+        if isinstance(val, (ftype, bftype)):
+            # If we have overloaded that function in the dparray module (always True right now)...
+            if val.__name__ in functions_list:
+                todo.append(ig)
+        if isinstance(val, type):
+            todo_classes.append(ig)
+
+    for tgetattr in templates_registry.attributes:
+        if tgetattr.key == types.Array:
+            todo_getattr.append(tgetattr)
+
+    for val, typ in todo:
+        assert len(typ.templates) == 1
+        # template is the typing class to invoke generic() upon.
+        template = typ.templates[0]
+        dpval = eval(val.__name__)
+        dprint("need to re-register for dparray", val, typ, typ.typing_key)
+        """
+        if debug:
+            print("--------------------------------------------------------------")
+            print("need to re-register for dparray", val, typ, typ.typing_key)
+            print("val:", val, type(val), "dir val", dir(val))
+            print("typ:", typ, type(typ), "dir typ", dir(typ))
+            print("typing key:", typ.typing_key)
+            print("name:", typ.name)
+            print("key:", typ.key)
+            print("templates:", typ.templates)
+            print("template:", template, type(template))
+            print("dpval:", dpval, type(dpval))
+            print("--------------------------------------------------------------")
+        """
+
+        class_name = "DparrayTemplate_" + val.__name__
+
+        @classmethod
+        def set_key_original(cls, key, original):
+            cls.key = key
+            cls.original = original
+
+        def generic_impl(self):
+            original_typer = self.__class__.original.generic(self.__class__.original)
+            ot_argspec = inspect.getfullargspec(original_typer)
+            # print("ot_argspec:", ot_argspec)
+            astr = argspec_to_string(ot_argspec)
+            # print("astr:", astr)
+
+            typer_func = """def typer({}):
+                                original_res = original_typer({})
+                                #print("original_res:", original_res)
+                                if isinstance(original_res, types.Array):
+                                    return DPArrayType(dtype=original_res.dtype, ndim=original_res.ndim, layout=original_res.layout)
+
+                                return original_res""".format(
+                astr, ",".join(ot_argspec.args)
+            )
+
+            # print("typer_func:", typer_func)
+
+            try:
+                gs = globals()
+                ls = locals()
+                gs["original_typer"] = ls["original_typer"]
+                exec(typer_func, globals(), locals())
+            except NameError as ne:
+                print("NameError in exec:", ne)
+                sys.exit(0)
+            except:
+                print("exec failed!", sys.exc_info()[0])
+                sys.exit(0)
+
+            try:
+                exec_res = eval("typer")
+            except NameError as ne:
+                print("NameError in eval:", ne)
+                sys.exit(0)
+            except:
+                print("eval failed!", sys.exc_info()[0])
+                sys.exit(0)
+
+            # print("exec_res:", exec_res)
+            return exec_res
+
+        new_dparray_template = type(
+            class_name,
+            (template,),
+            {"set_class_vars": set_key_original, "generic": generic_impl},
+        )
+
+        new_dparray_template.set_class_vars(dpval, template)
+
+        assert callable(dpval)
+        type_handler = types.Function(new_dparray_template)
+        typing_registry.register_global(dpval, type_handler)
+
+    # Handle dparray attribute typing.
+    for tgetattr in todo_getattr:
+        class_name = tgetattr.__name__ + "_dparray"
+        dprint("tgetattr:", tgetattr, type(tgetattr), class_name)
+
+        @classmethod
+        def set_key(cls, key):
+            cls.key = key
+
+        def getattr_impl(self, attr):
+            if attr.startswith("resolve_"):
+                # print("getattr_impl starts with resolve_:", self, type(self), attr)
+                def wrapper(*args, **kwargs):
+                    attr_res = tgetattr.__getattribute__(self, attr)(*args, **kwargs)
+                    if isinstance(attr_res, types.Array):
+                        return DPArrayType(
+                            dtype=attr_res.dtype,
+                            ndim=attr_res.ndim,
+                            layout=attr_res.layout,
+                        )
+
+                return wrapper
+            else:
+                return tgetattr.__getattribute__(self, attr)
+
+        new_dparray_template = type(
+            class_name,
+            (tgetattr,),
+            {"set_class_vars": set_key, "__getattribute__": getattr_impl},
+        )
+
+        new_dparray_template.set_class_vars(DPArrayType)
+        templates_registry.register_attr(new_dparray_template)
+
+
+def from_ndarray(x):
+    return copy(x)
+
+
+def as_ndarray(x):
+    return np.copy(x)
+
+
+@typing_registry.register_global(as_ndarray)
+class DparrayAsNdarray(CallableTemplate):
+    def generic(self):
+        def typer(arg):
+            return types.Array(dtype=arg.dtype, ndim=arg.ndim, layout=arg.layout)
+
+        return typer
+
+
+@typing_registry.register_global(from_ndarray)
+class DparrayFromNdarray(CallableTemplate):
+    def generic(self):
+        def typer(arg):
+            return DPArrayType(dtype=arg.dtype, ndim=arg.ndim, layout=arg.layout)
+
+        return typer
+
+
+@lower_registry.lower(as_ndarray, DPArrayType)
+def dparray_conversion_as(context, builder, sig, args):
+    return _array_copy(context, builder, sig, args)
+
+
+@lower_registry.lower(from_ndarray, types.Array)
+def dparray_conversion_from(context, builder, sig, args):
+    return _array_copy(context, builder, sig, args)
diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
new file mode 100644
index 0000000000..75c05ff585
--- /dev/null
+++ b/numba_dppy/dppy_rt.c
@@ -0,0 +1,89 @@
+#include "../_pymodule.h"
+#include "../core/runtime/nrt_external.h"
+#include "assert.h"
+#include <dlfcn.h>
+#include <stdio.h>
+
+NRT_ExternalAllocator dparray_allocator;
+
+void dparray_memsys_init(void) {
+    void *(*get_queue)(void);
+    char *lib_name = "libDPPLSyclInterface.so";
+    char *malloc_name = "DPPLmalloc_shared";
+    char *free_name = "DPPLfree_with_queue";
+    char *get_queue_name = "DPPLQueueMgr_GetCurrentQueue";
+
+    void *sycldl = dlopen(lib_name, RTLD_NOW);
+    assert(sycldl != NULL);
+    dparray_allocator.malloc = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
+    if (dparray_allocator.malloc == NULL) {
+        printf("Did not find %s in %s\n", malloc_name, lib_name);
+        exit(-1);
+    }
+    dparray_allocator.realloc = NULL;
+    dparray_allocator.free = (NRT_external_free_func)dlsym(sycldl, free_name);
+    if (dparray_allocator.free == NULL) {
+        printf("Did not find %s in %s\n", free_name, lib_name);
+        exit(-1);
+    }
+    get_queue = (void *(*))dlsym(sycldl, get_queue_name);
+    if (get_queue == NULL) {
+        printf("Did not find %s in %s\n", get_queue_name, lib_name);
+        exit(-1);
+    }
+    dparray_allocator.opaque_data = get_queue();
+//    printf("dparray_memsys_init: %p %p %p\n", dparray_allocator.malloc, dparray_allocator.free, dparray_allocator.opaque_data);
+}
+
+void * dparray_get_ext_allocator(void) {
+    printf("dparray_get_ext_allocator %p\n", &dparray_allocator);
+    return (void*)&dparray_allocator;
+}
+
+static PyObject *
+get_external_allocator(PyObject *self, PyObject *args) {
+    return PyLong_FromVoidPtr(dparray_get_ext_allocator());
+}
+
+static PyMethodDef ext_methods[] = {
+#define declmethod_noargs(func) { #func , ( PyCFunction )func , METH_NOARGS, NULL }
+    declmethod_noargs(get_external_allocator),
+    {NULL},
+#undef declmethod_noargs
+};
+
+static PyObject *
+build_c_helpers_dict(void)
+{
+    PyObject *dct = PyDict_New();
+    if (dct == NULL)
+        goto error;
+
+#define _declpointer(name, value) do {                 \
+    PyObject *o = PyLong_FromVoidPtr(value);           \
+    if (o == NULL) goto error;                         \
+    if (PyDict_SetItemString(dct, name, o)) {          \
+        Py_DECREF(o);                                  \
+        goto error;                                    \
+    }                                                  \
+    Py_DECREF(o);                                      \
+} while (0)
+
+    _declpointer("dparray_get_ext_allocator", &dparray_get_ext_allocator);
+
+#undef _declpointer
+    return dct;
+error:
+    Py_XDECREF(dct);
+    return NULL;
+}
+
+MOD_INIT(_dppl_rt) {
+    PyObject *m;
+    MOD_DEF(m, "numba.dppl._dppl_rt", "No docs", ext_methods)
+    if (m == NULL)
+        return MOD_ERROR_VAL;
+    dparray_memsys_init();
+    PyModule_AddObject(m, "c_helpers", build_c_helpers_dict());
+    return MOD_SUCCESS_VAL(m);
+}
diff --git a/numba_dppy/tests/test_dparray.py b/numba_dppy/tests/test_dparray.py
new file mode 100644
index 0000000000..24dbea43c4
--- /dev/null
+++ b/numba_dppy/tests/test_dparray.py
@@ -0,0 +1,228 @@
+from __future__ import print_function, division, absolute_import
+
+import numba
+import numba.dppl.dparray as dparray
+import numpy
+import sys
+
+
+def p1(a):
+    return a * 2.0 + 13
+
+
+f1 = numba.njit(p1)
+
+
+@numba.njit()
+def f2(a):
+    return a
+
+
+@numba.njit()
+def f3(a, b):  # a is dparray, b is numpy
+    return a * dparray.asarray(b)
+
+
+@numba.njit()
+def f4():
+    return dparray.ones(10)
+
+
+def p5(a, b):  # a is dparray, b is numpy
+    return a * b
+
+
+f5 = numba.njit(p5)
+
+
+@numba.njit()
+def f6(a):
+    return a + 13
+
+
+@numba.njit()
+def f7(a):  # a is dparray
+    # implicit conversion of a to numpy.ndarray
+    b = numpy.ones(10)
+    c = a * b
+    d = a.argsort()  # with no implicit conversion this fails
+
+
+@numba.njit
+def f8(a):
+    return dparray.as_ndarray(a)
+
+
+@numba.njit
+def f9(a):
+    return dparray.from_ndarray(a)
+
+
+@numba.njit
+def f10():
+    return dparray.empty((10, 10))
+
+
+@numba.njit
+def f11(x):
+    return x.shape
+
+
+@numba.njit
+def f12(x):
+    return x.T
+
+
+# --------------------------------------------------------------------------------
+
+print("------------------- Testing Python Numpy")
+sys.stdout.flush()
+z1 = numpy.ones(10)
+z2 = p1(z1)
+print("z2:", z2, type(z2))
+assert type(z2) == numpy.ndarray
+
+print("------------------- Testing Numba Numpy")
+sys.stdout.flush()
+z1 = numpy.ones(10)
+z2 = f1(z1)
+print("z2:", z2, type(z2))
+assert type(z2) == numpy.ndarray
+
+print("------------------- Testing dparray ones")
+sys.stdout.flush()
+a = dparray.ones(10)
+print("a:", a, type(a))
+assert isinstance(a, dparray.ndarray)
+assert dparray.has_array_interface(a)
+
+print("------------------- Testing dparray.dparray.as_ndarray")
+sys.stdout.flush()
+nd1 = a.as_ndarray()
+print("nd1:", nd1, type(nd1))
+assert type(nd1) == numpy.ndarray
+
+print("------------------- Testing dparray.as_ndarray")
+sys.stdout.flush()
+nd2 = dparray.as_ndarray(a)
+print("nd2:", nd2, type(nd2))
+assert type(nd2) == numpy.ndarray
+
+print("------------------- Testing dparray.from_ndarray")
+sys.stdout.flush()
+dp1 = dparray.from_ndarray(nd2)
+print("dp1:", dp1, type(dp1))
+assert isinstance(dp1, dparray.ndarray)
+assert dparray.has_array_interface(dp1)
+
+print("------------------- Testing dparray multiplication")
+sys.stdout.flush()
+c = a * 5
+print("c", c, type(c))
+assert isinstance(c, dparray.ndarray)
+assert dparray.has_array_interface(c)
+
+print("------------------- Testing Python dparray")
+sys.stdout.flush()
+b = p1(c)
+print("b:", b, type(b))
+assert isinstance(b, dparray.ndarray)
+assert dparray.has_array_interface(b)
+del b
+
+print("------------------- Testing Python mixing dparray and numpy.ndarray")
+sys.stdout.flush()
+h = p5(a, z1)
+print("h:", h, type(h))
+assert isinstance(h, dparray.ndarray)
+assert dparray.has_array_interface(h)
+del h
+
+print("------------------- Testing Numba dparray 2")
+sys.stdout.flush()
+d = f2(a)
+print("d:", d, type(d))
+assert isinstance(d, dparray.ndarray)
+assert dparray.has_array_interface(d)
+del d
+
+print("------------------- Testing Numba dparray")
+sys.stdout.flush()
+b = f1(c)
+print("b:", b, type(b))
+assert isinstance(b, dparray.ndarray)
+assert dparray.has_array_interface(b)
+del b
+
+"""
+print("------------------- Testing Numba dparray constructor from numpy.ndarray")
+sys.stdout.flush()
+e = f3(a, z1)
+print("e:", e, type(e))
+assert(isinstance(e, dparray.ndarray))
+"""
+
+print("------------------- Testing Numba mixing dparray and constant")
+sys.stdout.flush()
+g = f6(a)
+print("g:", g, type(g))
+assert isinstance(g, dparray.ndarray)
+assert dparray.has_array_interface(g)
+del g
+
+print("------------------- Testing Numba mixing dparray and numpy.ndarray")
+sys.stdout.flush()
+h = f5(a, z1)
+print("h:", h, type(h))
+assert isinstance(h, dparray.ndarray)
+assert dparray.has_array_interface(h)
+del h
+
+print("------------------- Testing Numba dparray functions")
+sys.stdout.flush()
+f = f4()
+print("f:", f, type(f))
+assert isinstance(f, dparray.ndarray)
+assert dparray.has_array_interface(f)
+del f
+
+print("------------------- Testing Numba dparray.as_ndarray")
+sys.stdout.flush()
+nd3 = f8(a)
+print("nd3:", nd3, type(nd3))
+assert type(nd3) == numpy.ndarray
+
+print("------------------- Testing Numba dparray.from_ndarray")
+sys.stdout.flush()
+dp2 = f9(nd3)
+print("dp2:", dp2, type(dp2))
+assert isinstance(dp2, dparray.ndarray)
+assert dparray.has_array_interface(dp2)
+del nd3
+del dp2
+
+print("------------------- Testing Numba dparray.empty")
+sys.stdout.flush()
+dp3 = f10()
+print("dp3:", dp3, type(dp3))
+assert isinstance(dp3, dparray.ndarray)
+assert dparray.has_array_interface(dp3)
+
+print("------------------- Testing Numba dparray.shape")
+sys.stdout.flush()
+s1 = f11(dp3)
+print("s1:", s1, type(s1))
+
+print("------------------- Testing Numba dparray.T")
+sys.stdout.flush()
+dp4 = f12(dp3)
+print("dp4:", dp4, type(dp4))
+assert isinstance(dp4, dparray.ndarray)
+assert dparray.has_array_interface(dp4)
+del dp3
+del dp4
+
+# -------------------------------
+del a
+
+print("SUCCESS")
diff --git a/setup.py b/setup.py
index 13f3d782d9..857eca49b1 100644
--- a/setup.py
+++ b/setup.py
@@ -8,6 +8,13 @@
 def get_ext_modules():
     ext_modules = []
 
+    ext_dppy = Extension(
+        name="numba_dppy._dppy_rt",
+        sources=["numba_dppy/dppl_rt.c"],
+        depends=["numba/core/runtime/nrt_external.h", "numba/core/runtime/nrt.h"],
+    )
+    ext_modules += [ext_modules]
+
     dpnp_present = False
     try:
         import dpnp
@@ -65,6 +72,11 @@ def get_ext_modules():
         "Topic :: Software Development :: Compilers",
     ],
     cmdclass=versioneer.get_cmdclass(),
+    entry_points={
+        "numba_extensions": [
+            "init = numba_dppy.dparray:numba_register",
+        ]},
+    )
 )
 
 setup(**metadata)

From 26d5e2e82fc1019aa40ea4ac948e7dc629e27573 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Wed, 9 Dec 2020 15:41:16 -0600
Subject: [PATCH 02/40] Fix build issues for dppy_rt.c

---
 numba_dppy/dppy_rt.c |  4 ++--
 setup.py             | 11 +++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
index 75c05ff585..140c3fcaf5 100644
--- a/numba_dppy/dppy_rt.c
+++ b/numba_dppy/dppy_rt.c
@@ -1,5 +1,5 @@
-#include "../_pymodule.h"
-#include "../core/runtime/nrt_external.h"
+#include "_pymodule.h"
+#include "core/runtime/nrt_external.h"
 #include "assert.h"
 #include <dlfcn.h>
 #include <stdio.h>
diff --git a/setup.py b/setup.py
index 857eca49b1..83c7153456 100644
--- a/setup.py
+++ b/setup.py
@@ -10,10 +10,11 @@ def get_ext_modules():
 
     ext_dppy = Extension(
         name="numba_dppy._dppy_rt",
-        sources=["numba_dppy/dppl_rt.c"],
-        depends=["numba/core/runtime/nrt_external.h", "numba/core/runtime/nrt.h"],
+        sources=["numba_dppy/dppy_rt.c"],
+        include_dirs=["../numba/numba"],  # Need to get rid of relative paths.
+        depends=["../numba/numba/core/runtime/nrt_external.h", "../numba/numba/core/runtime/nrt.h", "../numba/numba/_pymodule.h"],
     )
-    ext_modules += [ext_modules]
+    ext_modules += [ext_dppy]
 
     dpnp_present = False
     try:
@@ -45,7 +46,6 @@ def get_ext_modules():
 build_requires = ["cython"]
 install_requires = [
     "numba",
-    "cffi",
     "dpctl",
 ]
 
@@ -75,8 +75,7 @@ def get_ext_modules():
     entry_points={
         "numba_extensions": [
             "init = numba_dppy.dparray:numba_register",
-        ]},
-    )
+    ]},
 )
 
 setup(**metadata)

From a320b501974e112af5d2d28ffc97d0d3fc1b53ae Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Wed, 9 Dec 2020 16:08:38 -0600
Subject: [PATCH 03/40] Name changes from dparray things to usmarray.

---
 numba_dppy/dppy_rt.c                          |  41 ++--
 .../{dparray.py => numpy_usm_shared.py}       |  94 ++++----
 numba_dppy/tests/test_dparray.py              | 228 ------------------
 numba_dppy/tests/test_usmarray.py             | 228 ++++++++++++++++++
 setup.py                                      |   2 +-
 5 files changed, 296 insertions(+), 297 deletions(-)
 rename numba_dppy/{dparray.py => numpy_usm_shared.py} (76%)
 delete mode 100644 numba_dppy/tests/test_dparray.py
 create mode 100644 numba_dppy/tests/test_usmarray.py

diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
index 140c3fcaf5..dd892055bf 100644
--- a/numba_dppy/dppy_rt.c
+++ b/numba_dppy/dppy_rt.c
@@ -4,25 +4,25 @@
 #include <dlfcn.h>
 #include <stdio.h>
 
-NRT_ExternalAllocator dparray_allocator;
+NRT_ExternalAllocator usmarray_allocator;
 
-void dparray_memsys_init(void) {
+void usmarray_memsys_init(void) {
     void *(*get_queue)(void);
-    char *lib_name = "libDPPLSyclInterface.so";
-    char *malloc_name = "DPPLmalloc_shared";
-    char *free_name = "DPPLfree_with_queue";
-    char *get_queue_name = "DPPLQueueMgr_GetCurrentQueue";
+    char *lib_name = "libDPCTLSyclInterface.so";
+    char *malloc_name = "DPCTLmalloc_shared";
+    char *free_name = "DPCTLfree_with_queue";
+    char *get_queue_name = "DPCTLQueueMgr_GetCurrentQueue";
 
     void *sycldl = dlopen(lib_name, RTLD_NOW);
     assert(sycldl != NULL);
-    dparray_allocator.malloc = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
-    if (dparray_allocator.malloc == NULL) {
+    usmarray_allocator.malloc = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
+    if (usmarray_allocator.malloc == NULL) {
         printf("Did not find %s in %s\n", malloc_name, lib_name);
         exit(-1);
     }
-    dparray_allocator.realloc = NULL;
-    dparray_allocator.free = (NRT_external_free_func)dlsym(sycldl, free_name);
-    if (dparray_allocator.free == NULL) {
+    usmarray_allocator.realloc = NULL;
+    usmarray_allocator.free = (NRT_external_free_func)dlsym(sycldl, free_name);
+    if (usmarray_allocator.free == NULL) {
         printf("Did not find %s in %s\n", free_name, lib_name);
         exit(-1);
     }
@@ -31,18 +31,17 @@ void dparray_memsys_init(void) {
         printf("Did not find %s in %s\n", get_queue_name, lib_name);
         exit(-1);
     }
-    dparray_allocator.opaque_data = get_queue();
-//    printf("dparray_memsys_init: %p %p %p\n", dparray_allocator.malloc, dparray_allocator.free, dparray_allocator.opaque_data);
+    usmarray_allocator.opaque_data = get_queue();
 }
 
-void * dparray_get_ext_allocator(void) {
-    printf("dparray_get_ext_allocator %p\n", &dparray_allocator);
-    return (void*)&dparray_allocator;
+void * usmarray_get_ext_allocator(void) {
+    printf("usmarray_get_ext_allocator %p\n", &usmarray_allocator);
+    return (void*)&usmarray_allocator;
 }
 
 static PyObject *
 get_external_allocator(PyObject *self, PyObject *args) {
-    return PyLong_FromVoidPtr(dparray_get_ext_allocator());
+    return PyLong_FromVoidPtr(usmarray_get_ext_allocator());
 }
 
 static PyMethodDef ext_methods[] = {
@@ -69,7 +68,7 @@ build_c_helpers_dict(void)
     Py_DECREF(o);                                      \
 } while (0)
 
-    _declpointer("dparray_get_ext_allocator", &dparray_get_ext_allocator);
+    _declpointer("usmarray_get_ext_allocator", &usmarray_get_ext_allocator);
 
 #undef _declpointer
     return dct;
@@ -78,12 +77,12 @@ build_c_helpers_dict(void)
     return NULL;
 }
 
-MOD_INIT(_dppl_rt) {
+MOD_INIT(_dppy_rt) {
     PyObject *m;
-    MOD_DEF(m, "numba.dppl._dppl_rt", "No docs", ext_methods)
+    MOD_DEF(m, "numba_dppy._dppy_rt", "No docs", ext_methods)
     if (m == NULL)
         return MOD_ERROR_VAL;
-    dparray_memsys_init();
+    usmarray_memsys_init();
     PyModule_AddObject(m, "c_helpers", build_c_helpers_dict());
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/numba_dppy/dparray.py b/numba_dppy/numpy_usm_shared.py
similarity index 76%
rename from numba_dppy/dparray.py
rename to numba_dppy/numpy_usm_shared.py
index 654230fc81..0d190b1317 100644
--- a/numba_dppy/dparray.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -1,5 +1,5 @@
 # This class creates a type in Numba.
-class DPArrayType(types.Array):
+class UsmSharedArrayType(types.Array):
     def __init__(
         self,
         dtype,
@@ -11,8 +11,8 @@ def __init__(
         addrspace=None,
     ):
         # This name defines how this type will be shown in Numba's type dumps.
-        name = "DPArray:ndarray(%s, %sd, %s)" % (dtype, ndim, layout)
-        super(DPArrayType, self).__init__(
+        name = "UsmArray:ndarray(%s, %sd, %s)" % (dtype, ndim, layout)
+        super(UsmSharedArrayType, self).__init__(
             dtype,
             ndim,
             layout,
@@ -22,19 +22,19 @@ def __init__(
             addrspace=addrspace,
         )
 
-    # Tell Numba typing how to combine DPArrayType with other ndarray types.
+    # Tell Numba typing how to combine UsmSharedArrayType with other ndarray types.
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         if method == "__call__":
             for inp in inputs:
-                if not isinstance(inp, (DPArrayType, types.Array, types.Number)):
+                if not isinstance(inp, (UsmSharedArrayType, types.Array, types.Number)):
                     return None
 
-            return DPArrayType
+            return UsmSharedArrayType
         else:
             return None
 
 
-# This tells Numba how to create a DPArrayType when a dparray is passed
+# This tells Numba how to create a UsmSharedArrayType when a usmarray is passed
 # into a njit function.
 @typeof_impl.register(ndarray)
 def typeof_ta_ndarray(val, c):
@@ -44,16 +44,16 @@ def typeof_ta_ndarray(val, c):
         raise ValueError("Unsupported array dtype: %s" % (val.dtype,))
     layout = numpy_support.map_layout(val)
     readonly = not val.flags.writeable
-    return DPArrayType(dtype, val.ndim, layout, readonly=readonly)
+    return UsmSharedArrayType(dtype, val.ndim, layout, readonly=readonly)
 
 
 # This tells Numba to use the default Numpy ndarray data layout for
-# object of type DPArray.
-register_model(DPArrayType)(numba.core.datamodel.models.ArrayModel)
+# object of type UsmArray.
+register_model(UsmSharedArrayType)(numba.core.datamodel.models.ArrayModel)
 
 # This tells Numba how to convert from its native representation
-# of a DPArray in a njit function back to a Python DPArray.
-@box(DPArrayType)
+# of a UsmArray in a njit function back to a Python UsmArray.
+@box(UsmSharedArrayType)
 def box_array(typ, val, c):
     nativearycls = c.context.make_array(typ)
     nativeary = nativearycls(c.context, c.builder, value=val)
@@ -70,9 +70,9 @@ def box_array(typ, val, c):
 
 
 # This tells Numba to use this function when it needs to allocate a
-# DPArray in a njit function.
-@allocator(DPArrayType)
-def allocator_DPArray(context, builder, size, align):
+# UsmArray in a njit function.
+@allocator(UsmSharedArrayType)
+def allocator_UsmArray(context, builder, size, align):
     context.nrt._require_nrt()
 
     mod = builder.module
@@ -81,7 +81,7 @@ def allocator_DPArray(context, builder, size, align):
     # Get the Numba external allocator for USM memory.
     ext_allocator_fnty = ir.FunctionType(cgutils.voidptr_t, [])
     ext_allocator_fn = mod.get_or_insert_function(
-        ext_allocator_fnty, name="dparray_get_ext_allocator"
+        ext_allocator_fnty, name="usmarray_get_ext_allocator"
     )
     ext_allocator = builder.call(ext_allocator_fn, [])
     # Get the Numba function to allocate an aligned array with an external allocator.
@@ -109,14 +109,14 @@ def numba_register():
 
 
 # Copy a function registered as a lowerer in Numba but change the
-# "np" import in Numba to point to dparray instead of NumPy.
-def copy_func_for_dparray(f, dparray_mod):
+# "np" import in Numba to point to usmarray instead of NumPy.
+def copy_func_for_usmarray(f, usmarray_mod):
     import copy as cc
 
     # Make a copy so our change below doesn't affect anything else.
     gglobals = cc.copy(f.__globals__)
-    # Make the "np"'s in the code use dparray instead of Numba's default NumPy.
-    gglobals["np"] = dparray_mod
+    # Make the "np"'s in the code use usmarray instead of Numba's default NumPy.
+    gglobals["np"] = usmarray_mod
     # Create a new function using the original code but the new globals.
     g = ftype(f.__code__, gglobals, None, f.__defaults__, f.__closure__)
     # Some other tricks to make sure the function copy works.
@@ -126,7 +126,7 @@ def copy_func_for_dparray(f, dparray_mod):
 
 
 def types_replace_array(x):
-    return tuple([z if z != types.Array else DPArrayType for z in x])
+    return tuple([z if z != types.Array else UsmSharedArrayType for z in x])
 
 
 def numba_register_lower_builtin():
@@ -141,35 +141,35 @@ def numba_register_lower_builtin():
         # If it is a Numpy function...
         if isinstance(func, ftype):
             if func.__module__ == np.__name__:
-                # If we have overloaded that function in the dparray module (always True right now)...
+                # If we have overloaded that function in the usmarray module (always True right now)...
                 if func.__name__ in functions_list:
                     todo.append(ig)
         if isinstance(func, bftype):
             if func.__module__ == np.__name__:
-                # If we have overloaded that function in the dparray module (always True right now)...
+                # If we have overloaded that function in the usmarray module (always True right now)...
                 if func.__name__ in functions_list:
                     todo.append(ig)
 
     for lg in lower_registry.getattrs:
         func, attr, types = lg
-        types_with_dparray = types_replace_array(types)
-        if DPArrayType in types_with_dparray:
+        types_with_usmarray = types_replace_array(types)
+        if UsmSharedArrayType in types_with_usmarray:
             dprint(
                 "lower_getattr:", func, type(func), attr, type(attr), types, type(types)
             )
-            todo_getattr.append((func, attr, types_with_dparray))
+            todo_getattr.append((func, attr, types_with_usmarray))
 
     for lg in todo_getattr:
         lower_registry.getattrs.append(lg)
 
     cur_mod = importlib.import_module(__name__)
     for impl, func, types in todo + todo_builtin:
-        dparray_func = eval(func.__name__)
+        usmarray_func = eval(func.__name__)
         dprint(
-            "need to re-register lowerer for dparray", impl, func, types, dparray_func
+            "need to re-register lowerer for usmarray", impl, func, types, usmarray_func
         )
-        new_impl = copy_func_for_dparray(impl, cur_mod)
-        lower_registry.functions.append((new_impl, dparray_func, types))
+        new_impl = copy_func_for_usmarray(impl, cur_mod)
+        lower_registry.functions.append((new_impl, usmarray_func, types))
 
 
 def argspec_to_string(argspec):
@@ -190,7 +190,7 @@ def numba_register_typing():
         val, typ = ig
         # If it is a Numpy function...
         if isinstance(val, (ftype, bftype)):
-            # If we have overloaded that function in the dparray module (always True right now)...
+            # If we have overloaded that function in the usmarray module (always True right now)...
             if val.__name__ in functions_list:
                 todo.append(ig)
         if isinstance(val, type):
@@ -205,11 +205,11 @@ def numba_register_typing():
         # template is the typing class to invoke generic() upon.
         template = typ.templates[0]
         dpval = eval(val.__name__)
-        dprint("need to re-register for dparray", val, typ, typ.typing_key)
+        dprint("need to re-register for usmarray", val, typ, typ.typing_key)
         """
         if debug:
             print("--------------------------------------------------------------")
-            print("need to re-register for dparray", val, typ, typ.typing_key)
+            print("need to re-register for usmarray", val, typ, typ.typing_key)
             print("val:", val, type(val), "dir val", dir(val))
             print("typ:", typ, type(typ), "dir typ", dir(typ))
             print("typing key:", typ.typing_key)
@@ -239,7 +239,7 @@ def generic_impl(self):
                                 original_res = original_typer({})
                                 #print("original_res:", original_res)
                                 if isinstance(original_res, types.Array):
-                                    return DPArrayType(dtype=original_res.dtype, ndim=original_res.ndim, layout=original_res.layout)
+                                    return UsmSharedArrayType(dtype=original_res.dtype, ndim=original_res.ndim, layout=original_res.layout)
 
                                 return original_res""".format(
                 astr, ",".join(ot_argspec.args)
@@ -271,21 +271,21 @@ def generic_impl(self):
             # print("exec_res:", exec_res)
             return exec_res
 
-        new_dparray_template = type(
+        new_usmarray_template = type(
             class_name,
             (template,),
             {"set_class_vars": set_key_original, "generic": generic_impl},
         )
 
-        new_dparray_template.set_class_vars(dpval, template)
+        new_usmarray_template.set_class_vars(dpval, template)
 
         assert callable(dpval)
-        type_handler = types.Function(new_dparray_template)
+        type_handler = types.Function(new_usmarray_template)
         typing_registry.register_global(dpval, type_handler)
 
-    # Handle dparray attribute typing.
+    # Handle usmarray attribute typing.
     for tgetattr in todo_getattr:
-        class_name = tgetattr.__name__ + "_dparray"
+        class_name = tgetattr.__name__ + "_usmarray"
         dprint("tgetattr:", tgetattr, type(tgetattr), class_name)
 
         @classmethod
@@ -298,7 +298,7 @@ def getattr_impl(self, attr):
                 def wrapper(*args, **kwargs):
                     attr_res = tgetattr.__getattribute__(self, attr)(*args, **kwargs)
                     if isinstance(attr_res, types.Array):
-                        return DPArrayType(
+                        return UsmSharedArrayType(
                             dtype=attr_res.dtype,
                             ndim=attr_res.ndim,
                             layout=attr_res.layout,
@@ -308,14 +308,14 @@ def wrapper(*args, **kwargs):
             else:
                 return tgetattr.__getattribute__(self, attr)
 
-        new_dparray_template = type(
+        new_usmarray_template = type(
             class_name,
             (tgetattr,),
             {"set_class_vars": set_key, "__getattribute__": getattr_impl},
         )
 
-        new_dparray_template.set_class_vars(DPArrayType)
-        templates_registry.register_attr(new_dparray_template)
+        new_usmarray_template.set_class_vars(UsmSharedArrayType)
+        templates_registry.register_attr(new_usmarray_template)
 
 
 def from_ndarray(x):
@@ -339,16 +339,16 @@ def typer(arg):
 class DparrayFromNdarray(CallableTemplate):
     def generic(self):
         def typer(arg):
-            return DPArrayType(dtype=arg.dtype, ndim=arg.ndim, layout=arg.layout)
+            return UsmSharedArrayType(dtype=arg.dtype, ndim=arg.ndim, layout=arg.layout)
 
         return typer
 
 
-@lower_registry.lower(as_ndarray, DPArrayType)
-def dparray_conversion_as(context, builder, sig, args):
+@lower_registry.lower(as_ndarray, UsmSharedArrayType)
+def usmarray_conversion_as(context, builder, sig, args):
     return _array_copy(context, builder, sig, args)
 
 
 @lower_registry.lower(from_ndarray, types.Array)
-def dparray_conversion_from(context, builder, sig, args):
+def usmarray_conversion_from(context, builder, sig, args):
     return _array_copy(context, builder, sig, args)
diff --git a/numba_dppy/tests/test_dparray.py b/numba_dppy/tests/test_dparray.py
deleted file mode 100644
index 24dbea43c4..0000000000
--- a/numba_dppy/tests/test_dparray.py
+++ /dev/null
@@ -1,228 +0,0 @@
-from __future__ import print_function, division, absolute_import
-
-import numba
-import numba.dppl.dparray as dparray
-import numpy
-import sys
-
-
-def p1(a):
-    return a * 2.0 + 13
-
-
-f1 = numba.njit(p1)
-
-
-@numba.njit()
-def f2(a):
-    return a
-
-
-@numba.njit()
-def f3(a, b):  # a is dparray, b is numpy
-    return a * dparray.asarray(b)
-
-
-@numba.njit()
-def f4():
-    return dparray.ones(10)
-
-
-def p5(a, b):  # a is dparray, b is numpy
-    return a * b
-
-
-f5 = numba.njit(p5)
-
-
-@numba.njit()
-def f6(a):
-    return a + 13
-
-
-@numba.njit()
-def f7(a):  # a is dparray
-    # implicit conversion of a to numpy.ndarray
-    b = numpy.ones(10)
-    c = a * b
-    d = a.argsort()  # with no implicit conversion this fails
-
-
-@numba.njit
-def f8(a):
-    return dparray.as_ndarray(a)
-
-
-@numba.njit
-def f9(a):
-    return dparray.from_ndarray(a)
-
-
-@numba.njit
-def f10():
-    return dparray.empty((10, 10))
-
-
-@numba.njit
-def f11(x):
-    return x.shape
-
-
-@numba.njit
-def f12(x):
-    return x.T
-
-
-# --------------------------------------------------------------------------------
-
-print("------------------- Testing Python Numpy")
-sys.stdout.flush()
-z1 = numpy.ones(10)
-z2 = p1(z1)
-print("z2:", z2, type(z2))
-assert type(z2) == numpy.ndarray
-
-print("------------------- Testing Numba Numpy")
-sys.stdout.flush()
-z1 = numpy.ones(10)
-z2 = f1(z1)
-print("z2:", z2, type(z2))
-assert type(z2) == numpy.ndarray
-
-print("------------------- Testing dparray ones")
-sys.stdout.flush()
-a = dparray.ones(10)
-print("a:", a, type(a))
-assert isinstance(a, dparray.ndarray)
-assert dparray.has_array_interface(a)
-
-print("------------------- Testing dparray.dparray.as_ndarray")
-sys.stdout.flush()
-nd1 = a.as_ndarray()
-print("nd1:", nd1, type(nd1))
-assert type(nd1) == numpy.ndarray
-
-print("------------------- Testing dparray.as_ndarray")
-sys.stdout.flush()
-nd2 = dparray.as_ndarray(a)
-print("nd2:", nd2, type(nd2))
-assert type(nd2) == numpy.ndarray
-
-print("------------------- Testing dparray.from_ndarray")
-sys.stdout.flush()
-dp1 = dparray.from_ndarray(nd2)
-print("dp1:", dp1, type(dp1))
-assert isinstance(dp1, dparray.ndarray)
-assert dparray.has_array_interface(dp1)
-
-print("------------------- Testing dparray multiplication")
-sys.stdout.flush()
-c = a * 5
-print("c", c, type(c))
-assert isinstance(c, dparray.ndarray)
-assert dparray.has_array_interface(c)
-
-print("------------------- Testing Python dparray")
-sys.stdout.flush()
-b = p1(c)
-print("b:", b, type(b))
-assert isinstance(b, dparray.ndarray)
-assert dparray.has_array_interface(b)
-del b
-
-print("------------------- Testing Python mixing dparray and numpy.ndarray")
-sys.stdout.flush()
-h = p5(a, z1)
-print("h:", h, type(h))
-assert isinstance(h, dparray.ndarray)
-assert dparray.has_array_interface(h)
-del h
-
-print("------------------- Testing Numba dparray 2")
-sys.stdout.flush()
-d = f2(a)
-print("d:", d, type(d))
-assert isinstance(d, dparray.ndarray)
-assert dparray.has_array_interface(d)
-del d
-
-print("------------------- Testing Numba dparray")
-sys.stdout.flush()
-b = f1(c)
-print("b:", b, type(b))
-assert isinstance(b, dparray.ndarray)
-assert dparray.has_array_interface(b)
-del b
-
-"""
-print("------------------- Testing Numba dparray constructor from numpy.ndarray")
-sys.stdout.flush()
-e = f3(a, z1)
-print("e:", e, type(e))
-assert(isinstance(e, dparray.ndarray))
-"""
-
-print("------------------- Testing Numba mixing dparray and constant")
-sys.stdout.flush()
-g = f6(a)
-print("g:", g, type(g))
-assert isinstance(g, dparray.ndarray)
-assert dparray.has_array_interface(g)
-del g
-
-print("------------------- Testing Numba mixing dparray and numpy.ndarray")
-sys.stdout.flush()
-h = f5(a, z1)
-print("h:", h, type(h))
-assert isinstance(h, dparray.ndarray)
-assert dparray.has_array_interface(h)
-del h
-
-print("------------------- Testing Numba dparray functions")
-sys.stdout.flush()
-f = f4()
-print("f:", f, type(f))
-assert isinstance(f, dparray.ndarray)
-assert dparray.has_array_interface(f)
-del f
-
-print("------------------- Testing Numba dparray.as_ndarray")
-sys.stdout.flush()
-nd3 = f8(a)
-print("nd3:", nd3, type(nd3))
-assert type(nd3) == numpy.ndarray
-
-print("------------------- Testing Numba dparray.from_ndarray")
-sys.stdout.flush()
-dp2 = f9(nd3)
-print("dp2:", dp2, type(dp2))
-assert isinstance(dp2, dparray.ndarray)
-assert dparray.has_array_interface(dp2)
-del nd3
-del dp2
-
-print("------------------- Testing Numba dparray.empty")
-sys.stdout.flush()
-dp3 = f10()
-print("dp3:", dp3, type(dp3))
-assert isinstance(dp3, dparray.ndarray)
-assert dparray.has_array_interface(dp3)
-
-print("------------------- Testing Numba dparray.shape")
-sys.stdout.flush()
-s1 = f11(dp3)
-print("s1:", s1, type(s1))
-
-print("------------------- Testing Numba dparray.T")
-sys.stdout.flush()
-dp4 = f12(dp3)
-print("dp4:", dp4, type(dp4))
-assert isinstance(dp4, dparray.ndarray)
-assert dparray.has_array_interface(dp4)
-del dp3
-del dp4
-
-# -------------------------------
-del a
-
-print("SUCCESS")
diff --git a/numba_dppy/tests/test_usmarray.py b/numba_dppy/tests/test_usmarray.py
new file mode 100644
index 0000000000..fe1be71c9e
--- /dev/null
+++ b/numba_dppy/tests/test_usmarray.py
@@ -0,0 +1,228 @@
+from __future__ import print_function, division, absolute_import
+
+import numba
+import dpctl.dptensor.numpy_usm_shared as usmarray
+import numpy
+import sys
+
+
+def p1(a):
+    return a * 2.0 + 13
+
+
+f1 = numba.njit(p1)
+
+
+@numba.njit()
+def f2(a):
+    return a
+
+
+@numba.njit()
+def f3(a, b):  # a is usmarray, b is numpy
+    return a * usmarray.asarray(b)
+
+
+@numba.njit()
+def f4():
+    return usmarray.ones(10)
+
+
+def p5(a, b):  # a is usmarray, b is numpy
+    return a * b
+
+
+f5 = numba.njit(p5)
+
+
+@numba.njit()
+def f6(a):
+    return a + 13
+
+
+@numba.njit()
+def f7(a):  # a is usmarray
+    # implicit conversion of a to numpy.ndarray
+    b = numpy.ones(10)
+    c = a * b
+    d = a.argsort()  # with no implicit conversion this fails
+
+
+@numba.njit
+def f8(a):
+    return usmarray.as_ndarray(a)
+
+
+@numba.njit
+def f9(a):
+    return usmarray.from_ndarray(a)
+
+
+@numba.njit
+def f10():
+    return usmarray.empty((10, 10))
+
+
+@numba.njit
+def f11(x):
+    return x.shape
+
+
+@numba.njit
+def f12(x):
+    return x.T
+
+
+# --------------------------------------------------------------------------------
+
+print("------------------- Testing Python Numpy")
+sys.stdout.flush()
+z1 = numpy.ones(10)
+z2 = p1(z1)
+print("z2:", z2, type(z2))
+assert type(z2) == numpy.ndarray
+
+print("------------------- Testing Numba Numpy")
+sys.stdout.flush()
+z1 = numpy.ones(10)
+z2 = f1(z1)
+print("z2:", z2, type(z2))
+assert type(z2) == numpy.ndarray
+
+print("------------------- Testing usmarray ones")
+sys.stdout.flush()
+a = usmarray.ones(10)
+print("a:", a, type(a))
+assert isinstance(a, usmarray.ndarray)
+assert usmarray.has_array_interface(a)
+
+print("------------------- Testing usmarray.usmarray.as_ndarray")
+sys.stdout.flush()
+nd1 = a.as_ndarray()
+print("nd1:", nd1, type(nd1))
+assert type(nd1) == numpy.ndarray
+
+print("------------------- Testing usmarray.as_ndarray")
+sys.stdout.flush()
+nd2 = usmarray.as_ndarray(a)
+print("nd2:", nd2, type(nd2))
+assert type(nd2) == numpy.ndarray
+
+print("------------------- Testing usmarray.from_ndarray")
+sys.stdout.flush()
+dp1 = usmarray.from_ndarray(nd2)
+print("dp1:", dp1, type(dp1))
+assert isinstance(dp1, usmarray.ndarray)
+assert usmarray.has_array_interface(dp1)
+
+print("------------------- Testing usmarray multiplication")
+sys.stdout.flush()
+c = a * 5
+print("c", c, type(c))
+assert isinstance(c, usmarray.ndarray)
+assert usmarray.has_array_interface(c)
+
+print("------------------- Testing Python usmarray")
+sys.stdout.flush()
+b = p1(c)
+print("b:", b, type(b))
+assert isinstance(b, usmarray.ndarray)
+assert usmarray.has_array_interface(b)
+del b
+
+print("------------------- Testing Python mixing usmarray and numpy.ndarray")
+sys.stdout.flush()
+h = p5(a, z1)
+print("h:", h, type(h))
+assert isinstance(h, usmarray.ndarray)
+assert usmarray.has_array_interface(h)
+del h
+
+print("------------------- Testing Numba usmarray 2")
+sys.stdout.flush()
+d = f2(a)
+print("d:", d, type(d))
+assert isinstance(d, usmarray.ndarray)
+assert usmarray.has_array_interface(d)
+del d
+
+print("------------------- Testing Numba usmarray")
+sys.stdout.flush()
+b = f1(c)
+print("b:", b, type(b))
+assert isinstance(b, usmarray.ndarray)
+assert usmarray.has_array_interface(b)
+del b
+
+"""
+print("------------------- Testing Numba usmarray constructor from numpy.ndarray")
+sys.stdout.flush()
+e = f3(a, z1)
+print("e:", e, type(e))
+assert(isinstance(e, usmarray.ndarray))
+"""
+
+print("------------------- Testing Numba mixing usmarray and constant")
+sys.stdout.flush()
+g = f6(a)
+print("g:", g, type(g))
+assert isinstance(g, usmarray.ndarray)
+assert usmarray.has_array_interface(g)
+del g
+
+print("------------------- Testing Numba mixing usmarray and numpy.ndarray")
+sys.stdout.flush()
+h = f5(a, z1)
+print("h:", h, type(h))
+assert isinstance(h, usmarray.ndarray)
+assert usmarray.has_array_interface(h)
+del h
+
+print("------------------- Testing Numba usmarray functions")
+sys.stdout.flush()
+f = f4()
+print("f:", f, type(f))
+assert isinstance(f, usmarray.ndarray)
+assert usmarray.has_array_interface(f)
+del f
+
+print("------------------- Testing Numba usmarray.as_ndarray")
+sys.stdout.flush()
+nd3 = f8(a)
+print("nd3:", nd3, type(nd3))
+assert type(nd3) == numpy.ndarray
+
+print("------------------- Testing Numba usmarray.from_ndarray")
+sys.stdout.flush()
+dp2 = f9(nd3)
+print("dp2:", dp2, type(dp2))
+assert isinstance(dp2, usmarray.ndarray)
+assert usmarray.has_array_interface(dp2)
+del nd3
+del dp2
+
+print("------------------- Testing Numba usmarray.empty")
+sys.stdout.flush()
+dp3 = f10()
+print("dp3:", dp3, type(dp3))
+assert isinstance(dp3, usmarray.ndarray)
+assert usmarray.has_array_interface(dp3)
+
+print("------------------- Testing Numba usmarray.shape")
+sys.stdout.flush()
+s1 = f11(dp3)
+print("s1:", s1, type(s1))
+
+print("------------------- Testing Numba usmarray.T")
+sys.stdout.flush()
+dp4 = f12(dp3)
+print("dp4:", dp4, type(dp4))
+assert isinstance(dp4, usmarray.ndarray)
+assert usmarray.has_array_interface(dp4)
+del dp3
+del dp4
+
+# -------------------------------
+del a
+
+print("SUCCESS")
diff --git a/setup.py b/setup.py
index 83c7153456..b870c50a8f 100644
--- a/setup.py
+++ b/setup.py
@@ -74,7 +74,7 @@ def get_ext_modules():
     cmdclass=versioneer.get_cmdclass(),
     entry_points={
         "numba_extensions": [
-            "init = numba_dppy.dparray:numba_register",
+            "init = numba_dppy.usmarray:numba_register",
     ]},
 )
 

From f9de97a5099b44b4f9033966232b6d38fc437ea6 Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Fri, 4 Dec 2020 15:39:40 +0300
Subject: [PATCH 04/40] Delete old backup file (#45)

Co-authored-by: etotmeni <elena.totmenina@intel.com>
---
 .../parfor_loop_invariant_hoisting.py.bkp     | 213 ------------------
 1 file changed, 213 deletions(-)
 delete mode 100644 numba_dppy/parfor_loop_invariant_hoisting.py.bkp

diff --git a/numba_dppy/parfor_loop_invariant_hoisting.py.bkp b/numba_dppy/parfor_loop_invariant_hoisting.py.bkp
deleted file mode 100644
index fb37a1c97b..0000000000
--- a/numba_dppy/parfor_loop_invariant_hoisting.py.bkp
+++ /dev/null
@@ -1,213 +0,0 @@
-from __future__ import print_function, division, absolute_import
-
-def add_to_def_once_sets(a_def, def_once, def_more):
-    '''If the variable is already defined more than once, do nothing.
-       Else if defined exactly once previously then transition this
-       variable to the defined more than once set (remove it from
-       def_once set and add to def_more set).
-       Else this must be the first time we've seen this variable defined
-       so add to def_once set.
-    '''
-    if a_def in def_more:
-        pass
-    elif a_def in def_once:
-        def_more.add(a_def)
-        def_once.remove(a_def)
-    else:
-        def_once.add(a_def)
-
-def compute_def_once_block(block, def_once, def_more, getattr_taken, typemap, module_assigns):
-    '''Effect changes to the set of variables defined once or more than once
-       for a single block.
-       block - the block to process
-       def_once - set of variable names known to be defined exactly once
-       def_more - set of variable names known to be defined more than once
-       getattr_taken - dict mapping variable name to tuple of object and attribute taken
-       module_assigns - dict mapping variable name to the Global that they came from
-    '''
-    # The only "defs" occur in assignments, so find such instructions.
-    assignments = block.find_insts(ir.Assign)
-    # For each assignment...
-    for one_assign in assignments:
-        # Get the LHS/target of the assignment.
-        a_def = one_assign.target.name
-        # Add variable to def sets.
-        add_to_def_once_sets(a_def, def_once, def_more)
-
-        rhs = one_assign.value
-        if isinstance(rhs, ir.Global):
-            # Remember assignments of the form "a = Global(...)"
-            # Is this a module?
-            if isinstance(rhs.value, pytypes.ModuleType):
-                module_assigns[a_def] = rhs.value.__name__
-        if isinstance(rhs, ir.Expr) and rhs.op == 'getattr' and rhs.value.name in def_once:
-            # Remember assignments of the form "a = b.c"
-            getattr_taken[a_def] = (rhs.value.name, rhs.attr)
-        if isinstance(rhs, ir.Expr) and rhs.op == 'call' and rhs.func.name in getattr_taken:
-            # If "a" is being called then lookup the getattr definition of "a"
-            # as above, getting the module variable "b" (base_obj)
-            # and the attribute "c" (base_attr).
-            base_obj, base_attr = getattr_taken[rhs.func.name]
-            if base_obj in module_assigns:
-                # If we know the definition of the module variable then get the module
-                # name from module_assigns.
-                base_mod_name = module_assigns[base_obj]
-                if not is_const_call(base_mod_name, base_attr):
-                    # Calling a method on an object could modify the object and is thus
-                    # like a def of that object.  We call is_const_call to see if this module/attribute
-                    # combination is known to not modify the module state.  If we don't know that
-                    # the combination is safe then we have to assume there could be a modification to
-                    # the module and thus add the module variable as defined more than once.
-                    add_to_def_once_sets(base_obj, def_once, def_more)
-            else:
-                # Assume the worst and say that base_obj could be modified by the call.
-                add_to_def_once_sets(base_obj, def_once, def_more)
-        if isinstance(rhs, ir.Expr) and rhs.op == 'call':
-            # If a mutable object is passed to a function, then it may be changed and
-            # therefore can't be hoisted.
-            # For each argument to the function...
-            for argvar in rhs.args:
-                # Get the argument's type.
-                if isinstance(argvar, ir.Var):
-                    argvar = argvar.name
-                avtype = typemap[argvar]
-                # If that type doesn't have a mutable attribute or it does and it's set to
-                # not mutable then this usage is safe for hoisting.
-                if getattr(avtype, 'mutable', False):
-                    # Here we have a mutable variable passed to a function so add this variable
-                    # to the def lists.
-                    add_to_def_once_sets(argvar, def_once, def_more)
-
-def compute_def_once_internal(loop_body, def_once, def_more, getattr_taken, typemap, module_assigns):
-    '''Compute the set of variables defined exactly once in the given set of blocks
-       and use the given sets for storing which variables are defined once, more than
-       once and which have had a getattr call on them.
-    '''
-    # For each block...
-    for label, block in loop_body.items():
-        # Scan this block and effect changes to def_once, def_more, and getattr_taken
-        # based on the instructions in that block.
-        compute_def_once_block(block, def_once, def_more, getattr_taken, typemap, module_assigns)
-        # Have to recursively process parfors manually here.
-        for inst in block.body:
-            if isinstance(inst, parfor.Parfor):
-                # Recursively compute for the parfor's init block.
-                compute_def_once_block(inst.init_block, def_once, def_more, getattr_taken, typemap, module_assigns)
-                # Recursively compute for the parfor's loop body.
-                compute_def_once_internal(inst.loop_body, def_once, def_more, getattr_taken, typemap, module_assigns)
-
-def compute_def_once(loop_body, typemap):
-    '''Compute the set of variables defined exactly once in the given set of blocks.
-    '''
-    def_once = set()   # set to hold variables defined exactly once
-    def_more = set()   # set to hold variables defined more than once
-    getattr_taken = {}
-    module_assigns = {}
-    compute_def_once_internal(loop_body, def_once, def_more, getattr_taken, typemap, module_assigns)
-    return def_once
-
-def find_vars(var, varset):
-    assert isinstance(var, ir.Var)
-    varset.add(var.name)
-    return var
-
-def _hoist_internal(inst, dep_on_param, call_table, hoisted, not_hoisted,
-                    typemap, stored_arrays):
-    if inst.target.name in stored_arrays:
-        not_hoisted.append((inst, "stored array"))
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("Instruction", inst, " could not be hoisted because the created array is stored.")
-        return False
-
-    uses = set()
-    visit_vars_inner(inst.value, find_vars, uses)
-    diff = uses.difference(dep_on_param)
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("_hoist_internal:", inst, "uses:", uses, "diff:", diff)
-    if len(diff) == 0 and is_pure(inst.value, None, call_table):
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("Will hoist instruction", inst, typemap[inst.target.name])
-        hoisted.append(inst)
-        if not isinstance(typemap[inst.target.name], types.npytypes.Array):
-            dep_on_param += [inst.target.name]
-        return True
-    else:
-        if len(diff) > 0:
-            not_hoisted.append((inst, "dependency"))
-            if config.DEBUG_ARRAY_OPT >= 1:
-                print("Instruction", inst, " could not be hoisted because of a dependency.")
-        else:
-            not_hoisted.append((inst, "not pure"))
-            if config.DEBUG_ARRAY_OPT >= 1:
-                print("Instruction", inst, " could not be hoisted because it isn't pure.")
-    return False
-
-def find_setitems_block(setitems, itemsset, block, typemap):
-    for inst in block.body:
-        if isinstance(inst, ir.StaticSetItem) or isinstance(inst, ir.SetItem):
-            setitems.add(inst.target.name)
-            # If we store a non-mutable object into an array then that is safe to hoist.
-            # If the stored object is mutable and you hoist then multiple entries in the
-            # outer array could reference the same object and changing one index would then
-            # change other indices.
-            if getattr(typemap[inst.value.name], "mutable", False):
-                itemsset.add(inst.value.name)
-        elif isinstance(inst, parfor.Parfor):
-            find_setitems_block(setitems, itemsset, inst.init_block, typemap)
-            find_setitems_body(setitems, itemsset, inst.loop_body, typemap)
-
-def find_setitems_body(setitems, itemsset, loop_body, typemap):
-    """
-      Find the arrays that are written into (goes into setitems) and the
-      mutable objects (mostly arrays) that are written into other arrays
-      (goes into itemsset).
-    """
-    for label, block in loop_body.items():
-        find_setitems_block(setitems, itemsset, block, typemap)
-
-def hoist(parfor_params, loop_body, typemap, wrapped_blocks):
-    dep_on_param = copy.copy(parfor_params)
-    hoisted = []
-    not_hoisted = []
-
-    # Compute the set of variable defined exactly once in the loop body.
-    def_once = compute_def_once(loop_body, typemap)
-    (call_table, reverse_call_table) = get_call_table(wrapped_blocks)
-
-    setitems = set()
-    itemsset = set()
-    find_setitems_body(setitems, itemsset, loop_body, typemap)
-    dep_on_param = list(set(dep_on_param).difference(setitems))
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("hoist - def_once:", def_once, "setitems:",
-              setitems, "itemsset:", itemsset, "dep_on_param:",
-              dep_on_param, "parfor_params:", parfor_params)
-
-    for label, block in loop_body.items():
-        new_block = []
-        for inst in block.body:
-            if isinstance(inst, ir.Assign) and inst.target.name in def_once:
-                if _hoist_internal(inst, dep_on_param, call_table,
-                                   hoisted, not_hoisted, typemap, itemsset):
-                    # don't add this instruction to the block since it is
-                    # hoisted
-                    continue
-            elif isinstance(inst, parfor.Parfor):
-                new_init_block = []
-                if config.DEBUG_ARRAY_OPT >= 1:
-                    print("parfor")
-                    inst.dump()
-                for ib_inst in inst.init_block.body:
-                    if (isinstance(ib_inst, ir.Assign) and
-                        ib_inst.target.name in def_once):
-                        if _hoist_internal(ib_inst, dep_on_param, call_table,
-                                           hoisted, not_hoisted, typemap, itemsset):
-                            # don't add this instuction to the block since it is hoisted
-                            continue
-                    new_init_block.append(ib_inst)
-                inst.init_block.body = new_init_block
-
-            new_block.append(inst)
-        block.body = new_block
-    return hoisted, not_hoisted
-

From b8c11f77285ad77db688854cc039f74a29a8d0c3 Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Fri, 4 Dec 2020 15:41:15 +0300
Subject: [PATCH 05/40] Del dppl dir in tests (#43)

* Del dppl dir in tests

* Del unused var

Co-authored-by: etotmeni <elena.totmenina@intel.com>
---
 numba_dppy/tests/__init__.py                                | 3 +--
 numba_dppy/tests/dppl/__init__.py                           | 6 ------
 numba_dppy/tests/{dppl => }/test_arg_accessor.py            | 0
 numba_dppy/tests/{dppl => }/test_arg_types.py               | 0
 numba_dppy/tests/{dppl => }/test_atomic_op.py               | 0
 numba_dppy/tests/{dppl => }/test_barrier.py                 | 0
 numba_dppy/tests/{dppl => }/test_black_scholes.py           | 0
 numba_dppy/tests/{dppl => }/test_caching.py                 | 0
 numba_dppy/tests/{dppl => }/test_device_array_args.py       | 0
 numba_dppy/tests/{dppl => }/test_dpctl_api.py               | 0
 numba_dppy/tests/{dppl => }/test_dpnp_functions.py          | 0
 numba_dppy/tests/{dppl => }/test_dppl_fallback.py           | 0
 numba_dppy/tests/{dppl => }/test_dppl_func.py               | 0
 numba_dppy/tests/{dppl => }/test_math_functions.py          | 0
 .../tests/{dppl => }/test_numpy_bit_twiddling_functions.py  | 0
 .../tests/{dppl => }/test_numpy_comparison_functions.py     | 0
 .../tests/{dppl => }/test_numpy_floating_functions.py       | 0
 numba_dppy/tests/{dppl => }/test_numpy_math_functions.py    | 0
 .../tests/{dppl => }/test_numpy_trigonomteric_functions.py  | 0
 numba_dppy/tests/{dppl => }/test_parfor_lower_message.py    | 0
 numba_dppy/tests/{dppl => }/test_prange.py                  | 0
 numba_dppy/tests/{dppl => }/test_print.py                   | 0
 numba_dppy/tests/{dppl => }/test_sum_reduction.py           | 0
 numba_dppy/tests/{dppl => }/test_vectorize.py               | 0
 numba_dppy/tests/{dppl => }/test_with_context.py            | 0
 25 files changed, 1 insertion(+), 8 deletions(-)
 delete mode 100644 numba_dppy/tests/dppl/__init__.py
 rename numba_dppy/tests/{dppl => }/test_arg_accessor.py (100%)
 rename numba_dppy/tests/{dppl => }/test_arg_types.py (100%)
 rename numba_dppy/tests/{dppl => }/test_atomic_op.py (100%)
 rename numba_dppy/tests/{dppl => }/test_barrier.py (100%)
 rename numba_dppy/tests/{dppl => }/test_black_scholes.py (100%)
 rename numba_dppy/tests/{dppl => }/test_caching.py (100%)
 rename numba_dppy/tests/{dppl => }/test_device_array_args.py (100%)
 rename numba_dppy/tests/{dppl => }/test_dpctl_api.py (100%)
 rename numba_dppy/tests/{dppl => }/test_dpnp_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_dppl_fallback.py (100%)
 rename numba_dppy/tests/{dppl => }/test_dppl_func.py (100%)
 rename numba_dppy/tests/{dppl => }/test_math_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_bit_twiddling_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_comparison_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_floating_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_math_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_trigonomteric_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_parfor_lower_message.py (100%)
 rename numba_dppy/tests/{dppl => }/test_prange.py (100%)
 rename numba_dppy/tests/{dppl => }/test_print.py (100%)
 rename numba_dppy/tests/{dppl => }/test_sum_reduction.py (100%)
 rename numba_dppy/tests/{dppl => }/test_vectorize.py (100%)
 rename numba_dppy/tests/{dppl => }/test_with_context.py (100%)

diff --git a/numba_dppy/tests/__init__.py b/numba_dppy/tests/__init__.py
index d29208fb91..5a2199f149 100644
--- a/numba_dppy/tests/__init__.py
+++ b/numba_dppy/tests/__init__.py
@@ -8,10 +8,9 @@
 def load_tests(loader, tests, pattern):
 
     suite = SerialSuite()
-    this_dir = dirname(__file__)
 
     if dppy_config.dppy_present:
-        suite.addTests(load_testsuite(loader, join(this_dir, 'dppl')))
+        suite.addTests(load_testsuite(loader, dirname(__file__)))
     else:
         print("skipped DPPL tests")
 
diff --git a/numba_dppy/tests/dppl/__init__.py b/numba_dppy/tests/dppl/__init__.py
deleted file mode 100644
index cff5a36cc2..0000000000
--- a/numba_dppy/tests/dppl/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from numba.testing import SerialSuite
-from numba.testing import load_testsuite
-import os
-
-def load_tests(loader, tests, pattern):
-    return SerialSuite(load_testsuite(loader, os.path.dirname(__file__)))
diff --git a/numba_dppy/tests/dppl/test_arg_accessor.py b/numba_dppy/tests/test_arg_accessor.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_arg_accessor.py
rename to numba_dppy/tests/test_arg_accessor.py
diff --git a/numba_dppy/tests/dppl/test_arg_types.py b/numba_dppy/tests/test_arg_types.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_arg_types.py
rename to numba_dppy/tests/test_arg_types.py
diff --git a/numba_dppy/tests/dppl/test_atomic_op.py b/numba_dppy/tests/test_atomic_op.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_atomic_op.py
rename to numba_dppy/tests/test_atomic_op.py
diff --git a/numba_dppy/tests/dppl/test_barrier.py b/numba_dppy/tests/test_barrier.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_barrier.py
rename to numba_dppy/tests/test_barrier.py
diff --git a/numba_dppy/tests/dppl/test_black_scholes.py b/numba_dppy/tests/test_black_scholes.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_black_scholes.py
rename to numba_dppy/tests/test_black_scholes.py
diff --git a/numba_dppy/tests/dppl/test_caching.py b/numba_dppy/tests/test_caching.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_caching.py
rename to numba_dppy/tests/test_caching.py
diff --git a/numba_dppy/tests/dppl/test_device_array_args.py b/numba_dppy/tests/test_device_array_args.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_device_array_args.py
rename to numba_dppy/tests/test_device_array_args.py
diff --git a/numba_dppy/tests/dppl/test_dpctl_api.py b/numba_dppy/tests/test_dpctl_api.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_dpctl_api.py
rename to numba_dppy/tests/test_dpctl_api.py
diff --git a/numba_dppy/tests/dppl/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_dpnp_functions.py
rename to numba_dppy/tests/test_dpnp_functions.py
diff --git a/numba_dppy/tests/dppl/test_dppl_fallback.py b/numba_dppy/tests/test_dppl_fallback.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_dppl_fallback.py
rename to numba_dppy/tests/test_dppl_fallback.py
diff --git a/numba_dppy/tests/dppl/test_dppl_func.py b/numba_dppy/tests/test_dppl_func.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_dppl_func.py
rename to numba_dppy/tests/test_dppl_func.py
diff --git a/numba_dppy/tests/dppl/test_math_functions.py b/numba_dppy/tests/test_math_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_math_functions.py
rename to numba_dppy/tests/test_math_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_bit_twiddling_functions.py b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_bit_twiddling_functions.py
rename to numba_dppy/tests/test_numpy_bit_twiddling_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_comparison_functions.py b/numba_dppy/tests/test_numpy_comparison_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_comparison_functions.py
rename to numba_dppy/tests/test_numpy_comparison_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_floating_functions.py b/numba_dppy/tests/test_numpy_floating_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_floating_functions.py
rename to numba_dppy/tests/test_numpy_floating_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_math_functions.py
rename to numba_dppy/tests/test_numpy_math_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_trigonomteric_functions.py b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_trigonomteric_functions.py
rename to numba_dppy/tests/test_numpy_trigonomteric_functions.py
diff --git a/numba_dppy/tests/dppl/test_parfor_lower_message.py b/numba_dppy/tests/test_parfor_lower_message.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_parfor_lower_message.py
rename to numba_dppy/tests/test_parfor_lower_message.py
diff --git a/numba_dppy/tests/dppl/test_prange.py b/numba_dppy/tests/test_prange.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_prange.py
rename to numba_dppy/tests/test_prange.py
diff --git a/numba_dppy/tests/dppl/test_print.py b/numba_dppy/tests/test_print.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_print.py
rename to numba_dppy/tests/test_print.py
diff --git a/numba_dppy/tests/dppl/test_sum_reduction.py b/numba_dppy/tests/test_sum_reduction.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_sum_reduction.py
rename to numba_dppy/tests/test_sum_reduction.py
diff --git a/numba_dppy/tests/dppl/test_vectorize.py b/numba_dppy/tests/test_vectorize.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_vectorize.py
rename to numba_dppy/tests/test_vectorize.py
diff --git a/numba_dppy/tests/dppl/test_with_context.py b/numba_dppy/tests/test_with_context.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_with_context.py
rename to numba_dppy/tests/test_with_context.py

From 16fb9b8967a5814aeba2e5fb9459465abd668bf4 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 4 Dec 2020 09:04:32 -0600
Subject: [PATCH 06/40] Revert "numba-dppy requires cffi"

This reverts commit 776bf2228e2aef77ea9767ce2ae90ff204482230.
---
 conda-recipe/meta.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 4967295c05..d8f6c1ecbb 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -23,7 +23,6 @@ requirements:
     run:
         - python
         - numba >=0.51
-        - cffi
         - dpctl
         - spirv-tools
         - llvm-spirv

From 56e969a0904c112b4a55cd1a75f76bb18a58f261 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 4 Dec 2020 09:06:12 -0600
Subject: [PATCH 07/40] Remove use of cffi

---
 numba_dppy/dppl_lowerer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/numba_dppy/dppl_lowerer.py b/numba_dppy/dppl_lowerer.py
index 51fb072551..a317c990a6 100644
--- a/numba_dppy/dppl_lowerer.py
+++ b/numba_dppy/dppl_lowerer.py
@@ -979,14 +979,13 @@ def relatively_deep_copy(obj, memo):
     from numba.core.compiler import CompileResult
     from numba.np.ufunc.dufunc import DUFunc
     from ctypes import _CFuncPtr
-    from cffi.api import FFI
     from types import ModuleType
     from numba.core.types.abstract import Type
 
     # objects which shouldn't or can't be copied and it's ok not to copy it.
     if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type, Dispatcher, ModuleType,
                         Signature, DPPLFunctionTemplate, CompileResult,
-                        DUFunc, _CFuncPtr, FFI,
+                        DUFunc, _CFuncPtr,
                         type, str, bool, type(None))):
         return obj
 

From 0a56e08cd9d01c33be5f2b90de6bcece106d480d Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Mon, 7 Dec 2020 22:44:44 +0300
Subject: [PATCH 08/40] Rename dppl to dppy (#42)

Co-authored-by: etotmeni <elena.totmenina@intel.com>
Co-authored-by: Diptorup Deb <diptorup.deb@intel.com>
---
 .gitignore                                    |  27 +++
 HowTo.rst                                     |   6 +-
 numba_dppy/CHANGE_LOG                         |   4 +-
 numba_dppy/__init__.py                        |  52 +++---
 numba_dppy/compiler.py                        | 103 +++++------
 numba_dppy/decorators.py                      |  14 +-
 numba_dppy/descriptor.py                      |  20 +--
 numba_dppy/dispatcher.py                      |  22 +--
 ...n_call_gen.py => dppy_host_fn_call_gen.py} |  14 +-
 .../{dppl_lowerer.py => dppy_lowerer.py}      |  52 +++---
 ...spatcher.py => dppy_offload_dispatcher.py} |  12 +-
 ...ppl_passbuilder.py => dppy_passbuilder.py} |  34 ++--
 numba_dppy/{dppl_passes.py => dppy_passes.py} |  30 ++--
 .../examples/{dppl_func.py => dppy_func.py}   |  10 +-
 ...l_with_context.py => dppy_with_context.py} |   2 +-
 numba_dppy/examples/matmul.py                 |  12 +-
 numba_dppy/examples/pairwise_distance.py      |   6 +-
 numba_dppy/examples/sum-hybrid.py             |  10 +-
 numba_dppy/examples/sum.py                    |   8 +-
 numba_dppy/examples/sum2D.py                  |  10 +-
 numba_dppy/examples/sum_ndarray.py            |   6 +-
 numba_dppy/examples/sum_reduction.py          |   8 +-
 numba_dppy/examples/sum_reduction_ocl.py      |  16 +-
 .../examples/sum_reduction_recursive_ocl.py   |  16 +-
 .../experimental_numpy_lowering_overload.py   |  12 +-
 numba_dppy/initialize.py                      |   8 +-
 numba_dppy/ocl/atomics/atomic_ops.cl          |  56 +++---
 numba_dppy/ocl/ocldecl.py                     |  44 ++---
 numba_dppy/ocl/oclimpl.py                     |  22 +--
 numba_dppy/ocl/stubs.py                       |   6 +-
 numba_dppy/printimpl.py                       |   4 +-
 numba_dppy/target.py                          |  26 +--
 numba_dppy/target_dispatcher.py               |  12 +-
 numba_dppy/testing.py                         |  12 +-
 numba_dppy/tests/__init__.py                  |   6 +-
 numba_dppy/tests/test_arg_accessor.py         |  18 +-
 numba_dppy/tests/test_arg_types.py            |  26 +--
 numba_dppy/tests/test_atomic_op.py            | 168 +++++++++---------
 numba_dppy/tests/test_barrier.py              |  34 ++--
 numba_dppy/tests/test_black_scholes.py        |  14 +-
 numba_dppy/tests/test_caching.py              |  14 +-
 numba_dppy/tests/test_device_array_args.py    |  16 +-
 numba_dppy/tests/test_dpctl_api.py            |   4 +-
 numba_dppy/tests/test_dpnp_functions.py       |   6 +-
 numba_dppy/tests/test_dppl_fallback.py        |  26 +--
 numba_dppy/tests/test_dppl_func.py            |  32 ++--
 numba_dppy/tests/test_math_functions.py       |  76 ++++----
 .../test_numpy_bit_twiddling_functions.py     |   6 +-
 .../tests/test_numpy_comparison_functions.py  |   6 +-
 .../tests/test_numpy_floating_functions.py    |   6 +-
 numba_dppy/tests/test_numpy_math_functions.py |   6 +-
 .../test_numpy_trigonomteric_functions.py     |   6 +-
 numba_dppy/tests/test_parfor_lower_message.py |   8 +-
 numba_dppy/tests/test_prange.py               |  14 +-
 numba_dppy/tests/test_print.py                |  16 +-
 numba_dppy/tests/test_sum_reduction.py        |  12 +-
 numba_dppy/tests/test_vectorize.py            |   6 +-
 numba_dppy/tests/test_with_context.py         |  16 +-
 58 files changed, 634 insertions(+), 604 deletions(-)
 create mode 100644 .gitignore
 rename numba_dppy/{dppl_host_fn_call_gen.py => dppy_host_fn_call_gen.py} (98%)
 rename numba_dppy/{dppl_lowerer.py => dppy_lowerer.py} (97%)
 rename numba_dppy/{dppl_offload_dispatcher.py => dppy_offload_dispatcher.py} (73%)
 rename numba_dppy/{dppl_passbuilder.py => dppy_passbuilder.py} (82%)
 rename numba_dppy/{dppl_passes.py => dppy_passes.py} (95%)
 rename numba_dppy/examples/{dppl_func.py => dppy_func.py} (81%)
 rename numba_dppy/examples/{dppl_with_context.py => dppy_with_context.py} (94%)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..340ae2678b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,27 @@
+*.pyc
+*.o
+*.so
+*.dylib
+*.pyd
+*.pdb
+*.egg-info
+*.sw[po]
+*.out
+*.ll
+.coverage
+.nfs*
+tags
+MANIFEST
+
+build/
+docs/_build/
+docs/gh-pages/
+dist/
+htmlcov/
+.idea/
+.vscode/
+.mypy_cache/
+.ipynb_checkpoints/
+__pycache__/
+
+docs/source/developer/autogen*
diff --git a/HowTo.rst b/HowTo.rst
index 03927c0ea7..7689bc52bf 100644
--- a/HowTo.rst
+++ b/HowTo.rst
@@ -7,7 +7,7 @@ are listed below with the help of sample code snippets. In this release we have
 the implementation of the OAK approach described in MS138 in section 4.3.2. The
 new decorator is described below.
 
-To access the features driver module have to be imported from numba_dppy.dppl_driver
+To access the features driver module have to be imported from numba_dppy.dppy_driver
 
 New Decorator
 =============
@@ -61,7 +61,7 @@ Primitive types are passed by value to the kernel, currently supported are int,
 Math Kernels
 ============
 
-This release has support for math kernels. See numba_dppy/tests/dppl/test_math_functions.py
+This release has support for math kernels. See numba_dppy/tests/dppy/test_math_functions.py
 for more details.
 
 
@@ -170,6 +170,6 @@ Testing
 
 All examples can be found in numba_dppy/examples/
 
-All tests can be found in numba_dppy/tests/dppl and can be triggered by the following command:
+All tests can be found in numba_dppy/tests/dppy and can be triggered by the following command:
 
 ``python -m numba.runtests numba_dppy.tests``
diff --git a/numba_dppy/CHANGE_LOG b/numba_dppy/CHANGE_LOG
index e3cb06522c..2a1fcdee40 100644
--- a/numba_dppy/CHANGE_LOG
+++ b/numba_dppy/CHANGE_LOG
@@ -1,7 +1,7 @@
-NUMBA Version 0.48.0 + DPPL Version 0.3.0 (June 29, 2020)
+NUMBA Version 0.48.0 + DPPY Version 0.3.0 (June 29, 2020)
 --------------------------------------------------------
 
 This release includes:
 
-* Caching of dppl.kernels which will improve performance.
+* Caching of dppy.kernels which will improve performance.
 * Addition of support for Intel Advisor which will help in profiling applications.
diff --git a/numba_dppy/__init__.py b/numba_dppy/__init__.py
index 6eff949d16..ac4e898889 100644
--- a/numba_dppy/__init__.py
+++ b/numba_dppy/__init__.py
@@ -4,9 +4,9 @@
 
 
 Extensions to Numba for Intel GPUs introduce two new features into Numba:
-    a.  A new backend that has a new decorator called @dppl.kernel that
+    a.  A new backend that has a new decorator called @dppy.kernel that
         exposes an explicit kernel programming interface similar to the
-        existing Numba GPU code-generation backends. The @dppl.kernel
+        existing Numba GPU code-generation backends. The @dppy.kernel
         decorator currently implements a subset of OpenCL’s API through
         Numba’s intrinsic functions.
 
@@ -20,48 +20,48 @@
 Explicit Kernel Prgoramming with new Docorators:
 
 
-@dppl.kernel
+@dppy.kernel
 
-    The @dppl.kernel decorator can be used with or without extra arguments.
+    The @dppy.kernel decorator can be used with or without extra arguments.
     Optionally, users can pass the signature of the arguments to the
     decorator. When a signature is provided to the DK decorator the version
     of the OpenCL kernel generated gets specialized for that type signature.
 
     ---------------------------------------------------------------------------
-    @dppl.kernel
+    @dppy.kernel
     def data_parallel_sum(a, b, c):
-        i = dppl.get_global_id(0)
+        i = dppy.get_global_id(0)
         c[i] = a[i] + b[i]
     ---------------------------------------------------------------------------
 
     To invoke the above function users will need to provide a
     global size (OpenCL) which is the size of a (same as b and c) and a
-    local size (dppl.DEFAULT_LOCAL_SIZE if user don't want to specify).
+    local size (dppy.DEFAULT_LOCAL_SIZE if user don't want to specify).
     Example shown below:
 
     ---------------------------------------------------------------------------
-    data_parallel_sum[len(a), dppl.DEFAULT_LOCAL_SIZE](dA, dB, dC)
+    data_parallel_sum[len(a), dppy.DEFAULT_LOCAL_SIZE](dA, dB, dC)
     ---------------------------------------------------------------------------
 
 
-@dppl.func
+@dppy.func
 
-    The @dppl.func decorator is the other decorator provided in the explicit
+    The @dppy.func decorator is the other decorator provided in the explicit
     kernel programming model. This decorator allows users to write “device”
     functions that can be invoked from inside DK functions but cannot be invoked
     from the host. The decorator also supports type specialization as with the
-    DK decorator. Functions decorated with @dppl.func will also be JIT compiled
-    and inlined into the OpenCL Program containing the @dppl.kernel function
-    calling it. A @dppl.func will not be launched as an OpenCL kernel.
+    DK decorator. Functions decorated with @dppy.func will also be JIT compiled
+    and inlined into the OpenCL Program containing the @dppy.kernel function
+    calling it. A @dppy.func will not be launched as an OpenCL kernel.
 
     ---------------------------------------------------------------------------
-    @dppl.func
+    @dppy.func
     def bar(a):
         return a*a
 
-    @dppl.kernel
+    @dppy.kernel
     def foo(in, out):
-        i = dppl.get_global_id(0)
+        i = dppy.get_global_id(0)
         out[i] = bar(in[i])
     ---------------------------------------------------------------------------
 
@@ -71,13 +71,13 @@ def foo(in, out):
     The following table has the list of intrinsic functions that can be directly
     used inside a DK function. All the functions are equivalent to the similarly
     named OpenCL function. Wherever there is an implementation difference
-    between the Numba-PyDPPL version and the OpenCL version, the difference is
+    between the Numba-DPPY version and the OpenCL version, the difference is
     explained in table. Note that these functions cannot be used anywhere else
     outside of a DK function in a Numba application. Readers are referred to the
     OpenCL API specs to review the functionality of each function.
 
     +----------------------+----------------------------+----------------------+
-    | Numba-DPPL intrinsic | Equivalent OpenCL function |         Notes        |
+    | Numba-DPPY intrinsic | Equivalent OpenCL function |         Notes        |
     +----------------------+----------------------------+----------------------+
     | get_global_id        | get_global_id              |                      |
     +----------------------+----------------------------+----------------------+
@@ -121,7 +121,7 @@ def foo(in, out):
     |print             |print(varargs)                 |The print function is a  |
     |                  |                               |subset of the OpenCL     |
     |                  |                               |printf function. The     |
-    |                  |                               |Numba-DPPL version of    |
+    |                  |                               |Numba-DPPY version of    |
     |                  |                               |print supports only int, |
     |                  |                               |string, and float        |
     |                  |                               |arguments.               |
@@ -160,16 +160,16 @@ def foo(in, out):
 
 
 
-Complete Example using @dppl.kernel:
+Complete Example using @dppy.kernel:
 
     ---------------------------------------------------------------------------
     import numpy as np
-    import numba_dppy, numba_dppy as dppl
+    import numba_dppy, numba_dppy as dppy
     import dpctl
 
-    @dppl.kernel
+    @dppy.kernel
     def data_parallel_sum(a, b, c):
-        i = dppl.get_global_id(0)
+        i = dppy.get_global_id(0)
         c[i] = a[i] + b[i]
 
     def driver(device_env, a, b, c, global_size):
@@ -181,7 +181,7 @@ def driver(device_env, a, b, c, global_size):
         print("before : ", dA._ndarray)
         print("before : ", dB._ndarray)
         print("before : ", dC._ndarray)
-        data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](dA, dB, dC)
+        data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](dA, dB, dC)
         device_env.copy_array_from_device(dC)
         print("after : ", dC._ndarray)
 
@@ -509,11 +509,11 @@ def main():
 if dppy_present:
     from .device_init import *
 else:
-    raise ImportError("Importing dppl failed")
+    raise ImportError("Importing numba-dppy failed")
 
 def test(*args, **kwargs):
     if not dppy_present and not is_available():
-        dppl_error()
+        dppy_error()
 
     return numba.testing.test("numba_dppy.tests", *args, **kwargs)
 
diff --git a/numba_dppy/compiler.py b/numba_dppy/compiler.py
index 736cd96a26..c8a329738a 100644
--- a/numba_dppy/compiler.py
+++ b/numba_dppy/compiler.py
@@ -2,7 +2,7 @@
 import copy
 from collections import namedtuple
 
-from .dppl_passbuilder import DPPLPassBuilder
+from .dppy_passbuilder import DPPYPassBuilder
 from numba.core.typing.templates import ConcreteTemplate
 from numba.core import types, compiler, ir
 from numba.core.typing.templates import AbstractTemplate
@@ -12,6 +12,7 @@
 
 import dpctl
 import dpctl.memory as dpctl_mem
+import dpctl.program as dpctl_prog
 import numpy as np
 
 from . import spirv_generator
@@ -19,10 +20,10 @@
 import os
 from numba.core.compiler import DefaultPassBuilder, CompilerBase
 
-DEBUG=os.environ.get('NUMBA_DPPL_DEBUG', None)
-_NUMBA_DPPL_READ_ONLY  = "read_only"
-_NUMBA_DPPL_WRITE_ONLY = "write_only"
-_NUMBA_DPPL_READ_WRITE = "read_write"
+DEBUG=os.environ.get('NUMBA_DPPY_DEBUG', None)
+_NUMBA_DPPY_READ_ONLY  = "read_only"
+_NUMBA_DPPY_WRITE_ONLY = "write_only"
+_NUMBA_DPPY_READ_WRITE = "read_write"
 
 def _raise_no_device_found_error():
     error_message = ("No OpenCL device specified. "
@@ -30,7 +31,7 @@ def _raise_no_device_found_error():
     raise ValueError(error_message)
 
 def _raise_invalid_kernel_enqueue_args():
-    error_message = ("Incorrect number of arguments for enquing dppl.kernel. "
+    error_message = ("Incorrect number of arguments for enquing dppy.kernel. "
                      "Usage: device_env, global size, local size. "
                      "The local size argument is optional.")
     raise ValueError(error_message)
@@ -51,15 +52,15 @@ def get_ordered_arg_access_types(pyfunc, access_types):
     return ordered_arg_access_types
 
 
-class DPPLCompiler(CompilerBase):
-    """ DPPL Compiler """
+class DPPYCompiler(CompilerBase):
+    """ DPPY Compiler """
 
     def define_pipelines(self):
         # this maintains the objmode fallback behaviour
         pms = []
         if not self.state.flags.force_pyobject:
-            #print("Numba-DPPL [INFO]: Using Numba-DPPL pipeline")
-            pms.append(DPPLPassBuilder.define_nopython_pipeline(self.state))
+            #print("Numba-DPPY [INFO]: Using Numba-DPPY pipeline")
+            pms.append(DPPYPassBuilder.define_nopython_pipeline(self.state))
         if self.state.status.can_fallback or self.state.flags.force_pyobject:
             pms.append(
                 DefaultPassBuilder.define_objectmode_pipeline(self.state)
@@ -71,12 +72,12 @@ def define_pipelines(self):
         return pms
 
 
-def compile_with_dppl(pyfunc, return_type, args, debug):
+def compile_with_dppy(pyfunc, return_type, args, debug):
     # First compilation will trigger the initialization of the OpenCL backend.
-    from .descriptor import dppl_target
+    from .descriptor import dppy_target
 
-    typingctx = dppl_target.typing_context
-    targetctx = dppl_target.target_context
+    typingctx = dppy_target.typing_context
+    targetctx = dppy_target.target_context
     # TODO handle debug flag
     flags = compiler.Flags()
     # Do not compile (generate native code), just lower (to LLVM)
@@ -93,7 +94,7 @@ def compile_with_dppl(pyfunc, return_type, args, debug):
                                       return_type=return_type,
                                       flags=flags,
                                       locals={},
-                                      pipeline_class=DPPLCompiler)
+                                      pipeline_class=DPPYCompiler)
     elif isinstance(pyfunc, ir.FunctionIR):
         cres = compiler.compile_ir(typingctx=typingctx,
                                    targetctx=targetctx,
@@ -102,7 +103,7 @@ def compile_with_dppl(pyfunc, return_type, args, debug):
                                    return_type=return_type,
                                    flags=flags,
                                    locals={},
-                                   pipeline_class=DPPLCompiler)
+                                   pipeline_class=DPPYCompiler)
     else:
         assert(0)
     # Linking depending libraries
@@ -120,7 +121,7 @@ def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=False):
         # This will be get_current_queue
         sycl_queue = dpctl.get_current_queue()
 
-    cres = compile_with_dppl(pyfunc, None, args, debug=debug)
+    cres = compile_with_dppy(pyfunc, None, args, debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
     kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args)
     # The kernel objet should have a reference to the target context it is compiled for.
@@ -128,7 +129,7 @@ def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=False):
     # depending on the target context. For example, we want to link our kernel object
     # with implementation containing atomic operations only when atomic operations
     # are being used in the kernel.
-    oclkern = DPPLKernel(context=cres.target_context,
+    oclkern = DPPYKernel(context=cres.target_context,
                          sycl_queue=sycl_queue,
                          llvm_module=kernel.module,
                          name=kernel.name,
@@ -146,7 +147,7 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
             if isinstance(a, types.npytypes.Array):
                 print("addrspace:", a.addrspace)
 
-    cres = compile_with_dppl(func_ir, None, args_with_addrspaces,
+    cres = compile_with_dppy(func_ir, None, args_with_addrspaces,
                              debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
 
@@ -159,7 +160,7 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
 
     kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args)
     #kernel = cres.target_context.prepare_ocl_kernel(func, args_with_addrspaces)
-    oclkern = DPPLKernel(context=cres.target_context,
+    oclkern = DPPYKernel(context=cres.target_context,
                          sycl_queue=sycl_queue,
                          llvm_module=kernel.module,
                          name=kernel.name,
@@ -168,44 +169,44 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
     return oclkern
 
 
-def compile_dppl_func(pyfunc, return_type, args, debug=False):
-    cres = compile_with_dppl(pyfunc, return_type, args, debug=debug)
+def compile_dppy_func(pyfunc, return_type, args, debug=False):
+    cres = compile_with_dppy(pyfunc, return_type, args, debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
     cres.target_context.mark_ocl_device(func)
-    devfn = DPPLFunction(cres)
+    devfn = DPPYFunction(cres)
 
-    class dppl_function_template(ConcreteTemplate):
+    class dppy_function_template(ConcreteTemplate):
         key = devfn
         cases = [cres.signature]
 
-    cres.typing_context.insert_user_function(devfn, dppl_function_template)
+    cres.typing_context.insert_user_function(devfn, dppy_function_template)
     libs = [cres.library]
     cres.target_context.insert_user_function(devfn, cres.fndesc, libs)
     return devfn
 
 
-# Compile dppl function template
-def compile_dppl_func_template(pyfunc):
-    """Compile a DPPLFunctionTemplate
+# Compile dppy function template
+def compile_dppy_func_template(pyfunc):
+    """Compile a DPPYFunctionTemplate
     """
-    from .descriptor import dppl_target
+    from .descriptor import dppy_target
 
-    dft = DPPLFunctionTemplate(pyfunc)
+    dft = DPPYFunctionTemplate(pyfunc)
 
-    class dppl_function_template(AbstractTemplate):
+    class dppy_function_template(AbstractTemplate):
         key = dft
 
         def generic(self, args, kws):
             assert not kws
             return dft.compile(args)
 
-    typingctx = dppl_target.typing_context
-    typingctx.insert_user_function(dft, dppl_function_template)
+    typingctx = dppy_target.typing_context
+    typingctx.insert_user_function(dft, dppy_function_template)
     return dft
 
 
-class DPPLFunctionTemplate(object):
-    """Unmaterialized dppl function
+class DPPYFunctionTemplate(object):
+    """Unmaterialized dppy function
     """
     def __init__(self, pyfunc, debug=False):
         self.py_func = pyfunc
@@ -220,7 +221,7 @@ def compile(self, args):
         this object.
         """
         if args not in self._compileinfos:
-            cres = compile_with_dppl(self.py_func, None, args, debug=self.debug)
+            cres = compile_with_dppy(self.py_func, None, args, debug=self.debug)
             func = cres.library.get_function(cres.fndesc.llvm_func_name)
             cres.target_context.mark_ocl_device(func)
             first_definition = not self._compileinfos
@@ -240,7 +241,7 @@ def compile(self, args):
         return cres.signature
 
 
-class DPPLFunction(object):
+class DPPYFunction(object):
     def __init__(self, cres):
         self.cres = cres
 
@@ -282,7 +283,7 @@ def _ensure_valid_work_group_size(val, work_item_grid):
     return list(val[::-1]) # reversing due to sycl and opencl interop kernel range mismatch semantic
 
 
-class DPPLKernelBase(object):
+class DPPYKernelBase(object):
     """Define interface for configurable kernels
     """
 
@@ -293,9 +294,9 @@ def __init__(self):
 
         # list of supported access types, stored in dict for fast lookup
         self.valid_access_types = {
-                _NUMBA_DPPL_READ_ONLY: _NUMBA_DPPL_READ_ONLY,
-                _NUMBA_DPPL_WRITE_ONLY: _NUMBA_DPPL_WRITE_ONLY,
-                _NUMBA_DPPL_READ_WRITE: _NUMBA_DPPL_READ_WRITE}
+                _NUMBA_DPPY_READ_ONLY: _NUMBA_DPPY_READ_ONLY,
+                _NUMBA_DPPY_WRITE_ONLY: _NUMBA_DPPY_WRITE_ONLY,
+                _NUMBA_DPPY_READ_WRITE: _NUMBA_DPPY_READ_WRITE}
 
     def copy(self):
         return copy.copy(self)
@@ -331,14 +332,14 @@ def __getitem__(self, args):
         return self.configure(sycl_queue, gs, ls)
 
 
-class DPPLKernel(DPPLKernelBase):
+class DPPYKernel(DPPYKernelBase):
     """
     A OCL kernel object
     """
 
     def __init__(self, context, sycl_queue, llvm_module, name, argtypes,
                  ordered_arg_access_types=None):
-        super(DPPLKernel, self).__init__()
+        super(DPPYKernel, self).__init__()
         self._llvm_module = llvm_module
         self.assembly = self.binary = llvm_module.__str__()
         self.entry_name = name
@@ -355,7 +356,7 @@ def __init__(self, context, sycl_queue, llvm_module, name, argtypes,
         self.spirv_bc = spirv_generator.llvm_to_spirv(self.context, self.binary)
 
         # create a program
-        self.program = dpctl.create_program_from_spirv(self.sycl_queue, self.spirv_bc)
+        self.program = dpctl_prog.create_program_from_spirv(self.sycl_queue, self.spirv_bc)
         #  create a kernel
         self.kernel = self.program.get_sycl_kernel(self.entry_name)
 
@@ -385,7 +386,7 @@ def _pack_argument(self, ty, val, sycl_queue, device_arr, access_type):
         """
         if (device_arr and (access_type not in self.valid_access_types or
             access_type in self.valid_access_types and
-            self.valid_access_types[access_type] != _NUMBA_DPPL_READ_ONLY)):
+            self.valid_access_types[access_type] != _NUMBA_DPPY_READ_ONLY)):
             # we get the date back to host if have created a
             # device_array or if access_type of this device_array
             # is not of type read_only and read_write
@@ -431,8 +432,8 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
                 usm_ndarr = np.ndarray(val.shape, buffer=usm_buf, dtype=val.dtype)
 
                 if (default_behavior or
-                    self.valid_access_types[access_type] == _NUMBA_DPPL_READ_ONLY or
-                    self.valid_access_types[access_type] == _NUMBA_DPPL_READ_WRITE):
+                    self.valid_access_types[access_type] == _NUMBA_DPPY_READ_ONLY or
+                    self.valid_access_types[access_type] == _NUMBA_DPPY_READ_WRITE):
                     np.copyto(usm_ndarr, val)
 
                 device_arrs[-1] = (usm_buf, usm_ndarr, val)
@@ -486,18 +487,18 @@ def check_for_invalid_access_type(self, access_type):
             return False
 
 
-class JitDPPLKernel(DPPLKernelBase):
+class JitDPPYKernel(DPPYKernelBase):
     def __init__(self, func, access_types):
 
-        super(JitDPPLKernel, self).__init__()
+        super(JitDPPYKernel, self).__init__()
 
         self.py_func = func
         self.definitions = {}
         self.access_types = access_types
 
-        from .descriptor import dppl_target
+        from .descriptor import dppy_target
 
-        self.typingctx = dppl_target.typing_context
+        self.typingctx = dppy_target.typing_context
 
     def __call__(self, *args, **kwargs):
         assert not kwargs, "Keyword Arguments are not supported"
diff --git a/numba_dppy/decorators.py b/numba_dppy/decorators.py
index a8b6bbba36..641d924134 100644
--- a/numba_dppy/decorators.py
+++ b/numba_dppy/decorators.py
@@ -1,11 +1,11 @@
 from __future__ import print_function, absolute_import, division
 from numba.core import sigutils, types
-from .compiler import (compile_kernel, JitDPPLKernel, compile_dppl_func_template,
-                       compile_dppl_func, get_ordered_arg_access_types)
+from .compiler import (compile_kernel, JitDPPYKernel, compile_dppy_func_template,
+                       compile_dppy_func, get_ordered_arg_access_types)
 
 
 def kernel(signature=None, access_types=None, debug=False):
-    """JIT compile a python function conforming using the DPPL backend.
+    """JIT compile a python function conforming using the DPPY backend.
 
     A kernel is equvalent to an OpenCL kernel function, and has the
     same restrictions as definined by SPIR_KERNEL calling convention.
@@ -22,14 +22,14 @@ def kernel(signature=None, access_types=None, debug=False):
 def autojit(debug=False, access_types=None):
     def _kernel_autojit(pyfunc):
         ordered_arg_access_types = get_ordered_arg_access_types(pyfunc, access_types)
-        return JitDPPLKernel(pyfunc, ordered_arg_access_types)
+        return JitDPPYKernel(pyfunc, ordered_arg_access_types)
     return _kernel_autojit
 
 
 def _kernel_jit(signature, debug, access_types):
     argtypes, restype = sigutils.normalize_signature(signature)
     if restype is not None and restype != types.void:
-        msg = ("DPPL kernel must have void return type but got {restype}")
+        msg = ("DPPY kernel must have void return type but got {restype}")
         raise TypeError(msg.format(restype=restype))
 
     def _wrapped(pyfunc):
@@ -54,9 +54,9 @@ def _func_jit(signature):
     argtypes, restype = sigutils.normalize_signature(signature)
 
     def _wrapped(pyfunc):
-        return compile_dppl_func(pyfunc, restype, argtypes)
+        return compile_dppy_func(pyfunc, restype, argtypes)
 
     return _wrapped
 
 def _func_autojit(pyfunc):
-    return compile_dppl_func_template(pyfunc)
+    return compile_dppy_func_template(pyfunc)
diff --git a/numba_dppy/descriptor.py b/numba_dppy/descriptor.py
index c0a24868c2..c8e6a58ec7 100644
--- a/numba_dppy/descriptor.py
+++ b/numba_dppy/descriptor.py
@@ -3,41 +3,41 @@
 from numba.core.options import TargetOptions
 
 from numba.core import dispatcher, utils, typing
-from .target import DPPLTargetContext, DPPLTypingContext
+from .target import DPPYTargetContext, DPPYTypingContext
 
 from numba.core.cpu import CPUTargetOptions
 
 
-class DPPLTarget(TargetDescriptor):
+class DPPYTarget(TargetDescriptor):
     options = CPUTargetOptions
-    #typingctx = DPPLTypingContext()
-    #targetctx = DPPLTargetContext(typingctx)
+    #typingctx = DPPYTypingContext()
+    #targetctx = DPPYTargetContext(typingctx)
 
     @utils.cached_property
     def _toplevel_target_context(self):
         # Lazily-initialized top-level target context, for all threads
-        return DPPLTargetContext(self.typing_context)
+        return DPPYTargetContext(self.typing_context)
 
     @utils.cached_property
     def _toplevel_typing_context(self):
         # Lazily-initialized top-level typing context, for all threads
-        return DPPLTypingContext()
+        return DPPYTypingContext()
 
     @property
     def target_context(self):
         """
-        The target context for DPPL targets.
+        The target context for DPPY targets.
         """
         return self._toplevel_target_context
 
     @property
     def typing_context(self):
         """
-        The typing context for DPPL targets.
+        The typing context for DPPY targets.
         """
         return self._toplevel_typing_context
 
 
 
-# The global DPPL target
-dppl_target = DPPLTarget()
+# The global DPPY target
+dppy_target = DPPYTarget()
diff --git a/numba_dppy/dispatcher.py b/numba_dppy/dispatcher.py
index a4c32ec7ec..d00a597875 100644
--- a/numba_dppy/dispatcher.py
+++ b/numba_dppy/dispatcher.py
@@ -4,17 +4,17 @@
 
 #from numba.targets.descriptors import TargetDescriptor
 #from numba.targets.options import TargetOptions
-#import numba_dppy, numba_dppy as dppl
+#import numba_dppy, numba_dppy as dppy
 from numba_dppy import kernel, autojit
-from .descriptor import dppl_target
+from .descriptor import dppy_target
 #from numba.npyufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
  #                                       GUFuncCallSteps)
 
 from .. import dispatcher, utils, typing
-from .compiler import DPPLCompiler
+from .compiler import DPPYCompiler
 
-class DPPLDispatcher(dispatcher.Dispatcher):
-    targetdescr = dppl_target
+class DPPYDispatcher(dispatcher.Dispatcher):
+    targetdescr = dppy_target
 
 
     def __init__(self, py_func, locals={}, targetoptions={}):
@@ -58,7 +58,7 @@ def __getitem__(self, *args):
     def __getattr__(self, key):
         return getattr(self.compiled, key)
 
-class DPPLUFuncDispatcher(object):
+class DPPYUFuncDispatcher(object):
     """
     Invoke the OpenCL ufunc specialization for the given inputs.
     """
@@ -86,7 +86,7 @@ def __call__(self, *args, **kws):
                       depending on the input arguments.  Type must match
                       the input arguments.
         """
-        return DPPLUFuncMechanism.call(self.functions, args, kws)
+        return DPPYUFuncMechanism.call(self.functions, args, kws)
 
     def reduce(self, arg, stream=0):
         assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \
@@ -142,7 +142,7 @@ def __reduce(self, mem, gpu_mems, stream):
                 return left
 
 
-class _DPPLGUFuncCallSteps(GUFuncCallSteps):
+class _DPPYGUFuncCallSteps(GUFuncCallSteps):
     __slots__ = [
         '_stream',
     ]
@@ -167,10 +167,10 @@ def launch_kernel(self, kernel, nelem, args):
         kernel.forall(nelem, queue=self._stream)(*args)
 
 
-class DPPLGenerializedUFunc(GenerializedUFunc):
+class DPPYGenerializedUFunc(GenerializedUFunc):
     @property
     def _call_steps(self):
-        return _DPPLGUFuncCallSteps
+        return _DPPYGUFuncCallSteps
 
     def _broadcast_scalar_input(self, ary, shape):
         return devicearray.DeviceNDArray(shape=shape,
@@ -188,7 +188,7 @@ def _broadcast_add_axis(self, ary, newshape):
                                          gpu_data=ary.gpu_data)
 
 
-class DPPLUFuncMechanism(UFuncMechanism):
+class DPPYUFuncMechanism(UFuncMechanism):
     """
     Provide OpenCL specialization
     """
diff --git a/numba_dppy/dppl_host_fn_call_gen.py b/numba_dppy/dppy_host_fn_call_gen.py
similarity index 98%
rename from numba_dppy/dppl_host_fn_call_gen.py
rename to numba_dppy/dppy_host_fn_call_gen.py
index 10a4820906..7d1c9bcea4 100644
--- a/numba_dppy/dppl_host_fn_call_gen.py
+++ b/numba_dppy/dppy_host_fn_call_gen.py
@@ -9,7 +9,7 @@
 
 from numba.core.ir_utils import legalize_names
 
-class DPPLHostFunctionCallsGenerator(object):
+class DPPYHostFunctionCallsGenerator(object):
     def __init__(self, lowerer, cres, num_inputs):
         self.lowerer = lowerer
         self.context = self.lowerer.context
@@ -70,31 +70,31 @@ def _init_llvm_types_and_constants(self):
     def _declare_functions(self):
         get_queue_fnty = lc.Type.function(self.void_ptr_t, ())
         self.get_queue = self.builder.module.get_or_insert_function(get_queue_fnty,
-                                                                name="DPPLQueueMgr_GetCurrentQueue")
+                                                                name="DPCTLQueueMgr_GetCurrentQueue")
 
         submit_range_fnty = lc.Type.function(self.void_ptr_t,
                 [self.void_ptr_t, self.void_ptr_t, self.void_ptr_ptr_t,
                     self.int32_ptr_t, self.intp_t, self.intp_ptr_t,
                     self.intp_t, self.void_ptr_t, self.intp_t])
         self.submit_range = self.builder.module.get_or_insert_function(submit_range_fnty,
-                                                                name="DPPLQueue_SubmitRange")
+                                                                name="DPCTLQueue_SubmitRange")
 
 
         queue_memcpy_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t, self.void_ptr_t, self.intp_t])
         self.queue_memcpy = self.builder.module.get_or_insert_function(queue_memcpy_fnty,
-                                                                name="DPPLQueue_Memcpy")
+                                                                name="DPCTLQueue_Memcpy")
 
         queue_wait_fnty =  lc.Type.function(lir.VoidType(), [self.void_ptr_t])
         self.queue_wait = self.builder.module.get_or_insert_function(queue_wait_fnty,
-                                                                name="DPPLQueue_Wait")
+                                                                name="DPCTLQueue_Wait")
 
         usm_shared_fnty = lc.Type.function(self.void_ptr_t, [self.intp_t, self.void_ptr_t])
         self.usm_shared = self.builder.module.get_or_insert_function(usm_shared_fnty,
-                                                                name="DPPLmalloc_shared")
+                                                                name="DPCTLmalloc_shared")
 
         usm_free_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t])
         self.usm_free = self.builder.module.get_or_insert_function(usm_free_fnty,
-                                                                   name="DPPLfree_with_queue")
+                                                                   name="DPCTLfree_with_queue")
 
     def allocate_kenrel_arg_array(self, num_kernel_args):
         self.sycl_queue_val = cgutils.alloca_once(self.builder, self.void_ptr_t)
diff --git a/numba_dppy/dppl_lowerer.py b/numba_dppy/dppy_lowerer.py
similarity index 97%
rename from numba_dppy/dppl_lowerer.py
rename to numba_dppy/dppy_lowerer.py
index a317c990a6..1561a6d85e 100644
--- a/numba_dppy/dppl_lowerer.py
+++ b/numba_dppy/dppy_lowerer.py
@@ -12,7 +12,7 @@
 from numba.core import (compiler, ir, types, sigutils, lowering,
                 funcdesc, config)
 from numba.parfors import parfor
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba.core.ir_utils import (add_offset_to_labels,
                             replace_var_names,
                             remove_dels,
@@ -38,9 +38,9 @@
 from numba.core.errors import NumbaParallelSafetyWarning, NumbaPerformanceWarning
 
 from .dufunc_inliner import dufunc_inliner
-from . import dppl_host_fn_call_gen as dppl_call_gen
+from . import dppy_host_fn_call_gen as dppy_call_gen
 import dpctl
-from numba_dppy.target import DPPLTargetContext
+from numba_dppy.target import DPPYTargetContext
 
 
 def _print_block(block):
@@ -72,7 +72,7 @@ def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict):
 
     for eachdim in range(global_id_dim):
         gufunc_txt += ("    " + legal_loop_indices[eachdim] + " = "
-                       + "dppl.get_global_id(" + str(eachdim) + ")\n")
+                       + "dppy.get_global_id(" + str(eachdim) + ")\n")
 
 
     for eachdim in range(global_id_dim, for_loop_dim):
@@ -444,7 +444,7 @@ def print_arg_with_addrspaces(args):
         print("gufunc_txt = ", type(gufunc_txt), "\n", gufunc_txt)
         sys.stdout.flush()
     # Force gufunc outline into existence.
-    globls = {"np": np, "numba": numba, "dppl": dppl}
+    globls = {"np": np, "numba": numba, "dppy": dppy}
     locls = {}
     exec(gufunc_txt, globls, locls)
     gufunc_func = locls[gufunc_name]
@@ -740,7 +740,7 @@ def _lower_parfor_gufunc(lowerer, parfor):
         parfor.races,
         typemap)
 
-    generate_dppl_host_wrapper(
+    generate_dppy_host_wrapper(
         lowerer,
         func,
         gu_signature,
@@ -828,10 +828,10 @@ def bump_alpha(c, class_map):
     return (gu_sin, gu_sout)
 
 
-# Keep all the dppl kernels and programs created alive indefinitely.
+# Keep all the dppy kernels and programs created alive indefinitely.
 keep_alive_kernels = []
 
-def generate_dppl_host_wrapper(lowerer,
+def generate_dppy_host_wrapper(lowerer,
                                cres,
                                gu_signature,
                                outer_sig,
@@ -852,7 +852,7 @@ def generate_dppl_host_wrapper(lowerer,
     num_dim = len(loop_ranges)
 
     if config.DEBUG_ARRAY_OPT:
-        print("generate_dppl_host_wrapper")
+        print("generate_dppy_host_wrapper")
         print("args = ", expr_args)
         print("outer_sig = ", outer_sig.args, outer_sig.return_type,
               outer_sig.recvr, outer_sig.pysig)
@@ -868,8 +868,8 @@ def generate_dppl_host_wrapper(lowerer,
 #        print("cres.fndesc", cres.fndesc, type(cres.fndesc))
 
 
-    # get dppl_cpu_portion_lowerer object
-    dppl_cpu_lowerer = dppl_call_gen.DPPLHostFunctionCallsGenerator(
+    # get dppy_cpu_portion_lowerer object
+    dppy_cpu_lowerer = dppy_call_gen.DPPYHostFunctionCallsGenerator(
                            lowerer, cres, num_inputs)
 
     # Compute number of args ------------------------------------------------
@@ -886,7 +886,7 @@ def generate_dppl_host_wrapper(lowerer,
 
     # now that we know the total number of kernel args, lets allocate
     # a kernel_arg array
-    dppl_cpu_lowerer.allocate_kenrel_arg_array(num_expanded_args)
+    dppy_cpu_lowerer.allocate_kenrel_arg_array(num_expanded_args)
 
     ninouts = len(expr_args)
 
@@ -931,7 +931,7 @@ def val_type_or_none(context, lowerer, x):
                   "\n\tval_type:", val_type, type(val_type),
                   "\n\tindex:", index)
 
-        dppl_cpu_lowerer.process_kernel_arg(var, llvm_arg, arg_type, gu_sig,
+        dppy_cpu_lowerer.process_kernel_arg(var, llvm_arg, arg_type, gu_sig,
                                             val_type, index, modified_arrays)
     # -----------------------------------------------------------------------
 
@@ -951,7 +951,7 @@ def load_range(v):
         step = load_range(step)
         loop_ranges[i] = (start, stop, step)
 
-    dppl_cpu_lowerer.enqueue_kernel_and_read_back(loop_ranges)
+    dppy_cpu_lowerer.enqueue_kernel_and_read_back(loop_ranges)
 
 
 from numba.core.lowering import Lower
@@ -975,7 +975,7 @@ def relatively_deep_copy(obj, memo):
     from numba.core.types.functions import Function, Dispatcher
     from numba.core.bytecode import FunctionIdentity
     from numba.core.typing.templates import Signature
-    from numba_dppy.compiler import DPPLFunctionTemplate
+    from numba_dppy.compiler import DPPYFunctionTemplate
     from numba.core.compiler import CompileResult
     from numba.np.ufunc.dufunc import DUFunc
     from ctypes import _CFuncPtr
@@ -983,9 +983,9 @@ def relatively_deep_copy(obj, memo):
     from numba.core.types.abstract import Type
 
     # objects which shouldn't or can't be copied and it's ok not to copy it.
-    if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type, Dispatcher, ModuleType,
-                        Signature, DPPLFunctionTemplate, CompileResult,
-                        DUFunc, _CFuncPtr,
+    if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type,
+                        Dispatcher, ModuleType, Signature,
+                        DPPYFunctionTemplate, CompileResult, DUFunc, _CFuncPtr,
                         type, str, bool, type(None))):
         return obj
 
@@ -1132,7 +1132,7 @@ def get_slots_members(obj):
     return cpy
 
 
-class DPPLLower(Lower):
+class DPPYLower(Lower):
     def __init__(self, context, library, fndesc, func_ir, metadata=None):
         Lower.__init__(self, context, library, fndesc, func_ir, metadata)
         memo = {}
@@ -1141,7 +1141,7 @@ def __init__(self, context, library, fndesc, func_ir, metadata=None):
         func_ir_cpu = relatively_deep_copy(func_ir, memo)
 
 
-        cpu_context = context.cpu_context if isinstance(context, DPPLTargetContext) else context
+        cpu_context = context.cpu_context if isinstance(context, DPPYTargetContext) else context
         self.gpu_lower = Lower(context, library, fndesc, func_ir, metadata)
         self.cpu_lower = Lower(cpu_context, library, fndesc_cpu, func_ir_cpu, metadata)
 
@@ -1151,11 +1151,11 @@ def lower(self):
         # 1. Start lowering of parent function
         # 2. Try to lower parfor on GPU
         #     2.a. enter lower_parfor_rollback and prepare function to lower on GPU - insert get_global_id.
-        #         2.a.a. starting lower parfor body - enter this point (DPPLLower.lower()) second time.
+        #         2.a.a. starting lower parfor body - enter this point (DPPYLower.lower()) second time.
         #         2.a.b. If lowering on GPU failed - try on CPU.
         #         2.a.d. Since get_global_id is NOT supported with CPU context - fail and throw exception
         #     2.b. in lower_parfor_rollback catch exception and restore parfor body and other to its initial state
-        #     2.c. in lower_parfor_rollback throw expeption to catch it here (DPPLLower.lower())
+        #     2.c. in lower_parfor_rollback throw expeption to catch it here (DPPYLower.lower())
         # 3. Catch exception and start parfor lowering with CPU context.
 
         # WARNING: this approach only works in case no device specific modifications were added to
@@ -1169,7 +1169,7 @@ def lower(self):
             lowering.lower_extensions[parfor.Parfor].pop()
         except Exception as e:
             if numba_dppy.compiler.DEBUG:
-                print("Failed to lower parfor on DPPL-device. Due to:\n", e)
+                print("Failed to lower parfor on DPPY-device. Due to:\n", e)
             lowering.lower_extensions[parfor.Parfor].pop()
             if (lowering.lower_extensions[parfor.Parfor][-1] == numba.parfors.parfor_lowering._lower_parfor_parallel):
                 self.cpu_lower.lower()
@@ -1195,13 +1195,13 @@ def lower_parfor_rollback(lowerer, parfor):
     try:
         _lower_parfor_gufunc(lowerer, parfor)
         if numba_dppy.compiler.DEBUG:
-            msg = "Parfor lowered on DPPL-device"
+            msg = "Parfor lowered on DPPY-device"
             print(msg, parfor.loc)
     except Exception as e:
-        msg = "Failed to lower parfor on DPPL-device.\nTo see details set environment variable NUMBA_DPPL_DEBUG=1"
+        msg = "Failed to lower parfor on DPPY-device.\nTo see details set environment variable NUMBA_DPPY_DEBUG=1"
         warnings.warn(NumbaPerformanceWarning(msg, parfor.loc))
         raise e
 
 
-def dppl_lower_array_expr(lowerer, expr):
+def dppy_lower_array_expr(lowerer, expr):
     raise NotImplementedError(expr)
diff --git a/numba_dppy/dppl_offload_dispatcher.py b/numba_dppy/dppy_offload_dispatcher.py
similarity index 73%
rename from numba_dppy/dppl_offload_dispatcher.py
rename to numba_dppy/dppy_offload_dispatcher.py
index db841bef06..0c5fe10f5e 100644
--- a/numba_dppy/dppl_offload_dispatcher.py
+++ b/numba_dppy/dppy_offload_dispatcher.py
@@ -3,21 +3,21 @@
 import numba_dppy.config as dppy_config
 
 
-class DpplOffloadDispatcher(dispatcher.Dispatcher):
+class DppyOffloadDispatcher(dispatcher.Dispatcher):
     targetdescr = cpu_target
 
     def __init__(self, py_func, locals={}, targetoptions={}, impl_kind='direct', pipeline_class=compiler.Compiler):
         if dppy_config.dppy_present:
-            from numba_dppy.compiler import DPPLCompiler
+            from numba_dppy.compiler import DPPYCompiler
             targetoptions['parallel'] = True
             dispatcher.Dispatcher.__init__(self, py_func, locals=locals,
-                    targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=DPPLCompiler)
+                    targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=DPPYCompiler)
         else:
             print("---------------------------------------------------------------------")
-            print("WARNING : DPPL pipeline ignored. Ensure OpenCL drivers are installed.")
+            print("WARNING : DPPY pipeline ignored. Ensure OpenCL drivers are installed.")
             print("---------------------------------------------------------------------")
             dispatcher.Dispatcher.__init__(self, py_func, locals=locals,
                 targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=pipeline_class)
 
-dispatcher_registry['__dppl_offload_gpu__'] = DpplOffloadDispatcher
-dispatcher_registry['__dppl_offload_cpu__'] = DpplOffloadDispatcher
+dispatcher_registry['__dppy_offload_gpu__'] = DppyOffloadDispatcher
+dispatcher_registry['__dppy_offload_cpu__'] = DppyOffloadDispatcher
diff --git a/numba_dppy/dppl_passbuilder.py b/numba_dppy/dppy_passbuilder.py
similarity index 82%
rename from numba_dppy/dppl_passbuilder.py
rename to numba_dppy/dppy_passbuilder.py
index 0ddaea6d0b..0a32a099cf 100644
--- a/numba_dppy/dppl_passbuilder.py
+++ b/numba_dppy/dppy_passbuilder.py
@@ -17,19 +17,19 @@
                                 DumpParforDiagnostics, IRLegalization,
                                 InlineOverloads, PreLowerStripPhis)
 
-from .dppl_passes import (
-        DPPLConstantSizeStaticLocalMemoryPass,
-        DPPLPreParforPass,
-        DPPLParforPass,
+from .dppy_passes import (
+        DPPYConstantSizeStaticLocalMemoryPass,
+        DPPYPreParforPass,
+        DPPYParforPass,
         SpirvFriendlyLowering,
-        DPPLAddNumpyOverloadPass,
-        DPPLAddNumpyRemoveOverloadPass,
-        DPPLNoPythonBackend
+        DPPYAddNumpyOverloadPass,
+        DPPYAddNumpyRemoveOverloadPass,
+        DPPYNoPythonBackend
         )
 
-class DPPLPassBuilder(object):
+class DPPYPassBuilder(object):
     """
-    This is the DPPL pass builder to run Intel GPU/CPU specific
+    This is the DPPY pass builder to run Intel GPU/CPU specific
     code-generation and optimization passes. This pass builder does
     not offer objectmode and interpreted passes.
     """
@@ -46,12 +46,12 @@ def default_numba_nopython_pipeline(state, pm):
 
         # this pass adds required logic to overload default implementation of
         # Numpy functions
-        pm.add_pass(DPPLAddNumpyOverloadPass, "dppl add typing template for Numpy functions")
+        pm.add_pass(DPPYAddNumpyOverloadPass, "dppy add typing template for Numpy functions")
 
         # Add pass to ensure when users are allocating static
         # constant memory the size is a constant and can not
         # come from a closure variable
-        pm.add_pass(DPPLConstantSizeStaticLocalMemoryPass, "dppl constant size for static local memory")
+        pm.add_pass(DPPYConstantSizeStaticLocalMemoryPass, "dppy constant size for static local memory")
 
         # pre typing
         if not state.flags.no_rewrites:
@@ -90,24 +90,24 @@ def default_numba_nopython_pipeline(state, pm):
 
 
     @staticmethod
-    def define_nopython_pipeline(state, name='dppl_nopython'):
+    def define_nopython_pipeline(state, name='dppy_nopython'):
         """Returns an nopython mode pipeline based PassManager
         """
         pm = PassManager(name)
-        DPPLPassBuilder.default_numba_nopython_pipeline(state, pm)
+        DPPYPassBuilder.default_numba_nopython_pipeline(state, pm)
 
         # Intel GPU/CPU specific optimizations
-        pm.add_pass(DPPLPreParforPass, "Preprocessing for parfors")
+        pm.add_pass(DPPYPreParforPass, "Preprocessing for parfors")
         if not state.flags.no_rewrites:
             pm.add_pass(NopythonRewrites, "nopython rewrites")
-        pm.add_pass(DPPLParforPass, "convert to parfors")
+        pm.add_pass(DPPYParforPass, "convert to parfors")
 
         # legalise
         pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
 
         # lower
         pm.add_pass(SpirvFriendlyLowering, "SPIRV-friendly lowering pass")
-        pm.add_pass(DPPLNoPythonBackend, "nopython mode backend")
-        pm.add_pass(DPPLAddNumpyRemoveOverloadPass, "dppl remove typing template for Numpy functions")
+        pm.add_pass(DPPYNoPythonBackend, "nopython mode backend")
+        pm.add_pass(DPPYAddNumpyRemoveOverloadPass, "dppy remove typing template for Numpy functions")
         pm.finalize()
         return pm
diff --git a/numba_dppy/dppl_passes.py b/numba_dppy/dppy_passes.py
similarity index 95%
rename from numba_dppy/dppl_passes.py
rename to numba_dppy/dppy_passes.py
index f9e2633c3c..0bb2eadb48 100644
--- a/numba_dppy/dppl_passes.py
+++ b/numba_dppy/dppy_passes.py
@@ -24,7 +24,7 @@
 
 from numba.core.compiler_machinery import FunctionPass, LoweringPass, register_pass
 
-from .dppl_lowerer import DPPLLower
+from .dppy_lowerer import DPPYLower
 
 from numba.parfors.parfor import PreParforPass as _parfor_PreParforPass, replace_functions_map
 from numba.parfors.parfor import ParforPass as _parfor_ParforPass
@@ -40,8 +40,8 @@ def dpnp_available():
 
 
 @register_pass(mutates_CFG=False, analysis_only=True)
-class DPPLAddNumpyOverloadPass(FunctionPass):
-    _name = "dppl_add_numpy_overload_pass"
+class DPPYAddNumpyOverloadPass(FunctionPass):
+    _name = "dppy_add_numpy_overload_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -122,8 +122,8 @@ def generic(self, args, kws):
         return True
 
 @register_pass(mutates_CFG=False, analysis_only=True)
-class DPPLAddNumpyRemoveOverloadPass(FunctionPass):
-    _name = "dppl_remove_numpy_overload_pass"
+class DPPYAddNumpyRemoveOverloadPass(FunctionPass):
+    _name = "dppy_remove_numpy_overload_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -143,9 +143,9 @@ def run_pass(self, state):
         return True
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLConstantSizeStaticLocalMemoryPass(FunctionPass):
+class DPPYConstantSizeStaticLocalMemoryPass(FunctionPass):
 
-    _name = "dppl_constant_size_static_local_memory_pass"
+    _name = "dppy_constant_size_static_local_memory_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -218,9 +218,9 @@ def run_pass(self, state):
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLPreParforPass(FunctionPass):
+class DPPYPreParforPass(FunctionPass):
 
-    _name = "dppl_pre_parfor_pass"
+    _name = "dppy_pre_parfor_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -262,9 +262,9 @@ def run_pass(self, state):
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLParforPass(FunctionPass):
+class DPPYParforPass(FunctionPass):
 
-    _name = "dppl_parfor_pass"
+    _name = "dppy_parfor_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -343,9 +343,9 @@ def run_pass(self, state):
         targetctx = state.targetctx
 
         # This should not happen here, after we have the notion of context in Numba
-        # we should have specialized dispatcher for dppl context and that dispatcher
+        # we should have specialized dispatcher for dppy context and that dispatcher
         # should be a cpu dispatcher that will overload the lowering functions for
-        # linalg for dppl.cpu_dispatcher and the dppl.gpu_dipatcher should be the
+        # linalg for dppy.cpu_dispatcher and the dppy.gpu_dipatcher should be the
         # current target context we have to launch kernels.
         # This is broken as this essentially adds the new lowering in a list which
         # means it does not get replaced with the new lowering_buitins
@@ -373,7 +373,7 @@ def run_pass(self, state):
                     noalias=flags.noalias)
 
             with targetctx.push_code_library(library):
-                lower = DPPLLower(targetctx, library, fndesc, interp,
+                lower = DPPYLower(targetctx, library, fndesc, interp,
                                        metadata=metadata)
                 lower.lower()
                 if not flags.no_cpython_wrapper:
@@ -400,7 +400,7 @@ def run_pass(self, state):
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLNoPythonBackend(FunctionPass):
+class DPPYNoPythonBackend(FunctionPass):
 
     _name = "nopython_backend"
 
diff --git a/numba_dppy/examples/dppl_func.py b/numba_dppy/examples/dppy_func.py
similarity index 81%
rename from numba_dppy/examples/dppl_func.py
rename to numba_dppy/examples/dppy_func.py
index ec86681457..353ba48995 100644
--- a/numba_dppy/examples/dppl_func.py
+++ b/numba_dppy/examples/dppy_func.py
@@ -1,26 +1,26 @@
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import math
 
 import dpctl
 
 
-@dppl.func
+@dppy.func
 def g(a):
     return a + 1
 
 
-@dppl.kernel
+@dppy.kernel
 def f(a, b):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     b[i] = g(a[i])
 
 
 def driver(a, b, N):
     print(b)
     print("--------")
-    f[N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+    f[N, dppy.DEFAULT_LOCAL_SIZE](a, b)
     print(b)
 
 
diff --git a/numba_dppy/examples/dppl_with_context.py b/numba_dppy/examples/dppy_with_context.py
similarity index 94%
rename from numba_dppy/examples/dppl_with_context.py
rename to numba_dppy/examples/dppy_with_context.py
index c830e81ec6..6df025f5ca 100644
--- a/numba_dppy/examples/dppl_with_context.py
+++ b/numba_dppy/examples/dppy_with_context.py
@@ -1,6 +1,6 @@
 import numpy as np
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 @njit
diff --git a/numba_dppy/examples/matmul.py b/numba_dppy/examples/matmul.py
index 35bef5be8a..b97ac49ca1 100644
--- a/numba_dppy/examples/matmul.py
+++ b/numba_dppy/examples/matmul.py
@@ -4,14 +4,14 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
-def dppl_gemm(a, b, c):
-    i = dppl.get_global_id(0)
-    j = dppl.get_global_id(1)
+@dppy.kernel
+def dppy_gemm(a, b, c):
+    i = dppy.get_global_id(0)
+    j = dppy.get_global_id(1)
     if i >= c.shape[0] or j >= c.shape[1]:
         return
     c[i,j] = 0
@@ -30,7 +30,7 @@ def dppl_gemm(a, b, c):
 
 def driver(a, b, c):
     # Invoke the kernel
-    dppl_gemm[griddim,blockdim](a, b, c)
+    dppy_gemm[griddim,blockdim](a, b, c)
 
 
 def main():
diff --git a/numba_dppy/examples/pairwise_distance.py b/numba_dppy/examples/pairwise_distance.py
index cc5c232c92..b72c41ba9c 100644
--- a/numba_dppy/examples/pairwise_distance.py
+++ b/numba_dppy/examples/pairwise_distance.py
@@ -6,7 +6,7 @@
 import argparse
 import timeit
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 import dpctl._memory as dpctl_mem
 
@@ -28,9 +28,9 @@
 D = np.empty((args.n, args.n))
 
 
-@dppl.kernel
+@dppy.kernel
 def pairwise_distance(X, D, xshape0, xshape1):
-    idx = dppl.get_global_id(0)
+    idx = dppy.get_global_id(0)
 
     #for i in range(xshape0):
     for j in range(X.shape[0]):
diff --git a/numba_dppy/examples/sum-hybrid.py b/numba_dppy/examples/sum-hybrid.py
index 418976f53a..e66c51ae2c 100644
--- a/numba_dppy/examples/sum-hybrid.py
+++ b/numba_dppy/examples/sum-hybrid.py
@@ -4,13 +4,13 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
@@ -27,7 +27,7 @@ def main():
             c = np.ones_like(a)
             print("before A: ", a)
             print("before B: ", b)
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
             print("after  C: ", c)
     else:
         print("CPU device not found")
@@ -40,7 +40,7 @@ def main():
             c = np.ones_like(a)
             print("before A: ", a)
             print("before B: ", b)
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
             print("after  C: ", c)
     else:
         print("GPU device not found")
diff --git a/numba_dppy/examples/sum.py b/numba_dppy/examples/sum.py
index f97b8243cb..fdc1623fa7 100644
--- a/numba_dppy/examples/sum.py
+++ b/numba_dppy/examples/sum.py
@@ -4,13 +4,13 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
@@ -18,7 +18,7 @@ def driver(a, b, c, global_size):
     print("before : ", a)
     print("before : ", b)
     print("before : ", c)
-    data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+    data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
     print("after : ", c)
 
 
diff --git a/numba_dppy/examples/sum2D.py b/numba_dppy/examples/sum2D.py
index 00be613d2b..90959c8bdf 100644
--- a/numba_dppy/examples/sum2D.py
+++ b/numba_dppy/examples/sum2D.py
@@ -4,21 +4,21 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
-    j = dppl.get_global_id(1)
+    i = dppy.get_global_id(0)
+    j = dppy.get_global_id(1)
     c[i,j] = a[i,j] + b[i,j]
 
 
 def driver(a, b, c, global_size):
     print("before A: ", a)
     print("before B: ", b)
-    data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+    data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
     print("after  C : ", c)
 
 
diff --git a/numba_dppy/examples/sum_ndarray.py b/numba_dppy/examples/sum_ndarray.py
index 6486be0275..2aea8e080a 100644
--- a/numba_dppy/examples/sum_ndarray.py
+++ b/numba_dppy/examples/sum_ndarray.py
@@ -4,13 +4,13 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
+@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
diff --git a/numba_dppy/examples/sum_reduction.py b/numba_dppy/examples/sum_reduction.py
index 3e00f95631..367fa37952 100644
--- a/numba_dppy/examples/sum_reduction.py
+++ b/numba_dppy/examples/sum_reduction.py
@@ -4,13 +4,13 @@
 import math
 import time
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def reduction_kernel(A, R, stride):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     # sum two element
     R[i] = A[i] + A[i+stride]
     # store the sum to be used in nex iteration
@@ -34,7 +34,7 @@ def test_sum_reduction():
             while (total > 1):
                 # call kernel
                 global_size = total // 2
-                reduction_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, R, global_size)
+                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size)
                 total = total // 2
 
     else:
diff --git a/numba_dppy/examples/sum_reduction_ocl.py b/numba_dppy/examples/sum_reduction_ocl.py
index e2605a7bbc..8d8e0411aa 100644
--- a/numba_dppy/examples/sum_reduction_ocl.py
+++ b/numba_dppy/examples/sum_reduction_ocl.py
@@ -1,20 +1,20 @@
 import sys
 import numpy as np
 from numba import int32
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import math
 
 import dpctl
 
 def sum_reduction_device_plus_host():
-    @dppl.kernel
+    @dppy.kernel
     def sum_reduction_kernel(inp, partial_sums):
-        local_id   = dppl.get_local_id(0)
-        global_id  = dppl.get_global_id(0)
-        group_size = dppl.get_local_size(0)
-        group_id   = dppl.get_group_id(0)
+        local_id   = dppy.get_local_id(0)
+        global_id  = dppy.get_global_id(0)
+        group_size = dppy.get_local_size(0)
+        group_id   = dppy.get_group_id(0)
 
-        local_sums = dppl.local.static_alloc(64, int32)
+        local_sums = dppy.local.static_alloc(64, int32)
 
         # Copy from global to local memory
         local_sums[local_id] = inp[global_id]
@@ -23,7 +23,7 @@ def sum_reduction_kernel(inp, partial_sums):
         stride = group_size // 2
         while (stride > 0):
             # Waiting for each 2x2 addition into given workgroup
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)
 
             # Add elements 2 by 2 between local_id and local_id + stride
             if (local_id < stride):
diff --git a/numba_dppy/examples/sum_reduction_recursive_ocl.py b/numba_dppy/examples/sum_reduction_recursive_ocl.py
index 11f5023a3b..c5dd6daa47 100644
--- a/numba_dppy/examples/sum_reduction_recursive_ocl.py
+++ b/numba_dppy/examples/sum_reduction_recursive_ocl.py
@@ -1,7 +1,7 @@
 import sys
 import numpy as np
 from numba import int32
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import math
 
 import dpctl
@@ -11,15 +11,15 @@
 def recursive_reduction(size, group_size,
                         Dinp, Dpartial_sums):
 
-    @dppl.kernel
+    @dppy.kernel
     def sum_reduction_kernel(inp, input_size,
                              partial_sums):
-        local_id   = dppl.get_local_id(0)
-        global_id  = dppl.get_global_id(0)
-        group_size = dppl.get_local_size(0)
-        group_id   = dppl.get_group_id(0)
+        local_id   = dppy.get_local_id(0)
+        global_id  = dppy.get_global_id(0)
+        group_size = dppy.get_local_size(0)
+        group_id   = dppy.get_group_id(0)
 
-        local_sums = dppl.local.static_alloc(64, int32)
+        local_sums = dppy.local.static_alloc(64, int32)
 
         local_sums[local_id] = 0
 
@@ -30,7 +30,7 @@ def sum_reduction_kernel(inp, input_size,
         stride = group_size // 2
         while (stride > 0):
             # Waiting for each 2x2 addition into given workgroup
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)
 
             # Add elements 2 by 2 between local_id and local_id + stride
             if (local_id < stride):
diff --git a/numba_dppy/experimental_numpy_lowering_overload.py b/numba_dppy/experimental_numpy_lowering_overload.py
index 2123e6667d..dd1e2a1eb6 100644
--- a/numba_dppy/experimental_numpy_lowering_overload.py
+++ b/numba_dppy/experimental_numpy_lowering_overload.py
@@ -77,7 +77,7 @@ def get_sycl_queue(context, builder):
     void_ptr_t = context.get_value_type(types.voidptr)
     get_queue_fnty = lc.Type.function(void_ptr_t, ())
     get_queue = builder.module.get_or_insert_function(get_queue_fnty,
-                                            name="DPPLQueueMgr_GetCurrentQueue")
+                                            name="DPCTLQueueMgr_GetCurrentQueue")
     sycl_queue_val = cgutils.alloca_once(builder, void_ptr_t)
     builder.store(builder.call(get_queue, []), sycl_queue_val)
 
@@ -87,7 +87,7 @@ def allocate_usm(context, builder, size, sycl_queue):
     void_ptr_t = context.get_value_type(types.voidptr)
     usm_shared_fnty = lc.Type.function(void_ptr_t, [ll_intp_t, void_ptr_t])
     usm_shared = builder.module.get_or_insert_function(usm_shared_fnty,
-                                                       name="DPPLmalloc_shared")
+                                                       name="DPCTLmalloc_shared")
 
     buffer_ptr = cgutils.alloca_once(builder, void_ptr_t)
     args = [size, builder.load(sycl_queue)]
@@ -100,7 +100,7 @@ def copy_usm(context, builder, src, dst, size, sycl_queue):
     queue_memcpy_fnty = lc.Type.function(ir.VoidType(), [void_ptr_t, void_ptr_t, void_ptr_t,
                                                          ll_intp_t])
     queue_memcpy = builder.module.get_or_insert_function(queue_memcpy_fnty,
-                                                       name="DPPLQueue_Memcpy")
+                                                       name="DPCTLQueue_Memcpy")
     args = [builder.load(sycl_queue),
             builder.bitcast(dst, void_ptr_t),
             builder.bitcast(src, void_ptr_t),
@@ -113,7 +113,7 @@ def free_usm(context, builder, usm_buf, sycl_queue):
 
     usm_free_fnty = lc.Type.function(ir.VoidType(), [void_ptr_t, void_ptr_t])
     usm_free = builder.module.get_or_insert_function(usm_free_fnty,
-                                               name="DPPLfree_with_queue")
+                                               name="DPCTLfree_with_queue")
 
     builder.call(usm_free, [usm_buf, builder.load(sycl_queue)])
 
@@ -350,7 +350,7 @@ def make_res(a, b):
 
 
 @lower_builtin(np.dot, types.Array, types.Array)
-def dot_dppl(context, builder, sig, args):
+def dot_dppy(context, builder, sig, args):
     """
     np.dot(a, b)
     a @ b
@@ -374,7 +374,7 @@ def dot_dppl(context, builder, sig, args):
 
 
 @lower_builtin("np.matmul", types.Array, types.Array)
-def matmul_dppl(context, builder, sig, args):
+def matmul_dppy(context, builder, sig, args):
     """
     np.matmul(matrix, matrix)
     """
diff --git a/numba_dppy/initialize.py b/numba_dppy/initialize.py
index c8ba56220a..745e8031eb 100644
--- a/numba_dppy/initialize.py
+++ b/numba_dppy/initialize.py
@@ -5,8 +5,8 @@
 
 
 def init_jit():
-    from numba_dppy.dispatcher import DPPLDispatcher
-    return DPPLDispatcher
+    from numba_dppy.dispatcher import DPPYDispatcher
+    return DPPYDispatcher
 
 def initialize_all():
     from numba.core.registry import dispatcher_registry
@@ -17,9 +17,9 @@ def initialize_all():
     import platform as plt
     platform = plt.system()
     if platform == 'Windows':
-        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPPLSyclInterface.dll'))
+        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface.dll'))
     else:
-        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPPLSyclInterface*'))
+        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface*'))
 
     if len(paths) == 1:
         ll.load_library_permanently(find_library(paths[0]))
diff --git a/numba_dppy/ocl/atomics/atomic_ops.cl b/numba_dppy/ocl/atomics/atomic_ops.cl
index ad581716de..56228d8bf5 100644
--- a/numba_dppy/ocl/atomics/atomic_ops.cl
+++ b/numba_dppy/ocl/atomics/atomic_ops.cl
@@ -5,7 +5,7 @@
 #ifdef cl_khr_int64_base_atomics
   #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
 
-  long numba_dppl_atomic_add_i64_local(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_add_i64_local(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -15,7 +15,7 @@
       return found;
   }
 
-  long numba_dppl_atomic_add_i64_global(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_add_i64_global(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -25,7 +25,7 @@
       return found;
   }
 
-  long numba_dppl_atomic_sub_i64_local(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_sub_i64_local(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -35,7 +35,7 @@
       return found;
   }
 
-  long numba_dppl_atomic_sub_i64_global(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_sub_i64_global(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -48,7 +48,7 @@
   #ifdef cl_khr_fp64
     #pragma OPENCL EXTENSION cl_khr_fp64: enable
 
-    double numba_dppl_atomic_cmpxchg_f64_local(volatile __generic double *p, double cmp, double val) {
+    double numba_dppy_atomic_cmpxchg_f64_local(volatile __generic double *p, double cmp, double val) {
         union {
             ulong  u64;
             double f64;
@@ -60,7 +60,7 @@
         return old_union.f64;
     }
 
-    double numba_dppl_atomic_cmpxchg_f64_global(volatile __generic double *p, double cmp, double val) {
+    double numba_dppy_atomic_cmpxchg_f64_global(volatile __generic double *p, double cmp, double val) {
         union {
             ulong  u64;
             double f64;
@@ -72,50 +72,50 @@
         return old_union.f64;
     }
 
-    double numba_dppl_atomic_add_f64_local(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_add_f64_local(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_local(p, expected, expected + val);
+            found = numba_dppy_atomic_cmpxchg_f64_local(p, expected, expected + val);
         } while (found != expected);
         return found;
     }
 
-    double numba_dppl_atomic_add_f64_global(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_add_f64_global(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_global(p, expected, expected + val);
+            found = numba_dppy_atomic_cmpxchg_f64_global(p, expected, expected + val);
         } while (found != expected);
         return found;
     }
 
 
-    double numba_dppl_atomic_sub_f64_local(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_sub_f64_local(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_local(p, expected, expected - val);
+            found = numba_dppy_atomic_cmpxchg_f64_local(p, expected, expected - val);
         } while (found != expected);
         return found;
     }
 
-    double numba_dppl_atomic_sub_f64_global(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_sub_f64_global(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_global(p, expected, expected - val);
+            found = numba_dppy_atomic_cmpxchg_f64_global(p, expected, expected - val);
         } while (found != expected);
         return found;
     }
   #endif
 #endif
 
-float numba_dppl_atomic_cmpxchg_f32_local(volatile __generic float *p, float cmp, float val) {
+float numba_dppy_atomic_cmpxchg_f32_local(volatile __generic float *p, float cmp, float val) {
     union {
         unsigned int u32;
         float        f32;
@@ -127,7 +127,7 @@ float numba_dppl_atomic_cmpxchg_f32_local(volatile __generic float *p, float cmp
     return old_union.f32;
 }
 
-float numba_dppl_atomic_cmpxchg_f32_global(volatile __generic float *p, float cmp, float val) {
+float numba_dppy_atomic_cmpxchg_f32_global(volatile __generic float *p, float cmp, float val) {
     union {
         unsigned int u32;
         float        f32;
@@ -139,47 +139,47 @@ float numba_dppl_atomic_cmpxchg_f32_global(volatile __generic float *p, float cm
     return old_union.f32;
 }
 
-float numba_dppl_atomic_add_f32_local(volatile __generic float *p, float val) {
+float numba_dppy_atomic_add_f32_local(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_local(p, expected, expected + val);
+        found = numba_dppy_atomic_cmpxchg_f32_local(p, expected, expected + val);
     } while (found != expected);
     return found;
 }
 
-float numba_dppl_atomic_add_f32_global(volatile __generic float *p, float val) {
+float numba_dppy_atomic_add_f32_global(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_global(p, expected, expected + val);
+        found = numba_dppy_atomic_cmpxchg_f32_global(p, expected, expected + val);
     } while (found != expected);
     return found;
 }
 
-float numba_dppl_atomic_sub_f32_local(volatile __generic float *p, float val) {
+float numba_dppy_atomic_sub_f32_local(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_local(p, expected, expected - val);
+        found = numba_dppy_atomic_cmpxchg_f32_local(p, expected, expected - val);
     } while (found != expected);
     return found;
 }
 
-float numba_dppl_atomic_sub_f32_global(volatile __generic float *p, float val) {
+float numba_dppy_atomic_sub_f32_global(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_global(p, expected, expected - val);
+        found = numba_dppy_atomic_cmpxchg_f32_global(p, expected, expected - val);
     } while (found != expected);
     return found;
 }
 
-int numba_dppl_atomic_add_i32_local(volatile __generic int *p, int val) {
+int numba_dppy_atomic_add_i32_local(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
@@ -189,7 +189,7 @@ int numba_dppl_atomic_add_i32_local(volatile __generic int *p, int val) {
     return found;
 }
 
-int numba_dppl_atomic_add_i32_global(volatile __generic int *p, int val) {
+int numba_dppy_atomic_add_i32_global(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
@@ -199,7 +199,7 @@ int numba_dppl_atomic_add_i32_global(volatile __generic int *p, int val) {
     return found;
 }
 
-int numba_dppl_atomic_sub_i32_local(volatile __generic int *p, int val) {
+int numba_dppy_atomic_sub_i32_local(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
@@ -209,7 +209,7 @@ int numba_dppl_atomic_sub_i32_local(volatile __generic int *p, int val) {
     return found;
 }
 
-int numba_dppl_atomic_sub_i32_global(volatile __generic int *p, int val) {
+int numba_dppy_atomic_sub_i32_global(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
diff --git a/numba_dppy/ocl/ocldecl.py b/numba_dppy/ocl/ocldecl.py
index 1af90a6884..adf14a1815 100644
--- a/numba_dppy/ocl/ocldecl.py
+++ b/numba_dppy/ocl/ocldecl.py
@@ -4,7 +4,7 @@
 from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
                                         AbstractTemplate, MacroTemplate,
                                         signature, Registry)
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 
 registry = Registry()
 intrinsic = registry.register
@@ -15,71 +15,71 @@
 
 @intrinsic
 class Ocl_get_global_id(ConcreteTemplate):
-    key = dppl.get_global_id
+    key = dppy.get_global_id
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_local_id(ConcreteTemplate):
-    key = dppl.get_local_id
+    key = dppy.get_local_id
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_group_id(ConcreteTemplate):
-    key = dppl.get_group_id
+    key = dppy.get_group_id
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_num_groups(ConcreteTemplate):
-    key = dppl.get_num_groups
+    key = dppy.get_num_groups
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_work_dim(ConcreteTemplate):
-    key = dppl.get_work_dim
+    key = dppy.get_work_dim
     cases = [signature(types.uint32)]
 
 
 @intrinsic
 class Ocl_get_global_size(ConcreteTemplate):
-    key = dppl.get_global_size
+    key = dppy.get_global_size
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_local_size(ConcreteTemplate):
-    key = dppl.get_local_size
+    key = dppy.get_local_size
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_barrier(ConcreteTemplate):
-    key = dppl.barrier
+    key = dppy.barrier
     cases = [signature(types.void, types.uint32),
              signature(types.void)]
 
 
 @intrinsic
 class Ocl_mem_fence(ConcreteTemplate):
-    key = dppl.mem_fence
+    key = dppy.mem_fence
     cases = [signature(types.void, types.uint32)]
 
 
 @intrinsic
 class Ocl_sub_group_barrier(ConcreteTemplate):
-    key = dppl.sub_group_barrier
+    key = dppy.sub_group_barrier
 
     cases = [signature(types.void)]
 
 
-# dppl.atomic submodule -------------------------------------------------------
+# dppy.atomic submodule -------------------------------------------------------
 
 @intrinsic
 class Ocl_atomic_add(AbstractTemplate):
-    key = dppl.atomic.add
+    key = dppy.atomic.add
 
     def generic(self, args, kws):
         assert not kws
@@ -92,7 +92,7 @@ def generic(self, args, kws):
 
 @intrinsic
 class Ocl_atomic_sub(AbstractTemplate):
-    key = dppl.atomic.sub
+    key = dppy.atomic.sub
 
     def generic(self, args, kws):
         assert not kws
@@ -106,7 +106,7 @@ def generic(self, args, kws):
 
 @intrinsic_attr
 class OclAtomicTemplate(AttributeTemplate):
-    key = types.Module(dppl.atomic)
+    key = types.Module(dppy.atomic)
 
     def resolve_add(self, mod):
         return types.Function(Ocl_atomic_add)
@@ -115,15 +115,15 @@ def resolve_sub(self, mod):
         return types.Function(Ocl_atomic_sub)
 
 
-# dppl.local submodule -------------------------------------------------------
+# dppy.local submodule -------------------------------------------------------
 
 class Ocl_local_alloc(MacroTemplate):
-    key = dppl.local.static_alloc
+    key = dppy.local.static_alloc
 
 
 @intrinsic_attr
 class OclLocalTemplate(AttributeTemplate):
-    key = types.Module(dppl.local)
+    key = types.Module(dppy.local)
 
     def resolve_static_alloc(self, mod):
         return types.Macro(Ocl_local_alloc)
@@ -133,7 +133,7 @@ def resolve_static_alloc(self, mod):
 
 @intrinsic_attr
 class OclModuleTemplate(AttributeTemplate):
-    key = types.Module(dppl)
+    key = types.Module(dppy)
 
     def resolve_get_global_id(self, mod):
         return types.Function(Ocl_get_global_id)
@@ -166,11 +166,11 @@ def resolve_sub_group_barrier(self, mod):
         return types.Function(Ocl_sub_group_barrier)
 
     def resolve_atomic(self, mod):
-        return types.Module(dppl.atomic)
+        return types.Module(dppy.atomic)
 
     def resolve_local(self, mod):
-        return types.Module(dppl.local)
+        return types.Module(dppy.local)
 
 # intrinsic
 
-#intrinsic_global(dppl, types.Module(dppl))
+#intrinsic_global(dppy, types.Module(dppy))
diff --git a/numba_dppy/ocl/oclimpl.py b/numba_dppy/ocl/oclimpl.py
index b92dca7bae..26f8482799 100644
--- a/numba_dppy/ocl/oclimpl.py
+++ b/numba_dppy/ocl/oclimpl.py
@@ -169,9 +169,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
         ll_val = ir.IntType(32)
         ll_p = ll_val.as_pointer()
         if fn_type == "add":
-            name = "numba_dppl_atomic_add_i32"
+            name = "numba_dppy_atomic_add_i32"
         elif fn_type == "sub":
-            name = "numba_dppl_atomic_sub_i32"
+            name = "numba_dppy_atomic_sub_i32"
         else:
             raise TypeError("Operation type is not supported %s" %
                              (fn_type))
@@ -182,9 +182,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
             ll_val = ir.IntType(64)
             ll_p = ll_val.as_pointer()
             if fn_type == "add":
-                name = "numba_dppl_atomic_add_i64"
+                name = "numba_dppy_atomic_add_i64"
             elif fn_type == "sub":
-                name = "numba_dppl_atomic_sub_i64"
+                name = "numba_dppy_atomic_sub_i64"
             else:
                 raise TypeError("Operation type is not supported %s" %
                                  (fn_type))
@@ -195,9 +195,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
         ll_val = ir.FloatType()
         ll_p = ll_val.as_pointer()
         if fn_type == "add":
-            name = "numba_dppl_atomic_add_f32"
+            name = "numba_dppy_atomic_add_f32"
         elif fn_type == "sub":
-            name = "numba_dppl_atomic_sub_f32"
+            name = "numba_dppy_atomic_sub_f32"
         else:
             raise TypeError("Operation type is not supported %s" %
                              (fn_type))
@@ -208,9 +208,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
             ll_val = ir.DoubleType()
             ll_p = ll_val.as_pointer()
             if fn_type == "add":
-                name = "numba_dppl_atomic_add_f64"
+                name = "numba_dppy_atomic_add_f64"
             elif fn_type == "sub":
-                name = "numba_dppl_atomic_sub_f64"
+                name = "numba_dppy_atomic_sub_f64"
             else:
                 raise TypeError("Operation type is not supported %s" %
                                  (fn_type))
@@ -331,11 +331,11 @@ def atomic_sub_tuple(context, builder, sig, args):
         raise ImportError("Atomic support is not present, can not perform atomic_add")
 
 
-@lower('dppl.lmem.alloc', types.UniTuple, types.Any)
-def dppl_lmem_alloc_array(context, builder, sig, args):
+@lower('dppy.lmem.alloc', types.UniTuple, types.Any)
+def dppy_lmem_alloc_array(context, builder, sig, args):
     shape, dtype = args
     return _generic_array(context, builder, shape=shape, dtype=dtype,
-                          symbol_name='_dppl_lmem',
+                          symbol_name='_dppy_lmem',
                           addrspace=target.SPIR_LOCAL_ADDRSPACE)
 
 
diff --git a/numba_dppy/ocl/stubs.py b/numba_dppy/ocl/stubs.py
index 2ec95fa9c8..190b685955 100644
--- a/numba_dppy/ocl/stubs.py
+++ b/numba_dppy/ocl/stubs.py
@@ -83,9 +83,9 @@ def sub_group_barrier():
 
 class Stub(object):
     """A stub object to represent special objects which is meaningless
-    outside the context of DPPL compilation context.
+    outside the context of DPPY compilation context.
     """
-    _description_ = '<dppl special value>'
+    _description_ = '<dppy special value>'
     __slots__ = ()  # don't allocate __dict__
 
     def __new__(cls):
@@ -100,7 +100,7 @@ def __repr__(self):
 def local_alloc(shape, dtype):
     shape = _legalize_shape(shape)
     ndim = len(shape)
-    fname = "dppl.lmem.alloc"
+    fname = "dppy.lmem.alloc"
     restype = types.Array(dtype, ndim, 'C', addrspace=SPIR_LOCAL_ADDRSPACE)
     sig = typing.signature(restype, types.UniTuple(types.intp, ndim), types.Any)
     return ir.Intrinsic(fname, sig, args=(shape, dtype))
diff --git a/numba_dppy/printimpl.py b/numba_dppy/printimpl.py
index 74319b1bdd..e5c9d4f793 100644
--- a/numba_dppy/printimpl.py
+++ b/numba_dppy/printimpl.py
@@ -79,8 +79,8 @@ def print_varargs(context, builder, sig, args):
     va_arg.extend(values)
     va_arg = tuple(va_arg)
 
-    dppl_print = declare_print(builder.module)
+    dppy_print = declare_print(builder.module)
 
-    builder.call(dppl_print, va_arg)
+    builder.call(dppy_print, va_arg)
 
     return context.get_dummy_value()
diff --git a/numba_dppy/target.py b/numba_dppy/target.py
index aac4efcd4b..6444a6e601 100644
--- a/numba_dppy/target.py
+++ b/numba_dppy/target.py
@@ -24,7 +24,7 @@
 # Typing
 
 
-class DPPLTypingContext(typing.BaseContext):
+class DPPYTypingContext(typing.BaseContext):
     def load_additional_registries(self):
         # Declarations for OpenCL API functions and OpenCL Math functions
         from .ocl import ocldecl, mathdecl
@@ -91,7 +91,7 @@ def _replace_numpy_ufunc_with_opencl_supported_functions():
                 ufunc_db[ufunc][sig] = lower_ocl_impl[(name, sig_mapper[sig])]
 
 
-class DPPLTargetContext(BaseContext):
+class DPPYTargetContext(BaseContext):
     implement_powi_as_math_call = True
     generic_addrspace = SPIR_GENERIC_ADDRSPACE
 
@@ -153,7 +153,7 @@ def load_additional_registries(self):
 
     @cached_property
     def call_conv(self):
-        return DPPLCallConv(self)
+        return DPPYCallConv(self)
 
     def codegen(self):
         return self._internal_codegen
@@ -169,7 +169,7 @@ def repl(m):
 
         qualified = name + '.' + '.'.join(str(a) for a in argtypes)
         mangled = VALID_CHARS.sub(repl, qualified)
-        return 'dppl_py_devfn_' + mangled
+        return 'dppy_py_devfn_' + mangled
 
     def prepare_ocl_kernel(self, func, argtypes):
         module = func.module
@@ -208,8 +208,8 @@ def sub_gen_with_global(lty):
             llargtys = changed = ()
         wrapperfnty = lc.Type.function(lc.Type.void(), llargtys)
 
-        wrapper_module = self.create_module("dppl.kernel.wrapper")
-        wrappername = 'dpplPy_{name}'.format(name=func.name)
+        wrapper_module = self.create_module("dppy.kernel.wrapper")
+        wrappername = 'dppyPy_{name}'.format(name=func.name)
 
         argtys = list(arginfo.argument_types)
         fnty = lc.Type.function(lc.Type.int(),
@@ -239,7 +239,7 @@ def sub_gen_with_global(lty):
                                                  argtypes, callargs)
         builder.ret_void()
 
-        set_dppl_kernel(wrapper)
+        set_dppy_kernel(wrapper)
 
         #print(str(wrapper_module))
         # Link
@@ -255,9 +255,9 @@ def declare_function(self, module, fndesc):
         fnty = self.call_conv.get_function_type(fndesc.restype, fndesc.argtypes)
         fn = module.get_or_insert_function(fnty, name=fndesc.mangled_name)
         fn.attributes.add('alwaysinline')
-        ret = super(DPPLTargetContext, self).declare_function(module, fndesc)
+        ret = super(DPPYTargetContext, self).declare_function(module, fndesc)
         # XXX: Refactor fndesc instead of this special case
-        if fndesc.llvm_func_name.startswith('dppl_py_devfn'):
+        if fndesc.llvm_func_name.startswith('dppy_py_devfn'):
             ret.calling_convention = CC_SPIR_FUNC
         return ret
 
@@ -305,7 +305,7 @@ def addrspacecast(self, builder, src, addrspace):
         return builder.addrspacecast(src, ptras)
 
 
-def set_dppl_kernel(fn):
+def set_dppy_kernel(fn):
     """
     Ensure `fn` is usable as a SPIR kernel.
     - Fix calling convention
@@ -332,11 +332,11 @@ def set_dppl_kernel(fn):
     make_constant = lambda x: lc.Constant.int(lc.Type.int(), x)
     spir_version_constant = [make_constant(x) for x in SPIR_VERSION]
 
-    spir_version = mod.get_or_insert_named_metadata("dppl.spir.version")
+    spir_version = mod.get_or_insert_named_metadata("dppy.spir.version")
     if not spir_version.operands:
         spir_version.add(lc.MetaData.get(mod, spir_version_constant))
 
-    ocl_version = mod.get_or_insert_named_metadata("dppl.ocl.version")
+    ocl_version = mod.get_or_insert_named_metadata("dppy.ocl.version")
     if not ocl_version.operands:
         ocl_version.add(lc.MetaData.get(mod, spir_version_constant))
 
@@ -414,7 +414,7 @@ def gen_arg_base_type(fn):
     return lc.MetaData.get(mod, [name] + consts)
 
 
-class DPPLCallConv(MinimalCallConv):
+class DPPYCallConv(MinimalCallConv):
     def call_function(self, builder, callee, resty, argtys, args, env=None):
         """
         Call the Numba-compiled *callee*.
diff --git a/numba_dppy/target_dispatcher.py b/numba_dppy/target_dispatcher.py
index 40b9d589d9..dde38eb75b 100644
--- a/numba_dppy/target_dispatcher.py
+++ b/numba_dppy/target_dispatcher.py
@@ -8,9 +8,9 @@
 class TargetDispatcher(serialize.ReduceMixin, metaclass=dispatcher.DispatcherMeta):
     __numba__ = 'py_func'
 
-    target_offload_gpu = '__dppl_offload_gpu__'
-    target_offload_cpu = '__dppl_offload_cpu__'
-    target_dppl = 'dppy'
+    target_offload_gpu = '__dppy_offload_gpu__'
+    target_offload_cpu = '__dppy_offload_cpu__'
+    target_dppy = 'dppy'
 
     def __init__(self, py_func, wrapper, target, parallel_options, compiled=None):
 
@@ -53,7 +53,7 @@ def get_compiled(self, target=None):
         return self.__compiled[disp]
 
     def __is_with_context_target(self, target):
-        return target is None or target == TargetDispatcher.target_dppl
+        return target is None or target == TargetDispatcher.target_dppy
 
     def get_current_disp(self):
         target = self.__target
@@ -66,7 +66,7 @@ def get_current_disp(self):
             if parallel is False or (isinstance(parallel, dict) and parallel.get('offload') is False):
                 raise UnsupportedError(f"Can't use 'with' context with parallel option '{parallel}'")
 
-            from numba_dppy import dppl_offload_dispatcher
+            from numba_dppy import dppy_offload_dispatcher
 
             if target is None:
                 if dpctl.get_current_device_type() == dpctl.device_type.gpu:
@@ -75,7 +75,7 @@ def get_current_disp(self):
                     return registry.dispatcher_registry[TargetDispatcher.target_offload_cpu]
                 else:
                     if dpctl.is_in_device_context():
-                        raise UnsupportedError('Unknown dppl device type')
+                        raise UnsupportedError('Unknown dppy device type')
                     if offload:
                         if dpctl.has_gpu_queues():
                             return registry.dispatcher_registry[TargetDispatcher.target_offload_gpu]
diff --git a/numba_dppy/testing.py b/numba_dppy/testing.py
index 8da0b7b91e..e309b7f0c9 100644
--- a/numba_dppy/testing.py
+++ b/numba_dppy/testing.py
@@ -11,7 +11,7 @@
     redirect_c_stdout,
 )
 
-class DPPLTestCase(SerialMixin, unittest.TestCase):
+class DPPYTestCase(SerialMixin, unittest.TestCase):
     def setUp(self):
         #init()
 	#TODO
@@ -21,7 +21,7 @@ def tearDown(self):
 	#TODO
         pass
 
-class DPPLTextCapture(object):
+class DPPYTextCapture(object):
     def __init__(self, stream):
         self._stream = stream
 
@@ -36,16 +36,16 @@ def getvalue(self):
         return self._stream.getvalue()
 
 @contextlib.contextmanager
-def captured_dppl_stdout():
+def captured_dppy_stdout():
     """
-    Return a minimal stream-like object capturing the text output of dppl
+    Return a minimal stream-like object capturing the text output of dppy
     """
     # Prevent accidentally capturing previously output text
     sys.stdout.flush()
 
-    import numba_dppy, numba_dppy as dppl
+    import numba_dppy, numba_dppy as dppy
     with redirect_c_stdout() as stream:
-        yield DPPLTextCapture(stream)
+        yield DPPYTextCapture(stream)
 
 
 def _id(obj):
diff --git a/numba_dppy/tests/__init__.py b/numba_dppy/tests/__init__.py
index 5a2199f149..939c95c567 100644
--- a/numba_dppy/tests/__init__.py
+++ b/numba_dppy/tests/__init__.py
@@ -2,9 +2,11 @@
 from numba.testing import load_testsuite
 from os.path import dirname, join
 
-
+import numba_dppy
 import numba_dppy.config as dppy_config
 
+# from numba_dppy.tests.dppy import *
+
 def load_tests(loader, tests, pattern):
 
     suite = SerialSuite()
@@ -12,6 +14,6 @@ def load_tests(loader, tests, pattern):
     if dppy_config.dppy_present:
         suite.addTests(load_testsuite(loader, dirname(__file__)))
     else:
-        print("skipped DPPL tests")
+        print("skipped DPPY tests")
 
     return suite
diff --git a/numba_dppy/tests/test_arg_accessor.py b/numba_dppy/tests/test_arg_accessor.py
index ecc5d839bb..3de2d31770 100644
--- a/numba_dppy/tests/test_arg_accessor.py
+++ b/numba_dppy/tests/test_arg_accessor.py
@@ -2,25 +2,25 @@
 
 import numpy as np
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
-@dppl.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
+@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
 def sum_with_accessor(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
-@dppl.kernel
+@dppy.kernel
 def sum_without_accessor(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 def call_kernel(global_size, local_size,
                 A, B, C, func):
-        func[global_size, dppl.DEFAULT_LOCAL_SIZE](A, B, C)
+        func[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, C)
 
 
 global_size = 10
@@ -33,7 +33,7 @@ def call_kernel(global_size, local_size,
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLArgAccessorCPU(DPPLTestCase):
+class TestDPPYArgAccessorCPU(DPPYTestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
@@ -50,7 +50,7 @@ def test_arg_without_accessor(self):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLArgAccessorGPU(DPPLTestCase):
+class TestDPPYArgAccessorGPU(DPPYTestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
diff --git a/numba_dppy/tests/test_arg_types.py b/numba_dppy/tests/test_arg_types.py
index fc2eae105d..7b06ef11f8 100644
--- a/numba_dppy/tests/test_arg_types.py
+++ b/numba_dppy/tests/test_arg_types.py
@@ -2,19 +2,19 @@
 
 import numpy as np
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def mul_kernel(A, B, test):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     B[i] = A[i] * test
 
 def call_mul_device_kernel(global_size, A, B, test):
-    mul_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, B, test)
+    mul_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, test)
 
 
 global_size = 10
@@ -24,7 +24,7 @@ def call_mul_device_kernel(global_size, A, B, test):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLArrayArgCPU(DPPLTestCase):
+class TestDPPYArrayArgCPU(DPPYTestCase):
     def test_integer_arg(self):
         x = np.int32(2)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
@@ -42,7 +42,7 @@ def test_float_arg(self):
             self.assertTrue(np.all(A * x == B))
 
     def test_bool_arg(self):
-        @dppl.kernel
+        @dppy.kernel
         def check_bool_kernel(A, test):
             if test:
                 A[0] = 111
@@ -52,14 +52,14 @@ def check_bool_kernel(A, test):
         A = np.array([0], dtype='float64')
 
         with dpctl.device_context("opencl:cpu") as cpu_queue:
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, True)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True)
             self.assertTrue(A[0] == 111)
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, False)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, False)
             self.assertTrue(A[0] == 222)
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLArrayArgGPU(DPPLTestCase):
+class TestDPPYArrayArgGPU(DPPYTestCase):
     def test_integer_arg(self):
         x = np.int32(2)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
@@ -77,7 +77,7 @@ def test_float_arg(self):
             self.assertTrue(np.all(A * x == B))
 
     def test_bool_arg(self):
-        @dppl.kernel
+        @dppy.kernel
         def check_bool_kernel(A, test):
             if test:
                 A[0] = 111
@@ -87,9 +87,9 @@ def check_bool_kernel(A, test):
         A = np.array([0], dtype='float64')
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, True)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True)
             self.assertTrue(A[0] == 111)
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, False)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, False)
             self.assertTrue(A[0] == 222)
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_atomic_op.py b/numba_dppy/tests/test_atomic_op.py
index 9825c707d1..9d8e88def1 100644
--- a/numba_dppy/tests/test_atomic_op.py
+++ b/numba_dppy/tests/test_atomic_op.py
@@ -3,106 +3,106 @@
 import numpy as np
 
 import numba
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 def atomic_add_int32(ary):
-    tid = dppl.get_local_id(0)
-    lm = dppl.local.static_alloc(32, numba.uint32)
+    tid = dppy.get_local_id(0)
+    lm = dppy.local.static_alloc(32, numba.uint32)
     lm[tid] = 0
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     bin = ary[tid] % 32
-    dppl.atomic.add(lm, bin, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, bin, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tid] = lm[tid]
 
 
 def atomic_sub_int32(ary):
-    tid = dppl.get_local_id(0)
-    lm = dppl.local.static_alloc(32, numba.uint32)
+    tid = dppy.get_local_id(0)
+    lm = dppy.local.static_alloc(32, numba.uint32)
     lm[tid] = 0
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     bin = ary[tid] % 32
-    dppl.atomic.sub(lm, bin, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, bin, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tid] = lm[tid]
 
 
 def atomic_add_float32(ary):
-    lm = dppl.local.static_alloc(1, numba.float32)
+    lm = dppy.local.static_alloc(1, numba.float32)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_sub_float32(ary):
-    lm = dppl.local.static_alloc(1, numba.float32)
+    lm = dppy.local.static_alloc(1, numba.float32)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.sub(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_add_int64(ary):
-    lm = dppl.local.static_alloc(1, numba.int64)
+    lm = dppy.local.static_alloc(1, numba.int64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_sub_int64(ary):
-    lm = dppl.local.static_alloc(1, numba.int64)
+    lm = dppy.local.static_alloc(1, numba.int64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.sub(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_add_float64(ary):
-    lm = dppl.local.static_alloc(1, numba.float64)
+    lm = dppy.local.static_alloc(1, numba.float64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_sub_float64(ary):
-    lm = dppl.local.static_alloc(1, numba.float64)
+    lm = dppy.local.static_alloc(1, numba.float64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.sub(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_add2(ary):
-    tx = dppl.get_local_id(0)
-    ty = dppl.get_local_id(1)
-    lm = dppl.local.static_alloc((4, 8), numba.uint32)
+    tx = dppy.get_local_id(0)
+    ty = dppy.get_local_id(1)
+    lm = dppy.local.static_alloc((4, 8), numba.uint32)
     lm[tx, ty] = ary[tx, ty]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, (tx, ty), 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, (tx, ty), 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tx, ty] = lm[tx, ty]
 
 
 def atomic_add3(ary):
-    tx = dppl.get_local_id(0)
-    ty = dppl.get_local_id(1)
-    lm = dppl.local.static_alloc((4, 8), numba.uint32)
+    tx = dppy.get_local_id(0)
+    ty = dppy.get_local_id(1)
+    lm = dppy.local.static_alloc((4, 8), numba.uint32)
     lm[tx, ty] = ary[tx, ty]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, (tx, numba.uint64(ty)), 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, (tx, numba.uint64(ty)), 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tx, ty] = lm[tx, ty]
 
 
@@ -118,18 +118,18 @@ def call_fn_for_datatypes(fn, result, input, global_size):
             #    continue
             #if dtype == np.int64 and not device_env.device_support_int64_atomics():
             #    continue
-            fn[global_size, dppl.DEFAULT_LOCAL_SIZE](a)
+            fn[global_size, dppy.DEFAULT_LOCAL_SIZE](a)
 
         assert(a[0] == result)
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 @unittest.skipUnless(numba_dppy.ocl.atomic_support_present(), 'test only when atomic support is present')
-class TestAtomicOp(DPPLTestCase):
+class TestAtomicOp(DPPYTestCase):
     def test_atomic_add_global(self):
-        @dppl.kernel
+        @dppy.kernel
         def atomic_add(B):
-            dppl.atomic.add(B, 0, 1)
+            dppy.atomic.add(B, 0, 1)
 
         N = 100
         B = np.array([0])
@@ -138,9 +138,9 @@ def atomic_add(B):
 
 
     def test_atomic_sub_global(self):
-        @dppl.kernel
+        @dppy.kernel
         def atomic_sub(B):
-            dppl.atomic.sub(B, 0, 1)
+            dppy.atomic.sub(B, 0, 1)
 
         N = 100
         B = np.array([100])
@@ -152,10 +152,10 @@ def test_atomic_add_local_int32(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
 
-        #dppl_atomic_add = dppl.kernel('void(uint32[:])')(atomic_add_int32)
-        dppl_atomic_add = dppl.kernel(atomic_add_int32)
+        #dppy_atomic_add = dppy.kernel('void(uint32[:])')(atomic_add_int32)
+        dppy_atomic_add = dppy.kernel(atomic_add_int32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         gold = np.zeros(32, dtype=np.uint32)
         for i in range(orig.size):
@@ -168,10 +168,10 @@ def test_atomic_sub_local_int32(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
 
-        #dppl_atomic_sub = dppl.kernel('void(uint32[:])')(atomic_sub_int32)
-        dppl_atomic_sub = dppl.kernel(atomic_sub_int32)
+        #dppy_atomic_sub = dppy.kernel('void(uint32[:])')(atomic_sub_int32)
+        dppy_atomic_sub = dppy.kernel(atomic_sub_int32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_sub[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_sub[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         gold = np.zeros(32, dtype=np.uint32)
         for i in range(orig.size):
@@ -183,10 +183,10 @@ def test_atomic_sub_local_int32(self):
     def test_atomic_add_local_float32(self):
         ary = np.array([0], dtype=np.float32)
 
-        #dppl_atomic_add = dppl.kernel('void(float32[:])')(atomic_add_float32)
-        dppl_atomic_add = dppl.kernel(atomic_add_float32)
+        #dppy_atomic_add = dppy.kernel('void(float32[:])')(atomic_add_float32)
+        dppy_atomic_add = dppy.kernel(atomic_add_float32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(ary[0] == 32)
 
@@ -194,11 +194,11 @@ def test_atomic_add_local_float32(self):
     def test_atomic_sub_local_float32(self):
         ary = np.array([32], dtype=np.float32)
 
-        #dppl_atomic_sub = dppl.kernel('void(float32[:])')(atomic_sub_float32)
-        dppl_atomic_sub = dppl.kernel(atomic_sub_float32)
+        #dppy_atomic_sub = dppy.kernel('void(float32[:])')(atomic_sub_float32)
+        dppy_atomic_sub = dppy.kernel(atomic_sub_float32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
 
-            dppl_atomic_sub[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_sub[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(ary[0] == 0)
 
@@ -206,12 +206,12 @@ def test_atomic_sub_local_float32(self):
     def test_atomic_add_local_int64(self):
         ary = np.array([0], dtype=np.int64)
 
-        #dppl_atomic_add = dppl.kernel('void(int64[:])')(atomic_add_int64)
-        dppl_atomic_add = dppl.kernel(atomic_add_int64)
+        #dppy_atomic_add = dppy.kernel('void(int64[:])')(atomic_add_int64)
+        dppy_atomic_add = dppy.kernel(atomic_add_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_int64_atomics():
-            dppl_atomic_add[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 32)
             #else:
             #    return
@@ -220,12 +220,12 @@ def test_atomic_add_local_int64(self):
     def test_atomic_sub_local_int64(self):
         ary = np.array([32], dtype=np.int64)
 
-        #fn = dppl.kernel('void(int64[:])')(atomic_sub_int64)
-        fn = dppl.kernel(atomic_sub_int64)
+        #fn = dppy.kernel('void(int64[:])')(atomic_sub_int64)
+        fn = dppy.kernel(atomic_sub_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_int64_atomics():
-            fn[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 0)
             #else:
             #    return
@@ -234,12 +234,12 @@ def test_atomic_sub_local_int64(self):
     def test_atomic_add_local_float64(self):
         ary = np.array([0], dtype=np.double)
 
-        #fn = dppl.kernel('void(float64[:])')(atomic_add_float64)
-        fn = dppl.kernel(atomic_add_float64)
+        #fn = dppy.kernel('void(float64[:])')(atomic_add_float64)
+        fn = dppy.kernel(atomic_add_float64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_float64_atomics():
-            fn[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 32)
             #else:
             #    return
@@ -248,12 +248,12 @@ def test_atomic_add_local_float64(self):
     def test_atomic_sub_local_float64(self):
         ary = np.array([32], dtype=np.double)
 
-        #fn = dppl.kernel('void(float64[:])')(atomic_sub_int64)
-        fn = dppl.kernel(atomic_sub_int64)
+        #fn = dppy.kernel('void(float64[:])')(atomic_sub_int64)
+        fn = dppy.kernel(atomic_sub_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_float64_atomics():
-            fn[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 0)
             #else:
             #    return
@@ -262,20 +262,20 @@ def test_atomic_sub_local_float64(self):
     def test_atomic_add2(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        #dppl_atomic_add2 = dppl.kernel('void(uint32[:,:])')(atomic_add2)
-        dppl_atomic_add2 = dppl.kernel(atomic_add2)
+        #dppy_atomic_add2 = dppy.kernel('void(uint32[:,:])')(atomic_add2)
+        dppy_atomic_add2 = dppy.kernel(atomic_add2)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add2[(4, 8), dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add2[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary)
         self.assertTrue(np.all(ary == orig + 1))
 
 
     def test_atomic_add3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        #dppl_atomic_add3 = dppl.kernel('void(uint32[:,:])')(atomic_add3)
-        dppl_atomic_add3 = dppl.kernel(atomic_add3)
+        #dppy_atomic_add3 = dppy.kernel('void(uint32[:,:])')(atomic_add3)
+        dppy_atomic_add3 = dppy.kernel(atomic_add3)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add3[(4, 8), dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add3[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(np.all(ary == orig + 1))
 
diff --git a/numba_dppy/tests/test_barrier.py b/numba_dppy/tests/test_barrier.py
index aeff16dd40..3657672240 100644
--- a/numba_dppy/tests/test_barrier.py
+++ b/numba_dppy/tests/test_barrier.py
@@ -3,21 +3,21 @@
 import numpy as np
 
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 from numba import float32
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 class TestBarrier(unittest.TestCase):
     def test_proper_lowering(self):
-        #@dppl.kernel("void(float32[::1])")
-        @dppl.kernel
+        #@dppy.kernel("void(float32[::1])")
+        @dppy.kernel
         def twice(A):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             d = A[i]
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)  # local mem fence
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
             A[i] = d * 2
 
         N = 256
@@ -31,13 +31,13 @@ def twice(A):
         np.testing.assert_allclose(orig * 2, arr)
 
     def test_no_arg_barrier_support(self):
-        #@dppl.kernel("void(float32[::1])")
-        @dppl.kernel
+        #@dppy.kernel("void(float32[::1])")
+        @dppy.kernel
         def twice(A):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             d = A[i]
             # no argument defaults to global mem fence
-            dppl.barrier()
+            dppy.barrier()
             A[i] = d * 2
 
         N = 256
@@ -45,7 +45,7 @@ def twice(A):
         orig = arr.copy()
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            twice[N, dppl.DEFAULT_LOCAL_SIZE](arr)
+            twice[N, dppy.DEFAULT_LOCAL_SIZE](arr)
 
         # The computation is correct?
         np.testing.assert_allclose(orig * 2, arr)
@@ -54,16 +54,16 @@ def twice(A):
     def test_local_memory(self):
         blocksize = 10
 
-        #@dppl.kernel("void(float32[::1])")
-        @dppl.kernel
+        #@dppy.kernel("void(float32[::1])")
+        @dppy.kernel
         def reverse_array(A):
-            lm = dppl.local.static_alloc(shape=10, dtype=float32)
-            i = dppl.get_global_id(0)
+            lm = dppy.local.static_alloc(shape=10, dtype=float32)
+            i = dppy.get_global_id(0)
 
             # preload
             lm[i] = A[i]
             # barrier local or global will both work as we only have one work group
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)  # local mem fence
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
             # write
             A[i] += lm[blocksize - 1 - i]
 
@@ -71,7 +71,7 @@ def reverse_array(A):
         orig = arr.copy()
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            reverse_array[blocksize, dppl.DEFAULT_LOCAL_SIZE](arr)
+            reverse_array[blocksize, dppy.DEFAULT_LOCAL_SIZE](arr)
 
         expected = orig[::-1] + orig
         np.testing.assert_allclose(expected, arr)
diff --git a/numba_dppy/tests/test_black_scholes.py b/numba_dppy/tests/test_black_scholes.py
index 3d9581bb54..312536d33a 100644
--- a/numba_dppy/tests/test_black_scholes.py
+++ b/numba_dppy/tests/test_black_scholes.py
@@ -4,9 +4,9 @@
 import math
 import time
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
@@ -49,7 +49,7 @@ def randfloat(rand_var, low, high):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLBlackScholes(DPPLTestCase):
+class TestDPPYBlackScholes(DPPYTestCase):
     def test_black_scholes(self):
         OPT_N = 400
         iterations = 2
@@ -70,9 +70,9 @@ def test_black_scholes(self):
                           optionStrike, optionYears, RISKFREE, VOLATILITY)
 
 
-        @dppl.kernel
-        def black_scholes_dppl(callResult, putResult, S, X, T, R, V):
-            i = dppl.get_global_id(0)
+        @dppy.kernel
+        def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
+            i = dppy.get_global_id(0)
             if i >= S.shape[0]:
                 return
             sqrtT = math.sqrt(T[i])
@@ -103,7 +103,7 @@ def black_scholes_dppl(callResult, putResult, S, X, T, R, V):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             time1 = time.time()
             for i in range(iterations):
-                black_scholes_dppl[blockdim, griddim](
+                black_scholes_dppy[blockdim, griddim](
                     callResultNumbapro, putResultNumbapro, stockPrice, optionStrike,
                     optionYears, RISKFREE, VOLATILITY)
 
diff --git a/numba_dppy/tests/test_caching.py b/numba_dppy/tests/test_caching.py
index 6a6a7967a5..ae693190a3 100644
--- a/numba_dppy/tests/test_caching.py
+++ b/numba_dppy/tests/test_caching.py
@@ -3,18 +3,18 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
-class TestCaching(DPPLTestCase):
+class TestCaching(DPPYTestCase):
     def test_caching_kernel(self):
         global_size = 10
         N = global_size
@@ -25,11 +25,11 @@ def test_caching_kernel(self):
 
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            func = dppl.kernel(data_parallel_sum)
-            caching_kernel = func[global_size, dppl.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
+            func = dppy.kernel(data_parallel_sum)
+            caching_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
 
             for i in range(10):
-                cached_kernel = func[global_size, dppl.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
+                cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
                 self.assertIs(caching_kernel, cached_kernel)
 
 
diff --git a/numba_dppy/tests/test_device_array_args.py b/numba_dppy/tests/test_device_array_args.py
index 024e3723a9..b38eac12fe 100644
--- a/numba_dppy/tests/test_device_array_args.py
+++ b/numba_dppy/tests/test_device_array_args.py
@@ -4,14 +4,14 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
@@ -24,23 +24,23 @@ def data_parallel_sum(a, b, c):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLDeviceArrayArgsGPU(DPPLTestCase):
+class TestDPPYDeviceArrayArgsGPU(DPPYTestCase):
     def test_device_array_args_cpu(self):
         c = np.ones_like(a)
 
         with dpctl.device_context("opencl:cpu") as cpu_queue:
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
 
             self.assertTrue(np.all(c == d))
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLDeviceArrayArgsCPU(DPPLTestCase):
+class TestDPPYDeviceArrayArgsCPU(DPPYTestCase):
     def test_device_array_args_gpu(self):
         c = np.ones_like(a)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
 
         self.assertTrue(np.all(c == d))
 
diff --git a/numba_dppy/tests/test_dpctl_api.py b/numba_dppy/tests/test_dpctl_api.py
index bb72a35cf2..dcbb95e163 100644
--- a/numba_dppy/tests/test_dpctl_api.py
+++ b/numba_dppy/tests/test_dpctl_api.py
@@ -3,12 +3,12 @@
 import numpy as np
 
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPCTLAPI(DPPLTestCase):
+class TestDPCTLAPI(DPPYTestCase):
     def test_dpctl_api(self):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dpctl.dump()
diff --git a/numba_dppy/tests/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
index bbffb30c3f..b0837f5ba6 100644
--- a/numba_dppy/tests/test_dpnp_functions.py
+++ b/numba_dppy/tests/test_dpnp_functions.py
@@ -5,9 +5,9 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
 def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False, matrix=None):
@@ -76,7 +76,7 @@ def ensure_dpnp():
 
 
 @unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
-class Testdpnp_functions(DPPLTestCase):
+class Testdpnp_functions(DPPYTestCase):
     N = 10
 
     a = np.array(np.random.random(N), dtype=np.float32)
diff --git a/numba_dppy/tests/test_dppl_fallback.py b/numba_dppy/tests/test_dppl_fallback.py
index adb7ae868b..8519f4fb14 100644
--- a/numba_dppy/tests/test_dppl_fallback.py
+++ b/numba_dppy/tests/test_dppl_fallback.py
@@ -3,9 +3,9 @@
 import numpy as np
 
 import numba
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 from numba.tests.support import captured_stderr
 import dpctl
 import sys
@@ -13,8 +13,8 @@
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLFallback(DPPLTestCase):
-    def test_dppl_fallback_inner_call(self):
+class TestDPPYFallback(DPPYTestCase):
+    def test_dppy_fallback_inner_call(self):
         @numba.jit
         def fill_value(i):
             return i
@@ -29,27 +29,27 @@ def inner_call_fallback():
             return a
 
         with captured_stderr() as msg:
-            dppl = numba.njit(parallel={'offload':True})(inner_call_fallback)
-            dppl_result = dppl()
+            dppy = numba.njit(parallel={'offload':True})(inner_call_fallback)
+            dppy_result = dppy()
 
         ref_result = inner_call_fallback()
 
-        np.testing.assert_array_equal(dppl_result, ref_result)
-        self.assertTrue('Failed to lower parfor on DPPL-device' in msg.getvalue())
+        np.testing.assert_array_equal(dppy_result, ref_result)
+        self.assertTrue('Failed to lower parfor on DPPY-device' in msg.getvalue())
 
-    def test_dppl_fallback_reductions(self):
+    def test_dppy_fallback_reductions(self):
         def reduction(a):
             return np.amax(a)
 
         a = np.ones(10)
         with captured_stderr() as msg:
-            dppl = numba.njit(parallel={'offload':True})(reduction)
-            dppl_result = dppl(a)
+            dppy = numba.njit(parallel={'offload':True})(reduction)
+            dppy_result = dppy(a)
 
         ref_result = reduction(a)
 
-        np.testing.assert_array_equal(dppl_result, ref_result)
-        self.assertTrue('Failed to lower parfor on DPPL-device' in msg.getvalue())
+        np.testing.assert_array_equal(dppy_result, ref_result)
+        self.assertTrue('Failed to lower parfor on DPPY-device' in msg.getvalue())
 
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_dppl_func.py b/numba_dppy/tests/test_dppl_func.py
index 0f64046082..c58908554e 100644
--- a/numba_dppy/tests/test_dppl_func.py
+++ b/numba_dppy/tests/test_dppl_func.py
@@ -2,59 +2,59 @@
 
 import numpy as np
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLFunc(DPPLTestCase):
+class TestDPPYFunc(DPPYTestCase):
     N = 257
 
-    def test_dppl_func_device_array(self):
-        @dppl.func
+    def test_dppy_func_device_array(self):
+        @dppy.func
         def g(a):
             return a + 1
 
-        @dppl.kernel
+        @dppy.kernel
         def f(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i])
 
         a = np.ones(self.N)
         b = np.ones(self.N)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            f[self.N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
 
         self.assertTrue(np.all(b == 2))
 
-    def test_dppl_func_ndarray(self):
-        @dppl.func
+    def test_dppy_func_ndarray(self):
+        @dppy.func
         def g(a):
             return a + 1
 
-        @dppl.kernel
+        @dppy.kernel
         def f(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i])
 
-        @dppl.kernel
+        @dppy.kernel
         def h(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i]) + 1
 
         a = np.ones(self.N)
         b = np.ones(self.N)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            f[self.N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
             self.assertTrue(np.all(b == 2))
 
-            h[self.N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            h[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
             self.assertTrue(np.all(b == 3))
 
diff --git a/numba_dppy/tests/test_math_functions.py b/numba_dppy/tests/test_math_functions.py
index 977fe85fef..f83fdd30ee 100644
--- a/numba_dppy/tests/test_math_functions.py
+++ b/numba_dppy/tests/test_math_functions.py
@@ -4,45 +4,45 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import math
 
-@dppl.kernel
-def dppl_fabs(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_fabs(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.fabs(a[i])
 
-@dppl.kernel
-def dppl_exp(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_exp(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.exp(a[i])
 
-@dppl.kernel
-def dppl_log(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_log(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.log(a[i])
 
-@dppl.kernel
-def dppl_sqrt(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_sqrt(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.sqrt(a[i])
 
-@dppl.kernel
-def dppl_sin(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_sin(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.sin(a[i])
 
-@dppl.kernel
-def dppl_cos(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_cos(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.cos(a[i])
 
-@dppl.kernel
-def dppl_tan(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_tan(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.tan(a[i])
 
 global_size = 10
@@ -53,7 +53,7 @@ def dppl_tan(a,b):
 def driver(a, jitfunc):
     b = np.ones_like(a)
     # Device buffers
-    jitfunc[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b)
+    jitfunc[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b)
     return b
 
 
@@ -73,67 +73,67 @@ def test_driver(input_arr, device_ty, jitfunc):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLMathFunctionsCPU(DPPLTestCase):
+class TestDPPYMathFunctionsCPU(DPPYTestCase):
     def test_fabs_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_fabs)
+        b_actual = test_driver(a, "CPU", dppy_fabs)
         b_expected = np.fabs(a)
         self.assertTrue(np.all(b_actual == b_expected))
 
     def test_sin_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_sin)
+        b_actual = test_driver(a, "CPU", dppy_sin)
         b_expected = np.sin(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_cos_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_cos)
+        b_actual = test_driver(a, "CPU", dppy_cos)
         b_expected = np.cos(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_exp_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_exp)
+        b_actual = test_driver(a, "CPU", dppy_exp)
         b_expected = np.exp(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_sqrt_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_sqrt)
+        b_actual = test_driver(a, "CPU", dppy_sqrt)
         b_expected = np.sqrt(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_log_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_log)
+        b_actual = test_driver(a, "CPU", dppy_log)
         b_expected = np.log(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLMathFunctionsGPU(DPPLTestCase):
+class TestDPPYMathFunctionsGPU(DPPYTestCase):
     def test_fabs_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_fabs)
+        b_actual = test_driver(a, "GPU", dppy_fabs)
         b_expected = np.fabs(a)
         self.assertTrue(np.all(b_actual == b_expected))
 
     def test_sin_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_sin)
+        b_actual = test_driver(a, "GPU", dppy_sin)
         b_expected = np.sin(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_cos_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_cos)
+        b_actual = test_driver(a, "GPU", dppy_cos)
         b_expected = np.cos(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_exp_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_exp)
+        b_actual = test_driver(a, "GPU", dppy_exp)
         b_expected = np.exp(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_sqrt_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_sqrt)
+        b_actual = test_driver(a, "GPU", dppy_sqrt)
         b_expected = np.sqrt(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_log_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_log)
+        b_actual = test_driver(a, "GPU", dppy_log)
         b_expected = np.log(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
diff --git a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
index 5e3cd9ba24..de6b7bc963 100644
--- a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
+++ b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
@@ -5,12 +5,12 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestNumpy_bit_twiddling_functions(DPPLTestCase):
+class TestNumpy_bit_twiddling_functions(DPPYTestCase):
     def test_bitwise_and(self):
         @njit(parallel={'offload':True})
         def f(a, b):
diff --git a/numba_dppy/tests/test_numpy_comparison_functions.py b/numba_dppy/tests/test_numpy_comparison_functions.py
index 0bd7dcbb69..5daf1fc813 100644
--- a/numba_dppy/tests/test_numpy_comparison_functions.py
+++ b/numba_dppy/tests/test_numpy_comparison_functions.py
@@ -5,11 +5,11 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
-class TestNumpy_comparison_functions(DPPLTestCase):
+class TestNumpy_comparison_functions(DPPYTestCase):
     a = np.array([4,5,6])
     b = np.array([2,6,6])
     def test_greater(self):
diff --git a/numba_dppy/tests/test_numpy_floating_functions.py b/numba_dppy/tests/test_numpy_floating_functions.py
index 62b76b1ade..c05c10498d 100644
--- a/numba_dppy/tests/test_numpy_floating_functions.py
+++ b/numba_dppy/tests/test_numpy_floating_functions.py
@@ -4,12 +4,12 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestNumpy_floating_functions(DPPLTestCase):
+class TestNumpy_floating_functions(DPPYTestCase):
     def test_isfinite(self):
         @njit(parallel={'offload':True})
         def f(a):
diff --git a/numba_dppy/tests/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py
index ddbb568ede..155b352c7e 100644
--- a/numba_dppy/tests/test_numpy_math_functions.py
+++ b/numba_dppy/tests/test_numpy_math_functions.py
@@ -5,12 +5,12 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestNumpy_math_functions(DPPLTestCase):
+class TestNumpy_math_functions(DPPYTestCase):
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
     b = np.array(np.random.random(N), dtype=np.float32)
diff --git a/numba_dppy/tests/test_numpy_trigonomteric_functions.py b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
index 8f61f941c9..7ce18b870a 100644
--- a/numba_dppy/tests/test_numpy_trigonomteric_functions.py
+++ b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
@@ -5,12 +5,12 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestNumpy_math_functions(DPPLTestCase):
+class TestNumpy_math_functions(DPPYTestCase):
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
     b = np.array(np.random.random(N), dtype=np.float32)
diff --git a/numba_dppy/tests/test_parfor_lower_message.py b/numba_dppy/tests/test_parfor_lower_message.py
index fe8c85d356..591fd2cb0e 100644
--- a/numba_dppy/tests/test_parfor_lower_message.py
+++ b/numba_dppy/tests/test_parfor_lower_message.py
@@ -1,8 +1,8 @@
 import numpy as np
 import numba
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest, DPPLTestCase
+import numba_dppy, numba_dppy as dppy
+from numba_dppy.testing import unittest, DPPYTestCase
 from numba.tests.support import captured_stdout
 import dpctl
 
@@ -19,7 +19,7 @@ def prange_example():
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
-class TestParforMessage(DPPLTestCase):
+class TestParforMessage(DPPYTestCase):
     def test_parfor_message(self):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             numba_dppy.compiler.DEBUG = 1
@@ -29,7 +29,7 @@ def test_parfor_message(self):
                 jitted()
 
             numba_dppy.compiler.DEBUG = 0
-            self.assertTrue("Parfor lowered on DPPL-device" in got.getvalue())
+            self.assertTrue("Parfor lowered on DPPY-device" in got.getvalue())
 
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_prange.py b/numba_dppy/tests/test_prange.py
index 317c2cbb2f..f4c13c4b1f 100644
--- a/numba_dppy/tests/test_prange.py
+++ b/numba_dppy/tests/test_prange.py
@@ -6,13 +6,13 @@
 import numpy as np
 import numba
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest, expectedFailureIf
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 from numba.tests.support import captured_stdout
 
 
-class TestPrange(DPPLTestCase):
+class TestPrange(DPPYTestCase):
     def test_one_prange(self):
         @njit(parallel={'offload':True})
         def f(a, b):
@@ -118,8 +118,8 @@ def prange_example():
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPL-device'), 2, stdout.getvalue())
-        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPL-device'), 0, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPY-device'), 2, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
         np.testing.assert_equal(res, jitted_res)
 
 
@@ -146,8 +146,8 @@ def prange_example():
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPL-device'), 2, stdout.getvalue())
-        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPL-device'), 0, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPY-device'), 2, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
         np.testing.assert_equal(res, jitted_res)
 
 
diff --git a/numba_dppy/tests/test_print.py b/numba_dppy/tests/test_print.py
index ca1e47978a..0bc4a7cc2b 100644
--- a/numba_dppy/tests/test_print.py
+++ b/numba_dppy/tests/test_print.py
@@ -5,24 +5,24 @@
 import sys
 import numpy as np
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestPrint(DPPLTestCase):
-    def test_print_dppl_kernel(self):
-        @dppl.func
+class TestPrint(DPPYTestCase):
+    def test_print_dppy_kernel(self):
+        @dppy.func
         def g(a):
             print("value of a:", a)
             return a + 1
 
-        @dppl.kernel
+        @dppy.kernel
         def f(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i])
             print("value of b at:", i, "is", b[i])
 
@@ -32,7 +32,7 @@ def f(a, b):
         b = np.ones(N)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            f[N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            f[N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_sum_reduction.py b/numba_dppy/tests/test_sum_reduction.py
index 3095497a66..8ec7b3d5a9 100644
--- a/numba_dppy/tests/test_sum_reduction.py
+++ b/numba_dppy/tests/test_sum_reduction.py
@@ -4,14 +4,14 @@
 import math
 import time
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
-@dppl.kernel
+@dppy.kernel
 def reduction_kernel(A, R, stride):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     # sum two element
     R[i] = A[i] + A[i+stride]
     # store the sum to be used in nex iteration
@@ -19,7 +19,7 @@ def reduction_kernel(A, R, stride):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLSumReduction(DPPLTestCase):
+class TestDPPYSumReduction(DPPYTestCase):
     def test_sum_reduction(self):
         # This test will only work for even case
         N = 1024
@@ -36,7 +36,7 @@ def test_sum_reduction(self):
             while (total > 1):
                 # call kernel
                 global_size = total // 2
-                reduction_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, R, global_size)
+                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size)
                 total = total // 2
 
             result = A_copy.sum()
diff --git a/numba_dppy/tests/test_vectorize.py b/numba_dppy/tests/test_vectorize.py
index 12dc7b5ed3..04891ca296 100644
--- a/numba_dppy/tests/test_vectorize.py
+++ b/numba_dppy/tests/test_vectorize.py
@@ -5,12 +5,12 @@
 import sys
 import numpy as np
 from numba import njit, vectorize
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestVectorize(DPPLTestCase):
+class TestVectorize(DPPYTestCase):
     def test_vectorize(self):
 
         @vectorize(nopython=True)
diff --git a/numba_dppy/tests/test_with_context.py b/numba_dppy/tests/test_with_context.py
index 0749ff3e89..e025a77784 100644
--- a/numba_dppy/tests/test_with_context.py
+++ b/numba_dppy/tests/test_with_context.py
@@ -2,18 +2,18 @@
 import numba
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba.core import errors
 from numba.tests.support import captured_stdout
-from numba_dppy.testing import DPPLTestCase, unittest, expectedFailureIf
+from numba_dppy.testing import DPPYTestCase, unittest, expectedFailureIf
 import dpctl
 
 
-class TestWithDPPLContext(DPPLTestCase):
+class TestWithDPPYContext(DPPYTestCase):
 
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
     @expectedFailureIf(sys.platform.startswith('win'))
-    def test_with_dppl_context_gpu(self):
+    def test_with_dppy_context_gpu(self):
 
         @njit
         def nested_func(a, b):
@@ -36,11 +36,11 @@ def func(b):
         func(expected)
 
         np.testing.assert_array_equal(expected, got_gpu)
-        self.assertTrue('Parfor lowered on DPPL-device' in got_gpu_message.getvalue())
+        self.assertTrue('Parfor lowered on DPPY-device' in got_gpu_message.getvalue())
 
     @unittest.skipIf(not dpctl.has_cpu_queues(), "No CPU platforms available")
     @unittest.expectedFailure
-    def test_with_dppl_context_cpu(self):
+    def test_with_dppy_context_cpu(self):
 
         @njit
         def nested_func(a, b):
@@ -63,11 +63,11 @@ def func(b):
         func(expected)
 
         np.testing.assert_array_equal(expected, got_cpu)
-        self.assertTrue('Parfor lowered on DPPL-device' in got_cpu_message.getvalue())
+        self.assertTrue('Parfor lowered on DPPY-device' in got_cpu_message.getvalue())
 
 
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
-    def test_with_dppl_context_target(self):
+    def test_with_dppy_context_target(self):
 
         @njit(target='cpu')
         def nested_func_target(a, b):

From 023fef9518c72318084eea1ee7ae8fa79b4522d3 Mon Sep 17 00:00:00 2001
From: Reazul Hoque <reazulhoque@users.noreply.github.com>
Date: Wed, 9 Dec 2020 00:46:00 -0600
Subject: [PATCH 09/40] Pass to rewrite Numpy function names to be able to
 overload them for Numba-dppy pipeline (#52)

* Sum example

* Moved from infer_type, lower_builtin to overload

* Added two level module name functions

* Remove cython generated file

* Module name fix for moving to new extension

* Incomplete linalg.eig implementation

* Updted all dppl to dppy and moved rewrite_numpy_function_pass to it's own file

* Import module at correct locations

* Added comments

* Added test and updated comments

* Revert unneeded changes

* Update Eigen implementation

* Remove eig implementation

* Add checking equivalent IR

Co-authored-by: reazul.hoque <reazul.hoque@intel.com>
---
 numba_dppy/device_init.py                     |  13 +-
 numba_dppy/dpctl_functions.py                 |  30 +++++
 numba_dppy/dpnp_glue/__init__.py              |   0
 numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx  |  44 +++++-
 numba_dppy/dpnp_glue/dpnpdecl.py              |  10 ++
 numba_dppy/dpnp_glue/dpnpimpl.py              |  89 +++++++++++++
 numba_dppy/dpnp_glue/stubs.py                 |   9 ++
 numba_dppy/dppy_passbuilder.py                |   7 +
 numba_dppy/dppy_passes.py                     |   3 +-
 numba_dppy/rename_numpy_functions_pass.py     | 125 ++++++++++++++++++
 .../tests/test_rename_numpy_function_pass.py  |  67 ++++++++++
 11 files changed, 389 insertions(+), 8 deletions(-)
 create mode 100644 numba_dppy/dpctl_functions.py
 create mode 100644 numba_dppy/dpnp_glue/__init__.py
 create mode 100644 numba_dppy/dpnp_glue/dpnpdecl.py
 create mode 100644 numba_dppy/dpnp_glue/dpnpimpl.py
 create mode 100644 numba_dppy/dpnp_glue/stubs.py
 create mode 100644 numba_dppy/rename_numpy_functions_pass.py
 create mode 100644 numba_dppy/tests/test_rename_numpy_function_pass.py

diff --git a/numba_dppy/device_init.py b/numba_dppy/device_init.py
index c4506014a8..efec55ba83 100644
--- a/numba_dppy/device_init.py
+++ b/numba_dppy/device_init.py
@@ -18,6 +18,14 @@
     CLK_GLOBAL_MEM_FENCE,
 )
 
+"""
+We are importing dpnp stub module to make Numba recognize the
+module when we rename Numpy functions.
+"""
+from .dpnp_glue.stubs import (
+    dpnp
+)
+
 DEFAULT_LOCAL_SIZE = []
 
 from . import initialize
@@ -35,9 +43,4 @@ def is_available():
     return dpctl.has_gpu_queues()
 
 
-#def ocl_error():
-#    """Returns None or an exception if the OpenCL driver fails to initialize.
-#    """
-#    return driver.driver.initialization_error
-
 initialize.initialize_all()
diff --git a/numba_dppy/dpctl_functions.py b/numba_dppy/dpctl_functions.py
new file mode 100644
index 0000000000..67bc358185
--- /dev/null
+++ b/numba_dppy/dpctl_functions.py
@@ -0,0 +1,30 @@
+from numba import types
+from numba.core.typing import signature
+
+
+class _DPCTL_FUNCTIONS:
+    @classmethod
+    def dpctl_get_current_queue(cls):
+        ret_type = types.voidptr
+        sig = signature(ret_type)
+        return types.ExternalFunction("DPCTLQueueMgr_GetCurrentQueue", sig)
+
+    @classmethod
+    def dpctl_malloc_shared(cls):
+        ret_type = types.voidptr
+        sig = signature(ret_type, types.int64, types.voidptr)
+        return types.ExternalFunction("DPCTLmalloc_shared", sig)
+
+    @classmethod
+    def dpctl_queue_memcpy(cls):
+        ret_type = types.void
+        sig = signature(
+            ret_type, types.voidptr, types.voidptr, types.voidptr, types.int64
+        )
+        return types.ExternalFunction("DPCTLQueue_Memcpy", sig)
+
+    @classmethod
+    def dpctl_free_with_queue(cls):
+        ret_type = types.void
+        sig = signature(ret_type, types.voidptr, types.voidptr)
+        return types.ExternalFunction("DPCTLfree_with_queue", sig)
diff --git a/numba_dppy/dpnp_glue/__init__.py b/numba_dppy/dpnp_glue/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx b/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx
index 8eba8bf74c..a63d4fdafa 100644
--- a/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx
+++ b/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx
@@ -8,6 +8,7 @@ cdef extern from "backend_iface_fptr.hpp" namespace "DPNPFuncName":  # need this
     cdef enum DPNPFuncName "DPNPFuncName":
         DPNP_FN_ABSOLUTE
         DPNP_FN_ADD
+        DPNP_FN_ARANGE
         DPNP_FN_ARCCOS
         DPNP_FN_ARCCOSH
         DPNP_FN_ARCSIN
@@ -18,40 +19,77 @@ cdef extern from "backend_iface_fptr.hpp" namespace "DPNPFuncName":  # need this
         DPNP_FN_ARGMAX
         DPNP_FN_ARGMIN
         DPNP_FN_ARGSORT
+        DPNP_FN_BITWISE_AND
+        DPNP_FN_BITWISE_OR
+        DPNP_FN_BITWISE_XOR
         DPNP_FN_CBRT
         DPNP_FN_CEIL
+        DPNP_FN_CHOLESKY
+        DPNP_FN_COPYSIGN
+        DPNP_FN_CORRELATE
         DPNP_FN_COS
         DPNP_FN_COSH
         DPNP_FN_COV
         DPNP_FN_DEGREES
+        DPNP_FN_DET
         DPNP_FN_DIVIDE
         DPNP_FN_DOT
         DPNP_FN_EIG
+        DPNP_FN_EIGVALS
         DPNP_FN_EXP
         DPNP_FN_EXP2
         DPNP_FN_EXPM1
         DPNP_FN_FABS
+        DPNP_FN_FFT_FFT
         DPNP_FN_FLOOR
+        DPNP_FN_FLOOR_DIVIDE
         DPNP_FN_FMOD
-        DPNP_FN_GAUSSIAN
         DPNP_FN_HYPOT
+        DPNP_FN_INVERT
+        DPNP_FN_LEFT_SHIFT
         DPNP_FN_LOG
         DPNP_FN_LOG10
         DPNP_FN_LOG1P
         DPNP_FN_LOG2
         DPNP_FN_MATMUL
+        DPNP_FN_MATRIX_RANK
         DPNP_FN_MAX
         DPNP_FN_MAXIMUM
         DPNP_FN_MEAN
         DPNP_FN_MEDIAN
         DPNP_FN_MIN
         DPNP_FN_MINIMUM
+        DPNP_FN_MODF
         DPNP_FN_MULTIPLY
         DPNP_FN_POWER
         DPNP_FN_PROD
-        DPNP_FN_UNIFORM
         DPNP_FN_RADIANS
+        DPNP_FN_REMAINDER
         DPNP_FN_RECIP
+        DPNP_FN_RIGHT_SHIFT
+        DPNP_FN_RNG_BETA
+        DPNP_FN_RNG_BINOMIAL
+        DPNP_FN_RNG_CHISQUARE
+        DPNP_FN_RNG_EXPONENTIAL
+        DPNP_FN_RNG_GAMMA
+        DPNP_FN_RNG_GAUSSIAN
+        DPNP_FN_RNG_GEOMETRIC
+        DPNP_FN_RNG_GUMBEL
+        DPNP_FN_RNG_HYPERGEOMETRIC
+        DPNP_FN_RNG_LAPLACE
+        DPNP_FN_RNG_LOGNORMAL
+        DPNP_FN_RNG_MULTINOMIAL
+        DPNP_FN_RNG_MULTIVARIATE_NORMAL
+        DPNP_FN_RNG_NEGATIVE_BINOMIAL
+        DPNP_FN_RNG_NORMAL
+        DPNP_FN_RNG_POISSON
+        DPNP_FN_RNG_RAYLEIGH
+        DPNP_FN_RNG_STANDARD_CAUCHY
+        DPNP_FN_RNG_STANDARD_EXPONENTIAL
+        DPNP_FN_RNG_STANDARD_GAMMA
+        DPNP_FN_RNG_STANDARD_NORMAL
+        DPNP_FN_RNG_UNIFORM
+        DPNP_FN_RNG_WEIBULL
         DPNP_FN_SIGN
         DPNP_FN_SIN
         DPNP_FN_SINH
@@ -109,6 +147,8 @@ cdef DPNPFuncName get_DPNPFuncName_from_str(name):
         return DPNPFuncName.DPNP_FN_ARGSORT
     elif name == "dpnp_cov":
         return DPNPFuncName.DPNP_FN_COV
+    elif name == "dpnp_eig":
+        return DPNPFuncName.DPNP_FN_EIG
     else:
         return  DPNPFuncName.DPNP_FN_DOT
 
diff --git a/numba_dppy/dpnp_glue/dpnpdecl.py b/numba_dppy/dpnp_glue/dpnpdecl.py
new file mode 100644
index 0000000000..e77739eeda
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnpdecl.py
@@ -0,0 +1,10 @@
+from numba.core.typing.templates import (AttributeTemplate, infer_getattr)
+import numba_dppy
+from numba import types
+
+@infer_getattr
+class DppyDpnpTemplate(AttributeTemplate):
+    key = types.Module(numba_dppy)
+
+    def resolve_dpnp(self, mod):
+        return types.Module(numba_dppy.dpnp)
diff --git a/numba_dppy/dpnp_glue/dpnpimpl.py b/numba_dppy/dpnp_glue/dpnpimpl.py
new file mode 100644
index 0000000000..d6e53c4b99
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnpimpl.py
@@ -0,0 +1,89 @@
+from numba.core.imputils import lower_builtin
+import numba_dppy.experimental_numpy_lowering_overload as dpnp_lowering
+from numba import types
+from numba.core.typing import signature
+from numba.core.extending import overload, register_jitable
+from . import stubs
+import numpy as np
+from numba_dppy.dpctl_functions import _DPCTL_FUNCTIONS
+
+
+def get_dpnp_fptr(fn_name, type_names):
+    from . import dpnp_fptr_interface as dpnp_glue
+
+    f_ptr = dpnp_glue.get_dpnp_fn_ptr(fn_name, type_names)
+    return f_ptr
+
+
+@register_jitable
+def _check_finite_matrix(a):
+    for v in np.nditer(a):
+        if not np.isfinite(v.item()):
+            raise np.linalg.LinAlgError("Array must not contain infs or NaNs.")
+
+
+@register_jitable
+def _dummy_liveness_func(a):
+    """pass a list of variables to be preserved through dead code elimination"""
+    return a[0]
+
+
+class RetrieveDpnpFnPtr(types.ExternalFunctionPointer):
+    def __init__(self, fn_name, type_names, sig, get_pointer):
+        self.fn_name = fn_name
+        self.type_names = type_names
+        super(RetrieveDpnpFnPtr, self).__init__(sig, get_pointer)
+
+
+class _DPNP_EXTENSION:
+    def __init__(self, name):
+        dpnp_lowering.ensure_dpnp(name)
+
+    @classmethod
+    def dpnp_sum(cls, fn_name, type_names):
+        ret_type = types.void
+        sig = signature(ret_type, types.voidptr, types.voidptr, types.int64)
+        f_ptr = get_dpnp_fptr(fn_name, type_names)
+
+        def get_pointer(obj):
+            return f_ptr
+
+        return types.ExternalFunctionPointer(sig, get_pointer=get_pointer)
+
+
+@overload(stubs.dpnp.sum)
+def dpnp_sum_impl(a):
+    dpnp_extension = _DPNP_EXTENSION("sum")
+    dpctl_functions = _DPCTL_FUNCTIONS()
+
+    dpnp_sum = dpnp_extension.dpnp_sum("dpnp_sum", [a.dtype.name, "NONE"])
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    def dpnp_sum_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out_usm = allocate_usm_shared(a.itemsize, sycl_queue)
+
+        dpnp_sum(a_usm, out_usm, a.size)
+
+        out = np.empty(1, dtype=a.dtype)
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+
+        _dummy_liveness_func([out.size])
+
+        return out[0]
+
+    return dpnp_sum_impl
diff --git a/numba_dppy/dpnp_glue/stubs.py b/numba_dppy/dpnp_glue/stubs.py
new file mode 100644
index 0000000000..d51cd28ead
--- /dev/null
+++ b/numba_dppy/dpnp_glue/stubs.py
@@ -0,0 +1,9 @@
+from numba_dppy.ocl.stubs import Stub
+
+class dpnp(Stub):
+    """dpnp namespace
+    """
+    _description_ = '<dpnp>'
+
+    class sum(Stub):
+        pass
diff --git a/numba_dppy/dppy_passbuilder.py b/numba_dppy/dppy_passbuilder.py
index 0a32a099cf..b3c632a85a 100644
--- a/numba_dppy/dppy_passbuilder.py
+++ b/numba_dppy/dppy_passbuilder.py
@@ -27,6 +27,8 @@
         DPPYNoPythonBackend
         )
 
+from .rename_numpy_functions_pass import DPPYRewriteOverloadedFunctions
+
 class DPPYPassBuilder(object):
     """
     This is the DPPY pass builder to run Intel GPU/CPU specific
@@ -44,6 +46,11 @@ def default_numba_nopython_pipeline(state, pm):
         pm.add_pass(IRProcessing, "processing IR")
         pm.add_pass(WithLifting, "Handle with contexts")
 
+        # this pass rewrites name of NumPy functions we intend to overload
+        pm.add_pass(DPPYRewriteOverloadedFunctions,
+                "Rewrite name of Numpy functions to overload already overloaded function",
+        )
+
         # this pass adds required logic to overload default implementation of
         # Numpy functions
         pm.add_pass(DPPYAddNumpyOverloadPass, "dppy add typing template for Numpy functions")
diff --git a/numba_dppy/dppy_passes.py b/numba_dppy/dppy_passes.py
index 0bb2eadb48..c73f5a7736 100644
--- a/numba_dppy/dppy_passes.py
+++ b/numba_dppy/dppy_passes.py
@@ -3,6 +3,7 @@
 import warnings
 
 import numpy as np
+import numba
 from numba.core import ir
 import weakref
 from collections import namedtuple, deque
@@ -49,7 +50,7 @@ def __init__(self):
     def run_pass(self, state):
         if dpnp_available():
             typingctx = state.typingctx
-            from numba.core.typing.templates import builtin_registry as reg, infer_global
+            from numba.core.typing.templates import (builtin_registry as reg, infer_global)
             from numba.core.typing.templates import (AbstractTemplate, CallableTemplate, signature)
             from numba.core.typing.npydecl import MatMulTyperMixin
 
diff --git a/numba_dppy/rename_numpy_functions_pass.py b/numba_dppy/rename_numpy_functions_pass.py
new file mode 100644
index 0000000000..a0c4b89b3e
--- /dev/null
+++ b/numba_dppy/rename_numpy_functions_pass.py
@@ -0,0 +1,125 @@
+from numba.core import ir
+from numba.core.compiler_machinery import FunctionPass, register_pass
+from numba.core.ir_utils import (
+    find_topo_order,
+    mk_unique_var,
+    remove_dead,
+    simplify_CFG,
+)
+import numba_dppy
+
+rewrite_function_name_map = {"sum": (["np"], "sum")}
+
+
+class RewriteNumPyOverloadedFunctions(object):
+    def __init__(self, state, rewrite_function_name_map=rewrite_function_name_map):
+        self.state = state
+        self.function_name_map = rewrite_function_name_map
+
+    def run(self):
+        """
+        This function rewrites the name of NumPy functions that exist in self.function_name_map
+        e.g np.sum(a) would produce the following:
+
+        np.sum() --> numba_dppy.dpnp.sum()
+
+        ---------------------------------------------------------------------------------------
+        Numba IR Before Rewrite:
+        ---------------------------------------------------------------------------------------
+
+            $2load_global.0 = global(np: <module 'numpy' from 'numpy/__init__.py'>) ['$2load_global.0']
+            $4load_method.1 = getattr(value=$2load_global.0, attr=sum) ['$2load_global.0', '$4load_method.1']
+            $8call_method.3 = call $4load_method.1(a, func=$4load_method.1, args=[Var(a, test_rewrite.py:7)],
+                                                   kws=(), vararg=None) ['$4load_method.1', '$8call_method.3', 'a']
+
+        ---------------------------------------------------------------------------------------
+        Numba IR After Rewrite:
+        ---------------------------------------------------------------------------------------
+
+            $dppy_replaced_var.0 = global(numba_dppy: <module 'numba_dppy' from 'numba_dppy/__init__.py'>) ['$dppy_replaced_var.0']
+            $dpnp_var.1 = getattr(value=$dppy_replaced_var.0, attr=dpnp) ['$dpnp_var.1', '$dppy_replaced_var.0']
+            $4load_method.1 = getattr(value=$dpnp_var.1, attr=sum) ['$4load_method.1', '$dpnp_var.1']
+            $8call_method.3 = call $4load_method.1(a, func=$4load_method.1, args=[Var(a, test_rewrite.py:7)],
+                                                   kws=(), vararg=None) ['$4load_method.1', '$8call_method.3', 'a']
+
+        ---------------------------------------------------------------------------------------
+        """
+        func_ir = self.state.func_ir
+        blocks = func_ir.blocks
+        topo_order = find_topo_order(blocks)
+
+        for label in topo_order:
+            block = blocks[label]
+            saved_arr_arg = {}
+            new_body = []
+            for stmt in block.body:
+                if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Expr):
+                    lhs = stmt.target.name
+                    rhs = stmt.value
+                    # replace np.FOO with name from self.function_name_map["FOO"]
+                    # e.g. np.sum will be replaced with numba_dppy.dpnp.sum
+                    if rhs.op == "getattr" and rhs.attr in self.function_name_map:
+                        module_node = block.find_variable_assignment(
+                            rhs.value.name
+                        ).value
+                        if (
+                            isinstance(module_node, ir.Global)
+                            and module_node.name in self.function_name_map[rhs.attr][0]
+                        ) or (
+                            isinstance(module_node, ir.Expr)
+                            and module_node.attr in self.function_name_map[rhs.attr][0]
+                        ):
+                            rhs = stmt.value
+                            rhs.attr = self.function_name_map[rhs.attr][1]
+
+                            global_module = rhs.value
+                            saved_arr_arg[lhs] = global_module
+
+                            scope = global_module.scope
+                            loc = global_module.loc
+
+                            g_dppy_var = ir.Var(
+                                scope, mk_unique_var("$2load_global"), loc
+                            )
+                            # We are trying to rename np.function_name/np.linalg.function_name with
+                            # numba_dppy.dpnp.function_name.
+                            # Hence, we need to have a global variable representing module numba_dppy.
+                            # Next, we add attribute dpnp to global module numba_dppy to
+                            # represent numba_dppy.dpnp.
+                            g_dppy = ir.Global("numba_dppy", numba_dppy, loc)
+                            g_dppy_assign = ir.Assign(g_dppy, g_dppy_var, loc)
+
+                            dpnp_var = ir.Var(scope, mk_unique_var("$4load_attr"), loc)
+                            getattr_dpnp = ir.Expr.getattr(g_dppy_var, "dpnp", loc)
+                            dpnp_assign = ir.Assign(getattr_dpnp, dpnp_var, loc)
+
+                            rhs.value = dpnp_var
+                            new_body.append(g_dppy_assign)
+                            new_body.append(dpnp_assign)
+                            func_ir._definitions[dpnp_var.name] = [getattr_dpnp]
+                            func_ir._definitions[g_dppy_var.name] = [g_dppy]
+
+                new_body.append(stmt)
+            block.body = new_body
+
+
+@register_pass(mutates_CFG=True, analysis_only=False)
+class DPPYRewriteOverloadedFunctions(FunctionPass):
+    _name = "dppy_rewrite_overloaded_functions_pass"
+
+    def __init__(self):
+        FunctionPass.__init__(self)
+        import numba_dppy.dpnp_glue.dpnpdecl
+        import numba_dppy.dpnp_glue.dpnpimpl
+
+    def run_pass(self, state):
+        rewrite_function_name_pass = RewriteNumPyOverloadedFunctions(
+            state, rewrite_function_name_map
+        )
+
+        rewrite_function_name_pass.run()
+
+        remove_dead(state.func_ir.blocks, state.func_ir.arg_names, state.func_ir)
+        state.func_ir.blocks = simplify_CFG(state.func_ir.blocks)
+
+        return True
diff --git a/numba_dppy/tests/test_rename_numpy_function_pass.py b/numba_dppy/tests/test_rename_numpy_function_pass.py
new file mode 100644
index 0000000000..b06a03b5e0
--- /dev/null
+++ b/numba_dppy/tests/test_rename_numpy_function_pass.py
@@ -0,0 +1,67 @@
+#! /usr/bin/env python
+
+import unittest
+import numpy as np
+
+import numba
+from numba import njit, prange
+import numba_dppy, numba_dppy as dppy
+
+
+from numba.core import compiler
+from numba_dppy.rename_numpy_functions_pass import DPPYRewriteOverloadedFunctions
+
+
+class MyPipeline(object):
+    def __init__(self, test_ir):
+        self.state = compiler.StateDict()
+        self.state.func_ir = test_ir
+
+
+def check_equivalent(expected_ir, got_ir):
+    expected_block_body = expected_ir.blocks[0].body
+    got_block_body = got_ir.blocks[0].body
+
+    if len(expected_block_body) != len(got_block_body):
+        return False
+
+    for i in range(len(expected_block_body)):
+        expected_stmt = expected_block_body[i]
+        got_stmt = got_block_body[i]
+        if type(expected_stmt) != type(got_stmt):
+            return False
+        else:
+            if isinstance(expected_stmt, numba.core.ir.Assign):
+                if isinstance(expected_stmt.value, numba.core.ir.Global):
+                    if (expected_stmt.value.name != got_stmt.value.name and
+                        expected_stmt.value.name != "numba_dppy"):
+                        return False
+                elif isinstance(expected_stmt.value, numba.core.ir.Expr):
+                    # should get "dpnp" and "sum" as attr
+                    if expected_stmt.value.op == "getattr":
+                        if expected_stmt.value.attr != got_stmt.value.attr:
+                            return False
+    return True
+
+
+class TestRenameNumpyFunctionsPass(unittest.TestCase):
+    def test_rename(self):
+        def expected(a):
+            return numba_dppy.dpnp.sum(a)
+
+        def got(a):
+            return np.sum(a)
+
+        expected_ir = compiler.run_frontend(expected)
+        got_ir = compiler.run_frontend(got)
+
+        pipeline = MyPipeline(got_ir)
+
+        rewrite_numpy_functions_pass = DPPYRewriteOverloadedFunctions()
+        rewrite_numpy_functions_pass.run_pass(pipeline.state)
+
+        self.assertTrue(check_equivalent(expected_ir, pipeline.state.func_ir))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 059a5a3aa4e8a2b81ebcca34ba929a8f6320868f Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Wed, 9 Dec 2020 18:36:52 -0600
Subject: [PATCH 10/40] Store allocation queue on a per-object basis.

---
 numba_dppy/dppy_rt.c | 53 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 6 deletions(-)

diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
index dd892055bf..610c45018e 100644
--- a/numba_dppy/dppy_rt.c
+++ b/numba_dppy/dppy_rt.c
@@ -5,33 +5,74 @@
 #include <stdio.h>
 
 NRT_ExternalAllocator usmarray_allocator;
+NRT_external_malloc_func internal_allocator = NULL;
+NRT_external_free_func internal_free = NULL;
+void *(*get_queue_internal)(void) = NULL;
+void (*free_queue_internal)(void*) = NULL;
+
+void * save_queue_allocator(size_t size, void *opaque) {
+    // Allocate a pointer-size more space than neded.
+    int new_size = size + sizeof(void*);
+    // Get the current queue
+    void *cur_queue = get_queue_internal(); // this makes a copy
+    // Use that queue to allocate.
+    void *data = internal_allocator(new_size, cur_queue);
+    // Set first pointer-sized data in allocated space to be the current queue.
+    *(void**)data = cur_queue;
+    // Return the pointer after this queue in memory.
+    return (char*)data + sizeof(void*);
+}
+
+void save_queue_deallocator(void *data, void *opaque) {
+    // Compute original allocation location by subtracting the length
+    // of the queue pointer from the data location that Numba thinks
+    // starts the object.
+    void *orig_data = (char*)data - sizeof(void*);
+    // Get the queue from the original data by derefencing the first qword.
+    void *obj_queue = *(void**)orig_data;
+    // Free the space using the correct queue.
+    internal_free(orig_data, obj_queue);
+    // Free the queue itself.
+    free_queue_internal(obj_queue);
+}
 
 void usmarray_memsys_init(void) {
-    void *(*get_queue)(void);
     char *lib_name = "libDPCTLSyclInterface.so";
     char *malloc_name = "DPCTLmalloc_shared";
     char *free_name = "DPCTLfree_with_queue";
     char *get_queue_name = "DPCTLQueueMgr_GetCurrentQueue";
+    char *free_queue_name = "DPCTLQueue_Delete";
 
     void *sycldl = dlopen(lib_name, RTLD_NOW);
     assert(sycldl != NULL);
-    usmarray_allocator.malloc = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
+    internal_allocator = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
+    usmarray_allocator.malloc = save_queue_allocator;
     if (usmarray_allocator.malloc == NULL) {
         printf("Did not find %s in %s\n", malloc_name, lib_name);
         exit(-1);
     }
+
     usmarray_allocator.realloc = NULL;
-    usmarray_allocator.free = (NRT_external_free_func)dlsym(sycldl, free_name);
+
+    internal_free = (NRT_external_free_func)dlsym(sycldl, free_name);
+    usmarray_allocator.free = save_queue_deallocator;
     if (usmarray_allocator.free == NULL) {
         printf("Did not find %s in %s\n", free_name, lib_name);
         exit(-1);
     }
-    get_queue = (void *(*))dlsym(sycldl, get_queue_name);
-    if (get_queue == NULL) {
+
+    get_queue_internal = (void *(*)(void))dlsym(sycldl, get_queue_name);
+    if (get_queue_internal == NULL) {
         printf("Did not find %s in %s\n", get_queue_name, lib_name);
         exit(-1);
     }
-    usmarray_allocator.opaque_data = get_queue();
+    usmarray_allocator.opaque_data = NULL;
+
+    free_queue_internal = (void (*)(void*))dlsym(sycldl, free_queue_name);
+    if (free_queue_internal == NULL) {
+        printf("Did not find %s in %s\n", free_queue_name, lib_name);
+        exit(-1);
+    }
 }
 
 void * usmarray_get_ext_allocator(void) {

From 4e99a557dae35b91c5cfce6775ce9b63ed97a6ca Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Thu, 10 Dec 2020 11:34:14 -0600
Subject: [PATCH 11/40] Add imports to usmarray module and fixed setup.py
 extension initialization

---
 numba_dppy/numpy_usm_shared.py | 53 ++++++++++++++++++++++++++++++++++
 setup.py                       |  2 +-
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 0d190b1317..16b7f4fee8 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -1,3 +1,56 @@
+import numpy as np
+from inspect import getmembers, isfunction, isclass, isbuiltin
+from numbers import Number
+import numba
+from types import FunctionType as ftype, BuiltinFunctionType as bftype
+from numba import types
+from numba.extending import typeof_impl, register_model, type_callable, lower_builtin
+from numba.np import numpy_support
+from numba.core.pythonapi import box, allocator
+from llvmlite import ir
+import llvmlite.llvmpy.core as lc
+import llvmlite.binding as llb
+from numba.core import types, cgutils, config
+import builtins
+import sys
+from ctypes.util import find_library
+from numba.core.typing.templates import builtin_registry as templates_registry
+from numba.core.typing.npydecl import registry as typing_registry
+from numba.core.imputils import builtin_registry as lower_registry
+import importlib
+import functools
+import inspect
+from numba.core.typing.templates import CallableTemplate
+from numba.np.arrayobj import _array_copy
+
+from dpctl.dptensor.numpy_usm_shared import ndarray, functions_list
+
+
+debug = config.DEBUG
+
+def dprint(*args):
+    if debug:
+        print(*args)
+        sys.stdout.flush()
+
+# # This code makes it so that Numba can contain calls into the DPPLSyclInterface library.
+# sycl_mem_lib = find_library('DPCTLSyclInterface')
+# dprint("sycl_mem_lib:", sycl_mem_lib)
+# # Load the symbols from the DPPL Sycl library.
+# llb.load_library_permanently(sycl_mem_lib)
+
+import dpctl
+from dpctl.memory import MemoryUSMShared
+import numba_dppy._dppy_rt
+
+# functions_list = [o[0] for o in getmembers(np) if isfunction(o[1]) or isbuiltin(o[1])]
+# class_list = [o for o in getmembers(np) if isclass(o[1])]
+
+# Register the helper function in dppl_rt so that we can insert calls to them via llvmlite.
+for py_name, c_address in numba_dppy._dppy_rt.c_helpers.items():
+    llb.add_symbol(py_name, c_address)
+
+
 # This class creates a type in Numba.
 class UsmSharedArrayType(types.Array):
     def __init__(
diff --git a/setup.py b/setup.py
index b870c50a8f..11db126686 100644
--- a/setup.py
+++ b/setup.py
@@ -74,7 +74,7 @@ def get_ext_modules():
     cmdclass=versioneer.get_cmdclass(),
     entry_points={
         "numba_extensions": [
-            "init = numba_dppy.usmarray:numba_register",
+            "init = numba_dppy.numpy_usm_shared:numba_register",
     ]},
 )
 

From 595f94b586cec629c9cd46bea143c8f61e721948 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Thu, 10 Dec 2020 20:28:25 -0600
Subject: [PATCH 12/40] Register is_usm_callback with dpctl to say whether a
 given Python object is a USM MemInfo.

---
 numba_dppy/numpy_usm_shared.py | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 16b7f4fee8..8649826a53 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -23,6 +23,7 @@
 from numba.core.typing.templates import CallableTemplate
 from numba.np.arrayobj import _array_copy
 
+import dpctl.dptensor.numpy_usm_shared as numpy_usm_shared
 from dpctl.dptensor.numpy_usm_shared import ndarray, functions_list
 
 
@@ -152,11 +153,25 @@ def allocator_UsmArray(context, builder, size, align):
 
 registered = False
 
+def is_usm_callback(obj):
+    if isinstance(obj, numba.core.runtime._nrt_python._MemInfo):
+        mobj = obj
+        while isinstance(mobj, numba.core.runtime._nrt_python._MemInfo):
+            ea = mobj.external_allocator
+            d = mobj.data
+            dppl_rt_allocator = numba_dppy._dppy_rt.get_external_allocator()
+            if ea == dppl_rt_allocator:
+                return True
+            mobj = mobj.parent
+            if isinstance(mobj, ndarray):
+                mobj = mobj.base
+    return False
 
 def numba_register():
     global registered
     if not registered:
         registered = True
+        ndarray.add_external_usm_checker(is_usm_callback)
         numba_register_typing()
         numba_register_lower_builtin()
 
@@ -217,7 +232,11 @@ def numba_register_lower_builtin():
 
     cur_mod = importlib.import_module(__name__)
     for impl, func, types in todo + todo_builtin:
-        usmarray_func = eval(func.__name__)
+        try:
+            usmarray_func = eval("numpy_usm_shared."+func.__name__)
+        except:
+            dprint("failed to eval", func.__name__)
+            continue
         dprint(
             "need to re-register lowerer for usmarray", impl, func, types, usmarray_func
         )
@@ -257,7 +276,11 @@ def numba_register_typing():
         assert len(typ.templates) == 1
         # template is the typing class to invoke generic() upon.
         template = typ.templates[0]
-        dpval = eval(val.__name__)
+        try:
+            dpval = eval("numpy_usm_shared."+val.__name__)
+        except:
+            dprint("failed to eval", val.__name__)
+            continue
         dprint("need to re-register for usmarray", val, typ, typ.typing_key)
         """
         if debug:

From c87a94b4bfb59ffe82eabd24d589aea536c7fd67 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Fri, 11 Dec 2020 12:24:28 -0600
Subject: [PATCH 13/40] Remove printf.

---
 numba_dppy/dppy_rt.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
index 610c45018e..d637064989 100644
--- a/numba_dppy/dppy_rt.c
+++ b/numba_dppy/dppy_rt.c
@@ -76,7 +76,6 @@ void usmarray_memsys_init(void) {
 }
 
 void * usmarray_get_ext_allocator(void) {
-    printf("usmarray_get_ext_allocator %p\n", &usmarray_allocator);
     return (void*)&usmarray_allocator;
 }
 

From 12148936265938c31344cfaf83344cced3234d00 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Fri, 11 Dec 2020 12:30:19 -0600
Subject: [PATCH 14/40] There were some spots where there was a silent
 assumption that the class and the Numba integration were in the same file.  I
 changed those to explicitly refer to the usmarray module in dpctl.

---
 numba_dppy/numpy_usm_shared.py | 50 ++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 8649826a53..9c8c7855ba 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -23,8 +23,8 @@
 from numba.core.typing.templates import CallableTemplate
 from numba.np.arrayobj import _array_copy
 
-import dpctl.dptensor.numpy_usm_shared as numpy_usm_shared
-from dpctl.dptensor.numpy_usm_shared import ndarray, functions_list
+import dpctl.dptensor.numpy_usm_shared as nus
+from dpctl.dptensor.numpy_usm_shared import ndarray, functions_list, class_list
 
 
 debug = config.DEBUG
@@ -233,7 +233,7 @@ def numba_register_lower_builtin():
     cur_mod = importlib.import_module(__name__)
     for impl, func, types in todo + todo_builtin:
         try:
-            usmarray_func = eval("numpy_usm_shared."+func.__name__)
+            usmarray_func = eval("dpctl.dptensor.numpy_usm_shared." + func.__name__)
         except:
             dprint("failed to eval", func.__name__)
             continue
@@ -260,28 +260,44 @@ def numba_register_typing():
     # For all Numpy identifiers that have been registered for typing in Numba...
     for ig in typing_registry.globals:
         val, typ = ig
+        dprint("Numpy registered:", val, type(val), typ, type(typ))
         # If it is a Numpy function...
         if isinstance(val, (ftype, bftype)):
             # If we have overloaded that function in the usmarray module (always True right now)...
             if val.__name__ in functions_list:
                 todo.append(ig)
         if isinstance(val, type):
-            todo_classes.append(ig)
+            if isinstance(typ, numba.core.types.functions.Function):
+                todo.append(ig)
+            elif isinstance(typ, numba.core.types.functions.NumberClass):
+                pass
+                #todo_classes.append(ig)
 
     for tgetattr in templates_registry.attributes:
         if tgetattr.key == types.Array:
             todo_getattr.append(tgetattr)
 
+    for val, typ in todo_classes:
+        dprint("todo_classes:", val, typ, type(typ))
+
+        try:
+            dptype = eval("dpctl.dptensor.numpy_usm_shared." + val.__name__)
+        except:
+            dprint("failed to eval", val.__name__)
+            continue
+
+        typing_registry.register_global(dptype, numba.core.types.NumberClass(typ.instance_type))
+
     for val, typ in todo:
         assert len(typ.templates) == 1
         # template is the typing class to invoke generic() upon.
         template = typ.templates[0]
+        dprint("need to re-register for usmarray", val, typ, typ.typing_key)
         try:
-            dpval = eval("numpy_usm_shared."+val.__name__)
+            dpval = eval("dpctl.dptensor.numpy_usm_shared." + val.__name__)
         except:
             dprint("failed to eval", val.__name__)
             continue
-        dprint("need to re-register for usmarray", val, typ, typ.typing_key)
         """
         if debug:
             print("--------------------------------------------------------------")
@@ -307,9 +323,7 @@ def set_key_original(cls, key, original):
         def generic_impl(self):
             original_typer = self.__class__.original.generic(self.__class__.original)
             ot_argspec = inspect.getfullargspec(original_typer)
-            # print("ot_argspec:", ot_argspec)
             astr = argspec_to_string(ot_argspec)
-            # print("astr:", astr)
 
             typer_func = """def typer({}):
                                 original_res = original_typer({})
@@ -321,8 +335,6 @@ def generic_impl(self):
                 astr, ",".join(ot_argspec.args)
             )
 
-            # print("typer_func:", typer_func)
-
             try:
                 gs = globals()
                 ls = locals()
@@ -344,7 +356,6 @@ def generic_impl(self):
                 print("eval failed!", sys.exc_info()[0])
                 sys.exit(0)
 
-            # print("exec_res:", exec_res)
             return exec_res
 
         new_usmarray_template = type(
@@ -370,7 +381,6 @@ def set_key(cls, key):
 
         def getattr_impl(self, attr):
             if attr.startswith("resolve_"):
-                # print("getattr_impl starts with resolve_:", self, type(self), attr)
                 def wrapper(*args, **kwargs):
                     attr_res = tgetattr.__getattribute__(self, attr)(*args, **kwargs)
                     if isinstance(attr_res, types.Array):
@@ -394,15 +404,7 @@ def wrapper(*args, **kwargs):
         templates_registry.register_attr(new_usmarray_template)
 
 
-def from_ndarray(x):
-    return copy(x)
-
-
-def as_ndarray(x):
-    return np.copy(x)
-
-
-@typing_registry.register_global(as_ndarray)
+@typing_registry.register_global(nus.as_ndarray)
 class DparrayAsNdarray(CallableTemplate):
     def generic(self):
         def typer(arg):
@@ -411,7 +413,7 @@ def typer(arg):
         return typer
 
 
-@typing_registry.register_global(from_ndarray)
+@typing_registry.register_global(nus.from_ndarray)
 class DparrayFromNdarray(CallableTemplate):
     def generic(self):
         def typer(arg):
@@ -420,11 +422,11 @@ def typer(arg):
         return typer
 
 
-@lower_registry.lower(as_ndarray, UsmSharedArrayType)
+@lower_registry.lower(nus.as_ndarray, UsmSharedArrayType)
 def usmarray_conversion_as(context, builder, sig, args):
     return _array_copy(context, builder, sig, args)
 
 
-@lower_registry.lower(from_ndarray, types.Array)
+@lower_registry.lower(nus.from_ndarray, types.Array)
 def usmarray_conversion_from(context, builder, sig, args):
     return _array_copy(context, builder, sig, args)

From a2b2bb75794179ba18173af43a85e229f5fc4248 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Fri, 11 Dec 2020 14:45:54 -0600
Subject: [PATCH 15/40] Found another spot where the current module was being
 used rather than numpy_usm_shared in dpctl.dptensor.  This fixes the ndindex
 issue.

---
 numba_dppy/numpy_usm_shared.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 9c8c7855ba..0f058bc778 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -230,7 +230,6 @@ def numba_register_lower_builtin():
     for lg in todo_getattr:
         lower_registry.getattrs.append(lg)
 
-    cur_mod = importlib.import_module(__name__)
     for impl, func, types in todo + todo_builtin:
         try:
             usmarray_func = eval("dpctl.dptensor.numpy_usm_shared." + func.__name__)
@@ -240,7 +239,7 @@ def numba_register_lower_builtin():
         dprint(
             "need to re-register lowerer for usmarray", impl, func, types, usmarray_func
         )
-        new_impl = copy_func_for_usmarray(impl, cur_mod)
+        new_impl = copy_func_for_usmarray(impl, nus)
         lower_registry.functions.append((new_impl, usmarray_func, types))
 
 
@@ -327,7 +326,6 @@ def generic_impl(self):
 
             typer_func = """def typer({}):
                                 original_res = original_typer({})
-                                #print("original_res:", original_res)
                                 if isinstance(original_res, types.Array):
                                     return UsmSharedArrayType(dtype=original_res.dtype, ndim=original_res.ndim, layout=original_res.layout)
 

From 0769a9724209b90028fc1cc916097d48429f6162 Mon Sep 17 00:00:00 2001
From: etotmeni <elena.totmenina@intel.com>
Date: Tue, 15 Dec 2020 06:47:38 -0600
Subject: [PATCH 16/40] Fix numba path

---
 setup.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 11db126686..f5882cceb4 100644
--- a/setup.py
+++ b/setup.py
@@ -3,16 +3,26 @@
 from Cython.Build import cythonize
 
 import versioneer
+import sys
+
+
+def find_numba():
+    sys_packages = sys.path
+    for pcg in sys_packages:
+        if pcg.find("/numba-0") != -1:
+            numba_dir = pcg
+    return numba_dir
 
 
 def get_ext_modules():
     ext_modules = []
+    numba_dir = find_numba()
 
     ext_dppy = Extension(
         name="numba_dppy._dppy_rt",
         sources=["numba_dppy/dppy_rt.c"],
-        include_dirs=["../numba/numba"],  # Need to get rid of relative paths.
-        depends=["../numba/numba/core/runtime/nrt_external.h", "../numba/numba/core/runtime/nrt.h", "../numba/numba/_pymodule.h"],
+        include_dirs=[numba_dir + "/numba"],
+        depends=[numba_dir + "/numba/core/runtime/nrt_external.h", numba_dir + "/numba/core/runtime/nrt.h", numba_dir + "/numba/_pymodule.h"],
     )
     ext_modules += [ext_dppy]
 

From eb53c26bc7ec73fa8275de15cf97cdda73d461ce Mon Sep 17 00:00:00 2001
From: etotmeni <elena.totmenina@intel.com>
Date: Tue, 15 Dec 2020 06:51:35 -0600
Subject: [PATCH 17/40] fix

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index f5882cceb4..dc950e37d6 100644
--- a/setup.py
+++ b/setup.py
@@ -7,6 +7,7 @@
 
 
 def find_numba():
+    numba_dir = ""
     sys_packages = sys.path
     for pcg in sys_packages:
         if pcg.find("/numba-0") != -1:

From f4333ddbcecadd29193219ac73aea4402ad74a1b Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 15 Dec 2020 16:31:00 +0300
Subject: [PATCH 18/40] Convert tests for USM array to unittest (#118)

* Split tests for usmarray in separate unittest test cases

* Remove prints and use unittest assertions

* Move functions to tests

* Give names to functions

* Add expectedFailure for failed tests

* Clean code
---
 numba_dppy/tests/test_usmarray.py | 349 ++++++++++++++----------------
 1 file changed, 162 insertions(+), 187 deletions(-)

diff --git a/numba_dppy/tests/test_usmarray.py b/numba_dppy/tests/test_usmarray.py
index fe1be71c9e..6ee21ab3da 100644
--- a/numba_dppy/tests/test_usmarray.py
+++ b/numba_dppy/tests/test_usmarray.py
@@ -1,228 +1,203 @@
-from __future__ import print_function, division, absolute_import
-
 import numba
-import dpctl.dptensor.numpy_usm_shared as usmarray
 import numpy
-import sys
-
-
-def p1(a):
-    return a * 2.0 + 13
+import unittest
 
+import dpctl.dptensor.numpy_usm_shared as usmarray
 
-f1 = numba.njit(p1)
+from numba_dppy.testing import DPPYTestCase
 
 
 @numba.njit()
-def f2(a):
-    return a
+def numba_mul_add(a):
+    return a * 2.0 + 13
 
 
 @numba.njit()
-def f3(a, b):  # a is usmarray, b is numpy
-    return a * usmarray.asarray(b)
+def numba_add_const(a):
+    return a + 13
 
 
 @numba.njit()
-def f4():
-    return usmarray.ones(10)
-
-
-def p5(a, b):  # a is usmarray, b is numpy
+def numba_mul(a, b):  # a is usmarray, b is numpy
     return a * b
 
 
-f5 = numba.njit(p5)
-
-
 @numba.njit()
-def f6(a):
-    return a + 13
+def numba_mul_usmarray_asarray(a, b):  # a is usmarray, b is numpy
+    return a * usmarray.asarray(b)
 
 
-@numba.njit()
-def f7(a):  # a is usmarray
-    # implicit conversion of a to numpy.ndarray
-    b = numpy.ones(10)
-    c = a * b
-    d = a.argsort()  # with no implicit conversion this fails
+# @numba.njit()
+# def f7(a):  # a is usmarray
+#     # implicit conversion of a to numpy.ndarray
+#     b = numpy.ones(10)
+#     c = a * b
+#     d = a.argsort()  # with no implicit conversion this fails
 
 
 @numba.njit
-def f8(a):
+def numba_usmarray_as_ndarray(a):
     return usmarray.as_ndarray(a)
 
 
 @numba.njit
-def f9(a):
+def numba_usmarray_from_ndarray(a):
     return usmarray.from_ndarray(a)
 
 
+@numba.njit()
+def numba_usmarray_ones():
+    return usmarray.ones(10)
+
+
 @numba.njit
-def f10():
+def numba_usmarray_empty():
     return usmarray.empty((10, 10))
 
 
+@numba.njit()
+def numba_identity(a):
+    return a
+
+
 @numba.njit
-def f11(x):
+def numba_shape(x):
     return x.shape
 
 
 @numba.njit
-def f12(x):
+def numba_T(x):
     return x.T
 
 
-# --------------------------------------------------------------------------------
-
-print("------------------- Testing Python Numpy")
-sys.stdout.flush()
-z1 = numpy.ones(10)
-z2 = p1(z1)
-print("z2:", z2, type(z2))
-assert type(z2) == numpy.ndarray
-
-print("------------------- Testing Numba Numpy")
-sys.stdout.flush()
-z1 = numpy.ones(10)
-z2 = f1(z1)
-print("z2:", z2, type(z2))
-assert type(z2) == numpy.ndarray
-
-print("------------------- Testing usmarray ones")
-sys.stdout.flush()
-a = usmarray.ones(10)
-print("a:", a, type(a))
-assert isinstance(a, usmarray.ndarray)
-assert usmarray.has_array_interface(a)
-
-print("------------------- Testing usmarray.usmarray.as_ndarray")
-sys.stdout.flush()
-nd1 = a.as_ndarray()
-print("nd1:", nd1, type(nd1))
-assert type(nd1) == numpy.ndarray
-
-print("------------------- Testing usmarray.as_ndarray")
-sys.stdout.flush()
-nd2 = usmarray.as_ndarray(a)
-print("nd2:", nd2, type(nd2))
-assert type(nd2) == numpy.ndarray
-
-print("------------------- Testing usmarray.from_ndarray")
-sys.stdout.flush()
-dp1 = usmarray.from_ndarray(nd2)
-print("dp1:", dp1, type(dp1))
-assert isinstance(dp1, usmarray.ndarray)
-assert usmarray.has_array_interface(dp1)
-
-print("------------------- Testing usmarray multiplication")
-sys.stdout.flush()
-c = a * 5
-print("c", c, type(c))
-assert isinstance(c, usmarray.ndarray)
-assert usmarray.has_array_interface(c)
-
-print("------------------- Testing Python usmarray")
-sys.stdout.flush()
-b = p1(c)
-print("b:", b, type(b))
-assert isinstance(b, usmarray.ndarray)
-assert usmarray.has_array_interface(b)
-del b
-
-print("------------------- Testing Python mixing usmarray and numpy.ndarray")
-sys.stdout.flush()
-h = p5(a, z1)
-print("h:", h, type(h))
-assert isinstance(h, usmarray.ndarray)
-assert usmarray.has_array_interface(h)
-del h
-
-print("------------------- Testing Numba usmarray 2")
-sys.stdout.flush()
-d = f2(a)
-print("d:", d, type(d))
-assert isinstance(d, usmarray.ndarray)
-assert usmarray.has_array_interface(d)
-del d
-
-print("------------------- Testing Numba usmarray")
-sys.stdout.flush()
-b = f1(c)
-print("b:", b, type(b))
-assert isinstance(b, usmarray.ndarray)
-assert usmarray.has_array_interface(b)
-del b
-
-"""
-print("------------------- Testing Numba usmarray constructor from numpy.ndarray")
-sys.stdout.flush()
-e = f3(a, z1)
-print("e:", e, type(e))
-assert(isinstance(e, usmarray.ndarray))
-"""
-
-print("------------------- Testing Numba mixing usmarray and constant")
-sys.stdout.flush()
-g = f6(a)
-print("g:", g, type(g))
-assert isinstance(g, usmarray.ndarray)
-assert usmarray.has_array_interface(g)
-del g
-
-print("------------------- Testing Numba mixing usmarray and numpy.ndarray")
-sys.stdout.flush()
-h = f5(a, z1)
-print("h:", h, type(h))
-assert isinstance(h, usmarray.ndarray)
-assert usmarray.has_array_interface(h)
-del h
-
-print("------------------- Testing Numba usmarray functions")
-sys.stdout.flush()
-f = f4()
-print("f:", f, type(f))
-assert isinstance(f, usmarray.ndarray)
-assert usmarray.has_array_interface(f)
-del f
-
-print("------------------- Testing Numba usmarray.as_ndarray")
-sys.stdout.flush()
-nd3 = f8(a)
-print("nd3:", nd3, type(nd3))
-assert type(nd3) == numpy.ndarray
-
-print("------------------- Testing Numba usmarray.from_ndarray")
-sys.stdout.flush()
-dp2 = f9(nd3)
-print("dp2:", dp2, type(dp2))
-assert isinstance(dp2, usmarray.ndarray)
-assert usmarray.has_array_interface(dp2)
-del nd3
-del dp2
-
-print("------------------- Testing Numba usmarray.empty")
-sys.stdout.flush()
-dp3 = f10()
-print("dp3:", dp3, type(dp3))
-assert isinstance(dp3, usmarray.ndarray)
-assert usmarray.has_array_interface(dp3)
-
-print("------------------- Testing Numba usmarray.shape")
-sys.stdout.flush()
-s1 = f11(dp3)
-print("s1:", s1, type(s1))
-
-print("------------------- Testing Numba usmarray.T")
-sys.stdout.flush()
-dp4 = f12(dp3)
-print("dp4:", dp4, type(dp4))
-assert isinstance(dp4, usmarray.ndarray)
-assert usmarray.has_array_interface(dp4)
-del dp3
-del dp4
-
-# -------------------------------
-del a
-
-print("SUCCESS")
+class TestUsmArray(DPPYTestCase):
+    def ndarray(self):
+        """Create NumPy array"""
+        return numpy.ones(10)
+
+    def usmarray(self):
+        """Create dpCtl USM array"""
+        return usmarray.ones(10)
+
+    def test_python_numpy(self):
+        """Testing Python Numpy"""
+        z2 = numba_mul_add.py_func(self.ndarray())
+        self.assertEqual(type(z2), numpy.ndarray, z2)
+
+    def test_numba_numpy(self):
+        """Testing Numba Numpy"""
+        z2 = numba_mul_add(self.ndarray())
+        self.assertEqual(type(z2), numpy.ndarray, z2)
+
+    def test_usmarray_ones(self):
+        """Testing usmarray ones"""
+        a = usmarray.ones(10)
+        self.assertIsInstance(a, usmarray.ndarray, type(a))
+        self.assertTrue(usmarray.has_array_interface(a))
+
+    def test_usmarray_usmarray_as_ndarray(self):
+        """Testing usmarray.usmarray.as_ndarray"""
+        nd1 = self.usmarray().as_ndarray()
+        self.assertEqual(type(nd1), numpy.ndarray, nd1)
+
+    def test_usmarray_as_ndarray(self):
+        """Testing usmarray.as_ndarray"""
+        nd2 = usmarray.as_ndarray(self.usmarray())
+        self.assertEqual(type(nd2), numpy.ndarray, nd2)
+
+    def test_usmarray_from_ndarray(self):
+        """Testing usmarray.from_ndarray"""
+        nd2 = usmarray.as_ndarray(self.usmarray())
+        dp1 = usmarray.from_ndarray(nd2)
+        self.assertIsInstance(dp1, usmarray.ndarray, type(dp1))
+        self.assertTrue(usmarray.has_array_interface(dp1))
+
+    def test_usmarray_multiplication(self):
+        """Testing usmarray multiplication"""
+        c = self.usmarray() * 5
+        self.assertIsInstance(c, usmarray.ndarray, type(c))
+        self.assertTrue(usmarray.has_array_interface(c))
+
+    def test_python_usmarray_mul_add(self):
+        """Testing Python usmarray"""
+        c = self.usmarray() * 5
+        b = numba_mul_add.py_func(c)
+        self.assertIsInstance(b, usmarray.ndarray, type(b))
+        self.assertTrue(usmarray.has_array_interface(b))
+
+    @unittest.expectedFailure
+    def test_numba_usmarray_mul_add(self):
+        """Testing Numba usmarray"""
+        # fails if run tests in bunch
+        c = self.usmarray() * 5
+        b = numba_mul_add(c)
+        self.assertIsInstance(b, usmarray.ndarray, type(b))
+        self.assertTrue(usmarray.has_array_interface(b))
+
+    def test_python_mixing_usmarray_and_numpy_ndarray(self):
+        """Testing Python mixing usmarray and numpy.ndarray"""
+        h = numba_mul.py_func(self.usmarray(), self.ndarray())
+        self.assertIsInstance(h, usmarray.ndarray, type(h))
+        self.assertTrue(usmarray.has_array_interface(h))
+
+    def test_numba_usmarray_2(self):
+        """Testing Numba usmarray 2"""
+
+        d = numba_identity(self.usmarray())
+        self.assertIsInstance(d, usmarray.ndarray, type(d))
+        self.assertTrue(usmarray.has_array_interface(d))
+
+    @unittest.expectedFailure
+    def test_numba_usmarray_constructor_from_numpy_ndarray(self):
+        """Testing Numba usmarray constructor from numpy.ndarray"""
+        e = numba_mul_usmarray_asarray(self.usmarray(), self.ndarray())
+        self.assertIsInstance(e, usmarray.ndarray, type(e))
+
+    def test_numba_mixing_usmarray_and_constant(self):
+        """Testing Numba mixing usmarray and constant"""
+        g = numba_add_const(self.usmarray())
+        self.assertIsInstance(g, usmarray.ndarray, type(g))
+        self.assertTrue(usmarray.has_array_interface(g))
+
+    def test_numba_mixing_usmarray_and_numpy_ndarray(self):
+        """Testing Numba mixing usmarray and numpy.ndarray"""
+        h = numba_mul(self.usmarray(), self.ndarray())
+        self.assertIsInstance(h, usmarray.ndarray, type(h))
+        self.assertTrue(usmarray.has_array_interface(h))
+
+    def test_numba_usmarray_functions(self):
+        """Testing Numba usmarray functions"""
+        f = numba_usmarray_ones()
+        self.assertIsInstance(f, usmarray.ndarray, type(f))
+        self.assertTrue(usmarray.has_array_interface(f))
+
+    def test_numba_usmarray_as_ndarray(self):
+        """Testing Numba usmarray.as_ndarray"""
+        nd3 = numba_usmarray_as_ndarray(self.usmarray())
+        self.assertEqual(type(nd3), numpy.ndarray, nd3)
+
+    def test_numba_usmarray_from_ndarray(self):
+        """Testing Numba usmarray.from_ndarray"""
+        nd3 = numba_usmarray_as_ndarray(self.usmarray())
+        dp2 = numba_usmarray_from_ndarray(nd3)
+        self.assertIsInstance(dp2, usmarray.ndarray, type(dp2))
+        self.assertTrue(usmarray.has_array_interface(dp2))
+
+    def test_numba_usmarray_empty(self):
+        """Testing Numba usmarray.empty"""
+        dp3 = numba_usmarray_empty()
+        self.assertIsInstance(dp3, usmarray.ndarray, type(dp3))
+        self.assertTrue(usmarray.has_array_interface(dp3))
+
+    def test_numba_usmarray_shape(self):
+        """Testing Numba usmarray.shape"""
+        s1 = numba_shape(numba_usmarray_empty())
+        self.assertIsInstance(s1, tuple, type(s1))
+        self.assertEqual(s1, (10, 10))
+
+    @unittest.expectedFailure
+    def test_numba_usmarray_T(self):
+        """Testing Numba usmarray.T"""
+        dp4 = numba_T(numba_usmarray_empty())
+        self.assertIsInstance(dp4, usmarray.ndarray, type(dp4))
+        self.assertTrue(usmarray.has_array_interface(dp4))

From 313727bb318a4971e2eb8019d139d89dd25a26aa Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 15 Dec 2020 07:56:50 -0600
Subject: [PATCH 19/40] Small code fixes

---
 numba_dppy/tests/test_usmarray.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/numba_dppy/tests/test_usmarray.py b/numba_dppy/tests/test_usmarray.py
index 6ee21ab3da..a6d428f80e 100644
--- a/numba_dppy/tests/test_usmarray.py
+++ b/numba_dppy/tests/test_usmarray.py
@@ -142,7 +142,6 @@ def test_python_mixing_usmarray_and_numpy_ndarray(self):
 
     def test_numba_usmarray_2(self):
         """Testing Numba usmarray 2"""
-
         d = numba_identity(self.usmarray())
         self.assertIsInstance(d, usmarray.ndarray, type(d))
         self.assertTrue(usmarray.has_array_interface(d))

From ff877fc42bb016b56247e44b2f6f44c4db1559dd Mon Sep 17 00:00:00 2001
From: etotmeni <elena.totmenina@intel.com>
Date: Tue, 15 Dec 2020 09:08:52 -0600
Subject: [PATCH 20/40] use include_path to find numba

---
 setup.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/setup.py b/setup.py
index dc950e37d6..d833951745 100644
--- a/setup.py
+++ b/setup.py
@@ -1,23 +1,15 @@
 import os
 from setuptools import Extension, find_packages, setup
 from Cython.Build import cythonize
+from numba.core.extending import include_path
 
 import versioneer
 import sys
 
 
-def find_numba():
-    numba_dir = ""
-    sys_packages = sys.path
-    for pcg in sys_packages:
-        if pcg.find("/numba-0") != -1:
-            numba_dir = pcg
-    return numba_dir
-
-
 def get_ext_modules():
     ext_modules = []
-    numba_dir = find_numba()
+    numba_dir = include_path()
 
     ext_dppy = Extension(
         name="numba_dppy._dppy_rt",

From f5b1989cf3f47e75122837fc05b1b8cfeb8fbd79 Mon Sep 17 00:00:00 2001
From: etotmeni <elena.totmenina@intel.com>
Date: Wed, 16 Dec 2020 15:53:32 +0300
Subject: [PATCH 21/40] Added loader lib for win

---
 numba_dppy/dppy_rt.c | 117 +++++++++++++++++++++++++++++--------------
 1 file changed, 80 insertions(+), 37 deletions(-)

diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
index d637064989..83fd6949b2 100644
--- a/numba_dppy/dppy_rt.c
+++ b/numba_dppy/dppy_rt.c
@@ -1,8 +1,12 @@
 #include "_pymodule.h"
 #include "core/runtime/nrt_external.h"
 #include "assert.h"
-#include <dlfcn.h>
 #include <stdio.h>
+#if !defined _WIN32
+   #include <dlfcn.h>
+#else
+   #include <windows.h>
+#endif
 
 NRT_ExternalAllocator usmarray_allocator;
 NRT_external_malloc_func internal_allocator = NULL;
@@ -37,42 +41,81 @@ void save_queue_deallocator(void *data, void *opaque) {
 }
 
 void usmarray_memsys_init(void) {
-    char *lib_name = "libDPCTLSyclInterface.so";
-    char *malloc_name = "DPCTLmalloc_shared";
-    char *free_name = "DPCTLfree_with_queue";
-    char *get_queue_name = "DPCTLQueueMgr_GetCurrentQueue";
-    char *free_queue_name = "DPCTLQueue_Delete";
-
-    void *sycldl = dlopen(lib_name, RTLD_NOW);
-    assert(sycldl != NULL);
-    internal_allocator = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
-    usmarray_allocator.malloc = save_queue_allocator;
-    if (usmarray_allocator.malloc == NULL) {
-        printf("Did not find %s in %s\n", malloc_name, lib_name);
-        exit(-1);
-    }
-
-    usmarray_allocator.realloc = NULL;
-
-    internal_free = (NRT_external_free_func)dlsym(sycldl, free_name);
-    usmarray_allocator.free = save_queue_deallocator;
-    if (usmarray_allocator.free == NULL) {
-        printf("Did not find %s in %s\n", free_name, lib_name);
-        exit(-1);
-    }
-
-    get_queue_internal = (void *(*)(void))dlsym(sycldl, get_queue_name);
-    if (get_queue_internal == NULL) {
-        printf("Did not find %s in %s\n", get_queue_name, lib_name);
-        exit(-1);
-    }
-    usmarray_allocator.opaque_data = NULL;
-
-    free_queue_internal = (void (*)(void*))dlsym(sycldl, free_queue_name);
-    if (free_queue_internal == NULL) {
-        printf("Did not find %s in %s\n", free_queue_name, lib_name);
-        exit(-1);
-    }
+    #if !defined _WIN32
+        char *lib_name = "libDPCTLSyclInterface.so";
+        char *malloc_name = "DPCTLmalloc_shared";
+        char *free_name = "DPCTLfree_with_queue";
+        char *get_queue_name = "DPCTLQueueMgr_GetCurrentQueue";
+        char *free_queue_name = "DPCTLQueue_Delete";
+
+        void *sycldl = dlopen(lib_name, RTLD_NOW);
+        assert(sycldl != NULL);
+        internal_allocator = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
+        usmarray_allocator.malloc = save_queue_allocator;
+        if (usmarray_allocator.malloc == NULL) {
+            printf("Did not find %s in %s\n", malloc_name, lib_name);
+            exit(-1);
+        }
+
+        usmarray_allocator.realloc = NULL;
+
+        internal_free = (NRT_external_free_func)dlsym(sycldl, free_name);
+        usmarray_allocator.free = save_queue_deallocator;
+        if (usmarray_allocator.free == NULL) {
+            printf("Did not find %s in %s\n", free_name, lib_name);
+            exit(-1);
+        }
+
+        get_queue_internal = (void *(*)(void))dlsym(sycldl, get_queue_name);
+        if (get_queue_internal == NULL) {
+            printf("Did not find %s in %s\n", get_queue_name, lib_name);
+            exit(-1);
+        }
+        usmarray_allocator.opaque_data = NULL;
+
+        free_queue_internal = (void (*)(void*))dlsym(sycldl, free_queue_name);
+        if (free_queue_internal == NULL) {
+            printf("Did not find %s in %s\n", free_queue_name, lib_name);
+            exit(-1);
+        }
+    #else
+        char *lib_name = "libDPCTLSyclInterface.dll";
+        char *malloc_name = "DPCTLmalloc_shared";
+        char *free_name = "DPCTLfree_with_queue";
+        char *get_queue_name = "DPCTLQueueMgr_GetCurrentQueue";
+        char *free_queue_name = "DPCTLQueue_Delete";
+
+        HMODULE sycldl = LoadLibrary(lib_name);
+        assert(sycldl != NULL);
+        internal_allocator = (NRT_external_malloc_func)GetProcAddress(sycldl, malloc_name);
+        usmarray_allocator.malloc = save_queue_allocator;
+        if (usmarray_allocator.malloc == NULL) {
+            printf("Did not find %s in %s\n", malloc_name, lib_name);
+            exit(-1);
+        }
+
+        usmarray_allocator.realloc = NULL;
+
+        internal_free = (NRT_external_free_func)GetProcAddress(sycldl, free_name);
+        usmarray_allocator.free = save_queue_deallocator;
+        if (usmarray_allocator.free == NULL) {
+            printf("Did not find %s in %s\n", free_name, lib_name);
+            exit(-1);
+        }
+
+        get_queue_internal = (void *(*)(void))GetProcAddress(sycldl, get_queue_name);
+        if (get_queue_internal == NULL) {
+            printf("Did not find %s in %s\n", get_queue_name, lib_name);
+            exit(-1);
+        }
+        usmarray_allocator.opaque_data = NULL;
+
+        free_queue_internal = (void (*)(void*))GetProcAddress(sycldl, free_queue_name);
+        if (free_queue_internal == NULL) {
+            printf("Did not find %s in %s\n", free_queue_name, lib_name);
+            exit(-1);
+        }
+    #endif
 }
 
 void * usmarray_get_ext_allocator(void) {

From 5d8dd3985c3a0e79f646f65d5d91b6a217c915c8 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 16 Dec 2020 07:47:15 -0600
Subject: [PATCH 22/40] Use unittest.TestCase as base class for TestUsmArray

---
 numba_dppy/tests/test_usmarray.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/numba_dppy/tests/test_usmarray.py b/numba_dppy/tests/test_usmarray.py
index a6d428f80e..abf1a78ec6 100644
--- a/numba_dppy/tests/test_usmarray.py
+++ b/numba_dppy/tests/test_usmarray.py
@@ -4,8 +4,6 @@
 
 import dpctl.dptensor.numpy_usm_shared as usmarray
 
-from numba_dppy.testing import DPPYTestCase
-
 
 @numba.njit()
 def numba_mul_add(a):
@@ -70,7 +68,7 @@ def numba_T(x):
     return x.T
 
 
-class TestUsmArray(DPPYTestCase):
+class TestUsmArray(unittest.TestCase):
     def ndarray(self):
         """Create NumPy array"""
         return numpy.ones(10)

From be6a2ca16f65e18ee259b51058dcf3a2a61fb7b8 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Wed, 16 Dec 2020 16:30:44 -0600
Subject: [PATCH 23/40] Copy Numba array attribute code and replace array type
 with usmarray type since Numba extension point runs before the array
 attribute registration.

---
 numba_dppy/numpy_usm_shared.py | 273 ++++++++++++++++++++++++++++++++-
 1 file changed, 272 insertions(+), 1 deletion(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 0f058bc778..d1a0782be9 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -20,7 +20,8 @@
 import importlib
 import functools
 import inspect
-from numba.core.typing.templates import CallableTemplate
+from numba.core.typing.templates import (CallableTemplate, AttributeTemplate,
+                                         signature, bound_function)
 from numba.np.arrayobj import _array_copy
 
 import dpctl.dptensor.numpy_usm_shared as nus
@@ -273,6 +274,7 @@ def numba_register_typing():
                 #todo_classes.append(ig)
 
     for tgetattr in templates_registry.attributes:
+        dprint("Numpy getattr:", tgetattr, type(tgetattr), tgetattr.key)
         if tgetattr.key == types.Array:
             todo_getattr.append(tgetattr)
 
@@ -369,6 +371,8 @@ def generic_impl(self):
         typing_registry.register_global(dpval, type_handler)
 
     # Handle usmarray attribute typing.
+    templates_registry.register_attr(UsmArrayAttribute)
+    """
     for tgetattr in todo_getattr:
         class_name = tgetattr.__name__ + "_usmarray"
         dprint("tgetattr:", tgetattr, type(tgetattr), class_name)
@@ -378,6 +382,7 @@ def set_key(cls, key):
             cls.key = key
 
         def getattr_impl(self, attr):
+            dprint("getattr_impl:", class_name, attr)
             if attr.startswith("resolve_"):
                 def wrapper(*args, **kwargs):
                     attr_res = tgetattr.__getattribute__(self, attr)(*args, **kwargs)
@@ -387,6 +392,8 @@ def wrapper(*args, **kwargs):
                             ndim=attr_res.ndim,
                             layout=attr_res.layout,
                         )
+                    else:
+                        return attr_res
 
                 return wrapper
             else:
@@ -400,6 +407,270 @@ def wrapper(*args, **kwargs):
 
         new_usmarray_template.set_class_vars(UsmSharedArrayType)
         templates_registry.register_attr(new_usmarray_template)
+    """
+
+class UsmArrayAttribute(AttributeTemplate):
+    key = UsmSharedArrayType
+
+    def resolve_dtype(self, ary):
+        return types.DType(ary.dtype)
+
+    def resolve_itemsize(self, ary):
+        return types.intp
+
+    def resolve_shape(self, ary):
+        return types.UniTuple(types.intp, ary.ndim)
+
+    def resolve_strides(self, ary):
+        return types.UniTuple(types.intp, ary.ndim)
+
+    def resolve_ndim(self, ary):
+        return types.intp
+
+    def resolve_size(self, ary):
+        return types.intp
+
+    def resolve_flat(self, ary):
+        return types.NumpyFlatType(ary)
+
+    def resolve_ctypes(self, ary):
+        return types.ArrayCTypes(ary)
+
+    def resolve_flags(self, ary):
+        return types.ArrayFlags(ary)
+
+    def resolve_T(self, ary):
+        if ary.ndim <= 1:
+            retty = ary
+        else:
+            layout = {"C": "F", "F": "C"}.get(ary.layout, "A")
+            retty = ary.copy(layout=layout)
+        return retty
+
+    def resolve_real(self, ary):
+        return self._resolve_real_imag(ary, attr='real')
+
+    def resolve_imag(self, ary):
+        return self._resolve_real_imag(ary, attr='imag')
+
+    def _resolve_real_imag(self, ary, attr):
+        if ary.dtype in types.complex_domain:
+            return ary.copy(dtype=ary.dtype.underlying_float, layout='A')
+        elif ary.dtype in types.number_domain:
+            res = ary.copy(dtype=ary.dtype)
+            if attr == 'imag':
+                res = res.copy(readonly=True)
+            return res
+        else:
+            msg = "cannot access .{} of array of {}"
+            raise TypingError(msg.format(attr, ary.dtype))
+
+"""
+    @bound_function("array.transpose")
+    def resolve_transpose(self, ary, args, kws):
+        def sentry_shape_scalar(ty):
+            if ty in types.number_domain:
+                # Guard against non integer type
+                if not isinstance(ty, types.Integer):
+                    raise TypeError("transpose() arg cannot be {0}".format(ty))
+                return True
+            else:
+                return False
+
+        assert not kws
+        if len(args) == 0:
+            return signature(self.resolve_T(ary))
+
+        if len(args) == 1:
+            shape, = args
+
+            if sentry_shape_scalar(shape):
+                assert ary.ndim == 1
+                return signature(ary, *args)
+
+            if isinstance(shape, types.NoneType):
+                return signature(self.resolve_T(ary))
+
+            shape = normalize_shape(shape)
+            if shape is None:
+                return
+
+            assert ary.ndim == shape.count
+            return signature(self.resolve_T(ary).copy(layout="A"), shape)
+
+        else:
+            if any(not sentry_shape_scalar(a) for a in args):
+                raise TypeError("transpose({0}) is not supported".format(
+                    ', '.join(args)))
+            assert ary.ndim == len(args)
+            return signature(self.resolve_T(ary).copy(layout="A"), *args)
+
+    @bound_function("array.copy")
+    def resolve_copy(self, ary, args, kws):
+        assert not args
+        assert not kws
+        retty = ary.copy(layout="C", readonly=False)
+        return signature(retty)
+
+    @bound_function("array.item")
+    def resolve_item(self, ary, args, kws):
+        assert not kws
+        # We don't support explicit arguments as that's exactly equivalent
+        # to regular indexing.  The no-argument form is interesting to
+        # allow some degree of genericity when writing functions.
+        if not args:
+            return signature(ary.dtype)
+
+    @bound_function("array.itemset")
+    def resolve_itemset(self, ary, args, kws):
+        assert not kws
+        # We don't support explicit arguments as that's exactly equivalent
+        # to regular indexing.  The no-argument form is interesting to
+        # allow some degree of genericity when writing functions.
+        if len(args) == 1:
+            return signature(types.none, ary.dtype)
+
+    @bound_function("array.nonzero")
+    def resolve_nonzero(self, ary, args, kws):
+        assert not args
+        assert not kws
+        # 0-dim arrays return one result array
+        ndim = max(ary.ndim, 1)
+        retty = types.UniTuple(UsmSharedArrayType(types.intp, 1, 'C'), ndim)
+        return signature(retty)
+
+    @bound_function("array.reshape")
+    def resolve_reshape(self, ary, args, kws):
+        def sentry_shape_scalar(ty):
+            if ty in types.number_domain:
+                # Guard against non integer type
+                if not isinstance(ty, types.Integer):
+                    raise TypeError("reshape() arg cannot be {0}".format(ty))
+                return True
+            else:
+                return False
+
+        assert not kws
+        if ary.layout not in 'CF':
+            # only work for contiguous array
+            raise TypeError("reshape() supports contiguous array only")
+
+        if len(args) == 1:
+            # single arg
+            shape, = args
+
+            if sentry_shape_scalar(shape):
+                ndim = 1
+            else:
+                shape = normalize_shape(shape)
+                if shape is None:
+                    return
+                ndim = shape.count
+            retty = ary.copy(ndim=ndim)
+            return signature(retty, shape)
+
+        elif len(args) == 0:
+            # no arg
+            raise TypeError("reshape() take at least one arg")
+
+        else:
+            # vararg case
+            if any(not sentry_shape_scalar(a) for a in args):
+                raise TypeError("reshape({0}) is not supported".format(
+                    ', '.join(map(str, args))))
+
+            retty = ary.copy(ndim=len(args))
+            return signature(retty, *args)
+
+    @bound_function("array.sort")
+    def resolve_sort(self, ary, args, kws):
+        assert not args
+        assert not kws
+        if ary.ndim == 1:
+            return signature(types.none)
+
+    @bound_function("array.argsort")
+    def resolve_argsort(self, ary, args, kws):
+        assert not args
+        kwargs = dict(kws)
+        kind = kwargs.pop('kind', types.StringLiteral('quicksort'))
+        if not isinstance(kind, types.StringLiteral):
+            raise errors.TypingError('"kind" must be a string literal')
+        if kwargs:
+            msg = "Unsupported keywords: {!r}"
+            raise TypingError(msg.format([k for k in kwargs.keys()]))
+        if ary.ndim == 1:
+            def argsort_stub(kind='quicksort'):
+                pass
+            pysig = utils.pysignature(argsort_stub)
+            sig = signature(UsmSharedArrayType(types.intp, 1, 'C'), kind).replace(pysig=pysig)
+            return sig
+
+    @bound_function("array.view")
+    def resolve_view(self, ary, args, kws):
+        from .npydecl import parse_dtype
+        assert not kws
+        dtype, = args
+        dtype = parse_dtype(dtype)
+        if dtype is None:
+            return
+        retty = ary.copy(dtype=dtype)
+        return signature(retty, *args)
+
+    @bound_function("array.astype")
+    def resolve_astype(self, ary, args, kws):
+        from .npydecl import parse_dtype
+        assert not kws
+        dtype, = args
+        dtype = parse_dtype(dtype)
+        if dtype is None:
+            return
+        if not self.context.can_convert(ary.dtype, dtype):
+            raise TypeError("astype(%s) not supported on %s: "
+                            "cannot convert from %s to %s"
+                            % (dtype, ary, ary.dtype, dtype))
+        layout = ary.layout if ary.layout in 'CF' else 'C'
+        # reset the write bit irrespective of whether the cast type is the same
+        # as the current dtype, this replicates numpy
+        retty = ary.copy(dtype=dtype, layout=layout, readonly=False)
+        return signature(retty, *args)
+
+    @bound_function("array.ravel")
+    def resolve_ravel(self, ary, args, kws):
+        # Only support no argument version (default order='C')
+        assert not kws
+        assert not args
+        return signature(ary.copy(ndim=1, layout='C'))
+
+    @bound_function("array.flatten")
+    def resolve_flatten(self, ary, args, kws):
+        # Only support no argument version (default order='C')
+        assert not kws
+        assert not args
+        return signature(ary.copy(ndim=1, layout='C'))
+
+    @bound_function("array.take")
+    def resolve_take(self, ary, args, kws):
+        assert not kws
+        argty, = args
+        if isinstance(argty, types.Integer):
+            sig = signature(ary.dtype, *args)
+        elif isinstance(argty, UsmSharedArrayType):
+            sig = signature(argty.copy(layout='C', dtype=ary.dtype), *args)
+        elif isinstance(argty, types.List): # 1d lists only
+            sig = signature(UsmSharedArrayType(ary.dtype, 1, 'C'), *args)
+        elif isinstance(argty, types.BaseTuple):
+            sig = signature(UsmSharedArrayType(ary.dtype, np.ndim(argty), 'C'), *args)
+        else:
+            raise TypeError("take(%s) not supported for %s" % argty)
+        return sig
+
+    def generic_resolve(self, ary, attr):
+        # Resolution of other attributes, for record arrays
+        if isinstance(ary.dtype, types.Record):
+            if attr in ary.dtype.fields:
+                return ary.copy(dtype=ary.dtype.typeof(attr), layout='A')
+"""
 
 
 @typing_registry.register_global(nus.as_ndarray)

From 5ddc665f04a3b23d8c2c06cbbba8e1dda3d76827 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Thu, 17 Dec 2020 20:51:28 +0300
Subject: [PATCH 24/40] Update meta.yaml

---
 conda-recipe/meta.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 5e5b61a25c..7620596a16 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -28,6 +28,7 @@ requirements:
         - llvm-spirv
         - llvmdev
         - dpnp 0.4.*  # [linux]
+        - cython  # For testing only: dpCtl requires it.
 
 test:
   requires:

From 883c713d2f5d6f5e7ca55cd3aeaa8bb392954c04 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Mon, 21 Dec 2020 13:11:51 -0600
Subject: [PATCH 25/40] Fixed the attribute typing issue.  So now, transpose
 works.

---
 numba_dppy/numpy_usm_shared.py    | 18 +++++++++++++++---
 numba_dppy/tests/test_usmarray.py |  1 -
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index d1a0782be9..83358ac7e9 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -371,7 +371,13 @@ def generic_impl(self):
         typing_registry.register_global(dpval, type_handler)
 
     # Handle usmarray attribute typing.
-    templates_registry.register_attr(UsmArrayAttribute)
+    # This explicit register_attr of a copied/modified UsmArrayAttribute
+    # may be removed in the future in favor of the below commented out code
+    # once we get this registration code to run after everything is registered
+    # in Numba.  Right now, the attribute registrations we need are happening
+    # after the registration callback that gets us here so we would miss the
+    # attribute registrations we need.
+    typing_registry.register_attr(UsmArrayAttribute)
     """
     for tgetattr in todo_getattr:
         class_name = tgetattr.__name__ + "_usmarray"
@@ -439,13 +445,19 @@ def resolve_ctypes(self, ary):
     def resolve_flags(self, ary):
         return types.ArrayFlags(ary)
 
+    def convert_array_to_usmarray(self, retty):
+        if isinstance(retty, types.Array):
+            return UsmSharedArrayType(dtype=retty.dtype, ndim=retty.ndim, layout=retty.layout)
+        else:
+            return retty
+
     def resolve_T(self, ary):
         if ary.ndim <= 1:
             retty = ary
         else:
             layout = {"C": "F", "F": "C"}.get(ary.layout, "A")
             retty = ary.copy(layout=layout)
-        return retty
+        return self.convert_array_to_usmarray(retty)
 
     def resolve_real(self, ary):
         return self._resolve_real_imag(ary, attr='real')
@@ -460,7 +472,7 @@ def _resolve_real_imag(self, ary, attr):
             res = ary.copy(dtype=ary.dtype)
             if attr == 'imag':
                 res = res.copy(readonly=True)
-            return res
+            return self.convert_array_to_usmarray(res)
         else:
             msg = "cannot access .{} of array of {}"
             raise TypingError(msg.format(attr, ary.dtype))
diff --git a/numba_dppy/tests/test_usmarray.py b/numba_dppy/tests/test_usmarray.py
index abf1a78ec6..3f838e743a 100644
--- a/numba_dppy/tests/test_usmarray.py
+++ b/numba_dppy/tests/test_usmarray.py
@@ -192,7 +192,6 @@ def test_numba_usmarray_shape(self):
         self.assertIsInstance(s1, tuple, type(s1))
         self.assertEqual(s1, (10, 10))
 
-    @unittest.expectedFailure
     def test_numba_usmarray_T(self):
         """Testing Numba usmarray.T"""
         dp4 = numba_T(numba_usmarray_empty())

From 602f7e45dcda4c60a3105eaeed68f8de5176f440 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 22 Dec 2020 05:52:42 -0600
Subject: [PATCH 26/40] dpCtl >=0.5.1rc1

---
 README.md              | 2 +-
 conda-recipe/meta.yaml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9d1969fe3f..4600642ba3 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ https://intelpython.github.io/dpnp/
 ## Dependencies
 
 * numba >=0.51 (IntelPython/numba)
-* dpCtl 0.5.*
+* dpCtl >=0.5.1
 * dpNP 0.4.* (optional)
 * llvm-spirv (SPIRV generation from LLVM IR)
 * llvmdev (LLVM IR generation)
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 7620596a16..1675055cb2 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -18,12 +18,12 @@ requirements:
         - setuptools
         - cython
         - numba
-        - dpctl 0.5.*
+        - dpctl >=0.5.1rc1
         - dpnp 0.4.*  # [linux]
     run:
         - python
         - numba >=0.51
-        - dpctl 0.5.*
+        - dpctl >=0.5.1rc1
         - spirv-tools
         - llvm-spirv
         - llvmdev

From 0355b25f63d8831a8fc06958ad146f1ca3e8d5d7 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 22 Dec 2020 06:15:50 -0600
Subject: [PATCH 27/40] Remove expectedFailure for test_numba_usmarray_mul_add.
 It could fail in different test order.

---
 numba_dppy/tests/test_usmarray.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/numba_dppy/tests/test_usmarray.py b/numba_dppy/tests/test_usmarray.py
index 3f838e743a..b79f13d650 100644
--- a/numba_dppy/tests/test_usmarray.py
+++ b/numba_dppy/tests/test_usmarray.py
@@ -123,7 +123,6 @@ def test_python_usmarray_mul_add(self):
         self.assertIsInstance(b, usmarray.ndarray, type(b))
         self.assertTrue(usmarray.has_array_interface(b))
 
-    @unittest.expectedFailure
     def test_numba_usmarray_mul_add(self):
         """Testing Numba usmarray"""
         # fails if run tests in bunch

From cd4058c16a7c7803592108e5ed6244a9c1f36669 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 22 Dec 2020 07:53:37 -0600
Subject: [PATCH 28/40] Revert "Update meta.yaml"

This reverts commit 5ddc665f04a3b23d8c2c06cbbba8e1dda3d76827.
---
 conda-recipe/meta.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 1675055cb2..01e0f8b406 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -28,7 +28,6 @@ requirements:
         - llvm-spirv
         - llvmdev
         - dpnp 0.4.*  # [linux]
-        - cython  # For testing only: dpCtl requires it.
 
 test:
   requires:

From 9b4e707b8657d7e5d988e07dd172f448df2c7679 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 22 Dec 2020 10:24:52 -0600
Subject: [PATCH 29/40] Fix dpCtl library name on Windows

---
 numba_dppy/dppy_rt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
index 83fd6949b2..d889cec3f5 100644
--- a/numba_dppy/dppy_rt.c
+++ b/numba_dppy/dppy_rt.c
@@ -79,7 +79,7 @@ void usmarray_memsys_init(void) {
             exit(-1);
         }
     #else
-        char *lib_name = "libDPCTLSyclInterface.dll";
+        char *lib_name = "DPCTLSyclInterface.dll";
         char *malloc_name = "DPCTLmalloc_shared";
         char *free_name = "DPCTLfree_with_queue";
         char *get_queue_name = "DPCTLQueueMgr_GetCurrentQueue";

From f7b56d7a244e136ea7306e0ab4fcc7d14eaa7c36 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 22 Dec 2020 12:21:01 -0600
Subject: [PATCH 30/40] Fix checks for imported functions from dpCtl library.

---
 numba_dppy/dppy_rt.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
index d889cec3f5..f4c79b2f4d 100644
--- a/numba_dppy/dppy_rt.c
+++ b/numba_dppy/dppy_rt.c
@@ -52,7 +52,7 @@ void usmarray_memsys_init(void) {
         assert(sycldl != NULL);
         internal_allocator = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
         usmarray_allocator.malloc = save_queue_allocator;
-        if (usmarray_allocator.malloc == NULL) {
+        if (internal_allocator == NULL) {
             printf("Did not find %s in %s\n", malloc_name, lib_name);
             exit(-1);
         }
@@ -61,7 +61,7 @@ void usmarray_memsys_init(void) {
 
         internal_free = (NRT_external_free_func)dlsym(sycldl, free_name);
         usmarray_allocator.free = save_queue_deallocator;
-        if (usmarray_allocator.free == NULL) {
+        if (internal_free == NULL) {
             printf("Did not find %s in %s\n", free_name, lib_name);
             exit(-1);
         }
@@ -89,7 +89,7 @@ void usmarray_memsys_init(void) {
         assert(sycldl != NULL);
         internal_allocator = (NRT_external_malloc_func)GetProcAddress(sycldl, malloc_name);
         usmarray_allocator.malloc = save_queue_allocator;
-        if (usmarray_allocator.malloc == NULL) {
+        if (internal_allocator == NULL) {
             printf("Did not find %s in %s\n", malloc_name, lib_name);
             exit(-1);
         }
@@ -98,7 +98,7 @@ void usmarray_memsys_init(void) {
 
         internal_free = (NRT_external_free_func)GetProcAddress(sycldl, free_name);
         usmarray_allocator.free = save_queue_deallocator;
-        if (usmarray_allocator.free == NULL) {
+        if (internal_free == NULL) {
             printf("Did not find %s in %s\n", free_name, lib_name);
             exit(-1);
         }

From 8fd8e5eefd7e1560c270d71ff336eb1164d4a96a Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 22 Dec 2020 06:08:26 -0600
Subject: [PATCH 31/40] Include numba/.../*.h in C code

---
 numba_dppy/dppy_rt.c | 4 ++--
 setup.py             | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
index f4c79b2f4d..6589a369df 100644
--- a/numba_dppy/dppy_rt.c
+++ b/numba_dppy/dppy_rt.c
@@ -1,5 +1,5 @@
-#include "_pymodule.h"
-#include "core/runtime/nrt_external.h"
+#include "numba/_pymodule.h"
+#include "numba/core/runtime/nrt_external.h"
 #include "assert.h"
 #include <stdio.h>
 #if !defined _WIN32
diff --git a/setup.py b/setup.py
index d833951745..a6dcfd4d32 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,6 @@
 import os
 from setuptools import Extension, find_packages, setup
 from Cython.Build import cythonize
-from numba.core.extending import include_path
 
 import versioneer
 import sys
@@ -9,13 +8,12 @@
 
 def get_ext_modules():
     ext_modules = []
-    numba_dir = include_path()
 
+    import numba
     ext_dppy = Extension(
         name="numba_dppy._dppy_rt",
         sources=["numba_dppy/dppy_rt.c"],
-        include_dirs=[numba_dir + "/numba"],
-        depends=[numba_dir + "/numba/core/runtime/nrt_external.h", numba_dir + "/numba/core/runtime/nrt.h", numba_dir + "/numba/_pymodule.h"],
+        include_dirs=[numba.core.extending.include_path()],
     )
     ext_modules += [ext_dppy]
 

From 4aa05b7ddc02af683dc1529eff02c61f9b02c0d1 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 23 Dec 2020 09:19:15 -0600
Subject: [PATCH 32/40] black formatter

---
 numba_dppy/__init__.py                        |   6 +-
 numba_dppy/_version.py                        | 154 +++--
 numba_dppy/codegen.py                         |  19 +-
 numba_dppy/compiler.py                        | 271 +++++----
 numba_dppy/decorators.py                      |  14 +-
 numba_dppy/descriptor.py                      |   5 +-
 numba_dppy/device_init.py                     |   4 +-
 numba_dppy/dispatcher.py                      |  63 ++-
 numba_dppy/dpnp_glue/__init__.py              |   2 +-
 numba_dppy/dpnp_glue/dpnp_linalgimpl.py       |  38 +-
 .../dpnp_glue/dpnp_sort_search_countimpl.py   |  10 +-
 numba_dppy/dpnp_glue/dpnp_statisticsimpl.py   |  74 ++-
 .../dpnp_glue/dpnp_transcendentalsimpl.py     |   4 +-
 numba_dppy/dpnp_glue/dpnpdecl.py              |   6 +-
 numba_dppy/dpnp_glue/dpnpimpl.py              |  14 +-
 numba_dppy/dpnp_glue/stubs.py                 |   7 +-
 numba_dppy/dppy_host_fn_call_gen.py           | 387 ++++++++-----
 numba_dppy/dppy_lowerer.py                    | 524 ++++++++++--------
 numba_dppy/dppy_offload_dispatcher.py         |  49 +-
 numba_dppy/dppy_parfor_diagnostics.py         |  59 +-
 numba_dppy/dppy_passbuilder.py                |  97 ++--
 numba_dppy/dppy_passes.py                     | 161 +++---
 numba_dppy/dufunc_inliner.py                  |  93 +++-
 numba_dppy/examples/blacksholes_njit.py       |  26 +-
 numba_dppy/examples/dppy_func.py              |   2 +-
 numba_dppy/examples/dppy_with_context.py      |   7 +-
 numba_dppy/examples/matmul.py                 |  14 +-
 numba_dppy/examples/pa_examples/test1.py      |   2 +-
 numba_dppy/examples/pairwise_distance.py      |  24 +-
 numba_dppy/examples/sum-hybrid.py             |   6 +-
 numba_dppy/examples/sum.py                    |   2 +-
 numba_dppy/examples/sum2D.py                  |  12 +-
 numba_dppy/examples/sum_ndarray.py            |   6 +-
 numba_dppy/examples/sum_reduction.py          |  17 +-
 numba_dppy/examples/sum_reduction_ocl.py      |  15 +-
 .../examples/sum_reduction_recursive_ocl.py   |  48 +-
 numba_dppy/initialize.py                      |  18 +-
 numba_dppy/numpy_usm_shared.py                |  32 +-
 numba_dppy/ocl/atomics/__init__.py            |   9 +-
 numba_dppy/ocl/mathdecl.py                    |  11 +-
 numba_dppy/ocl/mathimpl.py                    | 145 ++---
 numba_dppy/ocl/ocldecl.py                     |  26 +-
 numba_dppy/ocl/oclimpl.py                     | 207 ++++---
 numba_dppy/ocl/stubs.py                       |  31 +-
 numba_dppy/printimpl.py                       |   5 +-
 numba_dppy/rename_numpy_functions_pass.py     |  50 +-
 numba_dppy/spirv_generator.py                 |  80 +--
 numba_dppy/target.py                          | 188 ++++---
 numba_dppy/target_dispatcher.py               |  46 +-
 numba_dppy/testing.py                         |   2 +
 numba_dppy/tests/__init__.py                  |   1 +
 numba_dppy/tests/skip_tests.py                |   1 +
 numba_dppy/tests/test_arg_accessor.py         |  29 +-
 numba_dppy/tests/test_arg_types.py            |  12 +-
 numba_dppy/tests/test_atomic_op.py            |  62 +--
 numba_dppy/tests/test_barrier.py              |  13 +-
 numba_dppy/tests/test_black_scholes.py        |  70 ++-
 numba_dppy/tests/test_caching.py              |  11 +-
 .../tests/test_controllable_fallback.py       |  16 +-
 numba_dppy/tests/test_device_array_args.py    |   7 +-
 numba_dppy/tests/test_dpctl_api.py            |   4 +-
 numba_dppy/tests/test_dpnp_functions.py       | 161 ++++--
 numba_dppy/tests/test_dppy_fallback.py        |  10 +-
 numba_dppy/tests/test_dppy_func.py            |   5 +-
 numba_dppy/tests/test_math_functions.py       |  49 +-
 .../test_numpy_bit_twiddling_functions.py     |   4 +-
 .../tests/test_numpy_comparison_functions.py  |   4 +-
 .../tests/test_numpy_floating_functions.py    |  10 +-
 numba_dppy/tests/test_numpy_math_functions.py |  11 +-
 .../test_numpy_trigonomteric_functions.py     |   4 +-
 numba_dppy/tests/test_offload_diagnostics.py  |   4 +-
 numba_dppy/tests/test_parfor_lower_message.py |   4 +-
 numba_dppy/tests/test_prange.py               |  38 +-
 numba_dppy/tests/test_print.py                |   4 +-
 .../tests/test_rename_numpy_function_pass.py  |  18 +-
 numba_dppy/tests/test_sum_reduction.py        |  17 +-
 numba_dppy/tests/test_vectorize.py            |   7 +-
 numba_dppy/tests/test_with_context.py         |  16 +-
 setup.py                                      |  20 +-
 79 files changed, 2224 insertions(+), 1480 deletions(-)

diff --git a/numba_dppy/__init__.py b/numba_dppy/__init__.py
index ac4e898889..7bc3ea504a 100644
--- a/numba_dppy/__init__.py
+++ b/numba_dppy/__init__.py
@@ -506,17 +506,21 @@ def main():
 import numba.testing
 
 from .config import dppy_present
+
 if dppy_present:
     from .device_init import *
 else:
     raise ImportError("Importing numba-dppy failed")
 
+
 def test(*args, **kwargs):
     if not dppy_present and not is_available():
         dppy_error()
 
     return numba.testing.test("numba_dppy.tests", *args, **kwargs)
 
+
 from ._version import get_versions
-__version__ = get_versions()['version']
+
+__version__ = get_versions()["version"]
 del get_versions
diff --git a/numba_dppy/_version.py b/numba_dppy/_version.py
index 165dbf4d17..dc6811b00f 100644
--- a/numba_dppy/_version.py
+++ b/numba_dppy/_version.py
@@ -1,4 +1,3 @@
-
 # This file helps to compute a version number in source trees obtained from
 # git-archive tarball (such as those provided by githubs download-from-tag
 # feature). Distribution tarballs (built by setup.py sdist) and build
@@ -58,17 +57,18 @@ class NotThisMethod(Exception):
 
 def register_vcs_handler(vcs, method):  # decorator
     """Create decorator to mark a method as the handler of a VCS."""
+
     def decorate(f):
         """Store f in HANDLERS[vcs][method]."""
         if vcs not in HANDLERS:
             HANDLERS[vcs] = {}
         HANDLERS[vcs][method] = f
         return f
+
     return decorate
 
 
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
-                env=None):
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
     """Call the given command(s)."""
     assert isinstance(commands, list)
     p = None
@@ -76,10 +76,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
         try:
             dispcmd = str([c] + args)
             # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
-                                 stdout=subprocess.PIPE,
-                                 stderr=(subprocess.PIPE if hide_stderr
-                                         else None))
+            p = subprocess.Popen(
+                [c] + args,
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=(subprocess.PIPE if hide_stderr else None),
+            )
             break
         except EnvironmentError:
             e = sys.exc_info()[1]
@@ -114,16 +117,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
     for i in range(3):
         dirname = os.path.basename(root)
         if dirname.startswith(parentdir_prefix):
-            return {"version": dirname[len(parentdir_prefix):],
-                    "full-revisionid": None,
-                    "dirty": False, "error": None, "date": None}
+            return {
+                "version": dirname[len(parentdir_prefix) :],
+                "full-revisionid": None,
+                "dirty": False,
+                "error": None,
+                "date": None,
+            }
         else:
             rootdirs.append(root)
             root = os.path.dirname(root)  # up a level
 
     if verbose:
-        print("Tried directories %s but none started with prefix %s" %
-              (str(rootdirs), parentdir_prefix))
+        print(
+            "Tried directories %s but none started with prefix %s"
+            % (str(rootdirs), parentdir_prefix)
+        )
     raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
 
 
@@ -183,7 +192,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
     TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
     if not tags:
         # Either we're using git < 1.8.3, or there really are no tags. We use
         # a heuristic: assume all version tags have a digit. The old git %d
@@ -192,7 +201,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
         # between branches and tags. By ignoring refnames without digits, we
         # filter out many common branch names like "release" and
         # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r'\d', r)])
+        tags = set([r for r in refs if re.search(r"\d", r)])
         if verbose:
             print("discarding '%s', no digits" % ",".join(refs - tags))
     if verbose:
@@ -200,19 +209,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     for ref in sorted(tags):
         # sorting will prefer e.g. "2.0" over "2.0rc1"
         if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
+            r = ref[len(tag_prefix) :]
             if verbose:
                 print("picking %s" % r)
-            return {"version": r,
-                    "full-revisionid": keywords["full"].strip(),
-                    "dirty": False, "error": None,
-                    "date": date}
+            return {
+                "version": r,
+                "full-revisionid": keywords["full"].strip(),
+                "dirty": False,
+                "error": None,
+                "date": date,
+            }
     # no suitable tags, so version is "0+unknown", but full hex is still there
     if verbose:
         print("no suitable tags, using unknown + full revision id")
-    return {"version": "0+unknown",
-            "full-revisionid": keywords["full"].strip(),
-            "dirty": False, "error": "no suitable tags", "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": keywords["full"].strip(),
+        "dirty": False,
+        "error": "no suitable tags",
+        "date": None,
+    }
 
 
 @register_vcs_handler("git", "pieces_from_vcs")
@@ -227,8 +243,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     if sys.platform == "win32":
         GITS = ["git.cmd", "git.exe"]
 
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
-                          hide_stderr=True)
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
     if rc != 0:
         if verbose:
             print("Directory %s not under git control" % root)
@@ -236,10 +251,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
 
     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
     # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
-                                          "--always", "--long",
-                                          "--match", "%s*" % tag_prefix],
-                                   cwd=root)
+    describe_out, rc = run_command(
+        GITS,
+        [
+            "describe",
+            "--tags",
+            "--dirty",
+            "--always",
+            "--long",
+            "--match",
+            "%s*" % tag_prefix,
+        ],
+        cwd=root,
+    )
     # --long was added in git-1.5.5
     if describe_out is None:
         raise NotThisMethod("'git describe' failed")
@@ -262,17 +286,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     dirty = git_describe.endswith("-dirty")
     pieces["dirty"] = dirty
     if dirty:
-        git_describe = git_describe[:git_describe.rindex("-dirty")]
+        git_describe = git_describe[: git_describe.rindex("-dirty")]
 
     # now we have TAG-NUM-gHEX or HEX
 
     if "-" in git_describe:
         # TAG-NUM-gHEX
-        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
         if not mo:
             # unparseable. Maybe git-describe is misbehaving?
-            pieces["error"] = ("unable to parse git-describe output: '%s'"
-                               % describe_out)
+            pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
             return pieces
 
         # tag
@@ -281,10 +304,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
             if verbose:
                 fmt = "tag '%s' doesn't start with prefix '%s'"
                 print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
-                               % (full_tag, tag_prefix))
+            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+                full_tag,
+                tag_prefix,
+            )
             return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+        pieces["closest-tag"] = full_tag[len(tag_prefix) :]
 
         # distance: number of commits since tag
         pieces["distance"] = int(mo.group(2))
@@ -295,13 +320,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     else:
         # HEX: no tags
         pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
-                                    cwd=root)
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
         pieces["distance"] = int(count_out)  # total number of commits
 
     # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
-                       cwd=root)[0].strip()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+        0
+    ].strip()
     # Use only the last line.  Previous lines may contain GPG signature
     # information.
     date = date.splitlines()[-1]
@@ -335,8 +360,7 @@ def render_pep440(pieces):
                 rendered += ".dirty"
     else:
         # exception #1
-        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
-                                          pieces["short"])
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
         if pieces["dirty"]:
             rendered += ".dirty"
     return rendered
@@ -450,11 +474,13 @@ def render_git_describe_long(pieces):
 def render(pieces, style):
     """Render the given version pieces into the requested style."""
     if pieces["error"]:
-        return {"version": "unknown",
-                "full-revisionid": pieces.get("long"),
-                "dirty": None,
-                "error": pieces["error"],
-                "date": None}
+        return {
+            "version": "unknown",
+            "full-revisionid": pieces.get("long"),
+            "dirty": None,
+            "error": pieces["error"],
+            "date": None,
+        }
 
     if not style or style == "default":
         style = "pep440"  # the default
@@ -474,9 +500,13 @@ def render(pieces, style):
     else:
         raise ValueError("unknown style '%s'" % style)
 
-    return {"version": rendered, "full-revisionid": pieces["long"],
-            "dirty": pieces["dirty"], "error": None,
-            "date": pieces.get("date")}
+    return {
+        "version": rendered,
+        "full-revisionid": pieces["long"],
+        "dirty": pieces["dirty"],
+        "error": None,
+        "date": pieces.get("date"),
+    }
 
 
 def get_versions():
@@ -490,8 +520,7 @@ def get_versions():
     verbose = cfg.verbose
 
     try:
-        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
-                                          verbose)
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
     except NotThisMethod:
         pass
 
@@ -500,13 +529,16 @@ def get_versions():
         # versionfile_source is the relative path from the top of the source
         # tree (where the .git directory might live) to this file. Invert
         # this to find the root from __file__.
-        for i in cfg.versionfile_source.split('/'):
+        for i in cfg.versionfile_source.split("/"):
             root = os.path.dirname(root)
     except NameError:
-        return {"version": "0+unknown", "full-revisionid": None,
-                "dirty": None,
-                "error": "unable to find root of source tree",
-                "date": None}
+        return {
+            "version": "0+unknown",
+            "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to find root of source tree",
+            "date": None,
+        }
 
     try:
         pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
@@ -520,6 +552,10 @@ def get_versions():
     except NotThisMethod:
         pass
 
-    return {"version": "0+unknown", "full-revisionid": None,
-            "dirty": None,
-            "error": "unable to compute version", "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": None,
+        "dirty": None,
+        "error": "unable to compute version",
+        "date": None,
+    }
diff --git a/numba_dppy/codegen.py b/numba_dppy/codegen.py
index 4e278d7ebc..ef78551a2f 100644
--- a/numba_dppy/codegen.py
+++ b/numba_dppy/codegen.py
@@ -5,14 +5,17 @@
 from numba.core import utils
 
 
-SPIR_TRIPLE = {32: ' spir-unknown-unknown',
-               64: 'spir64-unknown-unknown'}
+SPIR_TRIPLE = {32: " spir-unknown-unknown", 64: "spir64-unknown-unknown"}
 
 SPIR_DATA_LAYOUT = {
-    32 : ('e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:'
-          '256-v512:512-v1024:1024'),
-    64 : ('e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-'
-          'v512:512-v1024:1024')
+    32: (
+        "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:"
+        "256-v512:512-v1024:1024"
+    ),
+    64: (
+        "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-"
+        "v512:512-v1024:1024"
+    ),
 }
 
 
@@ -36,8 +39,8 @@ def _optimize_final_module(self):
     def _finalize_specific(self):
         # Fix global naming
         for gv in self._final_module.global_variables:
-            if '.' in gv.name:
-                gv.name = gv.name.replace('.', '_')
+            if "." in gv.name:
+                gv.name = gv.name.replace(".", "_")
 
     def get_asm_str(self):
         # Return nothing: we can only dump assembler code when it is later
diff --git a/numba_dppy/compiler.py b/numba_dppy/compiler.py
index 37b9e25e9f..31e9d57278 100644
--- a/numba_dppy/compiler.py
+++ b/numba_dppy/compiler.py
@@ -22,22 +22,26 @@
 from numba_dppy.dppy_parfor_diagnostics import ExtendedParforDiagnostics
 
 
-DEBUG = os.environ.get('NUMBA_DPPY_DEBUG', None)
-_NUMBA_DPPY_READ_ONLY  = "read_only"
+DEBUG = os.environ.get("NUMBA_DPPY_DEBUG", None)
+_NUMBA_DPPY_READ_ONLY = "read_only"
 _NUMBA_DPPY_WRITE_ONLY = "write_only"
 _NUMBA_DPPY_READ_WRITE = "read_write"
 
 
 def _raise_no_device_found_error():
-    error_message = ("No OpenCL device specified. "
-                     "Usage : jit_fn[device, globalsize, localsize](...)")
+    error_message = (
+        "No OpenCL device specified. "
+        "Usage : jit_fn[device, globalsize, localsize](...)"
+    )
     raise ValueError(error_message)
 
 
 def _raise_invalid_kernel_enqueue_args():
-    error_message = ("Incorrect number of arguments for enquing dppy.kernel. "
-                     "Usage: device_env, global size, local size. "
-                     "The local size argument is optional.")
+    error_message = (
+        "Incorrect number of arguments for enquing dppy.kernel. "
+        "Usage: device_env, global size, local size. "
+        "The local size argument is optional."
+    )
     raise ValueError(error_message)
 
 
@@ -63,18 +67,14 @@ def define_pipelines(self):
         # this maintains the objmode fallback behaviour
         pms = []
         self.state.parfor_diagnostics = ExtendedParforDiagnostics()
-        self.state.metadata['parfor_diagnostics'] = self.state.parfor_diagnostics
+        self.state.metadata["parfor_diagnostics"] = self.state.parfor_diagnostics
         if not self.state.flags.force_pyobject:
-            #print("Numba-DPPY [INFO]: Using Numba-DPPY pipeline")
+            # print("Numba-DPPY [INFO]: Using Numba-DPPY pipeline")
             pms.append(DPPYPassBuilder.define_nopython_pipeline(self.state))
         if self.state.status.can_fallback or self.state.flags.force_pyobject:
-            pms.append(
-                DefaultPassBuilder.define_objectmode_pipeline(self.state)
-            )
+            pms.append(DefaultPassBuilder.define_objectmode_pipeline(self.state))
         if self.state.status.can_giveup:
-            pms.append(
-                DefaultPassBuilder.define_interpreted_pipeline(self.state)
-            )
+            pms.append(DefaultPassBuilder.define_interpreted_pipeline(self.state))
         return pms
 
 
@@ -88,32 +88,36 @@ def compile_with_dppy(pyfunc, return_type, args, debug):
     flags = compiler.Flags()
     # Do not compile (generate native code), just lower (to LLVM)
     if debug:
-        flags.set('debuginfo')
-    flags.set('no_compile')
-    flags.set('no_cpython_wrapper')
-    flags.unset('nrt')
+        flags.set("debuginfo")
+    flags.set("no_compile")
+    flags.set("no_cpython_wrapper")
+    flags.unset("nrt")
 
     # Run compilation pipeline
     if isinstance(pyfunc, FunctionType):
-        cres = compiler.compile_extra(typingctx=typingctx,
-                                      targetctx=targetctx,
-                                      func=pyfunc,
-                                      args=args,
-                                      return_type=return_type,
-                                      flags=flags,
-                                      locals={},
-                                      pipeline_class=DPPYCompiler)
+        cres = compiler.compile_extra(
+            typingctx=typingctx,
+            targetctx=targetctx,
+            func=pyfunc,
+            args=args,
+            return_type=return_type,
+            flags=flags,
+            locals={},
+            pipeline_class=DPPYCompiler,
+        )
     elif isinstance(pyfunc, ir.FunctionIR):
-        cres = compiler.compile_ir(typingctx=typingctx,
-                                   targetctx=targetctx,
-                                   func_ir=pyfunc,
-                                   args=args,
-                                   return_type=return_type,
-                                   flags=flags,
-                                   locals={},
-                                   pipeline_class=DPPYCompiler)
+        cres = compiler.compile_ir(
+            typingctx=typingctx,
+            targetctx=targetctx,
+            func_ir=pyfunc,
+            args=args,
+            return_type=return_type,
+            flags=flags,
+            locals={},
+            pipeline_class=DPPYCompiler,
+        )
     else:
-        assert(0)
+        assert 0
     # Linking depending libraries
     # targetctx.link_dependencies(cres.llvm_module, cres.target_context.linking)
     library = cres.library
@@ -138,17 +142,18 @@ def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=False):
     # depending on the target context. For example, we want to link our kernel object
     # with implementation containing atomic operations only when atomic operations
     # are being used in the kernel.
-    oclkern = DPPYKernel(context=cres.target_context,
-                         sycl_queue=sycl_queue,
-                         llvm_module=kernel.module,
-                         name=kernel.name,
-                         argtypes=cres.signature.args,
-                         ordered_arg_access_types=access_types)
+    oclkern = DPPYKernel(
+        context=cres.target_context,
+        sycl_queue=sycl_queue,
+        llvm_module=kernel.module,
+        name=kernel.name,
+        argtypes=cres.signature.args,
+        ordered_arg_access_types=access_types,
+    )
     return oclkern
 
 
-def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
-                          debug=False):
+def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces, debug=False):
     if DEBUG:
         print("compile_kernel_parfor", args)
         for a in args:
@@ -156,25 +161,26 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
             if isinstance(a, types.npytypes.Array):
                 print("addrspace:", a.addrspace)
 
-    cres = compile_with_dppy(func_ir, None, args_with_addrspaces,
-                             debug=debug)
+    cres = compile_with_dppy(func_ir, None, args_with_addrspaces, debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
 
     if DEBUG:
         print("compile_kernel_parfor signature", cres.signature.args)
         for a in cres.signature.args:
             print(a, type(a))
-#            if isinstance(a, types.npytypes.Array):
-#                print("addrspace:", a.addrspace)
+    #            if isinstance(a, types.npytypes.Array):
+    #                print("addrspace:", a.addrspace)
 
     kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args)
-    #kernel = cres.target_context.prepare_ocl_kernel(func, args_with_addrspaces)
-    oclkern = DPPYKernel(context=cres.target_context,
-                         sycl_queue=sycl_queue,
-                         llvm_module=kernel.module,
-                         name=kernel.name,
-                         argtypes=args_with_addrspaces)
-                         #argtypes=cres.signature.args)
+    # kernel = cres.target_context.prepare_ocl_kernel(func, args_with_addrspaces)
+    oclkern = DPPYKernel(
+        context=cres.target_context,
+        sycl_queue=sycl_queue,
+        llvm_module=kernel.module,
+        name=kernel.name,
+        argtypes=args_with_addrspaces,
+    )
+    # argtypes=cres.signature.args)
     return oclkern
 
 
@@ -196,8 +202,7 @@ class dppy_function_template(ConcreteTemplate):
 
 # Compile dppy function template
 def compile_dppy_func_template(pyfunc):
-    """Compile a DPPYFunctionTemplate
-    """
+    """Compile a DPPYFunctionTemplate"""
     from .descriptor import dppy_target
 
     dft = DPPYFunctionTemplate(pyfunc)
@@ -215,8 +220,8 @@ def generic(self, args, kws):
 
 
 class DPPYFunctionTemplate(object):
-    """Unmaterialized dppy function
-    """
+    """Unmaterialized dppy function"""
+
     def __init__(self, pyfunc, debug=False):
         self.py_func = pyfunc
         self.debug = debug
@@ -239,8 +244,7 @@ def compile(self, args):
 
             if first_definition:
                 # First definition
-                cres.target_context.insert_user_function(self, cres.fndesc,
-                                                         libs)
+                cres.target_context.insert_user_function(self, cres.fndesc, libs)
             else:
                 cres.target_context.add_user_function(self, cres.fndesc, libs)
 
@@ -258,61 +262,65 @@ def __init__(self, cres):
 def _ensure_valid_work_item_grid(val, sycl_queue):
 
     if not isinstance(val, (tuple, list, int)):
-        error_message = ("Cannot create work item dimension from "
-                         "provided argument")
+        error_message = "Cannot create work item dimension from " "provided argument"
         raise ValueError(error_message)
 
     if isinstance(val, int):
         val = [val]
 
     # TODO: we need some way to check the max dimensions
-    '''
+    """
     if len(val) > device_env.get_max_work_item_dims():
         error_message = ("Unsupported number of work item dimensions ")
         raise ValueError(error_message)
-    '''
+    """
+
+    return list(
+        val[::-1]
+    )  # reversing due to sycl and opencl interop kernel range mismatch semantic
 
-    return list(val[::-1]) # reversing due to sycl and opencl interop kernel range mismatch semantic
 
 def _ensure_valid_work_group_size(val, work_item_grid):
 
     if not isinstance(val, (tuple, list, int)):
-        error_message = ("Cannot create work item dimension from "
-                         "provided argument")
+        error_message = "Cannot create work item dimension from " "provided argument"
         raise ValueError(error_message)
 
     if isinstance(val, int):
         val = [val]
 
     if len(val) != len(work_item_grid):
-        error_message = ("Unsupported number of work item dimensions, " +
-                         "dimensions of global and local work items has to be the same ")
+        error_message = (
+            "Unsupported number of work item dimensions, "
+            + "dimensions of global and local work items has to be the same "
+        )
         raise ValueError(error_message)
 
-    return list(val[::-1]) # reversing due to sycl and opencl interop kernel range mismatch semantic
+    return list(
+        val[::-1]
+    )  # reversing due to sycl and opencl interop kernel range mismatch semantic
 
 
 class DPPYKernelBase(object):
-    """Define interface for configurable kernels
-    """
+    """Define interface for configurable kernels"""
 
     def __init__(self):
         self.global_size = []
-        self.local_size  = []
-        self.sycl_queue  = None
+        self.local_size = []
+        self.sycl_queue = None
 
         # list of supported access types, stored in dict for fast lookup
         self.valid_access_types = {
-                _NUMBA_DPPY_READ_ONLY: _NUMBA_DPPY_READ_ONLY,
-                _NUMBA_DPPY_WRITE_ONLY: _NUMBA_DPPY_WRITE_ONLY,
-                _NUMBA_DPPY_READ_WRITE: _NUMBA_DPPY_READ_WRITE}
+            _NUMBA_DPPY_READ_ONLY: _NUMBA_DPPY_READ_ONLY,
+            _NUMBA_DPPY_WRITE_ONLY: _NUMBA_DPPY_WRITE_ONLY,
+            _NUMBA_DPPY_READ_WRITE: _NUMBA_DPPY_READ_WRITE,
+        }
 
     def copy(self):
         return copy.copy(self)
 
     def configure(self, sycl_queue, global_size, local_size=None):
-        """Configure the OpenCL kernel. The local_size can be None
-        """
+        """Configure the OpenCL kernel. The local_size can be None"""
         clone = self.copy()
         clone.global_size = global_size
         clone.local_size = local_size
@@ -346,8 +354,15 @@ class DPPYKernel(DPPYKernelBase):
     A OCL kernel object
     """
 
-    def __init__(self, context, sycl_queue, llvm_module, name, argtypes,
-                 ordered_arg_access_types=None):
+    def __init__(
+        self,
+        context,
+        sycl_queue,
+        llvm_module,
+        name,
+        argtypes,
+        ordered_arg_access_types=None,
+    ):
         super(DPPYKernel, self).__init__()
         self._llvm_module = llvm_module
         self.assembly = self.binary = llvm_module.__str__()
@@ -365,7 +380,9 @@ def __init__(self, context, sycl_queue, llvm_module, name, argtypes,
         self.spirv_bc = spirv_generator.llvm_to_spirv(self.context, self.binary)
 
         # create a program
-        self.program = dpctl_prog.create_program_from_spirv(self.sycl_queue, self.spirv_bc)
+        self.program = dpctl_prog.create_program_from_spirv(
+            self.sycl_queue, self.spirv_bc
+        )
         #  create a kernel
         self.kernel = self.program.get_sycl_kernel(self.entry_name)
 
@@ -376,33 +393,47 @@ def __call__(self, *args):
         retr = []  # hold functors for writeback
         kernelargs = []
         internal_device_arrs = []
-        for ty, val, access_type in zip(self.argument_types, args,
-                                        self.ordered_arg_access_types):
-            self._unpack_argument(ty, val, self.sycl_queue, retr,
-                    kernelargs, internal_device_arrs, access_type)
+        for ty, val, access_type in zip(
+            self.argument_types, args, self.ordered_arg_access_types
+        ):
+            self._unpack_argument(
+                ty,
+                val,
+                self.sycl_queue,
+                retr,
+                kernelargs,
+                internal_device_arrs,
+                access_type,
+            )
 
-        self.sycl_queue.submit(self.kernel, kernelargs, self.global_size, self.local_size)
+        self.sycl_queue.submit(
+            self.kernel, kernelargs, self.global_size, self.local_size
+        )
         self.sycl_queue.wait()
 
-        for ty, val, i_dev_arr, access_type in zip(self.argument_types, args,
-                internal_device_arrs, self.ordered_arg_access_types):
-            self._pack_argument(ty, val, self.sycl_queue, i_dev_arr,
-                                access_type)
+        for ty, val, i_dev_arr, access_type in zip(
+            self.argument_types,
+            args,
+            internal_device_arrs,
+            self.ordered_arg_access_types,
+        ):
+            self._pack_argument(ty, val, self.sycl_queue, i_dev_arr, access_type)
 
     def _pack_argument(self, ty, val, sycl_queue, device_arr, access_type):
         """
         Copy device data back to host
         """
-        if (device_arr and (access_type not in self.valid_access_types or
-            access_type in self.valid_access_types and
-            self.valid_access_types[access_type] != _NUMBA_DPPY_READ_ONLY)):
+        if device_arr and (
+            access_type not in self.valid_access_types
+            or access_type in self.valid_access_types
+            and self.valid_access_types[access_type] != _NUMBA_DPPY_READ_ONLY
+        ):
             # we get the date back to host if have created a
             # device_array or if access_type of this device_array
             # is not of type read_only and read_write
             usm_buf, usm_ndarr, orig_ndarray = device_arr
             np.copyto(orig_ndarray, usm_ndarr)
 
-
     def _unpack_device_array_argument(self, val, kernelargs):
         # this function only takes ndarrays created using USM allocated buffer
         void_ptr_arg = True
@@ -422,9 +453,9 @@ def _unpack_device_array_argument(self, val, kernelargs):
         for ax in range(val.ndim):
             kernelargs.append(ctypes.c_longlong(val.strides[ax]))
 
-
-    def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
-                         device_arrs, access_type):
+    def _unpack_argument(
+        self, ty, val, sycl_queue, retr, kernelargs, device_arrs, access_type
+    ):
         """
         Convert arguments to ctypes and append to kernelargs
         """
@@ -440,9 +471,11 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
                 usm_buf = dpctl_mem.MemoryUSMShared(val.size * val.dtype.itemsize)
                 usm_ndarr = np.ndarray(val.shape, buffer=usm_buf, dtype=val.dtype)
 
-                if (default_behavior or
-                    self.valid_access_types[access_type] == _NUMBA_DPPY_READ_ONLY or
-                    self.valid_access_types[access_type] == _NUMBA_DPPY_READ_WRITE):
+                if (
+                    default_behavior
+                    or self.valid_access_types[access_type] == _NUMBA_DPPY_READ_ONLY
+                    or self.valid_access_types[access_type] == _NUMBA_DPPY_READ_WRITE
+                ):
                     np.copyto(usm_ndarr, val)
 
                 device_arrs[-1] = (usm_buf, usm_ndarr, val)
@@ -470,13 +503,13 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
             cval = ctypes.c_uint8(int(val))
             kernelargs.append(cval)
         elif ty == types.complex64:
-            #kernelargs.append(ctypes.c_float(val.real))
-            #kernelargs.append(ctypes.c_float(val.imag))
+            # kernelargs.append(ctypes.c_float(val.real))
+            # kernelargs.append(ctypes.c_float(val.imag))
             raise NotImplementedError(ty, val)
 
         elif ty == types.complex128:
-            #kernelargs.append(ctypes.c_double(val.real))
-            #kernelargs.append(ctypes.c_double(val.imag))
+            # kernelargs.append(ctypes.c_double(val.real))
+            # kernelargs.append(ctypes.c_double(val.imag))
             raise NotImplementedError(ty, val)
 
         else:
@@ -484,13 +517,16 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
 
     def check_for_invalid_access_type(self, access_type):
         if access_type not in self.valid_access_types:
-            msg = ("[!] %s is not a valid access type. "
-                  "Supported access types are [" % (access_type))
+            msg = (
+                "[!] %s is not a valid access type. "
+                "Supported access types are [" % (access_type)
+            )
             for key in self.valid_access_types:
                 msg += " %s |" % (key)
 
             msg = msg[:-1] + "]"
-            if access_type != None: print(msg)
+            if access_type != None:
+                print(msg)
             return True
         else:
             return False
@@ -518,28 +554,27 @@ def __call__(self, *args, **kwargs):
                 _raise_no_device_found_error()
 
         kernel = self.specialize(*args)
-        cfg = kernel.configure(self.sycl_queue, self.global_size,
-                               self.local_size)
+        cfg = kernel.configure(self.sycl_queue, self.global_size, self.local_size)
         cfg(*args)
 
     def specialize(self, *args):
-        argtypes = tuple([self.typingctx.resolve_argument_type(a)
-                          for a in args])
+        argtypes = tuple([self.typingctx.resolve_argument_type(a) for a in args])
         q = None
         kernel = None
         # we were previously using the _env_ptr of the device_env, the sycl_queue
         # should be sufficient to cache the compiled kernel for now, but we should
         # use the device type to cache such kernels
-        #key_definitions = (self.sycl_queue, argtypes)
-        key_definitions = (argtypes)
+        # key_definitions = (self.sycl_queue, argtypes)
+        key_definitions = argtypes
         result = self.definitions.get(key_definitions)
         if result:
             q, kernel = result
 
         if q and self.sycl_queue.equals(q):
-                return kernel
+            return kernel
         else:
-            kernel = compile_kernel(self.sycl_queue, self.py_func, argtypes,
-                                    self.access_types)
+            kernel = compile_kernel(
+                self.sycl_queue, self.py_func, argtypes, self.access_types
+            )
             self.definitions[key_definitions] = (self.sycl_queue, kernel)
         return kernel
diff --git a/numba_dppy/decorators.py b/numba_dppy/decorators.py
index 641d924134..7576e20aae 100644
--- a/numba_dppy/decorators.py
+++ b/numba_dppy/decorators.py
@@ -1,7 +1,12 @@
 from __future__ import print_function, absolute_import, division
 from numba.core import sigutils, types
-from .compiler import (compile_kernel, JitDPPYKernel, compile_dppy_func_template,
-                       compile_dppy_func, get_ordered_arg_access_types)
+from .compiler import (
+    compile_kernel,
+    JitDPPYKernel,
+    compile_dppy_func_template,
+    compile_dppy_func,
+    get_ordered_arg_access_types,
+)
 
 
 def kernel(signature=None, access_types=None, debug=False):
@@ -23,13 +28,14 @@ def autojit(debug=False, access_types=None):
     def _kernel_autojit(pyfunc):
         ordered_arg_access_types = get_ordered_arg_access_types(pyfunc, access_types)
         return JitDPPYKernel(pyfunc, ordered_arg_access_types)
+
     return _kernel_autojit
 
 
 def _kernel_jit(signature, debug, access_types):
     argtypes, restype = sigutils.normalize_signature(signature)
     if restype is not None and restype != types.void:
-        msg = ("DPPY kernel must have void return type but got {restype}")
+        msg = "DPPY kernel must have void return type but got {restype}"
         raise TypeError(msg.format(restype=restype))
 
     def _wrapped(pyfunc):
@@ -39,7 +45,6 @@ def _wrapped(pyfunc):
     return _wrapped
 
 
-
 def func(signature=None):
     if signature is None:
         return _func_autojit
@@ -58,5 +63,6 @@ def _wrapped(pyfunc):
 
     return _wrapped
 
+
 def _func_autojit(pyfunc):
     return compile_dppy_func_template(pyfunc)
diff --git a/numba_dppy/descriptor.py b/numba_dppy/descriptor.py
index c8e6a58ec7..40e1ef0628 100644
--- a/numba_dppy/descriptor.py
+++ b/numba_dppy/descriptor.py
@@ -10,8 +10,8 @@
 
 class DPPYTarget(TargetDescriptor):
     options = CPUTargetOptions
-    #typingctx = DPPYTypingContext()
-    #targetctx = DPPYTargetContext(typingctx)
+    # typingctx = DPPYTypingContext()
+    # targetctx = DPPYTargetContext(typingctx)
 
     @utils.cached_property
     def _toplevel_target_context(self):
@@ -38,6 +38,5 @@ def typing_context(self):
         return self._toplevel_typing_context
 
 
-
 # The global DPPY target
 dppy_target = DPPYTarget()
diff --git a/numba_dppy/device_init.py b/numba_dppy/device_init.py
index efec55ba83..57c0d9c6e7 100644
--- a/numba_dppy/device_init.py
+++ b/numba_dppy/device_init.py
@@ -22,9 +22,7 @@
 We are importing dpnp stub module to make Numba recognize the
 module when we rename Numpy functions.
 """
-from .dpnp_glue.stubs import (
-    dpnp
-)
+from .dpnp_glue.stubs import dpnp
 
 DEFAULT_LOCAL_SIZE = []
 
diff --git a/numba_dppy/dispatcher.py b/numba_dppy/dispatcher.py
index d00a597875..e4a9ddecd3 100644
--- a/numba_dppy/dispatcher.py
+++ b/numba_dppy/dispatcher.py
@@ -2,21 +2,22 @@
 
 import numpy as np
 
-#from numba.targets.descriptors import TargetDescriptor
-#from numba.targets.options import TargetOptions
-#import numba_dppy, numba_dppy as dppy
+# from numba.targets.descriptors import TargetDescriptor
+# from numba.targets.options import TargetOptions
+# import numba_dppy, numba_dppy as dppy
 from numba_dppy import kernel, autojit
 from .descriptor import dppy_target
-#from numba.npyufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
- #                                       GUFuncCallSteps)
+
+# from numba.npyufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
+#                                       GUFuncCallSteps)
 
 from .. import dispatcher, utils, typing
 from .compiler import DPPYCompiler
 
+
 class DPPYDispatcher(dispatcher.Dispatcher):
     targetdescr = dppy_target
 
-
     def __init__(self, py_func, locals={}, targetoptions={}):
         assert not locals
         self.py_func = py_func
@@ -44,8 +45,7 @@ def __call__(self, *args, **kws):
         return self.compiled(*args, **kws)
 
     def disable_compile(self, val=True):
-        """Disable the compilation of new signatures at call time.
-        """
+        """Disable the compilation of new signatures at call time."""
         # Do nothing
         pass
 
@@ -58,6 +58,7 @@ def __getitem__(self, *args):
     def __getattr__(self, key):
         return getattr(self.compiled, key)
 
+
 class DPPYUFuncDispatcher(object):
     """
     Invoke the OpenCL ufunc specialization for the given inputs.
@@ -89,8 +90,7 @@ def __call__(self, *args, **kws):
         return DPPYUFuncMechanism.call(self.functions, args, kws)
 
     def reduce(self, arg, stream=0):
-        assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \
-                                                         "ufunc"
+        assert len(list(self.functions.keys())[0]) == 2, "must be a binary " "ufunc"
         assert arg.ndim == 1, "must use 1d array"
 
         n = arg.shape[0]
@@ -112,7 +112,12 @@ def reduce(self, arg, stream=0):
             # reduce by recursively spliting and operating
             out = self.__reduce(mem, gpu_mems, stream)
             # store the resultong scalar in a [1,] buffer
-            buf = np.empty([out.size,], dtype=out.dtype)
+            buf = np.empty(
+                [
+                    out.size,
+                ],
+                dtype=out.dtype,
+            )
             # copy the result back to host
             out.copy_to_host(buf, stream=stream)
 
@@ -144,7 +149,7 @@ def __reduce(self, mem, gpu_mems, stream):
 
 class _DPPYGUFuncCallSteps(GUFuncCallSteps):
     __slots__ = [
-        '_stream',
+        "_stream",
     ]
 
     def is_device_array(self, obj):
@@ -161,7 +166,7 @@ def device_array(self, shape, dtype):
         return ocl.device_array(shape=shape, dtype=dtype, stream=self._stream)
 
     def prepare_inputs(self):
-        self._stream = self.kwargs.get('stream', 0)
+        self._stream = self.kwargs.get("stream", 0)
 
     def launch_kernel(self, kernel, nelem, args):
         kernel.forall(nelem, queue=self._stream)(*args)
@@ -173,27 +178,26 @@ def _call_steps(self):
         return _DPPYGUFuncCallSteps
 
     def _broadcast_scalar_input(self, ary, shape):
-        return devicearray.DeviceNDArray(shape=shape,
-                                         strides=(0,),
-                                         dtype=ary.dtype,
-                                         gpu_data=ary.gpu_data)
+        return devicearray.DeviceNDArray(
+            shape=shape, strides=(0,), dtype=ary.dtype, gpu_data=ary.gpu_data
+        )
 
     def _broadcast_add_axis(self, ary, newshape):
         newax = len(newshape) - len(ary.shape)
         # Add 0 strides for missing dimension
         newstrides = (0,) * newax + ary.strides
-        return devicearray.DeviceNDArray(shape=newshape,
-                                         strides=newstrides,
-                                         dtype=ary.dtype,
-                                         gpu_data=ary.gpu_data)
+        return devicearray.DeviceNDArray(
+            shape=newshape, strides=newstrides, dtype=ary.dtype, gpu_data=ary.gpu_data
+        )
 
 
 class DPPYUFuncMechanism(UFuncMechanism):
     """
     Provide OpenCL specialization
     """
+
     DEFAULT_STREAM = 0
-    ARRAY_ORDER = 'A'
+    ARRAY_ORDER = "A"
 
     def launch(self, func, count, stream, args):
         func.forall(count, queue=stream)(*args)
@@ -211,9 +215,11 @@ def device_array(self, shape, dtype, stream):
         return ocl.device_array(shape=shape, dtype=dtype, stream=stream)
 
     def broadcast_device(self, ary, shape):
-        ax_differs = [ax for ax in range(len(shape))
-                      if ax >= ary.ndim
-                      or ary.shape[ax] != shape[ax]]
+        ax_differs = [
+            ax
+            for ax in range(len(shape))
+            if ax >= ary.ndim or ary.shape[ax] != shape[ax]
+        ]
 
         missingdim = len(shape) - len(ary.shape)
         strides = [0] * missingdim + list(ary.strides)
@@ -221,7 +227,6 @@ def broadcast_device(self, ary, shape):
         for ax in ax_differs:
             strides[ax] = 0
 
-        return devicearray.DeviceNDArray(shape=shape,
-                                         strides=strides,
-                                         dtype=ary.dtype,
-                                         gpu_data=ary.gpu_data)
+        return devicearray.DeviceNDArray(
+            shape=shape, strides=strides, dtype=ary.dtype, gpu_data=ary.gpu_data
+        )
diff --git a/numba_dppy/dpnp_glue/__init__.py b/numba_dppy/dpnp_glue/__init__.py
index 17d6b5ad6a..9e76480680 100644
--- a/numba_dppy/dpnp_glue/__init__.py
+++ b/numba_dppy/dpnp_glue/__init__.py
@@ -1,6 +1,6 @@
 def ensure_dpnp(name):
     try:
-       # import dpnp
+        # import dpnp
         from . import dpnp_fptr_interface as dpnp_glue
     except ImportError:
         raise ImportError("dpNP is needed to call np.%s" % name)
diff --git a/numba_dppy/dpnp_glue/dpnp_linalgimpl.py b/numba_dppy/dpnp_glue/dpnp_linalgimpl.py
index 9146299b05..4b47382c57 100644
--- a/numba_dppy/dpnp_glue/dpnp_linalgimpl.py
+++ b/numba_dppy/dpnp_glue/dpnp_linalgimpl.py
@@ -7,6 +7,7 @@
 import numpy as np
 from numba_dppy.dpctl_functions import _DPCTL_FUNCTIONS
 
+
 @overload(stubs.dpnp.eig)
 def dpnp_eig_impl(a):
     name = "eig"
@@ -22,10 +23,8 @@ def dpnp_eig_impl(a):
     void dpnp_eig_c(const void* array_in, void* result1, void* result2, size_t size)
 
     """
-    sig = signature(
-        ret_type, types.voidptr, types.voidptr, types.voidptr, types.intp
-    )
-    dpnp_eig = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.voidptr, types.intp)
+    dpnp_eig = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -92,8 +91,14 @@ def dpnp_dot_impl(a, b):
 
     """
     sig = signature(
-        ret_type, types.voidptr, types.voidptr, types.voidptr,
-                  types.intp, types.intp, types.intp)
+        ret_type,
+        types.voidptr,
+        types.voidptr,
+        types.voidptr,
+        types.intp,
+        types.intp,
+        types.intp,
+    )
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -137,6 +142,7 @@ def dpnp_dot_impl(a, b):
     ndims = [a.ndim, b.ndim]
     if ndims == [2, 2]:
         dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig)
+
         def dot_2_mm(a, b):
             sycl_queue = get_sycl_queue()
 
@@ -170,11 +176,12 @@ def dot_2_mm(a, b):
         return dot_2_mm
     elif ndims == [2, 1]:
         dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig)
+
         def dot_2_mv(a, b):
             sycl_queue = get_sycl_queue()
 
             m, k = a.shape
-            _n,  = b.shape
+            (_n,) = b.shape
             n = 1
 
             if _n != k:
@@ -186,7 +193,7 @@ def dot_2_mv(a, b):
             b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue)
             copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize)
 
-            out = np.empty((m, ), dtype=res_dtype)
+            out = np.empty((m,), dtype=res_dtype)
             out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
 
             dpnp_func(a_usm, b_usm, out_usm, m, n, k)
@@ -204,10 +211,11 @@ def dot_2_mv(a, b):
         return dot_2_mv
     elif ndims == [1, 2]:
         dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig)
+
         def dot_2_vm(a, b):
             sycl_queue = get_sycl_queue()
 
-            m, = a.shape
+            (m,) = a.shape
             k, n = b.shape
 
             if m != k:
@@ -219,7 +227,7 @@ def dot_2_vm(a, b):
             b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue)
             copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize)
 
-            out = np.empty((n, ), dtype=res_dtype)
+            out = np.empty((n,), dtype=res_dtype)
             out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
 
             dpnp_func(a_usm, b_usm, out_usm, m, n, k)
@@ -236,14 +244,16 @@ def dot_2_vm(a, b):
 
         return dot_2_vm
     elif ndims == [1, 1]:
-        sig = signature(ret_type, types.voidptr, types.voidptr, types.voidptr,
-                                  types.intp)
+        sig = signature(
+            ret_type, types.voidptr, types.voidptr, types.voidptr, types.intp
+        )
         dpnp_func = dpnp_ext.dpnp_func("dpnp_dot", [a.dtype.name, "NONE"], sig)
+
         def dot_2_vv(a, b):
             sycl_queue = get_sycl_queue()
 
-            m, = a.shape
-            n, = b.shape
+            (m,) = a.shape
+            (n,) = b.shape
 
             if m != n:
                 raise ValueError("Incompatible array sizes for np.dot(a, b)")
diff --git a/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py b/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py
index 8ec200059b..073b83e900 100644
--- a/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py
+++ b/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py
@@ -22,7 +22,9 @@ def dpnp_argmax_impl(a):
     void custom_argmax_c(void* array1_in, void* result1, size_t size)
     """
     sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, np.dtype(np.int64).name], sig)
+    dpnp_func = dpnp_ext.dpnp_func(
+        "dpnp_" + name, [a.dtype.name, np.dtype(np.int64).name], sig
+    )
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -72,7 +74,9 @@ def dpnp_argmin_impl(a):
     void custom_argmin_c(void* array1_in, void* result1, size_t size)
     """
     sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, np.dtype(np.int64).name], sig)
+    dpnp_func = dpnp_ext.dpnp_func(
+        "dpnp_" + name, [a.dtype.name, np.dtype(np.int64).name], sig
+    )
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -122,7 +126,7 @@ def dpnp_argsort_impl(a):
     void custom_argmin_c(void* array1_in, void* result1, size_t size)
     """
     sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
diff --git a/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py b/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py
index cae9507902..a3883dc860 100644
--- a/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py
+++ b/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py
@@ -28,10 +28,16 @@ def dpnp_amax_impl(a):
     if the compiler allows there should not be any mismatch in the size of
     the container to hold different types of pointer.
     """
-    sig = signature(ret_type, types.voidptr, types.voidptr,
-                              types.voidptr, types.intp,
-                              types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    sig = signature(
+        ret_type,
+        types.voidptr,
+        types.voidptr,
+        types.voidptr,
+        types.intp,
+        types.voidptr,
+        types.intp,
+    )
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -85,10 +91,16 @@ def dpnp_amin_impl(a):
     if the compiler allows there should not be any mismatch in the size of
     the container to hold different types of pointer.
     """
-    sig = signature(ret_type, types.voidptr, types.voidptr,
-                              types.voidptr, types.intp,
-                              types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    sig = signature(
+        ret_type,
+        types.voidptr,
+        types.voidptr,
+        types.voidptr,
+        types.intp,
+        types.voidptr,
+        types.intp,
+    )
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -141,10 +153,16 @@ def dpnp_mean_impl(a):
     if the compiler allows there should not be any mismatch in the size of
     the container to hold different types of pointer.
     """
-    sig = signature(ret_type, types.voidptr, types.voidptr,
-                              types.voidptr, types.intp,
-                              types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    sig = signature(
+        ret_type,
+        types.voidptr,
+        types.voidptr,
+        types.voidptr,
+        types.intp,
+        types.voidptr,
+        types.intp,
+    )
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -200,10 +218,16 @@ def dpnp_median_impl(a):
     if the compiler allows there should not be any mismatch in the size of
     the container to hold different types of pointer.
     """
-    sig = signature(ret_type, types.voidptr, types.voidptr,
-                              types.voidptr, types.intp,
-                              types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    sig = signature(
+        ret_type,
+        types.voidptr,
+        types.voidptr,
+        types.voidptr,
+        types.intp,
+        types.voidptr,
+        types.intp,
+    )
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -254,9 +278,8 @@ def dpnp_cov_impl(a):
     Function declaration:
     void custom_cov_c(void* array1_in, void* result1, size_t nrows, size_t ncols)
     """
-    sig = signature(ret_type, types.voidptr, types.voidptr,
-                              types.intp, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.intp, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -268,7 +291,6 @@ def dpnp_cov_impl(a):
     if a.dtype == types.float64:
         copy_input_to_double = False
 
-
     def dpnp_impl(a):
         if a.size == 0:
             raise ValueError("Passed Empty array")
@@ -280,9 +302,15 @@ def dpnp_impl(a):
             a_copy_in_double = a.astype(np.float64)
         else:
             a_copy_in_double = a
-        a_usm = allocate_usm_shared(a_copy_in_double.size * a_copy_in_double.itemsize, sycl_queue)
-        copy_usm(sycl_queue, a_usm, a_copy_in_double.ctypes,
-                 a_copy_in_double.size * a_copy_in_double.itemsize)
+        a_usm = allocate_usm_shared(
+            a_copy_in_double.size * a_copy_in_double.itemsize, sycl_queue
+        )
+        copy_usm(
+            sycl_queue,
+            a_usm,
+            a_copy_in_double.ctypes,
+            a_copy_in_double.size * a_copy_in_double.itemsize,
+        )
 
         if a.ndim == 2:
             rows = a.shape[0]
diff --git a/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
index f7ba425206..af79ad2c1c 100644
--- a/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
+++ b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
@@ -23,7 +23,7 @@ def dpnp_sum_impl(a):
 
     """
     sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -70,7 +70,7 @@ def dpnp_prod_impl(a):
     void custom_prod_c(void* array1_in, void* result1, size_t size)
     """
     sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
diff --git a/numba_dppy/dpnp_glue/dpnpdecl.py b/numba_dppy/dpnp_glue/dpnpdecl.py
index ce1f7d3583..373018c8db 100644
--- a/numba_dppy/dpnp_glue/dpnpdecl.py
+++ b/numba_dppy/dpnp_glue/dpnpdecl.py
@@ -1,8 +1,9 @@
-from numba.core.typing.templates import (AttributeTemplate, infer_getattr)
+from numba.core.typing.templates import AttributeTemplate, infer_getattr
 import numba_dppy
 from numba import types
 from numba.core.types.misc import RawPointer
 
+
 @infer_getattr
 class DppyDpnpTemplate(AttributeTemplate):
     key = types.Module(numba_dppy)
@@ -10,11 +11,14 @@ class DppyDpnpTemplate(AttributeTemplate):
     def resolve_dpnp(self, mod):
         return types.Module(numba_dppy.dpnp)
 
+
 """
 This adds a shapeptr attribute to Numba type representing np.ndarray.
 This allows us to get the raw pointer to the structure where the shape
 of an ndarray is stored from an overloaded implementation
 """
+
+
 @infer_getattr
 class ArrayAttribute(AttributeTemplate):
     key = types.Array
diff --git a/numba_dppy/dpnp_glue/dpnpimpl.py b/numba_dppy/dpnp_glue/dpnpimpl.py
index fa429f923f..a3dc5ce195 100644
--- a/numba_dppy/dpnp_glue/dpnpimpl.py
+++ b/numba_dppy/dpnp_glue/dpnpimpl.py
@@ -8,23 +8,27 @@
 
 ll_void_p = ir.IntType(8).as_pointer()
 
+
 def get_dpnp_fptr(fn_name, type_names):
     from . import dpnp_fptr_interface as dpnp_glue
 
     f_ptr = dpnp_glue.get_dpnp_fn_ptr(fn_name, type_names)
     return f_ptr
 
+
 @register_jitable
 def _check_finite_matrix(a):
     for v in np.nditer(a):
         if not np.isfinite(v.item()):
             raise np.linalg.LinAlgError("Array must not contain infs or NaNs.")
 
+
 @register_jitable
 def _dummy_liveness_func(a):
     """pass a list of variables to be preserved through dead code elimination"""
     return a[0]
 
+
 def dpnp_func(fn_name, type_names, sig):
     f_ptr = get_dpnp_fptr(fn_name, type_names)
 
@@ -33,15 +37,19 @@ def get_pointer(obj):
 
     return types.ExternalFunctionPointer(sig, get_pointer=get_pointer)
 
+
 """
 This function retrieves the pointer to the structure where the shape
 of an ndarray is stored. We cast it to void * to make it easier to
 pass around.
 """
+
+
 @lower_getattr(types.Array, "shapeptr")
 def array_shape(context, builder, typ, value):
-    shape_ptr = builder.gep(value.operands[0],
-                            [context.get_constant(types.int32, 0),
-                             context.get_constant(types.int32, 5)])
+    shape_ptr = builder.gep(
+        value.operands[0],
+        [context.get_constant(types.int32, 0), context.get_constant(types.int32, 5)],
+    )
 
     return builder.bitcast(shape_ptr, ll_void_p)
diff --git a/numba_dppy/dpnp_glue/stubs.py b/numba_dppy/dpnp_glue/stubs.py
index 2fdd6ecbe3..fa7e06ea48 100644
--- a/numba_dppy/dpnp_glue/stubs.py
+++ b/numba_dppy/dpnp_glue/stubs.py
@@ -1,9 +1,10 @@
 from numba_dppy.ocl.stubs import Stub
 
+
 class dpnp(Stub):
-    """dpnp namespace
-    """
-    _description_ = '<dpnp>'
+    """dpnp namespace"""
+
+    _description_ = "<dpnp>"
 
     class sum(Stub):
         pass
diff --git a/numba_dppy/dppy_host_fn_call_gen.py b/numba_dppy/dppy_host_fn_call_gen.py
index 2808ddf90d..585d461127 100644
--- a/numba_dppy/dppy_host_fn_call_gen.py
+++ b/numba_dppy/dppy_host_fn_call_gen.py
@@ -9,6 +9,7 @@
 
 from numba.core.ir_utils import legalize_names
 
+
 class DPPYHostFunctionCallsGenerator(object):
     def __init__(self, lowerer, cres, num_inputs):
         self.lowerer = lowerer
@@ -27,8 +28,8 @@ def __init__(self, lowerer, cres, num_inputs):
         self.null_ptr = self._create_null_ptr()
 
         self.total_kernel_args = 0
-        self.cur_arg           = 0
-        self.num_inputs        = num_inputs
+        self.cur_arg = 0
+        self.num_inputs = num_inputs
 
         # list of buffer that needs to comeback to host
         self.write_buffs = []
@@ -36,65 +37,91 @@ def __init__(self, lowerer, cres, num_inputs):
         # list of buffer that does not need to comeback to host
         self.read_only_buffs = []
 
-
     def _create_null_ptr(self):
-        null_ptr = cgutils.alloca_once(self.builder, self.void_ptr_t,
-                size=self.context.get_constant(types.uintp, 1), name="null_ptr")
+        null_ptr = cgutils.alloca_once(
+            self.builder,
+            self.void_ptr_t,
+            size=self.context.get_constant(types.uintp, 1),
+            name="null_ptr",
+        )
         self.builder.store(
             self.builder.inttoptr(
-                self.context.get_constant(types.uintp, 0), self.void_ptr_t),
-                null_ptr)
+                self.context.get_constant(types.uintp, 0), self.void_ptr_t
+            ),
+            null_ptr,
+        )
         return null_ptr
 
-
     def _init_llvm_types_and_constants(self):
-        self.byte_t          = lc.Type.int(8)
-        self.byte_ptr_t      = lc.Type.pointer(self.byte_t)
-        self.byte_ptr_ptr_t  = lc.Type.pointer(self.byte_ptr_t)
-        self.intp_t          = self.context.get_value_type(types.intp)
-        self.int64_t         = self.context.get_value_type(types.int64)
-        self.int32_t         = self.context.get_value_type(types.int32)
-        self.int32_ptr_t     = lc.Type.pointer(self.int32_t)
-        self.uintp_t         = self.context.get_value_type(types.uintp)
-        self.intp_ptr_t      = lc.Type.pointer(self.intp_t)
-        self.uintp_ptr_t     = lc.Type.pointer(self.uintp_t)
-        self.zero            = self.context.get_constant(types.uintp, 0)
-        self.one             = self.context.get_constant(types.uintp, 1)
-        self.one_type        = self.one.type
-        self.sizeof_intp     = self.context.get_abi_sizeof(self.intp_t)
-        self.void_ptr_t      = self.context.get_value_type(types.voidptr)
-        self.void_ptr_ptr_t  = lc.Type.pointer(self.void_ptr_t)
+        self.byte_t = lc.Type.int(8)
+        self.byte_ptr_t = lc.Type.pointer(self.byte_t)
+        self.byte_ptr_ptr_t = lc.Type.pointer(self.byte_ptr_t)
+        self.intp_t = self.context.get_value_type(types.intp)
+        self.int64_t = self.context.get_value_type(types.int64)
+        self.int32_t = self.context.get_value_type(types.int32)
+        self.int32_ptr_t = lc.Type.pointer(self.int32_t)
+        self.uintp_t = self.context.get_value_type(types.uintp)
+        self.intp_ptr_t = lc.Type.pointer(self.intp_t)
+        self.uintp_ptr_t = lc.Type.pointer(self.uintp_t)
+        self.zero = self.context.get_constant(types.uintp, 0)
+        self.one = self.context.get_constant(types.uintp, 1)
+        self.one_type = self.one.type
+        self.sizeof_intp = self.context.get_abi_sizeof(self.intp_t)
+        self.void_ptr_t = self.context.get_value_type(types.voidptr)
+        self.void_ptr_ptr_t = lc.Type.pointer(self.void_ptr_t)
         self.sizeof_void_ptr = self.context.get_abi_sizeof(self.intp_t)
         self.sycl_queue_val = None
 
     def _declare_functions(self):
         get_queue_fnty = lc.Type.function(self.void_ptr_t, ())
-        self.get_queue = self.builder.module.get_or_insert_function(get_queue_fnty,
-                                                                name="DPCTLQueueMgr_GetCurrentQueue")
-
-        submit_range_fnty = lc.Type.function(self.void_ptr_t,
-                [self.void_ptr_t, self.void_ptr_t, self.void_ptr_ptr_t,
-                    self.int32_ptr_t, self.intp_t, self.intp_ptr_t,
-                    self.intp_t, self.void_ptr_t, self.intp_t])
-        self.submit_range = self.builder.module.get_or_insert_function(submit_range_fnty,
-                                                                name="DPCTLQueue_SubmitRange")
-
-
-        queue_memcpy_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t, self.void_ptr_t, self.intp_t])
-        self.queue_memcpy = self.builder.module.get_or_insert_function(queue_memcpy_fnty,
-                                                                name="DPCTLQueue_Memcpy")
-
-        queue_wait_fnty =  lc.Type.function(lir.VoidType(), [self.void_ptr_t])
-        self.queue_wait = self.builder.module.get_or_insert_function(queue_wait_fnty,
-                                                                name="DPCTLQueue_Wait")
-
-        usm_shared_fnty = lc.Type.function(self.void_ptr_t, [self.intp_t, self.void_ptr_t])
-        self.usm_shared = self.builder.module.get_or_insert_function(usm_shared_fnty,
-                                                                name="DPCTLmalloc_shared")
-
-        usm_free_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t])
-        self.usm_free = self.builder.module.get_or_insert_function(usm_free_fnty,
-                                                                   name="DPCTLfree_with_queue")
+        self.get_queue = self.builder.module.get_or_insert_function(
+            get_queue_fnty, name="DPCTLQueueMgr_GetCurrentQueue"
+        )
+
+        submit_range_fnty = lc.Type.function(
+            self.void_ptr_t,
+            [
+                self.void_ptr_t,
+                self.void_ptr_t,
+                self.void_ptr_ptr_t,
+                self.int32_ptr_t,
+                self.intp_t,
+                self.intp_ptr_t,
+                self.intp_t,
+                self.void_ptr_t,
+                self.intp_t,
+            ],
+        )
+        self.submit_range = self.builder.module.get_or_insert_function(
+            submit_range_fnty, name="DPCTLQueue_SubmitRange"
+        )
+
+        queue_memcpy_fnty = lc.Type.function(
+            lir.VoidType(),
+            [self.void_ptr_t, self.void_ptr_t, self.void_ptr_t, self.intp_t],
+        )
+        self.queue_memcpy = self.builder.module.get_or_insert_function(
+            queue_memcpy_fnty, name="DPCTLQueue_Memcpy"
+        )
+
+        queue_wait_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t])
+        self.queue_wait = self.builder.module.get_or_insert_function(
+            queue_wait_fnty, name="DPCTLQueue_Wait"
+        )
+
+        usm_shared_fnty = lc.Type.function(
+            self.void_ptr_t, [self.intp_t, self.void_ptr_t]
+        )
+        self.usm_shared = self.builder.module.get_or_insert_function(
+            usm_shared_fnty, name="DPCTLmalloc_shared"
+        )
+
+        usm_free_fnty = lc.Type.function(
+            lir.VoidType(), [self.void_ptr_t, self.void_ptr_t]
+        )
+        self.usm_free = self.builder.module.get_or_insert_function(
+            usm_free_fnty, name="DPCTLfree_with_queue"
+        )
 
     def allocate_kenrel_arg_array(self, num_kernel_args):
         self.sycl_queue_val = cgutils.alloca_once(self.builder, self.void_ptr_t)
@@ -104,17 +131,21 @@ def allocate_kenrel_arg_array(self, num_kernel_args):
 
         # we need a kernel arg array to enqueue
         self.kernel_arg_array = cgutils.alloca_once(
-            self.builder, self.void_ptr_t, size=self.context.get_constant(
-                types.uintp, num_kernel_args), name="kernel_arg_array")
+            self.builder,
+            self.void_ptr_t,
+            size=self.context.get_constant(types.uintp, num_kernel_args),
+            name="kernel_arg_array",
+        )
 
         self.kernel_arg_ty_array = cgutils.alloca_once(
-            self.builder, self.int32_t, size=self.context.get_constant(
-                types.uintp, num_kernel_args), name="kernel_arg_ty_array")
-
+            self.builder,
+            self.int32_t,
+            size=self.context.get_constant(types.uintp, num_kernel_args),
+            name="kernel_arg_ty_array",
+        )
 
     def resolve_and_return_dpctl_type(self, ty):
-        """This function looks up the dpctl defined enum values from DPCTLKernelArgType.
-        """
+        """This function looks up the dpctl defined enum values from DPCTLKernelArgType."""
 
         val = None
         if ty == types.int32 or isinstance(ty, types.scalars.IntegerLiteral):
@@ -136,20 +167,26 @@ def resolve_and_return_dpctl_type(self, ty):
         else:
             raise NotImplementedError
 
-        assert(val != None)
+        assert val != None
 
         return val
 
-
     def form_kernel_arg_and_arg_ty(self, val, ty):
-        kernel_arg_dst = self.builder.gep(self.kernel_arg_array, [self.context.get_constant(types.int32, self.cur_arg)])
-        kernel_arg_ty_dst = self.builder.gep(self.kernel_arg_ty_array, [self.context.get_constant(types.int32, self.cur_arg)])
+        kernel_arg_dst = self.builder.gep(
+            self.kernel_arg_array,
+            [self.context.get_constant(types.int32, self.cur_arg)],
+        )
+        kernel_arg_ty_dst = self.builder.gep(
+            self.kernel_arg_ty_array,
+            [self.context.get_constant(types.int32, self.cur_arg)],
+        )
         self.cur_arg += 1
         self.builder.store(val, kernel_arg_dst)
         self.builder.store(ty, kernel_arg_ty_dst)
 
-
-    def process_kernel_arg(self, var, llvm_arg, arg_type, gu_sig, val_type, index, modified_arrays):
+    def process_kernel_arg(
+        self, var, llvm_arg, arg_type, gu_sig, val_type, index, modified_arrays
+    ):
         if isinstance(arg_type, types.npytypes.Array):
             if llvm_arg is None:
                 raise NotImplementedError(arg_type, var)
@@ -157,50 +194,83 @@ def process_kernel_arg(self, var, llvm_arg, arg_type, gu_sig, val_type, index, m
             storage = cgutils.alloca_once(self.builder, self.int64_t)
             self.builder.store(self.context.get_constant(types.int64, 0), storage)
             ty = self.resolve_and_return_dpctl_type(types.int64)
-            self.form_kernel_arg_and_arg_ty(self.builder.bitcast(storage, self.void_ptr_t), ty)
+            self.form_kernel_arg_and_arg_ty(
+                self.builder.bitcast(storage, self.void_ptr_t), ty
+            )
 
             storage = cgutils.alloca_once(self.builder, self.int64_t)
             self.builder.store(self.context.get_constant(types.int64, 0), storage)
             ty = self.resolve_and_return_dpctl_type(types.int64)
-            self.form_kernel_arg_and_arg_ty(self.builder.bitcast(storage, self.void_ptr_t), ty)
-
+            self.form_kernel_arg_and_arg_ty(
+                self.builder.bitcast(storage, self.void_ptr_t), ty
+            )
 
             # Handle array size
-            array_size_member = self.builder.gep(llvm_arg,
-                    [self.context.get_constant(types.int32, 0), self.context.get_constant(types.int32, 2)])
-
-            ty =  self.resolve_and_return_dpctl_type(types.int64)
-            self.form_kernel_arg_and_arg_ty(self.builder.bitcast(array_size_member, self.void_ptr_t), ty)
+            array_size_member = self.builder.gep(
+                llvm_arg,
+                [
+                    self.context.get_constant(types.int32, 0),
+                    self.context.get_constant(types.int32, 2),
+                ],
+            )
 
+            ty = self.resolve_and_return_dpctl_type(types.int64)
+            self.form_kernel_arg_and_arg_ty(
+                self.builder.bitcast(array_size_member, self.void_ptr_t), ty
+            )
 
             # Handle itemsize
-            item_size_member = self.builder.gep(llvm_arg,
-                    [self.context.get_constant(types.int32, 0), self.context.get_constant(types.int32, 3)])
-
-            ty =  self.resolve_and_return_dpctl_type(types.int64)
-            self.form_kernel_arg_and_arg_ty(self.builder.bitcast(item_size_member, self.void_ptr_t), ty)
+            item_size_member = self.builder.gep(
+                llvm_arg,
+                [
+                    self.context.get_constant(types.int32, 0),
+                    self.context.get_constant(types.int32, 3),
+                ],
+            )
 
+            ty = self.resolve_and_return_dpctl_type(types.int64)
+            self.form_kernel_arg_and_arg_ty(
+                self.builder.bitcast(item_size_member, self.void_ptr_t), ty
+            )
 
             # Calculate total buffer size
-            total_size = cgutils.alloca_once(self.builder, self.intp_t,
-                    size=self.one, name="total_size" + str(self.cur_arg))
-            self.builder.store(self.builder.sext(self.builder.mul(self.builder.load(array_size_member),
-                               self.builder.load(item_size_member)), self.intp_t), total_size)
+            total_size = cgutils.alloca_once(
+                self.builder,
+                self.intp_t,
+                size=self.one,
+                name="total_size" + str(self.cur_arg),
+            )
+            self.builder.store(
+                self.builder.sext(
+                    self.builder.mul(
+                        self.builder.load(array_size_member),
+                        self.builder.load(item_size_member),
+                    ),
+                    self.intp_t,
+                ),
+                total_size,
+            )
 
             # Handle data
-            data_member = self.builder.gep(llvm_arg,
-                    [self.context.get_constant(types.int32, 0), self.context.get_constant(types.int32, 4)])
+            data_member = self.builder.gep(
+                llvm_arg,
+                [
+                    self.context.get_constant(types.int32, 0),
+                    self.context.get_constant(types.int32, 4),
+                ],
+            )
 
             buffer_name = "buffer_ptr" + str(self.cur_arg)
-            buffer_ptr = cgutils.alloca_once(self.builder, self.void_ptr_t,
-                                             name=buffer_name)
+            buffer_ptr = cgutils.alloca_once(
+                self.builder, self.void_ptr_t, name=buffer_name
+            )
 
-
-            args = [self.builder.load(total_size),
-                    self.builder.load(self.sycl_queue_val)]
+            args = [
+                self.builder.load(total_size),
+                self.builder.load(self.sycl_queue_val),
+            ]
             self.builder.store(self.builder.call(self.usm_shared, args), buffer_ptr)
 
-
             # names are replaces usig legalize names, we have to do the same for them to match
             legal_names = legalize_names([var])
 
@@ -211,46 +281,70 @@ def process_kernel_arg(self, var, llvm_arg, arg_type, gu_sig, val_type, index, m
 
             # We really need to detect when an array needs to be copied over
             if index < self.num_inputs:
-                args = [self.builder.load(self.sycl_queue_val),
-                        self.builder.load(buffer_ptr),
-                        self.builder.bitcast(self.builder.load(data_member), self.void_ptr_t),
-                        self.builder.load(total_size)]
+                args = [
+                    self.builder.load(self.sycl_queue_val),
+                    self.builder.load(buffer_ptr),
+                    self.builder.bitcast(
+                        self.builder.load(data_member), self.void_ptr_t
+                    ),
+                    self.builder.load(total_size),
+                ]
                 self.builder.call(self.queue_memcpy, args)
 
-
-            ty =  self.resolve_and_return_dpctl_type(types.voidptr)
+            ty = self.resolve_and_return_dpctl_type(types.voidptr)
             self.form_kernel_arg_and_arg_ty(self.builder.load(buffer_ptr), ty)
 
             # Handle shape
-            shape_member = self.builder.gep(llvm_arg,
-                    [self.context.get_constant(types.int32, 0),
-                     self.context.get_constant(types.int32, 5)])
+            shape_member = self.builder.gep(
+                llvm_arg,
+                [
+                    self.context.get_constant(types.int32, 0),
+                    self.context.get_constant(types.int32, 5),
+                ],
+            )
 
             for this_dim in range(arg_type.ndim):
-                shape_entry = self.builder.gep(shape_member,
-                                [self.context.get_constant(types.int32, 0),
-                                 self.context.get_constant(types.int32, this_dim)])
-
-                ty =  self.resolve_and_return_dpctl_type(types.int64)
-                self.form_kernel_arg_and_arg_ty(self.builder.bitcast(shape_entry, self.void_ptr_t), ty)
-
+                shape_entry = self.builder.gep(
+                    shape_member,
+                    [
+                        self.context.get_constant(types.int32, 0),
+                        self.context.get_constant(types.int32, this_dim),
+                    ],
+                )
+
+                ty = self.resolve_and_return_dpctl_type(types.int64)
+                self.form_kernel_arg_and_arg_ty(
+                    self.builder.bitcast(shape_entry, self.void_ptr_t), ty
+                )
 
             # Handle strides
-            stride_member = self.builder.gep(llvm_arg,
-                    [self.context.get_constant(types.int32, 0),
-                     self.context.get_constant(types.int32, 6)])
+            stride_member = self.builder.gep(
+                llvm_arg,
+                [
+                    self.context.get_constant(types.int32, 0),
+                    self.context.get_constant(types.int32, 6),
+                ],
+            )
 
             for this_stride in range(arg_type.ndim):
-                stride_entry = self.builder.gep(stride_member,
-                                [self.context.get_constant(types.int32, 0),
-                                 self.context.get_constant(types.int32, this_stride)])
-
-                ty =  self.resolve_and_return_dpctl_type(types.int64)
-                self.form_kernel_arg_and_arg_ty(self.builder.bitcast(stride_entry, self.void_ptr_t), ty)
+                stride_entry = self.builder.gep(
+                    stride_member,
+                    [
+                        self.context.get_constant(types.int32, 0),
+                        self.context.get_constant(types.int32, this_stride),
+                    ],
+                )
+
+                ty = self.resolve_and_return_dpctl_type(types.int64)
+                self.form_kernel_arg_and_arg_ty(
+                    self.builder.bitcast(stride_entry, self.void_ptr_t), ty
+                )
 
         else:
-            ty =  self.resolve_and_return_dpctl_type(arg_type)
-            self.form_kernel_arg_and_arg_ty(self.builder.bitcast(llvm_arg, self.void_ptr_t), ty)
+            ty = self.resolve_and_return_dpctl_type(arg_type)
+            self.form_kernel_arg_and_arg_ty(
+                self.builder.bitcast(llvm_arg, self.void_ptr_t), ty
+            )
 
     def enqueue_kernel_and_read_back(self, loop_ranges):
         # the assumption is loop_ranges will always be less than or equal to 3 dimensions
@@ -258,8 +352,11 @@ def enqueue_kernel_and_read_back(self, loop_ranges):
 
         # form the global range
         global_range = cgutils.alloca_once(
-                        self.builder, self.uintp_t,
-                        size=self.context.get_constant(types.uintp, num_dim), name="global_range")
+            self.builder,
+            self.uintp_t,
+            size=self.context.get_constant(types.uintp, num_dim),
+            name="global_range",
+        )
 
         for i in range(num_dim):
             start, stop, step = loop_ranges[i]
@@ -267,20 +364,28 @@ def enqueue_kernel_and_read_back(self, loop_ranges):
                 stop = self.builder.sext(stop, self.one_type)
 
             # we reverse the global range to account for how sycl and opencl range differs
-            self.builder.store(stop,
-                               self.builder.gep(global_range, [self.context.get_constant(types.uintp, (num_dim-1)-i)]))
-
-
-        args = [self.builder.inttoptr(self.context.get_constant(types.uintp, self.kernel_addr), self.void_ptr_t),
-                self.builder.load(self.sycl_queue_val),
-                self.kernel_arg_array,
-                self.kernel_arg_ty_array,
-                self.context.get_constant(types.uintp, self.total_kernel_args),
-                self.builder.bitcast(global_range, self.intp_ptr_t),
-                self.context.get_constant(types.uintp, num_dim),
-                self.builder.bitcast(self.null_ptr,  self.void_ptr_t),
-                self.context.get_constant(types.uintp, 0)
-                ]
+            self.builder.store(
+                stop,
+                self.builder.gep(
+                    global_range,
+                    [self.context.get_constant(types.uintp, (num_dim - 1) - i)],
+                ),
+            )
+
+        args = [
+            self.builder.inttoptr(
+                self.context.get_constant(types.uintp, self.kernel_addr),
+                self.void_ptr_t,
+            ),
+            self.builder.load(self.sycl_queue_val),
+            self.kernel_arg_array,
+            self.kernel_arg_ty_array,
+            self.context.get_constant(types.uintp, self.total_kernel_args),
+            self.builder.bitcast(global_range, self.intp_ptr_t),
+            self.context.get_constant(types.uintp, num_dim),
+            self.builder.bitcast(self.null_ptr, self.void_ptr_t),
+            self.context.get_constant(types.uintp, 0),
+        ]
         self.builder.call(self.submit_range, args)
 
         self.builder.call(self.queue_wait, [self.builder.load(self.sycl_queue_val)])
@@ -288,14 +393,22 @@ def enqueue_kernel_and_read_back(self, loop_ranges):
         # read buffers back to host
         for write_buff in self.write_buffs:
             buffer_ptr, total_size, data_member = write_buff
-            args = [self.builder.load(self.sycl_queue_val),
-                    self.builder.bitcast(self.builder.load(data_member), self.void_ptr_t),
-                    self.builder.load(buffer_ptr),
-                    self.builder.load(total_size)]
+            args = [
+                self.builder.load(self.sycl_queue_val),
+                self.builder.bitcast(self.builder.load(data_member), self.void_ptr_t),
+                self.builder.load(buffer_ptr),
+                self.builder.load(total_size),
+            ]
             self.builder.call(self.queue_memcpy, args)
 
-            self.builder.call(self.usm_free, [self.builder.load(buffer_ptr), self.builder.load(self.sycl_queue_val)])
+            self.builder.call(
+                self.usm_free,
+                [self.builder.load(buffer_ptr), self.builder.load(self.sycl_queue_val)],
+            )
 
         for read_buff in self.read_only_buffs:
             buffer_ptr, total_size, data_member = read_buff
-            self.builder.call(self.usm_free, [self.builder.load(buffer_ptr), self.builder.load(self.sycl_queue_val)])
+            self.builder.call(
+                self.usm_free,
+                [self.builder.load(buffer_ptr), self.builder.load(self.sycl_queue_val)],
+            )
diff --git a/numba_dppy/dppy_lowerer.py b/numba_dppy/dppy_lowerer.py
index 3040362592..420414ec2f 100644
--- a/numba_dppy/dppy_lowerer.py
+++ b/numba_dppy/dppy_lowerer.py
@@ -9,28 +9,29 @@
 import numpy as np
 
 import numba
-from numba.core import (compiler, ir, types, sigutils, lowering,
-                funcdesc, config)
+from numba.core import compiler, ir, types, sigutils, lowering, funcdesc, config
 from numba.parfors import parfor
 import numba_dppy, numba_dppy as dppy
-from numba.core.ir_utils import (add_offset_to_labels,
-                            replace_var_names,
-                            remove_dels,
-                            legalize_names,
-                            mk_unique_var,
-                            rename_labels,
-                            get_name_var_table,
-                            visit_vars_inner,
-                            guard,
-                            find_callname,
-                            remove_dead,
-                            get_call_table,
-                            is_pure,
-                            build_definitions,
-                            get_np_ufunc_typ,
-                            get_unused_var_name,
-                            find_potential_aliases,
-                            is_const_call)
+from numba.core.ir_utils import (
+    add_offset_to_labels,
+    replace_var_names,
+    remove_dels,
+    legalize_names,
+    mk_unique_var,
+    rename_labels,
+    get_name_var_table,
+    visit_vars_inner,
+    guard,
+    find_callname,
+    remove_dead,
+    get_call_table,
+    is_pure,
+    build_definitions,
+    get_np_ufunc_typ,
+    get_unused_var_name,
+    find_potential_aliases,
+    is_const_call,
+)
 
 from numba.core.typing import signature
 
@@ -47,9 +48,9 @@ def _print_block(block):
     for i, inst in enumerate(block.body):
         print("    ", i, inst)
 
+
 def _print_body(body_dict):
-    '''Pretty-print a set of IR blocks.
-    '''
+    """Pretty-print a set of IR blocks."""
     for label, block in body_dict.items():
         print("label: ", label)
         _print_block(block)
@@ -61,9 +62,9 @@ def _print_body(body_dict):
 # through OpenCL and generate for loops for the remaining
 # dimensions
 def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict):
-    gufunc_txt    = ""
+    gufunc_txt = ""
     global_id_dim = 0
-    for_loop_dim  = parfor_dim
+    for_loop_dim = parfor_dim
 
     if parfor_dim > 3:
         global_id_dim = 3
@@ -71,9 +72,14 @@ def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict):
         global_id_dim = parfor_dim
 
     for eachdim in range(global_id_dim):
-        gufunc_txt += ("    " + legal_loop_indices[eachdim] + " = "
-                       + "dppy.get_global_id(" + str(eachdim) + ")\n")
-
+        gufunc_txt += (
+            "    "
+            + legal_loop_indices[eachdim]
+            + " = "
+            + "dppy.get_global_id("
+            + str(eachdim)
+            + ")\n"
+        )
 
     for eachdim in range(global_id_dim, for_loop_dim):
         for indent in range(1 + (eachdim - global_id_dim)):
@@ -82,11 +88,15 @@ def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict):
         start, stop, step = loop_ranges[eachdim]
         start = param_dict.get(str(start), start)
         stop = param_dict.get(str(stop), stop)
-        gufunc_txt += ("for " +
-                   legal_loop_indices[eachdim] +
-                   " in range(" + str(start) +
-                   ", " + str(stop) +
-                   " + 1):\n")
+        gufunc_txt += (
+            "for "
+            + legal_loop_indices[eachdim]
+            + " in range("
+            + str(start)
+            + ", "
+            + str(stop)
+            + " + 1):\n"
+        )
 
     for eachdim in range(global_id_dim, for_loop_dim):
         for indent in range(1 + (eachdim - global_id_dim)):
@@ -114,17 +124,18 @@ def _dbgprint_after_each_array_assignments(lowerer, loop_body, typemap):
                 strconsttyp = types.StringLiteral(strval)
 
                 lhs = ir.Var(scope, mk_unique_var("str_const"), loc)
-                assign_lhs = ir.Assign(value=ir.Const(value=strval, loc=loc),
-                                       target=lhs, loc=loc)
+                assign_lhs = ir.Assign(
+                    value=ir.Const(value=strval, loc=loc), target=lhs, loc=loc
+                )
                 typemap[lhs.name] = strconsttyp
                 new_block.append(assign_lhs)
 
                 # Make print node
-                print_node = ir.Print(args=[lhs, inst.target], vararg=None,
-                                      loc=loc)
+                print_node = ir.Print(args=[lhs, inst.target], vararg=None, loc=loc)
                 new_block.append(print_node)
-                sig = numba.typing.signature(types.none, typemap[lhs.name],
-                                             typemap[inst.target.name])
+                sig = numba.typing.signature(
+                    types.none, typemap[lhs.name], typemap[inst.target.name]
+                )
                 lowerer.fndesc.calltypes[print_node] = sig
         loop_body[label] = new_block
 
@@ -134,33 +145,36 @@ def replace_var_with_array_in_block(vars, block, typemap, calltypes):
     for inst in block.body:
         if isinstance(inst, ir.Assign) and inst.target.name in vars:
             const_node = ir.Const(0, inst.loc)
-            const_var = ir.Var(inst.target.scope, mk_unique_var("$const_ind_0"),
-                               inst.loc)
+            const_var = ir.Var(
+                inst.target.scope, mk_unique_var("$const_ind_0"), inst.loc
+            )
             typemap[const_var.name] = types.uintp
             const_assign = ir.Assign(const_node, const_var, inst.loc)
             new_block.append(const_assign)
 
-            setitem_node = ir.SetItem(inst.target, const_var, inst.value,
-                                      inst.loc)
+            setitem_node = ir.SetItem(inst.target, const_var, inst.value, inst.loc)
             calltypes[setitem_node] = signature(
-                types.none, types.npytypes.Array(typemap[inst.target.name], 1,
-                                                 "C"), types.intp,
-                                                 typemap[inst.target.name])
+                types.none,
+                types.npytypes.Array(typemap[inst.target.name], 1, "C"),
+                types.intp,
+                typemap[inst.target.name],
+            )
             new_block.append(setitem_node)
             continue
         elif isinstance(inst, parfor.Parfor):
-            replace_var_with_array_internal(vars, {0: inst.init_block},
-                                            typemap, calltypes)
-            replace_var_with_array_internal(vars, inst.loop_body,
-                                            typemap, calltypes)
+            replace_var_with_array_internal(
+                vars, {0: inst.init_block}, typemap, calltypes
+            )
+            replace_var_with_array_internal(vars, inst.loop_body, typemap, calltypes)
 
         new_block.append(inst)
     return new_block
 
+
 def replace_var_with_array_internal(vars, loop_body, typemap, calltypes):
     for label, block in loop_body.items():
-        block.body = replace_var_with_array_in_block(vars, block, typemap,
-                                                     calltypes)
+        block.body = replace_var_with_array_in_block(vars, block, typemap, calltypes)
+
 
 def replace_var_with_array(vars, loop_body, typemap, calltypes):
     replace_var_with_array_internal(vars, loop_body, typemap, calltypes)
@@ -178,17 +192,18 @@ def wrap_loop_body(loop_body):
     blocks[last_label].body.append(ir.Jump(first_label, loc))
     return blocks
 
+
 def unwrap_loop_body(loop_body):
     last_label = max(loop_body.keys())
     loop_body[last_label].body = loop_body[last_label].body[:-1]
 
 
 def legalize_names_with_typemap(names, typemap):
-    """ We use ir_utils.legalize_names to replace internal IR variable names
-        containing illegal characters (e.g. period) with a legal character
-        (underscore) so as to create legal variable names.
-        The original variable names are in the typemap so we also
-        need to add the legalized name to the typemap as well.
+    """We use ir_utils.legalize_names to replace internal IR variable names
+    containing illegal characters (e.g. period) with a legal character
+    (underscore) so as to create legal variable names.
+    The original variable names are in the typemap so we also
+    need to add the legalized name to the typemap as well.
     """
     outdict = legalize_names(names)
     # For each pair in the dict of legalized names...
@@ -206,6 +221,7 @@ def to_scalar_from_0d(x):
             return x.dtype
     return x
 
+
 def find_setitems_block(setitems, block, typemap):
     for inst in block.body:
         if isinstance(inst, ir.StaticSetItem) or isinstance(inst, ir.SetItem):
@@ -214,15 +230,17 @@ def find_setitems_block(setitems, block, typemap):
             find_setitems_block(setitems, inst.init_block, typemap)
             find_setitems_body(setitems, inst.loop_body, typemap)
 
+
 def find_setitems_body(setitems, loop_body, typemap):
     """
-      Find the arrays that are written into (goes into setitems)
+    Find the arrays that are written into (goes into setitems)
     """
     for label, block in loop_body.items():
         find_setitems_block(setitems, block, typemap)
 
+
 def _create_gufunc_for_regular_parfor():
-    #TODO
+    # TODO
     pass
 
 
@@ -231,18 +249,19 @@ def _create_gufunc_for_reduction_parfor():
 
 
 def _create_gufunc_for_parfor_body(
-        lowerer,
-        parfor,
-        typemap,
-        typingctx,
-        targetctx,
-        flags,
-        loop_ranges,
-        locals,
-        has_aliases,
-        index_var_typ,
-        races):
-    '''
+    lowerer,
+    parfor,
+    typemap,
+    typingctx,
+    targetctx,
+    flags,
+    loop_ranges,
+    locals,
+    has_aliases,
+    index_var_typ,
+    races,
+):
+    """
     Takes a parfor and creates a gufunc function for its body. There
     are two parts to this function:
 
@@ -257,7 +276,7 @@ def _create_gufunc_for_parfor_body(
     IR retrieved with run_frontend. The IR is scanned for the sentinel
     assignment where that basic block is split and the IR for the parfor
     body inserted.
-    '''
+    """
 
     loc = parfor.init_block.loc
 
@@ -288,25 +307,22 @@ def _create_gufunc_for_parfor_body(
     typemap = lowerer.fndesc.typemap
 
     parfor_redvars, parfor_reddict = numba.parfors.parfor.get_parfor_reductions(
-                                        lowerer.func_ir,
-                                        parfor,
-                                        parfor_params,
-                                        lowerer.fndesc.calltypes)
+        lowerer.func_ir, parfor, parfor_params, lowerer.fndesc.calltypes
+    )
     has_reduction = False if len(parfor_redvars) == 0 else True
 
     if has_reduction:
         _create_gufunc_for_reduction_parfor()
 
     # Compute just the parfor inputs as a set difference.
-    parfor_inputs = sorted(
-        list(
-            set(parfor_params) -
-            set(parfor_outputs)))
+    parfor_inputs = sorted(list(set(parfor_params) - set(parfor_outputs)))
 
     for race in races:
-        msg = ("Variable %s used in parallel loop may be written "
-               "to simultaneously by multiple workers and may result "
-               "in non-deterministic or unintended results." % race)
+        msg = (
+            "Variable %s used in parallel loop may be written "
+            "to simultaneously by multiple workers and may result "
+            "in non-deterministic or unintended results." % race
+        )
         warnings.warn(NumbaParallelSafetyWarning(msg, loc))
     replace_var_with_array(races, loop_body, typemap, lowerer.fndesc.calltypes)
 
@@ -321,15 +337,13 @@ def _create_gufunc_for_parfor_body(
     def addrspace_from(params, def_addr):
         addrspaces = []
         for p in params:
-            if isinstance(to_scalar_from_0d(typemap[p]),
-                          types.npytypes.Array):
+            if isinstance(to_scalar_from_0d(typemap[p]), types.npytypes.Array):
                 addrspaces.append(def_addr)
             else:
                 addrspaces.append(None)
         return addrspaces
 
-    addrspaces = addrspace_from(parfor_params,
-                                numba_dppy.target.SPIR_GLOBAL_ADDRSPACE)
+    addrspaces = addrspace_from(parfor_params, numba_dppy.target.SPIR_GLOBAL_ADDRSPACE)
 
     if config.DEBUG_ARRAY_OPT >= 1:
         print("parfor_params = ", parfor_params, type(parfor_params))
@@ -351,8 +365,7 @@ def addrspace_from(params, def_addr):
 
     if config.DEBUG_ARRAY_OPT >= 1:
         print("ind_dict = ", sorted(ind_dict.items()), type(ind_dict))
-        print("legal_loop_indices = ",legal_loop_indices,
-              type(legal_loop_indices))
+        print("legal_loop_indices = ", legal_loop_indices, type(legal_loop_indices))
 
         for pd in parfor_params:
             print("pd = ", pd)
@@ -365,14 +378,15 @@ def addrspace_from(params, def_addr):
 
     # Calculate types of args passed to gufunc.
     func_arg_types = [typemap[v] for v in (parfor_inputs + parfor_outputs)]
-    assert(len(param_types_addrspaces) == len(addrspaces))
+    assert len(param_types_addrspaces) == len(addrspaces)
     for i in range(len(param_types_addrspaces)):
         if addrspaces[i] is not None:
-            #print("before:", id(param_types_addrspaces[i]))
-            assert(isinstance(param_types_addrspaces[i], types.npytypes.Array))
-            param_types_addrspaces[i] = (param_types_addrspaces[i]
-                                        .copy(addrspace=addrspaces[i]))
-            #print("setting param type", i, param_types[i], id(param_types[i]),
+            # print("before:", id(param_types_addrspaces[i]))
+            assert isinstance(param_types_addrspaces[i], types.npytypes.Array)
+            param_types_addrspaces[i] = param_types_addrspaces[i].copy(
+                addrspace=addrspaces[i]
+            )
+            # print("setting param type", i, param_types[i], id(param_types[i]),
             #      "to addrspace", param_types_addrspaces[i].addrspace)
 
     def print_arg_with_addrspaces(args):
@@ -396,10 +410,12 @@ def print_arg_with_addrspaces(args):
     parfor_params = []
     ascontig = False
     for pindex in range(len(parfor_params_orig)):
-        if (ascontig and
-            pindex < len(parfor_inputs) and
-            isinstance(param_types[pindex], types.npytypes.Array)):
-            parfor_params.append(parfor_params_orig[pindex]+"param")
+        if (
+            ascontig
+            and pindex < len(parfor_inputs)
+            and isinstance(param_types[pindex], types.npytypes.Array)
+        ):
+            parfor_params.append(parfor_params_orig[pindex] + "param")
         else:
             parfor_params.append(parfor_params_orig[pindex])
 
@@ -409,11 +425,7 @@ def print_arg_with_addrspaces(args):
     sentinel_name = get_unused_var_name("__sentinel__", loop_body_var_table)
 
     if config.DEBUG_ARRAY_OPT >= 1:
-        print(
-            "legal parfor_params = ",
-            parfor_params,
-            type(parfor_params))
-
+        print("legal parfor_params = ", parfor_params, type(parfor_params))
 
     # Determine the unique names of the scheduling and gufunc functions.
     gufunc_name = "__numba_parfor_gufunc_%s" % (parfor.id)
@@ -428,9 +440,9 @@ def print_arg_with_addrspaces(args):
     gufunc_txt += "def " + gufunc_name
     gufunc_txt += "(" + (", ".join(parfor_params)) + "):\n"
 
-
-    gufunc_txt += _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges,
-                                 param_dict)
+    gufunc_txt += _schedule_loop(
+        parfor_dim, legal_loop_indices, loop_ranges, param_dict
+    )
 
     # Add the sentinel assignment so that we can find the loop body position
     # in the IR.
@@ -463,8 +475,7 @@ def print_arg_with_addrspaces(args):
     # rename all variables in gufunc_ir afresh
     var_table = get_name_var_table(gufunc_ir.blocks)
     new_var_dict = {}
-    reserved_names = [sentinel_name] + \
-        list(param_dict.values()) + legal_loop_indices
+    reserved_names = [sentinel_name] + list(param_dict.values()) + legal_loop_indices
     for name, var in var_table.items():
         if not (name in reserved_names):
             new_var_dict[name] = mk_unique_var(name)
@@ -481,10 +492,8 @@ def print_arg_with_addrspaces(args):
 
     if config.DEBUG_ARRAY_OPT:
         print(
-            "gufunc_param_types = ",
-            type(gufunc_param_types),
-            "\n",
-            gufunc_param_types)
+            "gufunc_param_types = ", type(gufunc_param_types), "\n", gufunc_param_types
+        )
 
     gufunc_stub_last_label = max(gufunc_ir.blocks.keys()) + 1
 
@@ -503,7 +512,7 @@ def print_arg_with_addrspaces(args):
         _print_body(loop_body)
 
     wrapped_blocks = wrap_loop_body(loop_body)
-    #hoisted, not_hoisted = hoist(parfor_params, loop_body,
+    # hoisted, not_hoisted = hoist(parfor_params, loop_body,
     #                             typemap, wrapped_blocks)
     setitems = set()
     find_setitems_body(setitems, loop_body, typemap)
@@ -516,11 +525,12 @@ def print_arg_with_addrspaces(args):
     unwrap_loop_body(loop_body)
 
     # store hoisted into diagnostics
-    diagnostics = lowerer.metadata['parfor_diagnostics']
-    diagnostics.hoist_info[parfor.id] = {'hoisted': hoisted,
-                                         'not_hoisted': not_hoisted}
+    diagnostics = lowerer.metadata["parfor_diagnostics"]
+    diagnostics.hoist_info[parfor.id] = {"hoisted": hoisted, "not_hoisted": not_hoisted}
 
-    lowerer.metadata['parfor_diagnostics'].extra_info[str(parfor.id)] = str(dpctl.get_current_queue().get_sycl_device().get_device_name())
+    lowerer.metadata["parfor_diagnostics"].extra_info[str(parfor.id)] = str(
+        dpctl.get_current_queue().get_sycl_device().get_device_name()
+    )
 
     if config.DEBUG_ARRAY_OPT:
         print("After hoisting")
@@ -529,8 +539,7 @@ def print_arg_with_addrspaces(args):
     # Search all the block in the gufunc outline for the sentinel assignment.
     for label, block in gufunc_ir.blocks.items():
         for i, inst in enumerate(block.body):
-            if (isinstance(inst, ir.Assign) and
-                inst.target.name == sentinel_name):
+            if isinstance(inst, ir.Assign) and inst.target.name == sentinel_name:
                 # We found the sentinel assignment.
                 loc = inst.loc
                 scope = block.scope
@@ -541,7 +550,7 @@ def print_arg_with_addrspaces(args):
                 prev_block.body = block.body[:i]
 
                 # The current block is used for statements after the sentinel.
-                block.body = block.body[i + 1:]
+                block.body = block.body[i + 1 :]
                 # But the current block gets a new label.
                 body_first_label = min(loop_body.keys())
 
@@ -557,8 +566,7 @@ def print_arg_with_addrspaces(args):
                 gufunc_ir.blocks[label] = prev_block
                 # Add a jump from the last parfor body block to the block
                 # containing statements after the sentinel.
-                gufunc_ir.blocks[body_last_label].append(
-                    ir.Jump(new_label, loc))
+                gufunc_ir.blocks[body_last_label].append(ir.Jump(new_label, loc))
                 break
         else:
             continue
@@ -598,22 +606,21 @@ def print_arg_with_addrspaces(args):
         sys.stdout.flush()
 
     if config.DEBUG_ARRAY_OPT:
-        print('before DUFunc inlining'.center(80, '-'))
+        print("before DUFunc inlining".center(80, "-"))
         gufunc_ir.dump()
 
     # Inlining all DUFuncs
-    dufunc_inliner(gufunc_ir, lowerer.fndesc.calltypes, typemap,
-                   lowerer.context.typing_context)
+    dufunc_inliner(
+        gufunc_ir, lowerer.fndesc.calltypes, typemap, lowerer.context.typing_context
+    )
 
     if config.DEBUG_ARRAY_OPT:
-        print('after DUFunc inline'.center(80, '-'))
+        print("after DUFunc inline".center(80, "-"))
         gufunc_ir.dump()
 
     kernel_func = numba_dppy.compiler.compile_kernel_parfor(
-        dpctl.get_current_queue(),
-        gufunc_ir,
-        gufunc_param_types,
-        param_types_addrspaces)
+        dpctl.get_current_queue(), gufunc_ir, gufunc_param_types, param_types_addrspaces
+    )
 
     flags.noalias = old_alias
 
@@ -672,8 +679,9 @@ def _lower_parfor_gufunc(lowerer, parfor):
 
     alias_map = {}
     arg_aliases = {}
-    numba.parfors.parfor.find_potential_aliases_parfor(parfor, parfor.params, typemap,
-                                        lowerer.func_ir, alias_map, arg_aliases)
+    numba.parfors.parfor.find_potential_aliases_parfor(
+        parfor, parfor.params, typemap, lowerer.func_ir, alias_map, arg_aliases
+    )
     if config.DEBUG_ARRAY_OPT:
         print("alias_map", alias_map)
         print("arg_aliases", arg_aliases)
@@ -684,12 +692,12 @@ def _lower_parfor_gufunc(lowerer, parfor):
     assert parfor.params != None
 
     parfor_output_arrays = numba.parfors.parfor.get_parfor_outputs(
-        parfor, parfor.params)
-
+        parfor, parfor.params
+    )
 
     # compile parfor body as a separate function to be used with GUFuncWrapper
     flags = copy.copy(parfor.flags)
-    flags.set('error_model', 'numpy')
+    flags.set("error_model", "numpy")
 
     # Can't get here unless flags.set('auto_parallel', ParallelOptions(True))
     index_var_typ = typemap[parfor.loop_nests[0].index_variable.name]
@@ -702,8 +710,13 @@ def _lower_parfor_gufunc(lowerer, parfor):
     loop_ranges = [(l.start, l.stop, l.step) for l in parfor.loop_nests]
 
     try:
-        func, func_args, func_sig, func_arg_types, modified_arrays =(
-        _create_gufunc_for_parfor_body(
+        (
+            func,
+            func_args,
+            func_sig,
+            func_arg_types,
+            modified_arrays,
+        ) = _create_gufunc_for_parfor_body(
             lowerer,
             parfor,
             typemap,
@@ -714,7 +727,8 @@ def _lower_parfor_gufunc(lowerer, parfor):
             {},
             bool(alias_map),
             index_var_typ,
-            parfor.races))
+            parfor.races,
+        )
     finally:
         numba.parfors.parfor.sequential_parfor_lowering = False
 
@@ -735,12 +749,8 @@ def _lower_parfor_gufunc(lowerer, parfor):
         print("loop_ranges = ", loop_ranges)
 
     gu_signature = _create_shape_signature(
-        parfor.get_shape_classes,
-        num_inputs,
-        func_args,
-        func_sig,
-        parfor.races,
-        typemap)
+        parfor.get_shape_classes, num_inputs, func_args, func_sig, parfor.races, typemap
+    )
 
     generate_dppy_host_wrapper(
         lowerer,
@@ -754,7 +764,8 @@ def _lower_parfor_gufunc(lowerer, parfor):
         parfor.init_block,
         index_var_typ,
         parfor.races,
-        modified_arrays)
+        modified_arrays,
+    )
 
     if config.DEBUG_ARRAY_OPT:
         sys.stdout.flush()
@@ -765,50 +776,52 @@ def _lower_parfor_gufunc(lowerer, parfor):
 
 
 def _create_shape_signature(
-        get_shape_classes,
-        num_inputs,
-        #num_reductions,
-        args,
-        func_sig,
-        races,
-        typemap):
-    '''Create shape signature for GUFunc
-    '''
+    get_shape_classes,
+    num_inputs,
+    # num_reductions,
+    args,
+    func_sig,
+    races,
+    typemap,
+):
+    """Create shape signature for GUFunc"""
     if config.DEBUG_ARRAY_OPT:
         print("_create_shape_signature", num_inputs, args)
         arg_start_print = 0
         for i in args[arg_start_print:]:
             print("argument", i, type(i), get_shape_classes(i, typemap=typemap))
 
-    #num_inouts = len(args) - num_reductions
+    # num_inouts = len(args) - num_reductions
     num_inouts = len(args)
     # maximum class number for array shapes
-    classes = [get_shape_classes(var, typemap=typemap)
-               if var not in races else (-1,) for var in args[1:]]
+    classes = [
+        get_shape_classes(var, typemap=typemap) if var not in races else (-1,)
+        for var in args[1:]
+    ]
     class_set = set()
     for _class in classes:
         if _class:
             for i in _class:
                 class_set.add(i)
     max_class = max(class_set) + 1 if class_set else 0
-    classes.insert(0, (max_class,)) # force set the class of 'sched' argument
+    classes.insert(0, (max_class,))  # force set the class of 'sched' argument
     class_set.add(max_class)
     class_map = {}
     # TODO: use prefix + class number instead of single char
-    alphabet = ord('a')
+    alphabet = ord("a")
     for n in class_set:
-       if n >= 0:
-           class_map[n] = chr(alphabet)
-           alphabet += 1
+        if n >= 0:
+            class_map[n] = chr(alphabet)
+            alphabet += 1
 
-    alpha_dict = {'latest_alpha' : alphabet}
+    alpha_dict = {"latest_alpha": alphabet}
 
     def bump_alpha(c, class_map):
         if c >= 0:
             return class_map[c]
         else:
-            alpha_dict['latest_alpha'] += 1
-            return chr(alpha_dict['latest_alpha'])
+            alpha_dict["latest_alpha"] += 1
+            return chr(alpha_dict["latest_alpha"])
 
     gu_sin = []
     gu_sout = []
@@ -833,21 +846,24 @@ def bump_alpha(c, class_map):
 # Keep all the dppy kernels and programs created alive indefinitely.
 keep_alive_kernels = []
 
-def generate_dppy_host_wrapper(lowerer,
-                               cres,
-                               gu_signature,
-                               outer_sig,
-                               expr_args,
-                               num_inputs,
-                               expr_arg_types,
-                               loop_ranges,
-                               init_block,
-                               index_var_typ,
-                               races,
-                               modified_arrays):
-    '''
+
+def generate_dppy_host_wrapper(
+    lowerer,
+    cres,
+    gu_signature,
+    outer_sig,
+    expr_args,
+    num_inputs,
+    expr_arg_types,
+    loop_ranges,
+    init_block,
+    index_var_typ,
+    races,
+    modified_arrays,
+):
+    """
     Adds the call to the gufunc function from the main function.
-    '''
+    """
     context = lowerer.context
     builder = lowerer.builder
     sin, sout = gu_signature
@@ -856,8 +872,13 @@ def generate_dppy_host_wrapper(lowerer,
     if config.DEBUG_ARRAY_OPT:
         print("generate_dppy_host_wrapper")
         print("args = ", expr_args)
-        print("outer_sig = ", outer_sig.args, outer_sig.return_type,
-              outer_sig.recvr, outer_sig.pysig)
+        print(
+            "outer_sig = ",
+            outer_sig.args,
+            outer_sig.return_type,
+            outer_sig.recvr,
+            outer_sig.pysig,
+        )
         print("loop_ranges = ", loop_ranges)
         print("expr_args", expr_args)
         print("expr_arg_types", expr_arg_types)
@@ -866,13 +887,13 @@ def generate_dppy_host_wrapper(lowerer,
         print("sout", sout)
         print("cres", cres, type(cres))
         print("modified_arrays", modified_arrays)
-#        print("cres.library", cres.library, type(cres.library))
-#        print("cres.fndesc", cres.fndesc, type(cres.fndesc))
-
+    #        print("cres.library", cres.library, type(cres.library))
+    #        print("cres.fndesc", cres.fndesc, type(cres.fndesc))
 
     # get dppy_cpu_portion_lowerer object
     dppy_cpu_lowerer = dppy_call_gen.DPPYHostFunctionCallsGenerator(
-                           lowerer, cres, num_inputs)
+        lowerer, cres, num_inputs
+    )
 
     # Compute number of args ------------------------------------------------
     num_expanded_args = 0
@@ -911,8 +932,7 @@ def val_type_or_none(context, lowerer, x):
             return None
 
     all_llvm_args = [getvar_or_none(lowerer, x) for x in expr_args[:ninouts]]
-    all_val_types = ([val_type_or_none(context, lowerer, x)
-                     for x in expr_args[:ninouts]])
+    all_val_types = [val_type_or_none(context, lowerer, x) for x in expr_args[:ninouts]]
     all_args = [loadvar_or_none(lowerer, x) for x in expr_args[:ninouts]]
 
     keep_alive_kernels.append(cres)
@@ -922,19 +942,37 @@ def val_type_or_none(context, lowerer, x):
     # the enqueue function. Put each part of each argument into
     # kernel_arg_array.
     for var, llvm_arg, arg_type, gu_sig, val_type, index in zip(
-        expr_args, all_llvm_args, expr_arg_types, sin + sout, all_val_types,
-        range(len(expr_args))):
+        expr_args,
+        all_llvm_args,
+        expr_arg_types,
+        sin + sout,
+        all_val_types,
+        range(len(expr_args)),
+    ):
 
         if config.DEBUG_ARRAY_OPT:
-            print("var:", var, type(var),
-                  "\n\tllvm_arg:", llvm_arg, type(llvm_arg),
-                  "\n\targ_type:", arg_type, type(arg_type),
-                  "\n\tgu_sig:", gu_sig,
-                  "\n\tval_type:", val_type, type(val_type),
-                  "\n\tindex:", index)
-
-        dppy_cpu_lowerer.process_kernel_arg(var, llvm_arg, arg_type, gu_sig,
-                                            val_type, index, modified_arrays)
+            print(
+                "var:",
+                var,
+                type(var),
+                "\n\tllvm_arg:",
+                llvm_arg,
+                type(llvm_arg),
+                "\n\targ_type:",
+                arg_type,
+                type(arg_type),
+                "\n\tgu_sig:",
+                gu_sig,
+                "\n\tval_type:",
+                val_type,
+                type(val_type),
+                "\n\tindex:",
+                index,
+            )
+
+        dppy_cpu_lowerer.process_kernel_arg(
+            var, llvm_arg, arg_type, gu_sig, val_type, index, modified_arrays
+        )
     # -----------------------------------------------------------------------
 
     # loadvars for loop_ranges
@@ -949,7 +987,7 @@ def load_range(v):
         start, stop, step = loop_ranges[i]
         start = load_range(start)
         stop = load_range(stop)
-        assert(step == 1)  # We do not support loop steps other than 1
+        assert step == 1  # We do not support loop steps other than 1
         step = load_range(step)
         loop_ranges[i] = (start, stop, step)
 
@@ -985,10 +1023,26 @@ def relatively_deep_copy(obj, memo):
     from numba.core.types.abstract import Type
 
     # objects which shouldn't or can't be copied and it's ok not to copy it.
-    if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type,
-                        Dispatcher, ModuleType, Signature,
-                        DPPYFunctionTemplate, CompileResult, DUFunc, _CFuncPtr,
-                        type, str, bool, type(None))):
+    if isinstance(
+        obj,
+        (
+            FunctionIdentity,
+            _DispatcherBase,
+            Function,
+            Type,
+            Dispatcher,
+            ModuleType,
+            Signature,
+            DPPYFunctionTemplate,
+            CompileResult,
+            DUFunc,
+            _CFuncPtr,
+            type,
+            str,
+            bool,
+            type(None),
+        ),
+    ):
         return obj
 
     from numba.core.ir import Global, FreeVar
@@ -997,15 +1051,24 @@ def relatively_deep_copy(obj, memo):
     from numba.core.funcdesc import FunctionDescriptor
 
     if isinstance(obj, FunctionDescriptor):
-        cpy = FunctionDescriptor(native=obj.native, modname=obj.modname, qualname=obj.qualname,
-                                 unique_name=obj.unique_name, doc=obj.doc,
-                                 typemap=relatively_deep_copy(obj.typemap, memo),
-                                 restype=obj.restype,
-                                 calltypes=relatively_deep_copy(obj.calltypes, memo),
-                                 args=obj.args, kws=obj.kws, mangler=None,
-                                 argtypes=relatively_deep_copy(obj.argtypes, memo),
-                                 inline=obj.inline, noalias=obj.noalias, env_name=obj.env_name,
-                                 global_dict=obj.global_dict)
+        cpy = FunctionDescriptor(
+            native=obj.native,
+            modname=obj.modname,
+            qualname=obj.qualname,
+            unique_name=obj.unique_name,
+            doc=obj.doc,
+            typemap=relatively_deep_copy(obj.typemap, memo),
+            restype=obj.restype,
+            calltypes=relatively_deep_copy(obj.calltypes, memo),
+            args=obj.args,
+            kws=obj.kws,
+            mangler=None,
+            argtypes=relatively_deep_copy(obj.argtypes, memo),
+            inline=obj.inline,
+            noalias=obj.noalias,
+            env_name=obj.env_name,
+            global_dict=obj.global_dict,
+        )
         # mangler parameter is not saved in FunctionDescriptor, but used to generated name.
         # So pass None as mangler parameter and then copy mangled_name by hands
         cpy.mangled_name = obj.mangled_name
@@ -1025,13 +1088,15 @@ def relatively_deep_copy(obj, memo):
         # This means that copy of IR actually has a side effect on it.
         pp = PostProcessor(obj)
         pp.run()
-        cpy = FunctionIR(blocks=relatively_deep_copy(obj.blocks, memo),
-                         is_generator=relatively_deep_copy(obj.is_generator, memo),
-                         func_id=relatively_deep_copy(obj.func_id, memo),
-                         loc=obj.loc,
-                         definitions=relatively_deep_copy(obj._definitions, memo),
-                         arg_count=obj.arg_count,
-                         arg_names=relatively_deep_copy(obj.arg_names, memo))
+        cpy = FunctionIR(
+            blocks=relatively_deep_copy(obj.blocks, memo),
+            is_generator=relatively_deep_copy(obj.is_generator, memo),
+            func_id=relatively_deep_copy(obj.func_id, memo),
+            loc=obj.loc,
+            definitions=relatively_deep_copy(obj._definitions, memo),
+            arg_count=obj.arg_count,
+            arg_names=relatively_deep_copy(obj.arg_names, memo),
+        )
         pp = PostProcessor(cpy)
         pp.run()
 
@@ -1142,8 +1207,9 @@ def __init__(self, context, library, fndesc, func_ir, metadata=None):
         fndesc_cpu = relatively_deep_copy(fndesc, memo)
         func_ir_cpu = relatively_deep_copy(func_ir, memo)
 
-
-        cpu_context = context.cpu_context if isinstance(context, DPPYTargetContext) else context
+        cpu_context = (
+            context.cpu_context if isinstance(context, DPPYTargetContext) else context
+        )
         self.gpu_lower = Lower(context, library, fndesc, func_ir, metadata)
         self.cpu_lower = Lower(cpu_context, library, fndesc_cpu, func_ir_cpu, metadata)
 
@@ -1168,17 +1234,23 @@ def lower(self):
             lowering.lower_extensions[parfor.Parfor].append(lower_parfor_rollback)
             self.gpu_lower.lower()
             # if lower dont crash, and parfor_diagnostics is empty then it is kernel
-            if not self.gpu_lower.metadata['parfor_diagnostics'].extra_info:
-                str_name = str(dpctl.get_current_queue().get_sycl_device().get_device_name())
-                self.gpu_lower.metadata['parfor_diagnostics'].extra_info["kernel"] = str_name
+            if not self.gpu_lower.metadata["parfor_diagnostics"].extra_info:
+                str_name = str(
+                    dpctl.get_current_queue().get_sycl_device().get_device_name()
+                )
+                self.gpu_lower.metadata["parfor_diagnostics"].extra_info[
+                    "kernel"
+                ] = str_name
             self.base_lower = self.gpu_lower
             lowering.lower_extensions[parfor.Parfor].pop()
         except Exception as e:
             if numba_dppy.compiler.DEBUG:
                 print("Failed to lower parfor on DPPY-device. Due to:\n", e)
             lowering.lower_extensions[parfor.Parfor].pop()
-            if ((lowering.lower_extensions[parfor.Parfor][-1] == numba.parfors.parfor_lowering._lower_parfor_parallel) and
-                numba_dppy.config.FALLBACK_ON_CPU == 1):
+            if (
+                lowering.lower_extensions[parfor.Parfor][-1]
+                == numba.parfors.parfor_lowering._lower_parfor_parallel
+            ) and numba_dppy.config.FALLBACK_ON_CPU == 1:
                 self.cpu_lower.lower()
                 self.base_lower = self.cpu_lower
             else:
diff --git a/numba_dppy/dppy_offload_dispatcher.py b/numba_dppy/dppy_offload_dispatcher.py
index 0c5fe10f5e..58ca6d9729 100644
--- a/numba_dppy/dppy_offload_dispatcher.py
+++ b/numba_dppy/dppy_offload_dispatcher.py
@@ -6,18 +6,45 @@
 class DppyOffloadDispatcher(dispatcher.Dispatcher):
     targetdescr = cpu_target
 
-    def __init__(self, py_func, locals={}, targetoptions={}, impl_kind='direct', pipeline_class=compiler.Compiler):
+    def __init__(
+        self,
+        py_func,
+        locals={},
+        targetoptions={},
+        impl_kind="direct",
+        pipeline_class=compiler.Compiler,
+    ):
         if dppy_config.dppy_present:
             from numba_dppy.compiler import DPPYCompiler
-            targetoptions['parallel'] = True
-            dispatcher.Dispatcher.__init__(self, py_func, locals=locals,
-                    targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=DPPYCompiler)
+
+            targetoptions["parallel"] = True
+            dispatcher.Dispatcher.__init__(
+                self,
+                py_func,
+                locals=locals,
+                targetoptions=targetoptions,
+                impl_kind=impl_kind,
+                pipeline_class=DPPYCompiler,
+            )
         else:
-            print("---------------------------------------------------------------------")
-            print("WARNING : DPPY pipeline ignored. Ensure OpenCL drivers are installed.")
-            print("---------------------------------------------------------------------")
-            dispatcher.Dispatcher.__init__(self, py_func, locals=locals,
-                targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=pipeline_class)
+            print(
+                "---------------------------------------------------------------------"
+            )
+            print(
+                "WARNING : DPPY pipeline ignored. Ensure OpenCL drivers are installed."
+            )
+            print(
+                "---------------------------------------------------------------------"
+            )
+            dispatcher.Dispatcher.__init__(
+                self,
+                py_func,
+                locals=locals,
+                targetoptions=targetoptions,
+                impl_kind=impl_kind,
+                pipeline_class=pipeline_class,
+            )
+
 
-dispatcher_registry['__dppy_offload_gpu__'] = DppyOffloadDispatcher
-dispatcher_registry['__dppy_offload_cpu__'] = DppyOffloadDispatcher
+dispatcher_registry["__dppy_offload_gpu__"] = DppyOffloadDispatcher
+dispatcher_registry["__dppy_offload_cpu__"] = DppyOffloadDispatcher
diff --git a/numba_dppy/dppy_parfor_diagnostics.py b/numba_dppy/dppy_parfor_diagnostics.py
index 50e19a1cb1..51b3747cac 100644
--- a/numba_dppy/dppy_parfor_diagnostics.py
+++ b/numba_dppy/dppy_parfor_diagnostics.py
@@ -14,14 +14,14 @@ def dump(self, level=1):
         if self.extra_info:
             parfors_simple = self.get_parfors_simple(False)
             all_lines = self.get_all_lines(parfors_simple)
-            print(' Auto-offloading '.center(_termwidth,'-'))
+            print(" Auto-offloading ".center(_termwidth, "-"))
             self.print_auto_offloading(all_lines)
-            if 'kernel' in self.extra_info.keys():
-                print_wrapped("Device - '%s'" % self.extra_info['kernel'])
-            print(_termwidth * '-')
+            if "kernel" in self.extra_info.keys():
+                print_wrapped("Device - '%s'" % self.extra_info["kernel"])
+            print(_termwidth * "-")
 
     def print_auto_offloading(self, lines):
-        sword = '+--'
+        sword = "+--"
         fac = len(sword)
 
         summary = dict()
@@ -30,26 +30,26 @@ def print_auto_offloading(self, lines):
         def print_nest(fadj_, nadj_, theroot, reported, region_id):
             def print_g(fadj_, nadj_, nroot, depth):
                 for k in nadj_[nroot]:
-                    msg = fac * depth * ' ' + '%s%s %s' % (sword, k, '(serial')
+                    msg = fac * depth * " " + "%s%s %s" % (sword, k, "(serial")
                     if nadj_[k] == []:
                         fused = []
                         if fadj_[k] != [] and k not in reported:
                             fused = sorted(self.reachable_nodes(fadj_, k))
                             msg += ", fused with loop(s): "
-                            msg += ', '.join([str(x) for x in fused])
-                        msg += ')'
+                            msg += ", ".join([str(x) for x in fused])
+                        msg += ")"
                         reported.append(k)
                         print_wrapped(msg)
-                        summary[region_id]['fused'] += len(fused)
+                        summary[region_id]["fused"] += len(fused)
                     else:
-                        print_wrapped(msg + ')')
+                        print_wrapped(msg + ")")
                         print_g(fadj_, nadj_, k, depth + 1)
-                    summary[region_id]['serialized'] += 1
+                    summary[region_id]["serialized"] += 1
 
             if nadj_[theroot] != []:
                 print_wrapped("Parallel region %s:" % region_id)
-                print_wrapped('%s%s %s' % (sword, theroot, '(parallel)'))
-                summary[region_id] = {'root': theroot, 'fused': 0, 'serialized': 0}
+                print_wrapped("%s%s %s" % (sword, theroot, "(parallel)"))
+                summary[region_id] = {"root": theroot, "fused": 0, "serialized": 0}
                 print_g(fadj_, nadj_, theroot, 1)
                 print("\n")
                 region_id = region_id + 1
@@ -57,15 +57,15 @@ def print_g(fadj_, nadj_, nroot, depth):
 
         def print_fuse(ty, pf_id, adj, depth, region_id):
             print_wrapped("Parallel region %s:" % region_id)
-            msg = fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel')
+            msg = fac * depth * " " + "%s%s %s" % (sword, pf_id, "(parallel")
             fused = []
             if adj[pf_id] != []:
                 fused = sorted(self.reachable_nodes(adj, pf_id))
                 msg += ", fused with loop(s): "
-                msg += ', '.join([str(x) for x in fused])
+                msg += ", ".join([str(x) for x in fused])
 
-            summary[region_id] = {'root': pf_id, 'fused': len(fused), 'serialized': 0}
-            msg += ')'
+            summary[region_id] = {"root": pf_id, "fused": len(fused), "serialized": 0}
+            msg += ")"
             print_wrapped(msg)
             extra_info = self.extra_info.get(str(region_id))
             if extra_info:
@@ -78,10 +78,10 @@ def print_fuse(ty, pf_id, adj, depth, region_id):
         reported = []
         for line, info in sorted(lines.items()):
             opt_ty, pf_id, adj = info
-            if opt_ty == 'fuse':
+            if opt_ty == "fuse":
                 if pf_id not in reported:
-                    region_id = print_fuse('f', pf_id, adj, 0, region_id)
-            elif opt_ty == 'nest':
+                    region_id = print_fuse("f", pf_id, adj, 0, region_id)
+            elif opt_ty == "nest":
                 region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
             else:
                 assert 0
@@ -89,18 +89,19 @@ def print_fuse(ty, pf_id, adj, depth, region_id):
         # print the summary of the fuse/serialize rewrite
         if summary:
             for k, v in sorted(summary.items()):
-                msg = ('\n \nParallel region %s (loop #%s) had %s '
-                    'loop(s) fused')
-                root = v['root']
-                fused = v['fused']
-                serialized = v['serialized']
+                msg = "\n \nParallel region %s (loop #%s) had %s " "loop(s) fused"
+                root = v["root"]
+                fused = v["fused"]
+                serialized = v["serialized"]
                 if serialized != 0:
-                    msg += (' and %s loop(s) '
-                    'serialized as part of the larger '
-                    'parallel loop (#%s).')
+                    msg += (
+                        " and %s loop(s) "
+                        "serialized as part of the larger "
+                        "parallel loop (#%s)."
+                    )
                     print_wrapped(msg % (k, root, fused, serialized, root))
                 else:
-                    msg += '.'
+                    msg += "."
                     print_wrapped(msg % (k, root, fused))
         else:
             print_wrapped("Parallel structure is already optimal.")
diff --git a/numba_dppy/dppy_passbuilder.py b/numba_dppy/dppy_passbuilder.py
index 994351d509..c6b2534a62 100644
--- a/numba_dppy/dppy_passbuilder.py
+++ b/numba_dppy/dppy_passbuilder.py
@@ -2,32 +2,51 @@
 
 from numba.core.compiler_machinery import PassManager
 
-from numba.core.untyped_passes import (ExtractByteCode, TranslateByteCode, FixupArgs,
-                                  IRProcessing, DeadBranchPrune,
-                                  RewriteSemanticConstants, InlineClosureLikes,
-                                  GenericRewrites, WithLifting,
-                                  InlineInlinables, FindLiterallyCalls,
-                                  MakeFunctionToJitFunction,
-                                  CanonicalizeLoopExit, CanonicalizeLoopEntry,
-                                  ReconstructSSA,
-                                  LiteralUnroll)
-
-from numba.core.typed_passes import (NopythonTypeInference, AnnotateTypes,
-                                NopythonRewrites, PreParforPass, ParforPass,
-                                DumpParforDiagnostics, IRLegalization,
-                                InlineOverloads, PreLowerStripPhis)
+from numba.core.untyped_passes import (
+    ExtractByteCode,
+    TranslateByteCode,
+    FixupArgs,
+    IRProcessing,
+    DeadBranchPrune,
+    RewriteSemanticConstants,
+    InlineClosureLikes,
+    GenericRewrites,
+    WithLifting,
+    InlineInlinables,
+    FindLiterallyCalls,
+    MakeFunctionToJitFunction,
+    CanonicalizeLoopExit,
+    CanonicalizeLoopEntry,
+    ReconstructSSA,
+    LiteralUnroll,
+)
+
+from numba.core.typed_passes import (
+    NopythonTypeInference,
+    AnnotateTypes,
+    NopythonRewrites,
+    PreParforPass,
+    ParforPass,
+    DumpParforDiagnostics,
+    IRLegalization,
+    InlineOverloads,
+    PreLowerStripPhis,
+)
 
 from .dppy_passes import (
-        DPPYConstantSizeStaticLocalMemoryPass,
-        DPPYPreParforPass,
-        DPPYParforPass,
-        SpirvFriendlyLowering,
-        DPPYNoPythonBackend,
-        DPPYDumpParforDiagnostics
-        )
+    DPPYConstantSizeStaticLocalMemoryPass,
+    DPPYPreParforPass,
+    DPPYParforPass,
+    SpirvFriendlyLowering,
+    DPPYNoPythonBackend,
+    DPPYDumpParforDiagnostics,
+)
+
+from .rename_numpy_functions_pass import (
+    DPPYRewriteOverloadedNumPyFunctions,
+    DPPYRewriteNdarrayFunctions,
+)
 
-from .rename_numpy_functions_pass import (DPPYRewriteOverloadedNumPyFunctions,
-                                          DPPYRewriteNdarrayFunctions)
 
 class DPPYPassBuilder(object):
     """
@@ -38,8 +57,7 @@ class DPPYPassBuilder(object):
 
     @staticmethod
     def default_numba_nopython_pipeline(state, pm):
-        """Adds the default set of NUMBA passes to the pass manager
-        """
+        """Adds the default set of NUMBA passes to the pass manager"""
         if state.func_ir is None:
             pm.add_pass(TranslateByteCode, "analyzing bytecode")
             pm.add_pass(FixupArgs, "fix up args")
@@ -47,14 +65,18 @@ def default_numba_nopython_pipeline(state, pm):
         pm.add_pass(WithLifting, "Handle with contexts")
 
         # this pass rewrites name of NumPy functions we intend to overload
-        pm.add_pass(DPPYRewriteOverloadedNumPyFunctions,
-                "Rewrite name of Numpy functions to overload already overloaded function",
+        pm.add_pass(
+            DPPYRewriteOverloadedNumPyFunctions,
+            "Rewrite name of Numpy functions to overload already overloaded function",
         )
 
         # Add pass to ensure when users are allocating static
         # constant memory the size is a constant and can not
         # come from a closure variable
-        pm.add_pass(DPPYConstantSizeStaticLocalMemoryPass, "dppy constant size for static local memory")
+        pm.add_pass(
+            DPPYConstantSizeStaticLocalMemoryPass,
+            "dppy constant size for static local memory",
+        )
 
         # pre typing
         if not state.flags.no_rewrites:
@@ -62,11 +84,11 @@ def default_numba_nopython_pipeline(state, pm):
             pm.add_pass(DeadBranchPrune, "dead branch pruning")
             pm.add_pass(GenericRewrites, "nopython rewrites")
 
-        pm.add_pass(InlineClosureLikes,
-                    "inline calls to locally defined closures")
+        pm.add_pass(InlineClosureLikes, "inline calls to locally defined closures")
         # convert any remaining closures into functions
-        pm.add_pass(MakeFunctionToJitFunction,
-                    "convert make_function into JIT functions")
+        pm.add_pass(
+            MakeFunctionToJitFunction, "convert make_function into JIT functions"
+        )
         # inline functions that have been determined as inlinable and rerun
         # branch pruning, this needs to be run after closures are inlined as
         # the IR repr of a closure masks call sites if an inlinable is called
@@ -84,8 +106,9 @@ def default_numba_nopython_pipeline(state, pm):
         pm.add_pass(NopythonTypeInference, "nopython frontend")
         pm.add_pass(AnnotateTypes, "annotate types")
 
-        pm.add_pass(DPPYRewriteNdarrayFunctions,
-                "Rewrite ndarray functions to dppy supported functions",
+        pm.add_pass(
+            DPPYRewriteNdarrayFunctions,
+            "Rewrite ndarray functions to dppy supported functions",
         )
 
         # strip phis
@@ -94,11 +117,9 @@ def default_numba_nopython_pipeline(state, pm):
         # optimisation
         pm.add_pass(InlineOverloads, "inline overloaded functions")
 
-
     @staticmethod
-    def define_nopython_pipeline(state, name='dppy_nopython'):
-        """Returns an nopython mode pipeline based PassManager
-        """
+    def define_nopython_pipeline(state, name="dppy_nopython"):
+        """Returns an nopython mode pipeline based PassManager"""
         pm = PassManager(name)
         DPPYPassBuilder.default_numba_nopython_pipeline(state, pm)
 
diff --git a/numba_dppy/dppy_passes.py b/numba_dppy/dppy_passes.py
index be9423230b..858420a098 100644
--- a/numba_dppy/dppy_passes.py
+++ b/numba_dppy/dppy_passes.py
@@ -16,19 +16,31 @@
     utils,
     typing,
     types,
-    )
+)
 
 from numba.core.ir_utils import remove_dels
 
-from numba.core.errors import (LoweringError, new_error_context, TypingError,
-                     LiteralTypingError)
+from numba.core.errors import (
+    LoweringError,
+    new_error_context,
+    TypingError,
+    LiteralTypingError,
+)
 
-from numba.core.compiler_machinery import FunctionPass, LoweringPass, register_pass, AnalysisPass
+from numba.core.compiler_machinery import (
+    FunctionPass,
+    LoweringPass,
+    register_pass,
+    AnalysisPass,
+)
 
 from .dppy_lowerer import DPPYLower
 from numba_dppy import config as dppy_config
 
-from numba.parfors.parfor import PreParforPass as _parfor_PreParforPass, replace_functions_map
+from numba.parfors.parfor import (
+    PreParforPass as _parfor_PreParforPass,
+    replace_functions_map,
+)
 from numba.parfors.parfor import ParforPass as _parfor_ParforPass
 from numba.parfors.parfor import Parfor
 
@@ -52,7 +64,11 @@ def run_pass(self, state):
         _DEBUG = False
 
         if _DEBUG:
-            print('Checks if size of OpenCL local address space alloca is a compile-time constant.'.center(80, '-'))
+            print(
+                "Checks if size of OpenCL local address space alloca is a compile-time constant.".center(
+                    80, "-"
+                )
+            )
             print(func_ir.dump())
 
         work_list = list(func_ir.blocks.items())
@@ -62,9 +78,14 @@ def run_pass(self, state):
                 if isinstance(instr, ir.Assign):
                     expr = instr.value
                     if isinstance(expr, ir.Expr):
-                        if expr.op == 'call':
-                            call_node = block.find_variable_assignment(expr.func.name).value
-                            if isinstance(call_node, ir.Expr) and call_node.attr == "static_alloc":
+                        if expr.op == "call":
+                            call_node = block.find_variable_assignment(
+                                expr.func.name
+                            ).value
+                            if (
+                                isinstance(call_node, ir.Expr)
+                                and call_node.attr == "static_alloc"
+                            ):
                                 arg = None
                                 # at first look in keyword arguments to get the shape, which has to be
                                 # constant
@@ -82,24 +103,28 @@ def run_pass(self, state):
                                 if isinstance(arg_type, ir.Expr):
                                     # we have a tuple
                                     for item in arg_type.items:
-                                        if not isinstance(func_ir.get_definition(item.name), ir.Const):
+                                        if not isinstance(
+                                            func_ir.get_definition(item.name), ir.Const
+                                        ):
                                             error = True
                                             break
 
                                 else:
-                                    if not isinstance(func_ir.get_definition(arg.name), ir.Const):
+                                    if not isinstance(
+                                        func_ir.get_definition(arg.name), ir.Const
+                                    ):
                                         error = True
                                         break
 
                                 if error:
-                                    warnings.warn_explicit("The size of the Local memory has to be constant",
-                                                           errors.NumbaError,
-                                                           state.func_id.filename,
-                                                           state.func_id.firstlineno)
+                                    warnings.warn_explicit(
+                                        "The size of the Local memory has to be constant",
+                                        errors.NumbaError,
+                                        state.func_id.filename,
+                                        state.func_id.firstlineno,
+                                    )
                                     raise
 
-
-
         if config.DEBUG or config.DUMP_IR:
             name = state.func_ir.func_id.func_qualname
             print(("IR DUMP: %s" % name).center(80, "-"))
@@ -124,22 +149,23 @@ def run_pass(self, state):
         # Ensure we have an IR and type information.
         assert state.func_ir
         functions_map = replace_functions_map.copy()
-        functions_map.pop(('dot', 'numpy'), None)
-        functions_map.pop(('sum', 'numpy'), None)
-        functions_map.pop(('prod', 'numpy'), None)
-        functions_map.pop(('argmax', 'numpy'), None)
-        functions_map.pop(('max', 'numpy'), None)
-        functions_map.pop(('argmin', 'numpy'), None)
-        functions_map.pop(('min', 'numpy'), None)
-        functions_map.pop(('mean', 'numpy'), None)
+        functions_map.pop(("dot", "numpy"), None)
+        functions_map.pop(("sum", "numpy"), None)
+        functions_map.pop(("prod", "numpy"), None)
+        functions_map.pop(("argmax", "numpy"), None)
+        functions_map.pop(("max", "numpy"), None)
+        functions_map.pop(("argmin", "numpy"), None)
+        functions_map.pop(("min", "numpy"), None)
+        functions_map.pop(("mean", "numpy"), None)
 
         preparfor_pass = _parfor_PreParforPass(
             state.func_ir,
             state.type_annotation.typemap,
-            state.type_annotation.calltypes, state.typingctx,
+            state.type_annotation.calltypes,
+            state.typingctx,
             state.flags.auto_parallel,
             state.parfor_diagnostics.replaced_fns,
-            replace_functions_map=functions_map
+            replace_functions_map=functions_map,
         )
 
         preparfor_pass.run()
@@ -166,14 +192,16 @@ def run_pass(self, state):
         """
         # Ensure we have an IR and type information.
         assert state.func_ir
-        parfor_pass = _parfor_ParforPass(state.func_ir,
-                                         state.type_annotation.typemap,
-                                         state.type_annotation.calltypes,
-                                         state.return_type,
-                                         state.typingctx,
-                                         state.flags.auto_parallel,
-                                         state.flags,
-                                         state.parfor_diagnostics)
+        parfor_pass = _parfor_ParforPass(
+            state.func_ir,
+            state.type_annotation.typemap,
+            state.type_annotation.calltypes,
+            state.return_type,
+            state.typingctx,
+            state.flags.auto_parallel,
+            state.flags,
+            state.parfor_diagnostics,
+        )
 
         parfor_pass.run()
 
@@ -203,14 +231,17 @@ def fallback_context(state, msg):
                 e = e.with_traceback(None)
             # this emits a warning containing the error message body in the
             # case of fallback from npm to objmode
-            loop_lift = '' if state.flags.enable_looplift else 'OUT'
-            msg_rewrite = ("\nCompilation is falling back to object mode "
-                           "WITH%s looplifting enabled because %s"
-                           % (loop_lift, msg))
-            warnings.warn_explicit('%s due to: %s' % (msg_rewrite, e),
-                                   errors.NumbaWarning,
-                                   state.func_id.filename,
-                                   state.func_id.firstlineno)
+            loop_lift = "" if state.flags.enable_looplift else "OUT"
+            msg_rewrite = (
+                "\nCompilation is falling back to object mode "
+                "WITH%s looplifting enabled because %s" % (loop_lift, msg)
+            )
+            warnings.warn_explicit(
+                "%s due to: %s" % (msg_rewrite, e),
+                errors.NumbaWarning,
+                state.func_id.filename,
+                state.func_id.firstlineno,
+            )
             raise
 
 
@@ -232,27 +263,31 @@ def run_pass(self, state):
 
         targetctx = state.targetctx
 
-        library   = state.library
-        interp    = state.func_ir  # why is it called this?!
-        typemap   = state.typemap
-        restype   = state.return_type
+        library = state.library
+        interp = state.func_ir  # why is it called this?!
+        typemap = state.typemap
+        restype = state.return_type
         calltypes = state.calltypes
-        flags     = state.flags
-        metadata  = state.metadata
+        flags = state.flags
+        metadata = state.metadata
 
-        msg = ("Function %s failed at nopython "
-               "mode lowering" % (state.func_id.func_name,))
+        msg = "Function %s failed at nopython " "mode lowering" % (
+            state.func_id.func_name,
+        )
         with fallback_context(state, msg):
             # Lowering
-            fndesc = \
-                funcdesc.PythonFunctionDescriptor.from_specialized_function(
-                    interp, typemap, restype, calltypes,
-                    mangler=targetctx.mangler, inline=flags.forceinline,
-                    noalias=flags.noalias)
+            fndesc = funcdesc.PythonFunctionDescriptor.from_specialized_function(
+                interp,
+                typemap,
+                restype,
+                calltypes,
+                mangler=targetctx.mangler,
+                inline=flags.forceinline,
+                noalias=flags.noalias,
+            )
 
             with targetctx.push_code_library(library):
-                lower = DPPYLower(targetctx, library, fndesc, interp,
-                                       metadata=metadata)
+                lower = DPPYLower(targetctx, library, fndesc, interp, metadata=metadata)
                 lower.lower()
                 if not flags.no_cpython_wrapper:
                     lower.create_cpython_wrapper(flags.release_gil)
@@ -262,17 +297,16 @@ def run_pass(self, state):
                 del lower
 
             from numba.core.compiler import _LowerResult  # TODO: move this
+
             if flags.no_compile:
-                state['cr'] = _LowerResult(fndesc, call_helper,
-                                           cfunc=None, env=env)
+                state["cr"] = _LowerResult(fndesc, call_helper, cfunc=None, env=env)
             else:
                 # Prepare for execution
                 cfunc = targetctx.get_executable(library, fndesc, env)
                 # Insert native function for use by other jitted-functions.
                 # We also register its library to allow for inlining.
                 targetctx.insert_user_function(cfunc, fndesc, [library])
-                state['cr'] = _LowerResult(fndesc, call_helper,
-                                           cfunc=cfunc, env=env)
+                state["cr"] = _LowerResult(fndesc, call_helper, cfunc=cfunc, env=env)
 
         return True
 
@@ -290,10 +324,11 @@ def run_pass(self, state):
         Back-end: Generate LLVM IR from Numba IR, compile to machine code
         """
 
-        lowered = state['cr']
+        lowered = state["cr"]
         signature = typing.signature(state.return_type, *state.args)
 
         from numba.core.compiler import compile_result
+
         state.cr = compile_result(
             typing_context=state.typingctx,
             target_context=state.targetctx,
diff --git a/numba_dppy/dufunc_inliner.py b/numba_dppy/dufunc_inliner.py
index f42a9b1855..ca14cab531 100644
--- a/numba_dppy/dufunc_inliner.py
+++ b/numba_dppy/dufunc_inliner.py
@@ -4,25 +4,41 @@
 from numba.core.ir_utils import dead_code_elimination, simplify_CFG
 
 
-def _run_inliner(func_ir, sig, template, arg_typs, expr, i, py_func, block,
-                 work_list, typemap, calltypes, typingctx):
-    from numba.core.inline_closurecall import (inline_closure_call,
-                                          callee_ir_validator)
+def _run_inliner(
+    func_ir,
+    sig,
+    template,
+    arg_typs,
+    expr,
+    i,
+    py_func,
+    block,
+    work_list,
+    typemap,
+    calltypes,
+    typingctx,
+):
+    from numba.core.inline_closurecall import inline_closure_call, callee_ir_validator
 
     # pass is typed so use the callee globals
-    inline_closure_call(func_ir, py_func.__globals__,
-                        block, i, py_func, typingctx=typingctx,
-                        arg_typs=arg_typs,
-                        typemap=typemap,
-                        calltypes=calltypes,
-                        work_list=work_list,
-                        replace_freevars=False,
-                        callee_validator=callee_ir_validator)
+    inline_closure_call(
+        func_ir,
+        py_func.__globals__,
+        block,
+        i,
+        py_func,
+        typingctx=typingctx,
+        arg_typs=arg_typs,
+        typemap=typemap,
+        calltypes=calltypes,
+        work_list=work_list,
+        replace_freevars=False,
+        callee_validator=callee_ir_validator,
+    )
     return True
 
 
-def _inline(func_ir, work_list, block, i, expr, py_func, typemap, calltypes,
-            typingctx):
+def _inline(func_ir, work_list, block, i, expr, py_func, typemap, calltypes, typingctx):
     # try and get a definition for the call, this isn't always possible as
     # it might be a eval(str)/part generated awaiting update etc. (parfors)
     to_inline = None
@@ -32,7 +48,7 @@ def _inline(func_ir, work_list, block, i, expr, py_func, typemap, calltypes,
         return False
 
     # do not handle closure inlining here, another pass deals with that.
-    if getattr(to_inline, 'op', False) == 'make_function':
+    if getattr(to_inline, "op", False) == "make_function":
         return False
 
     # check this is a known and typed function
@@ -40,33 +56,43 @@ def _inline(func_ir, work_list, block, i, expr, py_func, typemap, calltypes,
         func_ty = typemap[expr.func.name]
     except KeyError:
         return False
-    if not hasattr(func_ty, 'get_call_type'):
+    if not hasattr(func_ty, "get_call_type"):
         return False
 
     sig = calltypes[expr]
     is_method = False
 
-    templates = getattr(func_ty, 'templates', None)
+    templates = getattr(func_ty, "templates", None)
     arg_typs = sig.args
 
     if templates is None:
         return False
 
-    assert(len(templates) == 1)
+    assert len(templates) == 1
 
     # at this point we know we maybe want to inline something and there's
     # definitely something that could be inlined.
     return _run_inliner(
-        func_ir, sig, templates[0], arg_typs, expr, i, py_func, block,
-        work_list, typemap, calltypes, typingctx
+        func_ir,
+        sig,
+        templates[0],
+        arg_typs,
+        expr,
+        i,
+        py_func,
+        block,
+        work_list,
+        typemap,
+        calltypes,
+        typingctx,
     )
 
 
 def _is_dufunc_callsite(expr, block):
-    if expr.op == 'call':
+    if expr.op == "call":
         call_node = block.find_variable_assignment(expr.func.name).value
         # due to circular import we can not import DUFunc, TODO: Fix it
-        if(call_node.value.__class__.__name__ == "DUFunc"):
+        if call_node.value.__class__.__name__ == "DUFunc":
             return call_node
     return None
 
@@ -76,7 +102,7 @@ def dufunc_inliner(func_ir, calltypes, typemap, typingctx):
     modified = False
 
     if _DEBUG:
-        print('GUFunc before inlining DUFunc'.center(80, '-'))
+        print("GUFunc before inlining DUFunc".center(80, "-"))
         print(func_ir.dump())
 
     work_list = list(func_ir.blocks.items())
@@ -92,17 +118,26 @@ def dufunc_inliner(func_ir, calltypes, typemap, typingctx):
                     call_node = _is_dufunc_callsite(expr, block)
                     if call_node:
                         py_func = call_node.value._dispatcher.py_func
-                        workfn = _inline(func_ir, work_list, block, i, expr,
-                                         py_func, typemap, calltypes, typingctx)
+                        workfn = _inline(
+                            func_ir,
+                            work_list,
+                            block,
+                            i,
+                            expr,
+                            py_func,
+                            typemap,
+                            calltypes,
+                            typingctx,
+                        )
                         if workfn:
                             modified = True
                             break  # because block structure changed
                     else:
                         continue
     if _DEBUG:
-        print('GUFunc after inlining DUFunc'.center(80, '-'))
+        print("GUFunc after inlining DUFunc".center(80, "-"))
         print(func_ir.dump())
-        print(''.center(80, '-'))
+        print("".center(80, "-"))
 
     if modified:
         # clean up leftover load instructions. This step is needed or else
@@ -113,8 +148,8 @@ def dufunc_inliner(func_ir, calltypes, typemap, typingctx):
         func_ir.blocks = simplify_CFG(func_ir.blocks)
 
     if _DEBUG:
-        print('GUFunc after inlining DUFunc, DCE, SimplyCFG'.center(80, '-'))
+        print("GUFunc after inlining DUFunc, DCE, SimplyCFG".center(80, "-"))
         print(func_ir.dump())
-        print(''.center(80, '-'))
+        print("".center(80, "-"))
 
     return True
diff --git a/numba_dppy/examples/blacksholes_njit.py b/numba_dppy/examples/blacksholes_njit.py
index 3654a90c66..0786fed9bc 100644
--- a/numba_dppy/examples/blacksholes_njit.py
+++ b/numba_dppy/examples/blacksholes_njit.py
@@ -9,12 +9,14 @@
 import argparse
 import time
 
+
 @numba.vectorize(nopython=True)
 def cndf2(inp):
-    out = 0.5 + 0.5 * math.erf((math.sqrt(2.0)/2.0) * inp)
+    out = 0.5 + 0.5 * math.erf((math.sqrt(2.0) / 2.0) * inp)
     return out
 
-@numba.njit(parallel={'offload':True}, fastmath=True)
+
+@numba.njit(parallel={"offload": True}, fastmath=True)
 def blackscholes(sptprice, strike, rate, volatility, timev):
     logterm = np.log(sptprice / strike)
     powterm = 0.5 * volatility * volatility
@@ -23,29 +25,30 @@ def blackscholes(sptprice, strike, rate, volatility, timev):
     d2 = d1 - den
     NofXd1 = cndf2(d1)
     NofXd2 = cndf2(d2)
-    futureValue = strike * np.exp(- rate * timev)
+    futureValue = strike * np.exp(-rate * timev)
     c1 = futureValue * NofXd2
     call = sptprice * NofXd1 - c1
-    put  = call - futureValue + sptprice
+    put = call - futureValue + sptprice
     return put
 
 
 def run(iterations):
-    sptprice   = np.full((iterations,), 42.0)
+    sptprice = np.full((iterations,), 42.0)
     initStrike = 40 + (np.arange(iterations) + 1.0) / iterations
-    rate       = np.full((iterations,), 0.5)
+    rate = np.full((iterations,), 0.5)
     volatility = np.full((iterations,), 0.2)
-    timev      = np.full((iterations,), 0.5)
+    timev = np.full((iterations,), 0.5)
 
     t1 = time.time()
     put = blackscholes(sptprice, initStrike, rate, volatility, timev)
-    t = time.time()-t1
+    t = time.time() - t1
     print("checksum: ", sum(put))
     print("SELFTIMED ", t)
 
+
 def main():
-    parser = argparse.ArgumentParser(description='Black-Scholes')
-    parser.add_argument('--options', dest='options', type=int, default=10000000)
+    parser = argparse.ArgumentParser(description="Black-Scholes")
+    parser.add_argument("--options", dest="options", type=int, default=10000000)
     args = parser.parse_args()
     options = args.options
 
@@ -53,5 +56,6 @@ def main():
     print("options = ", options)
     run(options)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/dppy_func.py b/numba_dppy/examples/dppy_func.py
index 353ba48995..9230bf64fb 100644
--- a/numba_dppy/examples/dppy_func.py
+++ b/numba_dppy/examples/dppy_func.py
@@ -39,5 +39,5 @@ def main():
         print("No device found")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/dppy_with_context.py b/numba_dppy/examples/dppy_with_context.py
index 6df025f5ca..0520582a2a 100644
--- a/numba_dppy/examples/dppy_with_context.py
+++ b/numba_dppy/examples/dppy_with_context.py
@@ -3,6 +3,7 @@
 import numba_dppy, numba_dppy as dppy
 import dpctl
 
+
 @njit
 def add_two_arrays(b, c):
     a = np.empty_like(b)
@@ -20,14 +21,14 @@ def main():
     if dpctl.has_gpu_queues():
         with dpctl.device_context("opencl:gpu"):
             gpu_result = add_two_arrays(b, c)
-        print('GPU device found. Result on GPU:', gpu_result)
+        print("GPU device found. Result on GPU:", gpu_result)
     elif dpctl.has_cpu_queues():
         with dpctl.device_context("opencl:cpu"):
             cpu_result = add_two_arrays(b, c)
-        print('CPU device found. Result on CPU:', cpu_result)
+        print("CPU device found. Result on CPU:", cpu_result)
     else:
         print("No device found")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/matmul.py b/numba_dppy/examples/matmul.py
index b97ac49ca1..184e65235e 100644
--- a/numba_dppy/examples/matmul.py
+++ b/numba_dppy/examples/matmul.py
@@ -14,7 +14,7 @@ def dppy_gemm(a, b, c):
     j = dppy.get_global_id(1)
     if i >= c.shape[0] or j >= c.shape[1]:
         return
-    c[i,j] = 0
+    c[i, j] = 0
     for k in range(c.shape[0]):
         c[i, j] += a[i, k] * b[k, j]
 
@@ -22,7 +22,7 @@ def dppy_gemm(a, b, c):
 # Array dimesnions
 X = 1024
 Y = 16
-global_size = X,X
+global_size = X, X
 
 griddim = X, X
 blockdim = Y, Y
@@ -30,13 +30,13 @@ def dppy_gemm(a, b, c):
 
 def driver(a, b, c):
     # Invoke the kernel
-    dppy_gemm[griddim,blockdim](a, b, c)
+    dppy_gemm[griddim, blockdim](a, b, c)
 
 
 def main():
-    a = np.arange(X*X, dtype=np.float32).reshape(X,X)
-    b = np.array(np.random.random(X*X), dtype=np.float32).reshape(X,X)
-    c = np.ones_like(a).reshape(X,X)
+    a = np.arange(X * X, dtype=np.float32).reshape(X, X)
+    b = np.array(np.random.random(X * X), dtype=np.float32).reshape(X, X)
+    c = np.ones_like(a).reshape(X, X)
 
     if dpctl.has_gpu_queues():
         with dpctl.device_context("opencl:gpu") as gpu_queue:
@@ -58,5 +58,5 @@ def main():
     print("Done...")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/pa_examples/test1.py b/numba_dppy/examples/pa_examples/test1.py
index 01209b3309..ffe715549a 100644
--- a/numba_dppy/examples/pa_examples/test1.py
+++ b/numba_dppy/examples/pa_examples/test1.py
@@ -31,5 +31,5 @@ def main():
             break
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/pairwise_distance.py b/numba_dppy/examples/pairwise_distance.py
index b72c41ba9c..0a0705ee81 100644
--- a/numba_dppy/examples/pairwise_distance.py
+++ b/numba_dppy/examples/pairwise_distance.py
@@ -1,7 +1,7 @@
 from time import time
 import numba
 from numba import int32, float32
-from math import ceil,sqrt
+from math import ceil, sqrt
 import numpy as np
 import argparse
 import timeit
@@ -10,12 +10,12 @@
 import dpctl
 import dpctl._memory as dpctl_mem
 
-parser = argparse.ArgumentParser(description='Program to compute pairwise distance')
+parser = argparse.ArgumentParser(description="Program to compute pairwise distance")
 
-parser.add_argument('-n', type=int, default=10, help='Number of points')
-parser.add_argument('-d', type=int, default=3, help='Dimensions')
-parser.add_argument('-r', type=int, default=1, help='repeat')
-parser.add_argument('-l', type=int, default=1, help='local_work_size')
+parser.add_argument("-n", type=int, default=10, help="Number of points")
+parser.add_argument("-d", type=int, default=3, help="Dimensions")
+parser.add_argument("-r", type=int, default=1, help="repeat")
+parser.add_argument("-l", type=int, default=1, help="local_work_size")
 
 args = parser.parse_args()
 
@@ -32,7 +32,7 @@
 def pairwise_distance(X, D, xshape0, xshape1):
     idx = dppy.get_global_id(0)
 
-    #for i in range(xshape0):
+    # for i in range(xshape0):
     for j in range(X.shape[0]):
         d = 0.0
         for k in range(X.shape[1]):
@@ -42,7 +42,7 @@ def pairwise_distance(X, D, xshape0, xshape1):
 
 
 def driver():
-    #measure running time
+    # measure running time
     times = list()
 
     xbuf = dpctl_mem.MemoryUSMShared(X.size * X.dtype.itemsize)
@@ -55,7 +55,9 @@ def driver():
 
     for repeat in range(args.r):
         start = time()
-        pairwise_distance[global_size, local_size](x_ndarray, d_ndarray, X.shape[0], X.shape[1])
+        pairwise_distance[global_size, local_size](
+            x_ndarray, d_ndarray, X.shape[0], X.shape[1]
+        )
         end = time()
 
         total_time = end - start
@@ -80,9 +82,9 @@ def main():
         print("No device found")
         exit()
 
-    times =  np.asarray(times, dtype=np.float32)
+    times = np.asarray(times, dtype=np.float32)
     print("Average time of %d runs is = %fs" % (args.r, times.mean()))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/sum-hybrid.py b/numba_dppy/examples/sum-hybrid.py
index e66c51ae2c..38cb708fa3 100644
--- a/numba_dppy/examples/sum-hybrid.py
+++ b/numba_dppy/examples/sum-hybrid.py
@@ -14,8 +14,8 @@ def data_parallel_sum(a, b, c):
     c[i] = a[i] + b[i]
 
 
-N = 50*32
-global_size = N,
+N = 50 * 32
+global_size = (N,)
 
 
 def main():
@@ -49,5 +49,5 @@ def main():
     print("Done...")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/sum.py b/numba_dppy/examples/sum.py
index fdc1623fa7..1f2a0f6c31 100644
--- a/numba_dppy/examples/sum.py
+++ b/numba_dppy/examples/sum.py
@@ -44,5 +44,5 @@ def main():
     print("Done...")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/sum2D.py b/numba_dppy/examples/sum2D.py
index 90959c8bdf..4e279cfc01 100644
--- a/numba_dppy/examples/sum2D.py
+++ b/numba_dppy/examples/sum2D.py
@@ -12,7 +12,7 @@
 def data_parallel_sum(a, b, c):
     i = dppy.get_global_id(0)
     j = dppy.get_global_id(1)
-    c[i,j] = a[i,j] + b[i,j]
+    c[i, j] = a[i, j] + b[i, j]
 
 
 def driver(a, b, c, global_size):
@@ -26,11 +26,11 @@ def main():
     # Array dimesnions
     X = 8
     Y = 8
-    global_size = X,Y
+    global_size = X, Y
 
-    a = np.arange(X*Y, dtype=np.float32).reshape(X,Y)
-    b = np.array(np.random.random(X*Y), dtype=np.float32).reshape(X,Y)
-    c = np.ones_like(a).reshape(X,Y)
+    a = np.arange(X * Y, dtype=np.float32).reshape(X, Y)
+    b = np.array(np.random.random(X * Y), dtype=np.float32).reshape(X, Y)
+    c = np.ones_like(a).reshape(X, Y)
 
     if dpctl.has_gpu_queues():
         with dpctl.device_context("opencl:gpu") as gpu_queue:
@@ -45,5 +45,5 @@ def main():
     print("Done...")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/sum_ndarray.py b/numba_dppy/examples/sum_ndarray.py
index 2aea8e080a..fedf722000 100644
--- a/numba_dppy/examples/sum_ndarray.py
+++ b/numba_dppy/examples/sum_ndarray.py
@@ -8,7 +8,9 @@
 import dpctl
 
 
-@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
+@dppy.kernel(
+    access_types={"read_only": ["a", "b"], "write_only": ["c"], "read_write": []}
+)
 def data_parallel_sum(a, b, c):
     i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
@@ -45,5 +47,5 @@ def main():
     print("Done...")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/sum_reduction.py b/numba_dppy/examples/sum_reduction.py
index 367fa37952..c4d7a171e4 100644
--- a/numba_dppy/examples/sum_reduction.py
+++ b/numba_dppy/examples/sum_reduction.py
@@ -12,7 +12,7 @@
 def reduction_kernel(A, R, stride):
     i = dppy.get_global_id(0)
     # sum two element
-    R[i] = A[i] + A[i+stride]
+    R[i] = A[i] + A[i + stride]
     # store the sum to be used in nex iteration
     A[i] = R[i]
 
@@ -20,21 +20,23 @@ def reduction_kernel(A, R, stride):
 def test_sum_reduction():
     # This test will only work for size = power of two
     N = 2048
-    assert(N%2 == 0)
+    assert N % 2 == 0
 
     A = np.array(np.random.random(N), dtype=np.float32)
     A_copy = A.copy()
     # at max we will require half the size of A to store sum
-    R = np.array(np.random.random(math.ceil(N/2)), dtype=np.float32)
+    R = np.array(np.random.random(math.ceil(N / 2)), dtype=np.float32)
 
     if dpctl.has_gpu_queues():
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             total = N
 
-            while (total > 1):
+            while total > 1:
                 # call kernel
                 global_size = total // 2
-                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size)
+                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](
+                    A, R, global_size
+                )
                 total = total // 2
 
     else:
@@ -43,7 +45,8 @@ def test_sum_reduction():
 
     result = A_copy.sum()
     max_abs_err = result - R[0]
-    assert(max_abs_err < 1e-2)
+    assert max_abs_err < 1e-2
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     test_sum_reduction()
diff --git a/numba_dppy/examples/sum_reduction_ocl.py b/numba_dppy/examples/sum_reduction_ocl.py
index 8d8e0411aa..c3903cc8f2 100644
--- a/numba_dppy/examples/sum_reduction_ocl.py
+++ b/numba_dppy/examples/sum_reduction_ocl.py
@@ -6,13 +6,14 @@
 
 import dpctl
 
+
 def sum_reduction_device_plus_host():
     @dppy.kernel
     def sum_reduction_kernel(inp, partial_sums):
-        local_id   = dppy.get_local_id(0)
-        global_id  = dppy.get_global_id(0)
+        local_id = dppy.get_local_id(0)
+        global_id = dppy.get_global_id(0)
         group_size = dppy.get_local_size(0)
-        group_id   = dppy.get_group_id(0)
+        group_id = dppy.get_group_id(0)
 
         local_sums = dppy.local.static_alloc(64, int32)
 
@@ -21,12 +22,12 @@ def sum_reduction_kernel(inp, partial_sums):
 
         # Loop for computing local_sums : divide workgroup into 2 parts
         stride = group_size // 2
-        while (stride > 0):
+        while stride > 0:
             # Waiting for each 2x2 addition into given workgroup
             dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)
 
             # Add elements 2 by 2 between local_id and local_id + stride
-            if (local_id < stride):
+            if local_id < stride:
                 local_sums[local_id] += local_sums[local_id + stride]
 
             stride >>= 1
@@ -55,9 +56,9 @@ def sum_reduction_kernel(inp, partial_sums):
     for i in range(nb_work_groups):
         final_sum += partial_sums[i]
 
-    assert(final_sum == global_size)
+    assert final_sum == global_size
     print("Expected:", global_size, "--- GOT:", final_sum)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     sum_reduction_device_plus_host()
diff --git a/numba_dppy/examples/sum_reduction_recursive_ocl.py b/numba_dppy/examples/sum_reduction_recursive_ocl.py
index c5dd6daa47..eb06cb2054 100644
--- a/numba_dppy/examples/sum_reduction_recursive_ocl.py
+++ b/numba_dppy/examples/sum_reduction_recursive_ocl.py
@@ -8,16 +8,13 @@
 import dpctl._memory as dpctl_mem
 
 
-def recursive_reduction(size, group_size,
-                        Dinp, Dpartial_sums):
-
+def recursive_reduction(size, group_size, Dinp, Dpartial_sums):
     @dppy.kernel
-    def sum_reduction_kernel(inp, input_size,
-                             partial_sums):
-        local_id   = dppy.get_local_id(0)
-        global_id  = dppy.get_global_id(0)
+    def sum_reduction_kernel(inp, input_size, partial_sums):
+        local_id = dppy.get_local_id(0)
+        global_id = dppy.get_global_id(0)
         group_size = dppy.get_local_size(0)
-        group_id   = dppy.get_group_id(0)
+        group_id = dppy.get_group_id(0)
 
         local_sums = dppy.local.static_alloc(64, int32)
 
@@ -28,12 +25,12 @@ def sum_reduction_kernel(inp, input_size,
 
         # Loop for computing local_sums : divide workgroup into 2 parts
         stride = group_size // 2
-        while (stride > 0):
+        while stride > 0:
             # Waiting for each 2x2 addition into given workgroup
             dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)
 
             # Add elements 2 by 2 between local_id and local_id + stride
-            if (local_id < stride):
+            if local_id < stride:
                 local_sums[local_id] += local_sums[local_id + stride]
 
             stride >>= 1
@@ -41,27 +38,27 @@ def sum_reduction_kernel(inp, input_size,
         if local_id == 0:
             partial_sums[group_id] = local_sums[0]
 
-
     result = 0
     nb_work_groups = 0
     passed_size = size
 
-    if (size <= group_size):
+    if size <= group_size:
         nb_work_groups = 1
     else:
-        nb_work_groups = size // group_size;
-        if (size % group_size != 0):
+        nb_work_groups = size // group_size
+        if size % group_size != 0:
             nb_work_groups += 1
             passed_size = nb_work_groups * group_size
 
     sum_reduction_kernel[passed_size, group_size](Dinp, size, Dpartial_sums)
 
     if nb_work_groups <= group_size:
-        sum_reduction_kernel[group_size, group_size](Dpartial_sums, nb_work_groups, Dinp)
+        sum_reduction_kernel[group_size, group_size](
+            Dpartial_sums, nb_work_groups, Dinp
+        )
         result = Dinp[0]
     else:
-        result = recursive_reduction(nb_work_groups, group_size,
-                                     Dpartial_sums, Dinp)
+        result = recursive_reduction(nb_work_groups, group_size, Dpartial_sums, Dinp)
 
     return result
 
@@ -76,27 +73,30 @@ def sum_reduction_recursive():
     inp = np.ones(global_size).astype(np.int32)
     partial_sums = np.zeros(nb_work_groups).astype(np.int32)
 
-
     if dpctl.has_gpu_queues():
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             inp_buf = dpctl_mem.MemoryUSMShared(inp.size * inp.dtype.itemsize)
             inp_ndarray = np.ndarray(inp.shape, buffer=inp_buf, dtype=inp.dtype)
             np.copyto(inp_ndarray, inp)
 
-            partial_sums_buf = dpctl_mem.MemoryUSMShared(partial_sums.size * partial_sums.dtype.itemsize)
-            partial_sums_ndarray = np.ndarray(partial_sums.shape, buffer=partial_sums_buf, dtype=partial_sums.dtype)
+            partial_sums_buf = dpctl_mem.MemoryUSMShared(
+                partial_sums.size * partial_sums.dtype.itemsize
+            )
+            partial_sums_ndarray = np.ndarray(
+                partial_sums.shape, buffer=partial_sums_buf, dtype=partial_sums.dtype
+            )
             np.copyto(partial_sums_ndarray, partial_sums)
 
             print("Running recursive reduction")
-            result = recursive_reduction(global_size, work_group_size,
-                                         inp_ndarray, partial_sums_ndarray)
+            result = recursive_reduction(
+                global_size, work_group_size, inp_ndarray, partial_sums_ndarray
+            )
     else:
         print("No device found")
         exit()
 
-
     print("Expected:", global_size, "--- GOT:", result)
-    assert(result == global_size)
+    assert result == global_size
 
 
 sum_reduction_recursive()
diff --git a/numba_dppy/initialize.py b/numba_dppy/initialize.py
index 2a2c70f796..3894335051 100644
--- a/numba_dppy/initialize.py
+++ b/numba_dppy/initialize.py
@@ -6,24 +6,32 @@
 
 def init_jit():
     from numba_dppy.dispatcher import DPPYDispatcher
+
     return DPPYDispatcher
 
+
 def initialize_all():
     from numba.core.registry import dispatcher_registry
-    dispatcher_registry.ondemand['dppy'] = init_jit
+
+    dispatcher_registry.ondemand["dppy"] = init_jit
 
     import dpctl
     import glob
     import platform as plt
+
     platform = plt.system()
-    if platform == 'Windows':
-        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface.dll'))
+    if platform == "Windows":
+        paths = glob.glob(
+            os.path.join(os.path.dirname(dpctl.__file__), "*DPCTLSyclInterface.dll")
+        )
     else:
-        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface*'))
+        paths = glob.glob(
+            os.path.join(os.path.dirname(dpctl.__file__), "*DPCTLSyclInterface*")
+        )
 
     if len(paths) == 1:
         ll.load_library_permanently(paths[0])
     else:
         raise ImportError
 
-    ll.load_library_permanently(find_library('OpenCL'))
+    ll.load_library_permanently(find_library("OpenCL"))
diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 83358ac7e9..62ecc7435b 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -20,8 +20,12 @@
 import importlib
 import functools
 import inspect
-from numba.core.typing.templates import (CallableTemplate, AttributeTemplate,
-                                         signature, bound_function)
+from numba.core.typing.templates import (
+    CallableTemplate,
+    AttributeTemplate,
+    signature,
+    bound_function,
+)
 from numba.np.arrayobj import _array_copy
 
 import dpctl.dptensor.numpy_usm_shared as nus
@@ -30,11 +34,13 @@
 
 debug = config.DEBUG
 
+
 def dprint(*args):
     if debug:
         print(*args)
         sys.stdout.flush()
 
+
 # # This code makes it so that Numba can contain calls into the DPPLSyclInterface library.
 # sycl_mem_lib = find_library('DPCTLSyclInterface')
 # dprint("sycl_mem_lib:", sycl_mem_lib)
@@ -154,6 +160,7 @@ def allocator_UsmArray(context, builder, size, align):
 
 registered = False
 
+
 def is_usm_callback(obj):
     if isinstance(obj, numba.core.runtime._nrt_python._MemInfo):
         mobj = obj
@@ -168,6 +175,7 @@ def is_usm_callback(obj):
                 mobj = mobj.base
     return False
 
+
 def numba_register():
     global registered
     if not registered:
@@ -271,7 +279,7 @@ def numba_register_typing():
                 todo.append(ig)
             elif isinstance(typ, numba.core.types.functions.NumberClass):
                 pass
-                #todo_classes.append(ig)
+                # todo_classes.append(ig)
 
     for tgetattr in templates_registry.attributes:
         dprint("Numpy getattr:", tgetattr, type(tgetattr), tgetattr.key)
@@ -287,7 +295,9 @@ def numba_register_typing():
             dprint("failed to eval", val.__name__)
             continue
 
-        typing_registry.register_global(dptype, numba.core.types.NumberClass(typ.instance_type))
+        typing_registry.register_global(
+            dptype, numba.core.types.NumberClass(typ.instance_type)
+        )
 
     for val, typ in todo:
         assert len(typ.templates) == 1
@@ -415,6 +425,7 @@ def wrapper(*args, **kwargs):
         templates_registry.register_attr(new_usmarray_template)
     """
 
+
 class UsmArrayAttribute(AttributeTemplate):
     key = UsmSharedArrayType
 
@@ -447,7 +458,9 @@ def resolve_flags(self, ary):
 
     def convert_array_to_usmarray(self, retty):
         if isinstance(retty, types.Array):
-            return UsmSharedArrayType(dtype=retty.dtype, ndim=retty.ndim, layout=retty.layout)
+            return UsmSharedArrayType(
+                dtype=retty.dtype, ndim=retty.ndim, layout=retty.layout
+            )
         else:
             return retty
 
@@ -460,23 +473,24 @@ def resolve_T(self, ary):
         return self.convert_array_to_usmarray(retty)
 
     def resolve_real(self, ary):
-        return self._resolve_real_imag(ary, attr='real')
+        return self._resolve_real_imag(ary, attr="real")
 
     def resolve_imag(self, ary):
-        return self._resolve_real_imag(ary, attr='imag')
+        return self._resolve_real_imag(ary, attr="imag")
 
     def _resolve_real_imag(self, ary, attr):
         if ary.dtype in types.complex_domain:
-            return ary.copy(dtype=ary.dtype.underlying_float, layout='A')
+            return ary.copy(dtype=ary.dtype.underlying_float, layout="A")
         elif ary.dtype in types.number_domain:
             res = ary.copy(dtype=ary.dtype)
-            if attr == 'imag':
+            if attr == "imag":
                 res = res.copy(readonly=True)
             return self.convert_array_to_usmarray(res)
         else:
             msg = "cannot access .{} of array of {}"
             raise TypingError(msg.format(attr, ary.dtype))
 
+
 """
     @bound_function("array.transpose")
     def resolve_transpose(self, ary, args, kws):
diff --git a/numba_dppy/ocl/atomics/__init__.py b/numba_dppy/ocl/atomics/__init__.py
index aa6fdf5dfd..d3ac430da0 100644
--- a/numba_dppy/ocl/atomics/__init__.py
+++ b/numba_dppy/ocl/atomics/__init__.py
@@ -1,22 +1,25 @@
 import os
 import os.path
 
+
 def atomic_support_present():
-    if os.path.isfile(os.path.join(os.path.dirname(__file__), 'atomic_ops.spir')):
+    if os.path.isfile(os.path.join(os.path.dirname(__file__), "atomic_ops.spir")):
         return True
     else:
         return False
 
+
 def get_atomic_spirv_path():
     if atomic_support_present():
-        return os.path.join(os.path.dirname(__file__), 'atomic_ops.spir')
+        return os.path.join(os.path.dirname(__file__), "atomic_ops.spir")
     else:
         return None
 
+
 def read_atomic_spirv_file():
     path = get_atomic_spirv_path()
     if path:
-        with open(path, 'rb') as fin:
+        with open(path, "rb") as fin:
             spirv = fin.read()
         return spirv
     else:
diff --git a/numba_dppy/ocl/mathdecl.py b/numba_dppy/ocl/mathdecl.py
index 442e269cb4..7ce5e15d95 100644
--- a/numba_dppy/ocl/mathdecl.py
+++ b/numba_dppy/ocl/mathdecl.py
@@ -1,8 +1,12 @@
 from __future__ import print_function, absolute_import, division
 import math
 from numba.core import types, utils
-from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
-                                        signature, Registry)
+from numba.core.typing.templates import (
+    AttributeTemplate,
+    ConcreteTemplate,
+    signature,
+    Registry,
+)
 
 registry = Registry()
 builtin_attr = registry.register_attr
@@ -253,12 +257,15 @@ class Math_degrees(Math_unary):
 class Math_erf(Math_unary):
     key = math.erf
 
+
 class Math_erfc(Math_unary):
     key = math.erfc
 
+
 class Math_gamma(Math_unary):
     key = math.gamma
 
+
 class Math_lgamma(Math_unary):
     key = math.lgamma
 
diff --git a/numba_dppy/ocl/mathimpl.py b/numba_dppy/ocl/mathimpl.py
index 86c8195046..bb2f9cb7eb 100644
--- a/numba_dppy/ocl/mathimpl.py
+++ b/numba_dppy/ocl/mathimpl.py
@@ -25,58 +25,50 @@
 _binary_d_dl = types.float64(types.float64, types.int64)
 
 sig_mapper = {
-        'f->f' : _unary_f_f,
-        'd->d' : _unary_d_d,
-        'ff->f': _binary_f_ff,
-        'dd->d': _binary_d_dd,
-        'fi->f': _binary_f_fi,
-        'fl->f': _binary_f_fl,
-        'di->d': _binary_d_di,
-        'dl->d': _binary_d_dl,
-        }
+    "f->f": _unary_f_f,
+    "d->d": _unary_d_d,
+    "ff->f": _binary_f_ff,
+    "dd->d": _binary_d_dd,
+    "fi->f": _binary_f_fi,
+    "fl->f": _binary_f_fl,
+    "di->d": _binary_d_di,
+    "dl->d": _binary_d_dl,
+}
 
 function_descriptors = {
-    'isnan': (_unary_b_f, _unary_b_d),
-    'isinf': (_unary_b_f, _unary_b_d),
-
-    'ceil': (_unary_f_f, _unary_d_d),
-    'floor': (_unary_f_f, _unary_d_d),
-    'trunc': (_unary_f_f, _unary_d_d),
-
-    'fabs': (_unary_f_f, _unary_d_d),
-
-    'sqrt': (_unary_f_f, _unary_d_d),
-    'exp': (_unary_f_f, _unary_d_d),
-    'expm1': (_unary_f_f, _unary_d_d),
-    'log': (_unary_f_f, _unary_d_d),
-    'log10': (_unary_f_f, _unary_d_d),
-    'log1p': (_unary_f_f, _unary_d_d),
-
-    'sin': (_unary_f_f, _unary_d_d),
-    'cos': (_unary_f_f, _unary_d_d),
-    'tan': (_unary_f_f, _unary_d_d),
-    'asin': (_unary_f_f, _unary_d_d),
-    'acos': (_unary_f_f, _unary_d_d),
-    'atan': (_unary_f_f, _unary_d_d),
-    'sinh': (_unary_f_f, _unary_d_d),
-    'cosh': (_unary_f_f, _unary_d_d),
-    'tanh': (_unary_f_f, _unary_d_d),
-    'asinh': (_unary_f_f, _unary_d_d),
-    'acosh': (_unary_f_f, _unary_d_d),
-    'atanh': (_unary_f_f, _unary_d_d),
-
-    'copysign': (_binary_f_ff, _binary_d_dd),
-    'atan2': (_binary_f_ff, _binary_d_dd),
-    'pow': (_binary_f_ff, _binary_d_dd),
-    'fmod': (_binary_f_ff, _binary_d_dd),
-
-    'erf': (_unary_f_f, _unary_d_d),
-    'erfc': (_unary_f_f, _unary_d_d),
-    'gamma': (_unary_f_f, _unary_d_d),
-    'lgamma': (_unary_f_f, _unary_d_d),
-
-    'ldexp': (_binary_f_fi, _binary_f_fl, _binary_d_di, _binary_d_dl),
-
+    "isnan": (_unary_b_f, _unary_b_d),
+    "isinf": (_unary_b_f, _unary_b_d),
+    "ceil": (_unary_f_f, _unary_d_d),
+    "floor": (_unary_f_f, _unary_d_d),
+    "trunc": (_unary_f_f, _unary_d_d),
+    "fabs": (_unary_f_f, _unary_d_d),
+    "sqrt": (_unary_f_f, _unary_d_d),
+    "exp": (_unary_f_f, _unary_d_d),
+    "expm1": (_unary_f_f, _unary_d_d),
+    "log": (_unary_f_f, _unary_d_d),
+    "log10": (_unary_f_f, _unary_d_d),
+    "log1p": (_unary_f_f, _unary_d_d),
+    "sin": (_unary_f_f, _unary_d_d),
+    "cos": (_unary_f_f, _unary_d_d),
+    "tan": (_unary_f_f, _unary_d_d),
+    "asin": (_unary_f_f, _unary_d_d),
+    "acos": (_unary_f_f, _unary_d_d),
+    "atan": (_unary_f_f, _unary_d_d),
+    "sinh": (_unary_f_f, _unary_d_d),
+    "cosh": (_unary_f_f, _unary_d_d),
+    "tanh": (_unary_f_f, _unary_d_d),
+    "asinh": (_unary_f_f, _unary_d_d),
+    "acosh": (_unary_f_f, _unary_d_d),
+    "atanh": (_unary_f_f, _unary_d_d),
+    "copysign": (_binary_f_ff, _binary_d_dd),
+    "atan2": (_binary_f_ff, _binary_d_dd),
+    "pow": (_binary_f_ff, _binary_d_dd),
+    "fmod": (_binary_f_ff, _binary_d_dd),
+    "erf": (_unary_f_f, _unary_d_d),
+    "erfc": (_unary_f_f, _unary_d_d),
+    "gamma": (_unary_f_f, _unary_d_d),
+    "lgamma": (_unary_f_f, _unary_d_d),
+    "ldexp": (_binary_f_fi, _binary_f_fl, _binary_d_di, _binary_d_dl),
     # unsupported functions listed in the math module documentation:
     # frexp, ldexp, trunc, modf, factorial, fsum
 }
@@ -84,17 +76,16 @@
 
 # some functions may be named differently by the underlying math
 # library as oposed to the Python name.
-_lib_counterpart = {
-    'gamma': 'tgamma'
-}
+_lib_counterpart = {"gamma": "tgamma"}
 
 
 def _mk_fn_decl(name, decl_sig):
     sym = _lib_counterpart.get(name, name)
 
     def core(context, builder, sig, args):
-        fn = _declare_function(context, builder, sym, decl_sig, decl_sig.args,
-                               mangler=mangle)
+        fn = _declare_function(
+            context, builder, sym, decl_sig, decl_sig.args, mangler=mangle
+        )
         res = builder.call(fn, args)
         return context.cast(builder, res, decl_sig.return_type, sig.return_type)
 
@@ -102,16 +93,46 @@ def core(context, builder, sig, args):
     return core
 
 
-_supported = ['sin', 'cos', 'tan', 'asin', 'acos', 'atan', 'atan2', 'sinh',
-              'cosh', 'tanh', 'asinh', 'acosh', 'atanh', 'isnan', 'isinf',
-              'ceil', 'floor', 'fabs', 'sqrt', 'exp', 'expm1', 'log',
-              'log10', 'log1p', 'copysign', 'pow', 'fmod', 'erf', 'erfc',
-              'gamma', 'lgamma', 'ldexp', 'trunc'
-              ]
+_supported = [
+    "sin",
+    "cos",
+    "tan",
+    "asin",
+    "acos",
+    "atan",
+    "atan2",
+    "sinh",
+    "cosh",
+    "tanh",
+    "asinh",
+    "acosh",
+    "atanh",
+    "isnan",
+    "isinf",
+    "ceil",
+    "floor",
+    "fabs",
+    "sqrt",
+    "exp",
+    "expm1",
+    "log",
+    "log10",
+    "log1p",
+    "copysign",
+    "pow",
+    "fmod",
+    "erf",
+    "erfc",
+    "gamma",
+    "lgamma",
+    "ldexp",
+    "trunc",
+]
 
 
 lower_ocl_impl = dict()
 
+
 def function_name_to_supported_decl(name, sig):
     try:
         # only symbols present in the math module
@@ -120,7 +141,7 @@ def function_name_to_supported_decl(name, sig):
         return None
 
     fn = _mk_fn_decl(name, sig)
-    #lower(key, *sig.args)(fn)
+    # lower(key, *sig.args)(fn)
     lower_ocl_impl[(name, sig)] = lower(key, *sig.args)(fn)
 
 
diff --git a/numba_dppy/ocl/ocldecl.py b/numba_dppy/ocl/ocldecl.py
index adf14a1815..a5aa6545e5 100644
--- a/numba_dppy/ocl/ocldecl.py
+++ b/numba_dppy/ocl/ocldecl.py
@@ -1,17 +1,23 @@
 from __future__ import print_function, division, absolute_import
 from numba import types
 from numba.core.typing.npydecl import register_number_classes
-from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
-                                        AbstractTemplate, MacroTemplate,
-                                        signature, Registry)
+from numba.core.typing.templates import (
+    AttributeTemplate,
+    ConcreteTemplate,
+    AbstractTemplate,
+    MacroTemplate,
+    signature,
+    Registry,
+)
 import numba_dppy, numba_dppy as dppy
 
 registry = Registry()
 intrinsic = registry.register
 intrinsic_attr = registry.register_attr
-#intrinsic_global = registry.register_global
+# intrinsic_global = registry.register_global
+
+# register_number_classes(intrinsic_global)
 
-#register_number_classes(intrinsic_global)
 
 @intrinsic
 class Ocl_get_global_id(ConcreteTemplate):
@@ -58,8 +64,7 @@ class Ocl_get_local_size(ConcreteTemplate):
 @intrinsic
 class Ocl_barrier(ConcreteTemplate):
     key = dppy.barrier
-    cases = [signature(types.void, types.uint32),
-             signature(types.void)]
+    cases = [signature(types.void, types.uint32), signature(types.void)]
 
 
 @intrinsic
@@ -77,6 +82,7 @@ class Ocl_sub_group_barrier(ConcreteTemplate):
 
 # dppy.atomic submodule -------------------------------------------------------
 
+
 @intrinsic
 class Ocl_atomic_add(AbstractTemplate):
     key = dppy.atomic.add
@@ -90,6 +96,7 @@ def generic(self, args, kws):
         elif ary.ndim > 1:
             return signature(ary.dtype, ary, idx, ary.dtype)
 
+
 @intrinsic
 class Ocl_atomic_sub(AbstractTemplate):
     key = dppy.atomic.sub
@@ -117,6 +124,7 @@ def resolve_sub(self, mod):
 
 # dppy.local submodule -------------------------------------------------------
 
+
 class Ocl_local_alloc(MacroTemplate):
     key = dppy.local.static_alloc
 
@@ -131,6 +139,7 @@ def resolve_static_alloc(self, mod):
 
 # OpenCL module --------------------------------------------------------------
 
+
 @intrinsic_attr
 class OclModuleTemplate(AttributeTemplate):
     key = types.Module(dppy)
@@ -171,6 +180,7 @@ def resolve_atomic(self, mod):
     def resolve_local(self, mod):
         return types.Module(dppy.local)
 
+
 # intrinsic
 
-#intrinsic_global(dppy, types.Module(dppy))
+# intrinsic_global(dppy, types.Module(dppy))
diff --git a/numba_dppy/ocl/oclimpl.py b/numba_dppy/ocl/oclimpl.py
index 26f8482799..893a37c6a8 100644
--- a/numba_dppy/ocl/oclimpl.py
+++ b/numba_dppy/ocl/oclimpl.py
@@ -24,8 +24,7 @@
 # -----------------------------------------------------------------------------
 
 
-def _declare_function(context, builder, name, sig, cargs,
-                      mangler=mangle_c):
+def _declare_function(context, builder, name, sig, cargs, mangler=mangle_c):
     """Insert declaration for a opencl builtin function.
     Uses the Itanium mangler.
 
@@ -60,11 +59,13 @@ def _declare_function(context, builder, name, sig, cargs,
     fn.calling_convention = target.CC_SPIR_FUNC
     return fn
 
+
 @lower(stubs.get_global_id, types.uint32)
 def get_global_id_impl(context, builder, sig, args):
     [dim] = args
-    get_global_id = _declare_function(context, builder, 'get_global_id', sig,
-                                      ['unsigned int'])
+    get_global_id = _declare_function(
+        context, builder, "get_global_id", sig, ["unsigned int"]
+    )
     res = builder.call(get_global_id, [dim])
     return context.cast(builder, res, types.uintp, types.intp)
 
@@ -72,8 +73,9 @@ def get_global_id_impl(context, builder, sig, args):
 @lower(stubs.get_local_id, types.uint32)
 def get_local_id_impl(context, builder, sig, args):
     [dim] = args
-    get_local_id = _declare_function(context, builder, 'get_local_id', sig,
-                                     ['unsigned int'])
+    get_local_id = _declare_function(
+        context, builder, "get_local_id", sig, ["unsigned int"]
+    )
     res = builder.call(get_local_id, [dim])
     return context.cast(builder, res, types.uintp, types.intp)
 
@@ -81,8 +83,9 @@ def get_local_id_impl(context, builder, sig, args):
 @lower(stubs.get_group_id, types.uint32)
 def get_group_id_impl(context, builder, sig, args):
     [dim] = args
-    get_group_id = _declare_function(context, builder, 'get_group_id', sig,
-                                     ['unsigned int'])
+    get_group_id = _declare_function(
+        context, builder, "get_group_id", sig, ["unsigned int"]
+    )
     res = builder.call(get_group_id, [dim])
     return context.cast(builder, res, types.uintp, types.intp)
 
@@ -90,16 +93,16 @@ def get_group_id_impl(context, builder, sig, args):
 @lower(stubs.get_num_groups, types.uint32)
 def get_num_groups_impl(context, builder, sig, args):
     [dim] = args
-    get_num_groups = _declare_function(context, builder, 'get_num_groups', sig,
-                                       ['unsigned int'])
+    get_num_groups = _declare_function(
+        context, builder, "get_num_groups", sig, ["unsigned int"]
+    )
     res = builder.call(get_num_groups, [dim])
     return context.cast(builder, res, types.uintp, types.intp)
 
 
 @lower(stubs.get_work_dim)
 def get_work_dim_impl(context, builder, sig, args):
-    get_work_dim = _declare_function(context, builder, 'get_work_dim', sig,
-                                     ["void"])
+    get_work_dim = _declare_function(context, builder, "get_work_dim", sig, ["void"])
     res = builder.call(get_work_dim, [])
     return res
 
@@ -107,8 +110,9 @@ def get_work_dim_impl(context, builder, sig, args):
 @lower(stubs.get_global_size, types.uint32)
 def get_global_size_impl(context, builder, sig, args):
     [dim] = args
-    get_global_size = _declare_function(context, builder, 'get_global_size',
-                                        sig, ['unsigned int'])
+    get_global_size = _declare_function(
+        context, builder, "get_global_size", sig, ["unsigned int"]
+    )
     res = builder.call(get_global_size, [dim])
     return context.cast(builder, res, types.uintp, types.intp)
 
@@ -116,8 +120,9 @@ def get_global_size_impl(context, builder, sig, args):
 @lower(stubs.get_local_size, types.uint32)
 def get_local_size_impl(context, builder, sig, args):
     [dim] = args
-    get_local_size = _declare_function(context, builder, 'get_local_size',
-                                       sig, ['unsigned int'])
+    get_local_size = _declare_function(
+        context, builder, "get_local_size", sig, ["unsigned int"]
+    )
     res = builder.call(get_local_size, [dim])
     return context.cast(builder, res, types.uintp, types.intp)
 
@@ -125,17 +130,16 @@ def get_local_size_impl(context, builder, sig, args):
 @lower(stubs.barrier, types.uint32)
 def barrier_one_arg_impl(context, builder, sig, args):
     [flags] = args
-    barrier = _declare_function(context, builder, 'barrier', sig,
-                                ['unsigned int'])
+    barrier = _declare_function(context, builder, "barrier", sig, ["unsigned int"])
     builder.call(barrier, [flags])
     return _void_value
 
+
 @lower(stubs.barrier)
 def barrier_no_arg_impl(context, builder, sig, args):
     assert not args
     sig = types.void(types.uint32)
-    barrier = _declare_function(context, builder, 'barrier', sig,
-                                ['unsigned int'])
+    barrier = _declare_function(context, builder, "barrier", sig, ["unsigned int"])
     flags = context.get_constant(types.uint32, stubs.CLK_GLOBAL_MEM_FENCE)
     builder.call(barrier, [flags])
     return _void_value
@@ -144,8 +148,7 @@ def barrier_no_arg_impl(context, builder, sig, args):
 @lower(stubs.mem_fence, types.uint32)
 def mem_fence_impl(context, builder, sig, args):
     [flags] = args
-    mem_fence = _declare_function(context, builder, 'mem_fence', sig,
-                                ['unsigned int'])
+    mem_fence = _declare_function(context, builder, "mem_fence", sig, ["unsigned int"])
     builder.call(mem_fence, [flags])
     return _void_value
 
@@ -154,15 +157,15 @@ def mem_fence_impl(context, builder, sig, args):
 def sub_group_barrier_impl(context, builder, sig, args):
     assert not args
     sig = types.void(types.uint32)
-    barrier = _declare_function(context, builder, 'barrier', sig,
-                                ['unsigned int'])
+    barrier = _declare_function(context, builder, "barrier", sig, ["unsigned int"])
     flags = context.get_constant(types.uint32, stubs.CLK_LOCAL_MEM_FENCE)
     builder.call(barrier, [flags])
     return _void_value
 
 
-def insert_and_call_atomic_fn(context, builder, sig, fn_type,
-                              dtype, ptr, val, addrspace):
+def insert_and_call_atomic_fn(
+    context, builder, sig, fn_type, dtype, ptr, val, addrspace
+):
     ll_p = None
     name = ""
     if dtype.name == "int32" or dtype.name == "uint32":
@@ -173,11 +176,10 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
         elif fn_type == "sub":
             name = "numba_dppy_atomic_sub_i32"
         else:
-            raise TypeError("Operation type is not supported %s" %
-                             (fn_type))
+            raise TypeError("Operation type is not supported %s" % (fn_type))
     elif dtype.name == "int64" or dtype.name == "uint64":
         # dpctl needs to expose same functions()
-        #if device_env.device_support_int64_atomics():
+        # if device_env.device_support_int64_atomics():
         if True:
             ll_val = ir.IntType(64)
             ll_p = ll_val.as_pointer()
@@ -186,9 +188,8 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
             elif fn_type == "sub":
                 name = "numba_dppy_atomic_sub_i64"
             else:
-                raise TypeError("Operation type is not supported %s" %
-                                 (fn_type))
-        #else:
+                raise TypeError("Operation type is not supported %s" % (fn_type))
+        # else:
         #    raise TypeError("Current device does not support atomic " +
         #                     "operations on 64-bit Integer")
     elif dtype.name == "float32":
@@ -199,10 +200,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
         elif fn_type == "sub":
             name = "numba_dppy_atomic_sub_f32"
         else:
-            raise TypeError("Operation type is not supported %s" %
-                             (fn_type))
+            raise TypeError("Operation type is not supported %s" % (fn_type))
     elif dtype.name == "float64":
-        #if device_env.device_support_float64_atomics():
+        # if device_env.device_support_float64_atomics():
         # dpctl needs to expose same functions()
         if True:
             ll_val = ir.DoubleType()
@@ -212,22 +212,20 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
             elif fn_type == "sub":
                 name = "numba_dppy_atomic_sub_f64"
             else:
-                raise TypeError("Operation type is not supported %s" %
-                                 (fn_type))
-        #else:
+                raise TypeError("Operation type is not supported %s" % (fn_type))
+        # else:
         #    raise TypeError("Current device does not support atomic " +
         #                    "operations on 64-bit Float")
     else:
-        raise TypeError("Atomic operation is not supported for type %s" %
-                        (dtype.name))
+        raise TypeError("Atomic operation is not supported for type %s" % (dtype.name))
 
     if addrspace == target.SPIR_LOCAL_ADDRSPACE:
         name = name + "_local"
     else:
         name = name + "_global"
 
-    assert(ll_p != None)
-    assert(name != "")
+    assert ll_p != None
+    assert name != ""
     ll_p.addrspace = target.SPIR_GENERIC_ADDRSPACE
 
     mod = builder.module
@@ -242,19 +240,17 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
     fn = mod.get_or_insert_function(fnty, name)
     fn.calling_convention = target.CC_SPIR_FUNC
 
-    generic_ptr = context.addrspacecast(builder, ptr,
-                                    target.SPIR_GENERIC_ADDRSPACE)
+    generic_ptr = context.addrspacecast(builder, ptr, target.SPIR_GENERIC_ADDRSPACE)
 
     return builder.call(fn, [generic_ptr, val])
 
 
 @lower(stubs.atomic.add, types.Array, types.intp, types.Any)
-@lower(stubs.atomic.add, types.Array,
-           types.UniTuple, types.Any)
-@lower(stubs.atomic.add, types.Array, types.Tuple,
-           types.Any)
+@lower(stubs.atomic.add, types.Array, types.UniTuple, types.Any)
+@lower(stubs.atomic.add, types.Array, types.Tuple, types.Any)
 def atomic_add_tuple(context, builder, sig, args):
     from .atomics import atomic_support_present
+
     if atomic_support_present():
         context.link_binaries[target.LINK_ATOMIC] = True
         aryty, indty, valty = sig.args
@@ -266,36 +262,53 @@ def atomic_add_tuple(context, builder, sig, args):
             indty = [indty]
         else:
             indices = cgutils.unpack_tuple(builder, inds, count=len(indty))
-            indices = [context.cast(builder, i, t, types.intp)
-                       for t, i in zip(indty, indices)]
+            indices = [
+                context.cast(builder, i, t, types.intp) for t, i in zip(indty, indices)
+            ]
 
         if dtype != valty:
             raise TypeError("expecting %s but got %s" % (dtype, valty))
 
         if aryty.ndim != len(indty):
-            raise TypeError("indexing %d-D array with %d-D index" %
-                            (aryty.ndim, len(indty)))
+            raise TypeError(
+                "indexing %d-D array with %d-D index" % (aryty.ndim, len(indty))
+            )
 
         lary = context.make_array(aryty)(context, builder, ary)
         ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices)
 
         if aryty.addrspace == target.SPIR_LOCAL_ADDRSPACE:
-            return insert_and_call_atomic_fn(context, builder, sig, "add", dtype,
-                    ptr, val, target.SPIR_LOCAL_ADDRSPACE)
+            return insert_and_call_atomic_fn(
+                context,
+                builder,
+                sig,
+                "add",
+                dtype,
+                ptr,
+                val,
+                target.SPIR_LOCAL_ADDRSPACE,
+            )
         else:
-            return insert_and_call_atomic_fn(context, builder, sig, "add", dtype,
-                    ptr, val, target.SPIR_GLOBAL_ADDRSPACE)
+            return insert_and_call_atomic_fn(
+                context,
+                builder,
+                sig,
+                "add",
+                dtype,
+                ptr,
+                val,
+                target.SPIR_GLOBAL_ADDRSPACE,
+            )
     else:
         raise ImportError("Atomic support is not present, can not perform atomic_add")
 
 
 @lower(stubs.atomic.sub, types.Array, types.intp, types.Any)
-@lower(stubs.atomic.sub, types.Array,
-           types.UniTuple, types.Any)
-@lower(stubs.atomic.sub, types.Array, types.Tuple,
-           types.Any)
+@lower(stubs.atomic.sub, types.Array, types.UniTuple, types.Any)
+@lower(stubs.atomic.sub, types.Array, types.Tuple, types.Any)
 def atomic_sub_tuple(context, builder, sig, args):
     from .atomics import atomic_support_present
+
     if atomic_support_present():
         context.link_binaries[target.LINK_ATOMIC] = True
         aryty, indty, valty = sig.args
@@ -307,36 +320,58 @@ def atomic_sub_tuple(context, builder, sig, args):
             indty = [indty]
         else:
             indices = cgutils.unpack_tuple(builder, inds, count=len(indty))
-            indices = [context.cast(builder, i, t, types.intp)
-                       for t, i in zip(indty, indices)]
+            indices = [
+                context.cast(builder, i, t, types.intp) for t, i in zip(indty, indices)
+            ]
 
         if dtype != valty:
             raise TypeError("expecting %s but got %s" % (dtype, valty))
 
         if aryty.ndim != len(indty):
-            raise TypeError("indexing %d-D array with %d-D index" %
-                            (aryty.ndim, len(indty)))
+            raise TypeError(
+                "indexing %d-D array with %d-D index" % (aryty.ndim, len(indty))
+            )
 
         lary = context.make_array(aryty)(context, builder, ary)
         ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices)
 
-
         if aryty.addrspace == target.SPIR_LOCAL_ADDRSPACE:
-            return insert_and_call_atomic_fn(context, builder, sig, "sub", dtype,
-                    ptr, val, target.SPIR_LOCAL_ADDRSPACE)
+            return insert_and_call_atomic_fn(
+                context,
+                builder,
+                sig,
+                "sub",
+                dtype,
+                ptr,
+                val,
+                target.SPIR_LOCAL_ADDRSPACE,
+            )
         else:
-            return insert_and_call_atomic_fn(context, builder, sig, "sub", dtype,
-                    ptr, val, target.SPIR_GLOBAL_ADDRSPACE)
+            return insert_and_call_atomic_fn(
+                context,
+                builder,
+                sig,
+                "sub",
+                dtype,
+                ptr,
+                val,
+                target.SPIR_GLOBAL_ADDRSPACE,
+            )
     else:
         raise ImportError("Atomic support is not present, can not perform atomic_add")
 
 
-@lower('dppy.lmem.alloc', types.UniTuple, types.Any)
+@lower("dppy.lmem.alloc", types.UniTuple, types.Any)
 def dppy_lmem_alloc_array(context, builder, sig, args):
     shape, dtype = args
-    return _generic_array(context, builder, shape=shape, dtype=dtype,
-                          symbol_name='_dppy_lmem',
-                          addrspace=target.SPIR_LOCAL_ADDRSPACE)
+    return _generic_array(
+        context,
+        builder,
+        shape=shape,
+        dtype=dtype,
+        symbol_name="_dppy_lmem",
+        addrspace=target.SPIR_LOCAL_ADDRSPACE,
+    )
 
 
 def _generic_array(context, builder, shape, dtype, symbol_name, addrspace):
@@ -374,10 +409,18 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace):
     return _make_array(context, builder, gvmem, dtype, shape, addrspace=addrspace)
 
 
-def _make_array(context, builder, dataptr, dtype, shape, layout='C', addrspace=target.SPIR_GENERIC_ADDRSPACE):
+def _make_array(
+    context,
+    builder,
+    dataptr,
+    dtype,
+    shape,
+    layout="C",
+    addrspace=target.SPIR_GENERIC_ADDRSPACE,
+):
     ndim = len(shape)
     # Create array object
-    aryty = types.Array(dtype=dtype, ndim=ndim, layout='C', addrspace=addrspace)
+    aryty = types.Array(dtype=dtype, ndim=ndim, layout="C", addrspace=addrspace)
     ary = context.make_array(aryty)(context, builder)
 
     targetdata = _get_target_data(context)
@@ -392,12 +435,14 @@ def _make_array(context, builder, dataptr, dtype, shape, layout='C', addrspace=t
     kshape = [context.get_constant(types.intp, s) for s in shape]
     kstrides = [context.get_constant(types.intp, s) for s in strides]
 
-    context.populate_array(ary,
-                           data=builder.bitcast(dataptr, ary.data.type),
-                           shape=cgutils.pack_array(builder, kshape),
-                           strides=cgutils.pack_array(builder, kstrides),
-                           itemsize=context.get_constant(types.intp, itemsize),
-                           meminfo=None)
+    context.populate_array(
+        ary,
+        data=builder.bitcast(dataptr, ary.data.type),
+        shape=cgutils.pack_array(builder, kshape),
+        strides=cgutils.pack_array(builder, kstrides),
+        itemsize=context.get_constant(types.intp, itemsize),
+        meminfo=None,
+    )
 
     return ary._getvalue()
 
diff --git a/numba_dppy/ocl/stubs.py b/numba_dppy/ocl/stubs.py
index 190b685955..bd4e03c21a 100644
--- a/numba_dppy/ocl/stubs.py
+++ b/numba_dppy/ocl/stubs.py
@@ -7,7 +7,7 @@
 _stub_error = NotImplementedError("This is a stub.")
 
 # mem fence
-CLK_LOCAL_MEM_FENCE  = 0x1
+CLK_LOCAL_MEM_FENCE = 0x1
 CLK_GLOBAL_MEM_FENCE = 0x2
 
 
@@ -85,7 +85,8 @@ class Stub(object):
     """A stub object to represent special objects which is meaningless
     outside the context of DPPY compilation context.
     """
-    _description_ = '<dppy special value>'
+
+    _description_ = "<dppy special value>"
     __slots__ = ()  # don't allocate __dict__
 
     def __new__(cls):
@@ -94,25 +95,28 @@ def __new__(cls):
     def __repr__(self):
         return self._description_
 
-#-------------------------------------------------------------------------------
+
+# -------------------------------------------------------------------------------
 # local memory
 
+
 def local_alloc(shape, dtype):
     shape = _legalize_shape(shape)
     ndim = len(shape)
     fname = "dppy.lmem.alloc"
-    restype = types.Array(dtype, ndim, 'C', addrspace=SPIR_LOCAL_ADDRSPACE)
+    restype = types.Array(dtype, ndim, "C", addrspace=SPIR_LOCAL_ADDRSPACE)
     sig = typing.signature(restype, types.UniTuple(types.intp, ndim), types.Any)
     return ir.Intrinsic(fname, sig, args=(shape, dtype))
 
 
 class local(Stub):
-    """local namespace
-    """
-    _description_ = '<local>'
+    """local namespace"""
 
-    static_alloc = Macro('local.static_alloc', local_alloc, callable=True,
-                        argnames=['shape', 'dtype'])
+    _description_ = "<local>"
+
+    static_alloc = Macro(
+        "local.static_alloc", local_alloc, callable=True, argnames=["shape", "dtype"]
+    )
 
 
 def _legalize_shape(shape):
@@ -124,13 +128,14 @@ def _legalize_shape(shape):
         raise TypeError("invalid type for shape; got {0}".format(type(shape)))
 
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # atomic
 
+
 class atomic(Stub):
-    """atomic namespace
-    """
-    _description_ = '<atomic>'
+    """atomic namespace"""
+
+    _description_ = "<atomic>"
 
     class add(Stub):
         """add(ary, idx, val)
diff --git a/numba_dppy/printimpl.py b/numba_dppy/printimpl.py
index e5c9d4f793..983e1f267c 100644
--- a/numba_dppy/printimpl.py
+++ b/numba_dppy/printimpl.py
@@ -27,8 +27,7 @@ def print_item(ty, context, builder, val):
     A (format string, [list of arguments]) is returned that will allow
     forming the final printf()-like call.
     """
-    raise NotImplementedError("printing unimplemented for values of type %s"
-                              % (ty,))
+    raise NotImplementedError("printing unimplemented for values of type %s" % (ty,))
 
 
 @print_item.register(types.Integer)
@@ -44,11 +43,13 @@ def int_print_impl(ty, context, builder, val):
     lld = context.cast(builder, val, ty, dsttype)
     return rawfmt, [lld]
 
+
 @print_item.register(types.Float)
 def real_print_impl(ty, context, builder, val):
     lld = context.cast(builder, val, ty, types.float64)
     return "%f", [lld]
 
+
 @print_item.register(types.StringLiteral)
 def const_print_impl(ty, context, builder, sigval):
     pyval = ty.literal_value
diff --git a/numba_dppy/rename_numpy_functions_pass.py b/numba_dppy/rename_numpy_functions_pass.py
index c1d58ce036..b55c122d93 100644
--- a/numba_dppy/rename_numpy_functions_pass.py
+++ b/numba_dppy/rename_numpy_functions_pass.py
@@ -9,21 +9,23 @@
 import numba_dppy
 from numba.core import types
 
-rewrite_function_name_map = {"sum": (["np"], "sum"),
-                             "eig": (["linalg"], "eig"),
-                             "prod": (["np"], "prod"),
-                             "max": (["np"], "max"),
-                             "amax": (["np"], "amax"),
-                             "min": (["np"], "min"),
-                             "amin": (["np"], "amin"),
-                             "mean": (["np"], "mean"),
-                             "median": (["np"], "median"),
-                             "argmax": (["np"], "argmax"),
-                             "argmin": (["np"], "argmin"),
-                             "argsort": (["np"], "argsort"),
-                             "cov": (["np"], "cov"),
-                             "dot": (["np"], "dot"),
-                             "matmul": (["np"], "matmul")}
+rewrite_function_name_map = {
+    "sum": (["np"], "sum"),
+    "eig": (["linalg"], "eig"),
+    "prod": (["np"], "prod"),
+    "max": (["np"], "max"),
+    "amax": (["np"], "amax"),
+    "min": (["np"], "min"),
+    "amin": (["np"], "amin"),
+    "mean": (["np"], "mean"),
+    "median": (["np"], "median"),
+    "argmax": (["np"], "argmax"),
+    "argmin": (["np"], "argmin"),
+    "argsort": (["np"], "argsort"),
+    "cov": (["np"], "cov"),
+    "dot": (["np"], "dot"),
+    "matmul": (["np"], "matmul"),
+}
 
 
 class RewriteNumPyOverloadedFunctions(object):
@@ -147,6 +149,7 @@ def run_pass(self, state):
 
 def get_dpnp_func_typ(func):
     from numba.core.typing.templates import builtin_registry
+
     for (k, v) in builtin_registry.globals:
         if k == func:
             return v
@@ -178,9 +181,13 @@ def run(self):
                     lhs = stmt.target.name
                     rhs = stmt.value
                     # replace A.func with np.func, and save A in saved_arr_arg
-                    if (rhs.op == 'getattr' and rhs.attr in self.function_name_map
-                            and isinstance(
-                                self.typemap[rhs.value.name], types.npytypes.Array)):
+                    if (
+                        rhs.op == "getattr"
+                        and rhs.attr in self.function_name_map
+                        and isinstance(
+                            self.typemap[rhs.value.name], types.npytypes.Array
+                        )
+                    ):
                         rhs = stmt.value
                         arr = rhs.value
                         saved_arr_arg[lhs] = arr
@@ -211,17 +218,18 @@ def run(self):
                         self.typemap.pop(lhs)
                         self.typemap[lhs] = func_typ
 
-                    if rhs.op == 'call' and rhs.func.name in saved_arr_arg:
+                    if rhs.op == "call" and rhs.func.name in saved_arr_arg:
                         # add array as first arg
                         arr = saved_arr_arg[rhs.func.name]
                         # update call type signature to include array arg
                         old_sig = self.calltypes.pop(rhs)
                         # argsort requires kws for typing so sig.args can't be used
                         # reusing sig.args since some types become Const in sig
-                        argtyps = old_sig.args[:len(rhs.args)]
+                        argtyps = old_sig.args[: len(rhs.args)]
                         kwtyps = {name: self.typemap[v.name] for name, v in rhs.kws}
                         self.calltypes[rhs] = self.typemap[rhs.func.name].get_call_type(
-                            typingctx, [self.typemap[arr.name]] + list(argtyps), kwtyps)
+                            typingctx, [self.typemap[arr.name]] + list(argtyps), kwtyps
+                        )
                         rhs.args = [arr] + rhs.args
 
                 new_body.append(stmt)
diff --git a/numba_dppy/spirv_generator.py b/numba_dppy/spirv_generator.py
index 5bac98e014..3d31596c5a 100644
--- a/numba_dppy/spirv_generator.py
+++ b/numba_dppy/spirv_generator.py
@@ -17,60 +17,69 @@ def _raise_bad_env_path(msg, path, extra=None):
         error_message += extra
     raise ValueError(error_message)
 
+
 _real_check_call = check_call
 
+
 def check_call(*args, **kwargs):
-    #print("check_call:", *args, **kwargs)
+    # print("check_call:", *args, **kwargs)
     return _real_check_call(*args, **kwargs)
 
-class CmdLine(object):
 
+class CmdLine(object):
     def disassemble(self, ipath, opath):
-        check_call([
-            "spirv-dis",
-            # "--no-indent",
-            # "--no-header",
-            # "--raw-id",
-            # "--offsets",
-            "-o",
-            opath,
-            ipath])
+        check_call(
+            [
+                "spirv-dis",
+                # "--no-indent",
+                # "--no-header",
+                # "--raw-id",
+                # "--offsets",
+                "-o",
+                opath,
+                ipath,
+            ]
+        )
 
     def validate(self, ipath):
-        check_call(["spirv-val",ipath])
+        check_call(["spirv-val", ipath])
 
     def optimize(self, ipath, opath):
-        check_call([
-            "spirv-opt",
-            # "--strip-debug",
-            # "--freeze-spec-const",
-            # "--eliminate-dead-const",
-            # "--fold-spec-const-op-composite",
-            # "--set-spec-const-default-value '<spec id>:<default value> ...'",
-            # "--unify-const",
-            # "--inline-entry-points-exhaustive",
-            # "--flatten-decorations",
-            # "--compact-ids",
-            "-o",
-            opath,
-            ipath])
+        check_call(
+            [
+                "spirv-opt",
+                # "--strip-debug",
+                # "--freeze-spec-const",
+                # "--eliminate-dead-const",
+                # "--fold-spec-const-op-composite",
+                # "--set-spec-const-default-value '<spec id>:<default value> ...'",
+                # "--unify-const",
+                # "--inline-entry-points-exhaustive",
+                # "--flatten-decorations",
+                # "--compact-ids",
+                "-o",
+                opath,
+                ipath,
+            ]
+        )
 
     def generate(self, ipath, opath):
         # DRD : Temporary hack to get SPIR-V code generation to work.
         # The opt step is needed for:
         #     a) generate a bitcode file from the text IR file
         #     b) hoist all allocas to the enty block of the module
-        check_call(["opt","-O1","-o",ipath+'.bc',ipath])
-        check_call(["llvm-spirv","-o",opath,ipath+'.bc'])
+        check_call(["opt", "-O1", "-o", ipath + ".bc", ipath])
+        check_call(["llvm-spirv", "-o", opath, ipath + ".bc"])
         if dppy_config.SAVE_IR_FILES == 0:
-            os.unlink(ipath + '.bc')
+            os.unlink(ipath + ".bc")
 
     def link(self, opath, binaries):
-        params = ["spirv-link","--allow-partial-linkage","-o", opath]
+        params = ["spirv-link", "--allow-partial-linkage", "-o", opath]
         params.extend(binaries)
 
         check_call(params)
 
+
 class Module(object):
     def __init__(self, context):
         """
@@ -93,14 +102,13 @@ def __del__(self):
         if dppy_config.SAVE_IR_FILES == 0:
             os.rmdir(self._tmpdir)
 
-    def _create_temp_file(self, name, mode='wb'):
+    def _create_temp_file(self, name, mode="wb"):
         path = self._track_temp_file(name)
         fobj = open(path, mode=mode)
         return fobj, path
 
     def _track_temp_file(self, name):
-        path = os.path.join(self._tmpdir,
-                            "{0}-{1}".format(len(self._tempfiles), name))
+        path = os.path.join(self._tmpdir, "{0}-{1}".format(len(self._tempfiles), name))
         self._tempfiles.append(path)
         return path
 
@@ -130,6 +138,7 @@ def finalize(self):
             del self.context.link_binaries[key]
             if key == LINK_ATOMIC:
                 from .ocl.atomics import get_atomic_spirv_path
+
                 binary_paths.append(get_atomic_spirv_path())
 
         if len(binary_paths) > 1:
@@ -152,13 +161,13 @@ def finalize(self):
                     # Disassemble optimized SPIR-V code
                     dis_path = self._track_temp_file("disassembled-spirv")
                     self._cmd.disassemble(ipath=opt_path, opath=dis_path)
-                    with open(dis_path, 'rb') as fin_opt:
+                    with open(dis_path, "rb") as fin_opt:
                         print("ASSEMBLY".center(80, "-"))
                         print(fin_opt.read())
                         print("".center(80, "="))
 
         # Read and return final SPIR-V (not optimized!)
-        with open(spirv_path, 'rb') as fin:
+        with open(spirv_path, "rb") as fin:
             spirv = fin.read()
 
         self._finalized = True
@@ -168,6 +177,7 @@ def finalize(self):
 
 # Public llvm_to_spirv function ###############################################
 
+
 def llvm_to_spirv(context, bitcode):
     mod = Module(context)
     mod.load_llvm(bitcode)
diff --git a/numba_dppy/target.py b/numba_dppy/target.py
index 147b229e77..87103a5045 100644
--- a/numba_dppy/target.py
+++ b/numba_dppy/target.py
@@ -39,15 +39,15 @@ def load_additional_registries(self):
 # -----------------------------------------------------------------------------
 # Implementation
 
-VALID_CHARS = re.compile(r'[^a-z0-9]', re.I)
+VALID_CHARS = re.compile(r"[^a-z0-9]", re.I)
 
 
 # Address spaces
-SPIR_PRIVATE_ADDRSPACE  = 0
-SPIR_GLOBAL_ADDRSPACE   = 1
+SPIR_PRIVATE_ADDRSPACE = 0
+SPIR_GLOBAL_ADDRSPACE = 1
 SPIR_CONSTANT_ADDRSPACE = 2
-SPIR_LOCAL_ADDRSPACE    = 3
-SPIR_GENERIC_ADDRSPACE  = 4
+SPIR_LOCAL_ADDRSPACE = 3
+SPIR_GENERIC_ADDRSPACE = 4
 
 SPIR_VERSION = (2, 0)
 
@@ -57,9 +57,13 @@ def load_additional_registries(self):
 
 class GenericPointerModel(datamodel.PrimitiveModel):
     def __init__(self, dmm, fe_type):
-        #print("GenericPointerModel:", dmm, fe_type, fe_type.addrspace)
-        adrsp = fe_type.addrspace if fe_type.addrspace is not None else SPIR_GENERIC_ADDRSPACE
-        #adrsp = SPIR_GENERIC_ADDRSPACE
+        # print("GenericPointerModel:", dmm, fe_type, fe_type.addrspace)
+        adrsp = (
+            fe_type.addrspace
+            if fe_type.addrspace is not None
+            else SPIR_GENERIC_ADDRSPACE
+        )
+        # adrsp = SPIR_GENERIC_ADDRSPACE
         be_type = dmm.lookup(fe_type.dtype).get_data_type().as_pointer(adrsp)
         super(GenericPointerModel, self).__init__(dmm, fe_type, be_type)
 
@@ -72,18 +76,37 @@ def _init_data_model_manager():
 
 spirv_data_model_manager = _init_data_model_manager()
 
+
 def _replace_numpy_ufunc_with_opencl_supported_functions():
     from numba.np.ufunc_db import _ufunc_db as ufunc_db
     from numba_dppy.ocl.mathimpl import lower_ocl_impl, sig_mapper
 
-    ufuncs = [("fabs", np.fabs), ("exp", np.exp), ("log", np.log),
-              ("log10", np.log10), ("expm1", np.expm1), ("log1p", np.log1p),
-              ("sqrt", np.sqrt), ("sin", np.sin), ("cos", np.cos),
-              ("tan", np.tan), ("asin", np.arcsin), ("acos", np.arccos),
-              ("atan", np.arctan), ("atan2", np.arctan2), ("sinh", np.sinh),
-              ("cosh", np.cosh), ("tanh", np.tanh), ("asinh", np.arcsinh),
-              ("acosh", np.arccosh), ("atanh", np.arctanh), ("ldexp", np.ldexp),
-              ("floor", np.floor), ("ceil", np.ceil), ("trunc", np.trunc)]
+    ufuncs = [
+        ("fabs", np.fabs),
+        ("exp", np.exp),
+        ("log", np.log),
+        ("log10", np.log10),
+        ("expm1", np.expm1),
+        ("log1p", np.log1p),
+        ("sqrt", np.sqrt),
+        ("sin", np.sin),
+        ("cos", np.cos),
+        ("tan", np.tan),
+        ("asin", np.arcsin),
+        ("acos", np.arccos),
+        ("atan", np.arctan),
+        ("atan2", np.arctan2),
+        ("sinh", np.sinh),
+        ("cosh", np.cosh),
+        ("tanh", np.tanh),
+        ("asinh", np.arcsinh),
+        ("acosh", np.arccosh),
+        ("atanh", np.arctanh),
+        ("ldexp", np.ldexp),
+        ("floor", np.floor),
+        ("ceil", np.ceil),
+        ("trunc", np.trunc),
+    ]
 
     for name, ufunc in ufuncs:
         for sig in ufunc_db[ufunc].keys():
@@ -97,16 +120,19 @@ class DPPYTargetContext(BaseContext):
 
     def init(self):
         self._internal_codegen = codegen.JITSPIRVCodegen("numba_dppy.jit")
-        self._target_data = (ll.create_target_data(codegen
-                                .SPIR_DATA_LAYOUT[utils.MACHINE_BITS]))
+        self._target_data = ll.create_target_data(
+            codegen.SPIR_DATA_LAYOUT[utils.MACHINE_BITS]
+        )
         # Override data model manager to SPIR model
         self.data_model_manager = spirv_data_model_manager
         self.link_binaries = dict()
 
         from numba.np.ufunc_db import _lazy_init_db
         import copy
+
         _lazy_init_db()
         from numba.np.ufunc_db import _ufunc_db as ufunc_db
+
         self.ufunc_db = copy.deepcopy(ufunc_db)
 
         from numba.core.cpu import CPUContext
@@ -114,26 +140,41 @@ def init(self):
 
         self.cpu_context = cpu_target.target_context
 
-
-
     def replace_numpy_ufunc_with_opencl_supported_functions(self):
         from numba_dppy.ocl.mathimpl import lower_ocl_impl, sig_mapper
 
-        ufuncs = [("fabs", np.fabs), ("exp", np.exp), ("log", np.log),
-                  ("log10", np.log10), ("expm1", np.expm1), ("log1p", np.log1p),
-                  ("sqrt", np.sqrt), ("sin", np.sin), ("cos", np.cos),
-                  ("tan", np.tan), ("asin", np.arcsin), ("acos", np.arccos),
-                  ("atan", np.arctan), ("atan2", np.arctan2), ("sinh", np.sinh),
-                  ("cosh", np.cosh), ("tanh", np.tanh), ("asinh", np.arcsinh),
-                  ("acosh", np.arccosh), ("atanh", np.arctanh), ("ldexp", np.ldexp),
-                  ("floor", np.floor), ("ceil", np.ceil), ("trunc", np.trunc)]
+        ufuncs = [
+            ("fabs", np.fabs),
+            ("exp", np.exp),
+            ("log", np.log),
+            ("log10", np.log10),
+            ("expm1", np.expm1),
+            ("log1p", np.log1p),
+            ("sqrt", np.sqrt),
+            ("sin", np.sin),
+            ("cos", np.cos),
+            ("tan", np.tan),
+            ("asin", np.arcsin),
+            ("acos", np.arccos),
+            ("atan", np.arctan),
+            ("atan2", np.arctan2),
+            ("sinh", np.sinh),
+            ("cosh", np.cosh),
+            ("tanh", np.tanh),
+            ("asinh", np.arcsinh),
+            ("acosh", np.arccosh),
+            ("atanh", np.arctanh),
+            ("ldexp", np.ldexp),
+            ("floor", np.floor),
+            ("ceil", np.ceil),
+            ("trunc", np.trunc),
+        ]
 
         for name, ufunc in ufuncs:
             for sig in self.ufunc_db[ufunc].keys():
                 if sig in sig_mapper and (name, sig_mapper[sig]) in lower_ocl_impl:
                     self.ufunc_db[ufunc][sig] = lower_ocl_impl[(name, sig_mapper[sig])]
 
-
     def load_additional_registries(self):
         from .ocl import oclimpl, mathimpl
         from numba.np import npyimpl
@@ -150,7 +191,6 @@ def load_additional_registries(self):
         """
         self.replace_numpy_ufunc_with_opencl_supported_functions()
 
-
     @cached_property
     def call_conv(self):
         return DPPYCallConv(self)
@@ -167,13 +207,13 @@ def repl(m):
             ch = m.group(0)
             return "_%X_" % ord(ch)
 
-        qualified = name + '.' + '.'.join(str(a) for a in argtypes)
+        qualified = name + "." + ".".join(str(a) for a in argtypes)
         mangled = VALID_CHARS.sub(repl, qualified)
-        return 'dppy_py_devfn_' + mangled
+        return "dppy_py_devfn_" + mangled
 
     def prepare_ocl_kernel(self, func, argtypes):
         module = func.module
-        func.linkage = 'linkonce_odr'
+        func.linkage = "linkonce_odr"
 
         module.data_layout = codegen.SPIR_DATA_LAYOUT[self.address_size]
         wrapper = self.generate_kernel_wrapper(func, argtypes)
@@ -184,7 +224,7 @@ def mark_ocl_device(self, func):
         # Adapt to SPIR
         # module = func.module
         func.calling_convention = CC_SPIR_FUNC
-        func.linkage = 'linkonce_odr'
+        func.linkage = "linkonce_odr"
         return func
 
     def generate_kernel_wrapper(self, func, argtypes):
@@ -196,32 +236,33 @@ def sub_gen_with_global(lty):
                 if lty.addrspace == SPIR_LOCAL_ADDRSPACE:
                     return lty, None
                 # DRD : Cast all pointer types to global address space.
-                if  lty.addrspace != SPIR_GLOBAL_ADDRSPACE: # jcaraban
-                    return (lty.pointee.as_pointer(SPIR_GLOBAL_ADDRSPACE),
-                            lty.addrspace)
+                if lty.addrspace != SPIR_GLOBAL_ADDRSPACE:  # jcaraban
+                    return (
+                        lty.pointee.as_pointer(SPIR_GLOBAL_ADDRSPACE),
+                        lty.addrspace,
+                    )
             return lty, None
 
         if len(arginfo.argument_types) > 0:
-            llargtys, changed = zip(*map(sub_gen_with_global,
-                                         arginfo.argument_types))
+            llargtys, changed = zip(*map(sub_gen_with_global, arginfo.argument_types))
         else:
             llargtys = changed = ()
         wrapperfnty = lc.Type.function(lc.Type.void(), llargtys)
 
         wrapper_module = self.create_module("dppy.kernel.wrapper")
-        wrappername = 'dppyPy_{name}'.format(name=func.name)
+        wrappername = "dppyPy_{name}".format(name=func.name)
 
         argtys = list(arginfo.argument_types)
-        fnty = lc.Type.function(lc.Type.int(),
-                                [self.call_conv.get_return_type(
-                                    types.pyobject)] + argtys)
+        fnty = lc.Type.function(
+            lc.Type.int(), [self.call_conv.get_return_type(types.pyobject)] + argtys
+        )
 
         func = wrapper_module.add_function(fnty, name=func.name)
         func.calling_convention = CC_SPIR_FUNC
 
         wrapper = wrapper_module.add_function(wrapperfnty, name=wrappername)
 
-        builder = lc.Builder(wrapper.append_basic_block(''))
+        builder = lc.Builder(wrapper.append_basic_block(""))
 
         # Adjust address space of each kernel argument
         fixed_args = []
@@ -235,20 +276,21 @@ def sub_gen_with_global(lty):
         callargs = arginfo.from_arguments(builder, fixed_args)
 
         # XXX handle error status
-        status, _ = self.call_conv.call_function(builder, func, types.void,
-                                                 argtypes, callargs)
+        status, _ = self.call_conv.call_function(
+            builder, func, types.void, argtypes, callargs
+        )
         builder.ret_void()
 
         set_dppy_kernel(wrapper)
 
-        #print(str(wrapper_module))
+        # print(str(wrapper_module))
         # Link
         module.link_in(ll.parse_assembly(str(wrapper_module)))
         # To enable inlining which is essential because addrspacecast 1->0 is
         # illegal.  Inlining will optimize the addrspacecast out.
-        func.linkage = 'internal'
+        func.linkage = "internal"
         wrapper = module.get_function(wrapper.name)
-        module.get_function(func.name).linkage = 'internal'
+        module.get_function(func.name).linkage = "internal"
         return wrapper
 
     def declare_function(self, module, fndesc):
@@ -256,12 +298,12 @@ def declare_function(self, module, fndesc):
         fn = module.get_or_insert_function(fnty, name=fndesc.mangled_name)
 
         if not self.enable_debuginfo:
-            fn.attributes.add('alwaysinline')
+            fn.attributes.add("alwaysinline")
 
         ret = super(DPPYTargetContext, self).declare_function(module, fndesc)
 
         # XXX: Refactor fndesc instead of this special case
-        if fndesc.llvm_func_name.startswith('dppy_py_devfn'):
+        if fndesc.llvm_func_name.startswith("dppy_py_devfn"):
             ret.calling_convention = CC_SPIR_FUNC
         return ret
 
@@ -274,32 +316,29 @@ def make_constant_array(self, builder, typ, ary):
         # return a._getvalue()
         raise NotImplementedError
 
-
     def insert_const_string(self, mod, string):
         """
         This returns a a pointer in the spir generic addrspace.
         """
         text = lc.Constant.stringz(string)
 
-        name = '$'.join(["__conststring__",
-                         self.mangler(string, ["str"])])
+        name = "$".join(["__conststring__", self.mangler(string, ["str"])])
 
         # Try to reuse existing global
         try:
             gv = mod.get_global(name)
         except KeyError as e:
             # Not defined yet
-            gv = mod.add_global_variable(text.type, name=name,
-                                         addrspace=SPIR_GENERIC_ADDRSPACE)
-            gv.linkage = 'internal'
+            gv = mod.add_global_variable(
+                text.type, name=name, addrspace=SPIR_GENERIC_ADDRSPACE
+            )
+            gv.linkage = "internal"
             gv.global_constant = True
             gv.initializer = text
 
         # Cast to a i8* pointer
         charty = gv.type.pointee.element
-        return lc.Constant.bitcast(gv,
-                               charty.as_pointer(SPIR_GENERIC_ADDRSPACE))
-
+        return lc.Constant.bitcast(gv, charty.as_pointer(SPIR_GENERIC_ADDRSPACE))
 
     def addrspacecast(self, builder, src, addrspace):
         """
@@ -325,12 +364,19 @@ def set_dppy_kernel(fn):
 
     # Mark kernels
     ocl_kernels = mod.get_or_insert_named_metadata("opencl.kernels")
-    ocl_kernels.add(lc.MetaData.get(mod, [fn,
-                                          gen_arg_addrspace_md(fn),
-                                          gen_arg_access_qual_md(fn),
-                                          gen_arg_type(fn),
-                                          gen_arg_type_qual(fn),
-                                          gen_arg_base_type(fn)]))
+    ocl_kernels.add(
+        lc.MetaData.get(
+            mod,
+            [
+                fn,
+                gen_arg_addrspace_md(fn),
+                gen_arg_access_qual_md(fn),
+                gen_arg_type(fn),
+                gen_arg_type_qual(fn),
+                gen_arg_base_type(fn),
+            ],
+        )
+    )
 
     # SPIR version 2.0
     make_constant = lambda x: lc.Constant.int(lc.Type.int(), x)
@@ -346,14 +392,16 @@ def set_dppy_kernel(fn):
 
     # Other metadata
     empty_md = lc.MetaData.get(mod, ())
-    others = ["opencl.used.extensions",
-              "opencl.used.optional.core.features",
-              "opencl.compiler.options"]
+    others = [
+        "opencl.used.extensions",
+        "opencl.used.optional.core.features",
+        "opencl.compiler.options",
+    ]
 
     for name in others:
         nmd = mod.get_or_insert_named_metadata(name)
         if not nmd.operands:
-             nmd.add(empty_md)
+            nmd.add(empty_md)
 
 
 def gen_arg_addrspace_md(fn):
diff --git a/numba_dppy/target_dispatcher.py b/numba_dppy/target_dispatcher.py
index dde38eb75b..4a9dc16fc5 100644
--- a/numba_dppy/target_dispatcher.py
+++ b/numba_dppy/target_dispatcher.py
@@ -6,11 +6,11 @@
 
 
 class TargetDispatcher(serialize.ReduceMixin, metaclass=dispatcher.DispatcherMeta):
-    __numba__ = 'py_func'
+    __numba__ = "py_func"
 
-    target_offload_gpu = '__dppy_offload_gpu__'
-    target_offload_cpu = '__dppy_offload_cpu__'
-    target_dppy = 'dppy'
+    target_offload_gpu = "__dppy_offload_gpu__"
+    target_offload_cpu = "__dppy_offload_cpu__"
+    target_dppy = "dppy"
 
     def __init__(self, py_func, wrapper, target, parallel_options, compiled=None):
 
@@ -58,32 +58,46 @@ def __is_with_context_target(self, target):
     def get_current_disp(self):
         target = self.__target
         parallel = self.__parallel
-        offload = isinstance(parallel, dict) and parallel.get('offload') is True
+        offload = isinstance(parallel, dict) and parallel.get("offload") is True
 
-        if (dpctl.is_in_device_context() or offload):
+        if dpctl.is_in_device_context() or offload:
             if not self.__is_with_context_target(target):
-                raise UnsupportedError(f"Can't use 'with' context with explicitly specified target '{target}'")
-            if parallel is False or (isinstance(parallel, dict) and parallel.get('offload') is False):
-                raise UnsupportedError(f"Can't use 'with' context with parallel option '{parallel}'")
+                raise UnsupportedError(
+                    f"Can't use 'with' context with explicitly specified target '{target}'"
+                )
+            if parallel is False or (
+                isinstance(parallel, dict) and parallel.get("offload") is False
+            ):
+                raise UnsupportedError(
+                    f"Can't use 'with' context with parallel option '{parallel}'"
+                )
 
             from numba_dppy import dppy_offload_dispatcher
 
             if target is None:
                 if dpctl.get_current_device_type() == dpctl.device_type.gpu:
-                    return registry.dispatcher_registry[TargetDispatcher.target_offload_gpu]
+                    return registry.dispatcher_registry[
+                        TargetDispatcher.target_offload_gpu
+                    ]
                 elif dpctl.get_current_device_type() == dpctl.device_type.cpu:
-                    return registry.dispatcher_registry[TargetDispatcher.target_offload_cpu]
+                    return registry.dispatcher_registry[
+                        TargetDispatcher.target_offload_cpu
+                    ]
                 else:
                     if dpctl.is_in_device_context():
-                        raise UnsupportedError('Unknown dppy device type')
+                        raise UnsupportedError("Unknown dppy device type")
                     if offload:
                         if dpctl.has_gpu_queues():
-                            return registry.dispatcher_registry[TargetDispatcher.target_offload_gpu]
+                            return registry.dispatcher_registry[
+                                TargetDispatcher.target_offload_gpu
+                            ]
                         elif dpctl.has_cpu_queues():
-                            return registry.dispatcher_registry[TargetDispatcher.target_offload_cpu]
+                            return registry.dispatcher_registry[
+                                TargetDispatcher.target_offload_cpu
+                            ]
 
         if target is None:
-            target = 'cpu'
+            target = "cpu"
 
         return registry.dispatcher_registry[target]
 
@@ -93,5 +107,5 @@ def _reduce_states(self):
             wrapper=self.__wrapper,
             target=self.__target,
             parallel=self.__parallel,
-            compiled=self.__compiled
+            compiled=self.__compiled,
         )
diff --git a/numba_dppy/testing.py b/numba_dppy/testing.py
index e6ff1e3ab3..7f6e539b38 100644
--- a/numba_dppy/testing.py
+++ b/numba_dppy/testing.py
@@ -18,6 +18,7 @@ def captured_dppy_stdout():
     sys.stdout.flush()
 
     import numba_dppy, numba_dppy as dppy
+
     with redirect_c_stdout() as stream:
         yield DPPYTextCapture(stream)
 
@@ -38,6 +39,7 @@ def expectedFailureIf(condition):
 def ensure_dpnp():
     try:
         from numba_dppy.dpnp_glue import dpnp_fptr_interface as dpnp_glue
+
         return True
     except:
         return False
diff --git a/numba_dppy/tests/__init__.py b/numba_dppy/tests/__init__.py
index 939c95c567..09ff707488 100644
--- a/numba_dppy/tests/__init__.py
+++ b/numba_dppy/tests/__init__.py
@@ -7,6 +7,7 @@
 
 # from numba_dppy.tests.dppy import *
 
+
 def load_tests(loader, tests, pattern):
 
     suite = SerialSuite()
diff --git a/numba_dppy/tests/skip_tests.py b/numba_dppy/tests/skip_tests.py
index fa18d36181..6765b95cfe 100644
--- a/numba_dppy/tests/skip_tests.py
+++ b/numba_dppy/tests/skip_tests.py
@@ -1,5 +1,6 @@
 import dpctl
 
+
 def is_gen12(device_type):
     with dpctl.device_context(device_type):
         q = dpctl.get_current_queue()
diff --git a/numba_dppy/tests/test_arg_accessor.py b/numba_dppy/tests/test_arg_accessor.py
index 494f269c59..3a2f3d7f05 100644
--- a/numba_dppy/tests/test_arg_accessor.py
+++ b/numba_dppy/tests/test_arg_accessor.py
@@ -5,19 +5,22 @@
 import dpctl
 
 
-@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
+@dppy.kernel(
+    access_types={"read_only": ["a", "b"], "write_only": ["c"], "read_write": []}
+)
 def sum_with_accessor(a, b, c):
     i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
+
 @dppy.kernel
 def sum_without_accessor(a, b, c):
     i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
-def call_kernel(global_size, local_size,
-                A, B, C, func):
-        func[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, C)
+
+def call_kernel(global_size, local_size, A, B, C, func):
+    func[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, C)
 
 
 global_size = 10
@@ -29,39 +32,35 @@ def call_kernel(global_size, local_size,
 D = A + B
 
 
-@unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
+@unittest.skipUnless(dpctl.has_cpu_queues(), "test only on CPU system")
 class TestDPPYArgAccessorCPU(unittest.TestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
-            call_kernel(global_size, local_size,
-                        A, B, C, sum_with_accessor)
+            call_kernel(global_size, local_size, A, B, C, sum_with_accessor)
         self.assertTrue(np.all(D == C))
 
     def test_arg_without_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
-            call_kernel(global_size, local_size,
-                        A, B, C, sum_without_accessor)
+            call_kernel(global_size, local_size, A, B, C, sum_without_accessor)
         self.assertTrue(np.all(D == C))
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYArgAccessorGPU(unittest.TestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            call_kernel(global_size, local_size,
-                        A, B, C, sum_with_accessor)
+            call_kernel(global_size, local_size, A, B, C, sum_with_accessor)
         self.assertTrue(np.all(D == C))
 
     def test_arg_without_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            call_kernel(global_size, local_size,
-                        A, B, C, sum_without_accessor)
+            call_kernel(global_size, local_size, A, B, C, sum_without_accessor)
         self.assertTrue(np.all(D == C))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_arg_types.py b/numba_dppy/tests/test_arg_types.py
index ed55e12e16..7e21748a2c 100644
--- a/numba_dppy/tests/test_arg_types.py
+++ b/numba_dppy/tests/test_arg_types.py
@@ -10,6 +10,7 @@ def mul_kernel(A, B, test):
     i = dppy.get_global_id(0)
     B[i] = A[i] * test
 
+
 def call_mul_device_kernel(global_size, A, B, test):
     mul_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, test)
 
@@ -20,7 +21,7 @@ def call_mul_device_kernel(global_size, A, B, test):
 B = np.array(np.random.random(N), dtype=np.float32)
 
 
-@unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
+@unittest.skipUnless(dpctl.has_cpu_queues(), "test only on CPU system")
 class TestDPPYArrayArgCPU(unittest.TestCase):
     def test_integer_arg(self):
         x = np.int32(2)
@@ -46,7 +47,7 @@ def check_bool_kernel(A, test):
             else:
                 A[0] = 222
 
-        A = np.array([0], dtype='float64')
+        A = np.array([0], dtype="float64")
 
         with dpctl.device_context("opencl:cpu") as cpu_queue:
             check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True)
@@ -55,7 +56,7 @@ def check_bool_kernel(A, test):
             self.assertTrue(A[0] == 222)
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYArrayArgGPU(unittest.TestCase):
     def test_integer_arg(self):
         x = np.int32(2)
@@ -81,7 +82,7 @@ def check_bool_kernel(A, test):
             else:
                 A[0] = 222
 
-        A = np.array([0], dtype='float64')
+        A = np.array([0], dtype="float64")
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True)
@@ -89,5 +90,6 @@ def check_bool_kernel(A, test):
             check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, False)
             self.assertTrue(A[0] == 222)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_atomic_op.py b/numba_dppy/tests/test_atomic_op.py
index 27a810ba08..ccfcae978c 100644
--- a/numba_dppy/tests/test_atomic_op.py
+++ b/numba_dppy/tests/test_atomic_op.py
@@ -5,6 +5,7 @@
 import unittest
 import dpctl
 
+
 def atomic_add_int32(ary):
     tid = dppy.get_local_id(0)
     lm = dppy.local.static_alloc(32, numba.uint32)
@@ -111,17 +112,19 @@ def call_fn_for_datatypes(fn, result, input, global_size):
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
-            #if dtype == np.double and not device_env.device_support_float64_atomics():
+            # if dtype == np.double and not device_env.device_support_float64_atomics():
             #    continue
-            #if dtype == np.int64 and not device_env.device_support_int64_atomics():
+            # if dtype == np.int64 and not device_env.device_support_int64_atomics():
             #    continue
             fn[global_size, dppy.DEFAULT_LOCAL_SIZE](a)
 
-        assert(a[0] == result)
+        assert a[0] == result
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-@unittest.skipUnless(numba_dppy.ocl.atomic_support_present(), 'test only when atomic support is present')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
+@unittest.skipUnless(
+    numba_dppy.ocl.atomic_support_present(), "test only when atomic support is present"
+)
 class TestAtomicOp(unittest.TestCase):
     def test_atomic_add_global(self):
         @dppy.kernel
@@ -133,7 +136,6 @@ def atomic_add(B):
 
         call_fn_for_datatypes(atomic_add, N, B, N)
 
-
     def test_atomic_sub_global(self):
         @dppy.kernel
         def atomic_sub(B):
@@ -144,12 +146,11 @@ def atomic_sub(B):
 
         call_fn_for_datatypes(atomic_sub, 0, B, N)
 
-
     def test_atomic_add_local_int32(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
 
-        #dppy_atomic_add = dppy.kernel('void(uint32[:])')(atomic_add_int32)
+        # dppy_atomic_add = dppy.kernel('void(uint32[:])')(atomic_add_int32)
         dppy_atomic_add = dppy.kernel(atomic_add_int32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
@@ -160,12 +161,11 @@ def test_atomic_add_local_int32(self):
 
         self.assertTrue(np.all(ary == gold))
 
-
     def test_atomic_sub_local_int32(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
 
-        #dppy_atomic_sub = dppy.kernel('void(uint32[:])')(atomic_sub_int32)
+        # dppy_atomic_sub = dppy.kernel('void(uint32[:])')(atomic_sub_int32)
         dppy_atomic_sub = dppy.kernel(atomic_sub_int32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dppy_atomic_sub[32, dppy.DEFAULT_LOCAL_SIZE](ary)
@@ -176,22 +176,20 @@ def test_atomic_sub_local_int32(self):
 
         self.assertTrue(np.all(ary == gold))
 
-
     def test_atomic_add_local_float32(self):
         ary = np.array([0], dtype=np.float32)
 
-        #dppy_atomic_add = dppy.kernel('void(float32[:])')(atomic_add_float32)
+        # dppy_atomic_add = dppy.kernel('void(float32[:])')(atomic_add_float32)
         dppy_atomic_add = dppy.kernel(atomic_add_float32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(ary[0] == 32)
 
-
     def test_atomic_sub_local_float32(self):
         ary = np.array([32], dtype=np.float32)
 
-        #dppy_atomic_sub = dppy.kernel('void(float32[:])')(atomic_sub_float32)
+        # dppy_atomic_sub = dppy.kernel('void(float32[:])')(atomic_sub_float32)
         dppy_atomic_sub = dppy.kernel(atomic_sub_float32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
 
@@ -199,77 +197,71 @@ def test_atomic_sub_local_float32(self):
 
         self.assertTrue(ary[0] == 0)
 
-
     def test_atomic_add_local_int64(self):
         ary = np.array([0], dtype=np.int64)
 
-        #dppy_atomic_add = dppy.kernel('void(int64[:])')(atomic_add_int64)
+        # dppy_atomic_add = dppy.kernel('void(int64[:])')(atomic_add_int64)
         dppy_atomic_add = dppy.kernel(atomic_add_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
-            #if device_env.device_support_int64_atomics():
+            # if device_env.device_support_int64_atomics():
             dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 32)
-            #else:
+            # else:
             #    return
 
-
     def test_atomic_sub_local_int64(self):
         ary = np.array([32], dtype=np.int64)
 
-        #fn = dppy.kernel('void(int64[:])')(atomic_sub_int64)
+        # fn = dppy.kernel('void(int64[:])')(atomic_sub_int64)
         fn = dppy.kernel(atomic_sub_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
-            #if device_env.device_support_int64_atomics():
+            # if device_env.device_support_int64_atomics():
             fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 0)
-            #else:
+            # else:
             #    return
 
-
     def test_atomic_add_local_float64(self):
         ary = np.array([0], dtype=np.double)
 
-        #fn = dppy.kernel('void(float64[:])')(atomic_add_float64)
+        # fn = dppy.kernel('void(float64[:])')(atomic_add_float64)
         fn = dppy.kernel(atomic_add_float64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
-            #if device_env.device_support_float64_atomics():
+            # if device_env.device_support_float64_atomics():
             fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 32)
-            #else:
+            # else:
             #    return
 
-
     def test_atomic_sub_local_float64(self):
         ary = np.array([32], dtype=np.double)
 
-        #fn = dppy.kernel('void(float64[:])')(atomic_sub_int64)
+        # fn = dppy.kernel('void(float64[:])')(atomic_sub_int64)
         fn = dppy.kernel(atomic_sub_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
-            #if device_env.device_support_float64_atomics():
+            # if device_env.device_support_float64_atomics():
             fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 0)
-            #else:
+            # else:
             #    return
 
-
     def test_atomic_add2(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        #dppy_atomic_add2 = dppy.kernel('void(uint32[:,:])')(atomic_add2)
+        # dppy_atomic_add2 = dppy.kernel('void(uint32[:,:])')(atomic_add2)
         dppy_atomic_add2 = dppy.kernel(atomic_add2)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dppy_atomic_add2[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary)
         self.assertTrue(np.all(ary == orig + 1))
 
-
     def test_atomic_add3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        #dppy_atomic_add3 = dppy.kernel('void(uint32[:,:])')(atomic_add3)
+        # dppy_atomic_add3 = dppy.kernel('void(uint32[:,:])')(atomic_add3)
         dppy_atomic_add3 = dppy.kernel(atomic_add3)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dppy_atomic_add3[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary)
@@ -277,5 +269,5 @@ def test_atomic_add3(self):
         self.assertTrue(np.all(ary == orig + 1))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_barrier.py b/numba_dppy/tests/test_barrier.py
index 7cedc18f13..6f30e06f2b 100644
--- a/numba_dppy/tests/test_barrier.py
+++ b/numba_dppy/tests/test_barrier.py
@@ -6,10 +6,10 @@
 import dpctl
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestBarrier(unittest.TestCase):
     def test_proper_lowering(self):
-        #@dppy.kernel("void(float32[::1])")
+        # @dppy.kernel("void(float32[::1])")
         @dppy.kernel
         def twice(A):
             i = dppy.get_global_id(0)
@@ -22,13 +22,13 @@ def twice(A):
         orig = arr.copy()
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            twice[N, N//2](arr)
+            twice[N, N // 2](arr)
 
         # The computation is correct?
         np.testing.assert_allclose(orig * 2, arr)
 
     def test_no_arg_barrier_support(self):
-        #@dppy.kernel("void(float32[::1])")
+        # @dppy.kernel("void(float32[::1])")
         @dppy.kernel
         def twice(A):
             i = dppy.get_global_id(0)
@@ -47,11 +47,10 @@ def twice(A):
         # The computation is correct?
         np.testing.assert_allclose(orig * 2, arr)
 
-
     def test_local_memory(self):
         blocksize = 10
 
-        #@dppy.kernel("void(float32[::1])")
+        # @dppy.kernel("void(float32[::1])")
         @dppy.kernel
         def reverse_array(A):
             lm = dppy.local.static_alloc(shape=10, dtype=float32)
@@ -74,5 +73,5 @@ def reverse_array(A):
         np.testing.assert_allclose(expected, arr)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_black_scholes.py b/numba_dppy/tests/test_black_scholes.py
index 7baecbeda5..fca2fe69f3 100644
--- a/numba_dppy/tests/test_black_scholes.py
+++ b/numba_dppy/tests/test_black_scholes.py
@@ -20,12 +20,17 @@
 
 def cnd(d):
     K = 1.0 / (1.0 + 0.2316419 * np.abs(d))
-    ret_val = (RSQRT2PI * np.exp(-0.5 * d * d) *
-               (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
+    ret_val = (
+        RSQRT2PI
+        * np.exp(-0.5 * d * d)
+        * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
+    )
     return np.where(d > 0, 1.0 - ret_val, ret_val)
 
-def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears,
-                  Riskfree, Volatility):
+
+def black_scholes(
+    callResult, putResult, stockPrice, optionStrike, optionYears, Riskfree, Volatility
+):
     S = stockPrice
     X = optionStrike
     T = optionYears
@@ -37,15 +42,16 @@ def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears,
     cndd1 = cnd(d1)
     cndd2 = cnd(d2)
 
-    expRT = np.exp(- R * T)
-    callResult[:] = (S * cndd1 - X * expRT * cndd2)
-    putResult[:] = (X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1))
+    expRT = np.exp(-R * T)
+    callResult[:] = S * cndd1 - X * expRT * cndd2
+    putResult[:] = X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1)
+
 
 def randfloat(rand_var, low, high):
     return (1.0 - rand_var) * low + rand_var * high
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYBlackScholes(unittest.TestCase):
     def test_black_scholes(self):
         OPT_N = 400
@@ -63,9 +69,15 @@ def test_black_scholes(self):
 
         # numpy
         for i in range(iterations):
-            black_scholes(callResultNumpy, putResultNumpy, stockPrice,
-                          optionStrike, optionYears, RISKFREE, VOLATILITY)
-
+            black_scholes(
+                callResultNumpy,
+                putResultNumpy,
+                stockPrice,
+                optionStrike,
+                optionYears,
+                RISKFREE,
+                VOLATILITY,
+            )
 
         @dppy.kernel
         def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
@@ -77,20 +89,26 @@ def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
             d2 = d1 - V * sqrtT
 
             K = 1.0 / (1.0 + 0.2316419 * math.fabs(d1))
-            cndd1 = (RSQRT2PI * math.exp(-0.5 * d1 * d1) *
-                    (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
+            cndd1 = (
+                RSQRT2PI
+                * math.exp(-0.5 * d1 * d1)
+                * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
+            )
             if d1 > 0:
                 cndd1 = 1.0 - cndd1
 
             K = 1.0 / (1.0 + 0.2316419 * math.fabs(d2))
-            cndd2 = (RSQRT2PI * math.exp(-0.5 * d2 * d2) *
-                    (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
+            cndd2 = (
+                RSQRT2PI
+                * math.exp(-0.5 * d2 * d2)
+                * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
+            )
             if d2 > 0:
                 cndd2 = 1.0 - cndd2
 
-            expRT = math.exp((-1. * R) * T[i])
-            callResult[i] = (S[i] * cndd1 - X[i] * expRT * cndd2)
-            putResult[i] = (X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1))
+            expRT = math.exp((-1.0 * R) * T[i])
+            callResult[i] = S[i] * cndd1 - X[i] * expRT * cndd2
+            putResult[i] = X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1)
 
         # numbapro
         time0 = time.time()
@@ -101,11 +119,16 @@ def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
             time1 = time.time()
             for i in range(iterations):
                 black_scholes_dppy[blockdim, griddim](
-                    callResultNumbapro, putResultNumbapro, stockPrice, optionStrike,
-                    optionYears, RISKFREE, VOLATILITY)
+                    callResultNumbapro,
+                    putResultNumbapro,
+                    stockPrice,
+                    optionStrike,
+                    optionYears,
+                    RISKFREE,
+                    VOLATILITY,
+                )
 
-
-        dt = (time1 - time0)
+        dt = time1 - time0
 
         delta = np.abs(callResultNumpy - callResultNumbapro)
         L1norm = delta.sum() / np.abs(callResultNumpy).sum()
@@ -114,5 +137,6 @@ def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
         self.assertTrue(L1norm < 1e-13)
         self.assertTrue(max_abs_err < 1e-13)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_caching.py b/numba_dppy/tests/test_caching.py
index 268401ce98..561c1dda87 100644
--- a/numba_dppy/tests/test_caching.py
+++ b/numba_dppy/tests/test_caching.py
@@ -19,15 +19,18 @@ def test_caching_kernel(self):
         b = np.array(np.random.random(N), dtype=np.float32)
         c = np.ones_like(a)
 
-
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             func = dppy.kernel(data_parallel_sum)
-            caching_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
+            caching_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(
+                a, b, c
+            )
 
             for i in range(10):
-                cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
+                cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(
+                    a, b, c
+                )
                 self.assertIs(caching_kernel, cached_kernel)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_controllable_fallback.py b/numba_dppy/tests/test_controllable_fallback.py
index 357f0b5e20..6a722d2d72 100644
--- a/numba_dppy/tests/test_controllable_fallback.py
+++ b/numba_dppy/tests/test_controllable_fallback.py
@@ -7,7 +7,7 @@
 import dpctl
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYFallback(unittest.TestCase):
     def test_dppy_fallback_true(self):
         @numba.jit
@@ -33,7 +33,9 @@ def inner_call_fallback():
         numba_dppy.compiler.DEBUG = 0
 
         np.testing.assert_array_equal(dppy_fallback_true, ref_result)
-        self.assertTrue('Failed to lower parfor on DPPY-device' in msg_fallback_true.getvalue())
+        self.assertTrue(
+            "Failed to lower parfor on DPPY-device" in msg_fallback_true.getvalue()
+        )
 
     @unittest.expectedFailure
     def test_dppy_fallback_false(self):
@@ -52,7 +54,7 @@ def inner_call_fallback():
 
         try:
             numba_dppy.compiler.DEBUG = 1
-            numba_dppy.config.FALLBACK_ON_CPU  = 0
+            numba_dppy.config.FALLBACK_ON_CPU = 0
             with captured_stderr() as msg_fallback_true:
                 with dpctl.device_context("opencl:gpu") as gpu_queue:
                     dppy = numba.njit(parallel=True)(inner_call_fallback)
@@ -60,12 +62,14 @@ def inner_call_fallback():
 
         finally:
             ref_result = inner_call_fallback()
-            numba_dppy.config.FALLBACK_ON_CPU  = 1
+            numba_dppy.config.FALLBACK_ON_CPU = 1
             numba_dppy.compiler.DEBUG = 0
 
             not np.testing.assert_array_equal(dppy_fallback_false, ref_result)
-            not self.assertTrue('Failed to lower parfor on DPPY-device' in msg_fallback_true.getvalue())
+            not self.assertTrue(
+                "Failed to lower parfor on DPPY-device" in msg_fallback_true.getvalue()
+            )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_device_array_args.py b/numba_dppy/tests/test_device_array_args.py
index eb47cd28bc..06a49dd5ed 100644
--- a/numba_dppy/tests/test_device_array_args.py
+++ b/numba_dppy/tests/test_device_array_args.py
@@ -5,6 +5,7 @@
 import dpctl
 import unittest
 
+
 @dppy.kernel
 def data_parallel_sum(a, b, c):
     i = dppy.get_global_id(0)
@@ -19,7 +20,7 @@ def data_parallel_sum(a, b, c):
 d = a + b
 
 
-@unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
+@unittest.skipUnless(dpctl.has_cpu_queues(), "test only on CPU system")
 class TestDPPYDeviceArrayArgsGPU(unittest.TestCase):
     def test_device_array_args_cpu(self):
         c = np.ones_like(a)
@@ -30,7 +31,7 @@ def test_device_array_args_cpu(self):
             self.assertTrue(np.all(c == d))
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYDeviceArrayArgsCPU(unittest.TestCase):
     def test_device_array_args_gpu(self):
         c = np.ones_like(a)
@@ -41,5 +42,5 @@ def test_device_array_args_gpu(self):
         self.assertTrue(np.all(c == d))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_dpctl_api.py b/numba_dppy/tests/test_dpctl_api.py
index 59ddd16f65..29d31bbb2b 100644
--- a/numba_dppy/tests/test_dpctl_api.py
+++ b/numba_dppy/tests/test_dpctl_api.py
@@ -2,7 +2,7 @@
 import dpctl
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPCTLAPI(unittest.TestCase):
     def test_dpctl_api(self):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
@@ -16,5 +16,5 @@ def test_dpctl_api(self):
             dpctl.is_in_device_context()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
index 166937c275..75f22d1274 100644
--- a/numba_dppy/tests/test_dpnp_functions.py
+++ b/numba_dppy/tests/test_dpnp_functions.py
@@ -13,12 +13,16 @@
 
 import dpctl
 
-def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False, matrix=None):
+
+def test_for_different_datatypes(
+    fn, test_fn, dims, arg_count, tys, np_all=False, matrix=None
+):
     if arg_count == 1:
         for ty in tys:
             if matrix and matrix[0]:
-                a = np.array(np.random.random(
-                    dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1])
+                a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty).reshape(
+                    dims[0], dims[1]
+                )
             else:
                 a = np.array(np.random.random(dims[0]), dtype=ty)
 
@@ -36,13 +40,15 @@ def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False
     elif arg_count == 2:
         for ty in tys:
             if matrix and matrix[0]:
-                a = np.array(np.random.random(
-                    dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1])
+                a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty).reshape(
+                    dims[0], dims[1]
+                )
             else:
                 a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty)
             if matrix and matrix[1]:
-                b = np.array(np.random.random(
-                    dims[2] * dims[3]), dtype=ty).reshape(dims[2], dims[3])
+                b = np.array(np.random.random(dims[2] * dims[3]), dtype=ty).reshape(
+                    dims[2], dims[3]
+                )
             else:
                 b = np.array(np.random.random(dims[2] * dims[3]), dtype=ty)
 
@@ -100,9 +106,10 @@ def vvsort(val, vec, size):
             vec[k, imax] = temp
 
 
-@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
+@unittest.skipUnless(ensure_dpnp(), "test only when dpNP is available")
 class Testdpnp_linalg_functions(unittest.TestCase):
     tys = [np.int32, np.uint32, np.int64, np.uint64, np.float, np.double]
+
     def test_eig(self):
         @njit
         def f(a):
@@ -111,7 +118,11 @@ def f(a):
         size = 3
         for ty in self.tys:
             a = np.arange(size * size, dtype=ty).reshape((size, size))
-            symm_a = np.tril(a) + np.tril(a, -1).T + np.diag(np.full((size,), size * size, dtype=ty))
+            symm_a = (
+                np.tril(a)
+                + np.tril(a, -1).T
+                + np.diag(np.full((size,), size * size, dtype=ty))
+            )
 
             with dpctl.device_context("opencl:gpu"):
                 got_val, got_vec = f(symm_a)
@@ -122,8 +133,7 @@ def f(a):
             vvsort(got_val, got_vec, size)
             vvsort(np_val, np_vec, size)
 
-
-	    # NP change sign of vectors
+            # NP change sign of vectors
             for i in range(np_vec.shape[1]):
                 if np_vec[0, i] * got_vec[0, i] < 0:
                     np_vec[:, i] = -np_vec[:, i]
@@ -132,9 +142,10 @@ def f(a):
             self.assertTrue(np.allclose(got_vec, np_vec))
 
 
-@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
+@unittest.skipUnless(ensure_dpnp(), "test only when dpNP is available")
 class Testdpnp_ndarray_functions(unittest.TestCase):
     tys = [np.int32, np.uint32, np.int64, np.uint64, np.float, np.double]
+
     def test_ndarray_sum(self):
         @njit
         def f(a):
@@ -225,7 +236,6 @@ def f(a):
 
             self.assertTrue(expected == got)
 
-
     def test_ndarray_argmin(self):
         @njit
         def f(a):
@@ -257,7 +267,9 @@ def f(a):
             self.assertTrue(np.array_equal(expected, got))
 
 
-@unittest.skipUnless(ensure_dpnp() and dpctl.has_gpu_queues(), 'test only when dpNP and GPU is available')
+@unittest.skipUnless(
+    ensure_dpnp() and dpctl.has_gpu_queues(), "test only when dpNP and GPU is available"
+)
 class Testdpnp_functions(unittest.TestCase):
     N = 10
 
@@ -271,8 +283,7 @@ def f(a):
             c = np.sum(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.sum, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.sum, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.sum, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.sum, [10, 2, 3], self.tys))
 
@@ -282,8 +293,7 @@ def f(a):
             c = np.prod(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.prod, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.prod, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.prod, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.prod, [10, 2, 3], self.tys))
 
@@ -293,11 +303,9 @@ def f(a):
             c = np.argmax(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.argmax, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.argmax, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.argmax, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(
-            f, np.argmax, [10, 2, 3], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.argmax, [10, 2, 3], self.tys))
 
     def test_max(self):
         @njit
@@ -305,8 +313,7 @@ def f(a):
             c = np.max(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.max, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.max, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.max, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.max, [10, 2, 3], self.tys))
 
@@ -316,23 +323,19 @@ def f(a):
             c = np.amax(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.amax, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.amax, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.amax, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.amax, [10, 2, 3], self.tys))
 
-
     def test_argmin(self):
         @njit
         def f(a):
             c = np.argmin(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.argmin, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.argmin, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.argmin, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(
-            f, np.argmin, [10, 2, 3], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.argmin, [10, 2, 3], self.tys))
 
     def test_min(self):
         @njit
@@ -340,8 +343,7 @@ def f(a):
             c = np.min(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.min, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.min, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.min, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.min, [10, 2, 3], self.tys))
 
@@ -351,8 +353,7 @@ def f(a):
             c = np.amin(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.min, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.min, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.min, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.min, [10, 2, 3], self.tys))
 
@@ -362,8 +363,9 @@ def f(a):
             c = np.argsort(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.argmin, [10], 1, self.tys, np_all=True))
+        self.assertTrue(
+            test_for_different_datatypes(f, np.argmin, [10], 1, self.tys, np_all=True)
+        )
 
     def test_median(self):
         @njit
@@ -371,11 +373,9 @@ def f(a):
             c = np.median(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.median, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.median, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.median, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(
-            f, np.median, [10, 2, 3], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.median, [10, 2, 3], self.tys))
 
     def test_mean(self):
         @njit
@@ -383,8 +383,7 @@ def f(a):
             c = np.mean(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.mean, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.mean, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.mean, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.mean, [10, 2, 3], self.tys))
 
@@ -394,8 +393,17 @@ def f(a, b):
             c = np.matmul(a, b)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.matmul, [10, 5, 5, 10], 2, [
-                        np.float, np.double], np_all=True, matrix=[True, True]))
+        self.assertTrue(
+            test_for_different_datatypes(
+                f,
+                np.matmul,
+                [10, 5, 5, 10],
+                2,
+                [np.float, np.double],
+                np_all=True,
+                matrix=[True, True],
+            )
+        )
 
     def test_dot(self):
         @njit
@@ -403,14 +411,44 @@ def f(a, b):
             c = np.dot(a, b)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.dot, [10, 1, 10, 1], 2, [np.float, np.double]))
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 1, 10, 2], 2, [
-                        np.float, np.double], matrix=[False, True], np_all=True))
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [2, 10, 10, 1], 2, [
-                        np.float, np.double], matrix=[True, False], np_all=True))
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 2, 2, 10], 2, [
-                        np.float, np.double], matrix=[True, True], np_all=True))
+        self.assertTrue(
+            test_for_different_datatypes(
+                f, np.dot, [10, 1, 10, 1], 2, [np.float, np.double]
+            )
+        )
+        self.assertTrue(
+            test_for_different_datatypes(
+                f,
+                np.dot,
+                [10, 1, 10, 2],
+                2,
+                [np.float, np.double],
+                matrix=[False, True],
+                np_all=True,
+            )
+        )
+        self.assertTrue(
+            test_for_different_datatypes(
+                f,
+                np.dot,
+                [2, 10, 10, 1],
+                2,
+                [np.float, np.double],
+                matrix=[True, False],
+                np_all=True,
+            )
+        )
+        self.assertTrue(
+            test_for_different_datatypes(
+                f,
+                np.dot,
+                [10, 2, 2, 10],
+                2,
+                [np.float, np.double],
+                matrix=[True, True],
+                np_all=True,
+            )
+        )
 
     def test_cov(self):
         @njit
@@ -418,23 +456,26 @@ def f(a):
             c = np.cov(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.cov, [10, 7], 1, self.tys, matrix=[True], np_all=True))
+        self.assertTrue(
+            test_for_different_datatypes(
+                f, np.cov, [10, 7], 1, self.tys, matrix=[True], np_all=True
+            )
+        )
 
     def test_dpnp_interacting_with_parfor(self):
         @njit
         def f(a, b):
             c = np.sum(a)
             e = np.add(b, a)
-            #d = a + 1
+            # d = a + 1
             return 0
 
         result = f(self.a, self.b)
-        #np_result = np.add((self.a + np.sum(self.a)), self.b)
+        # np_result = np.add((self.a + np.sum(self.a)), self.b)
 
-        #max_abs_err = result.sum() - np_result.sum()
-        #self.assertTrue(max_abs_err < 1e-4)
+        # max_abs_err = result.sum() - np_result.sum()
+        # self.assertTrue(max_abs_err < 1e-4)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_dppy_fallback.py b/numba_dppy/tests/test_dppy_fallback.py
index dd05bbdc84..3ebad7aed9 100644
--- a/numba_dppy/tests/test_dppy_fallback.py
+++ b/numba_dppy/tests/test_dppy_fallback.py
@@ -6,7 +6,7 @@
 import dpctl
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYFallback(unittest.TestCase):
     def test_dppy_fallback_inner_call(self):
         @numba.jit
@@ -29,8 +29,7 @@ def inner_call_fallback():
         ref_result = inner_call_fallback()
 
         np.testing.assert_array_equal(dppy_result, ref_result)
-        self.assertTrue(
-            'Failed to lower parfor on DPPY-device' in msg.getvalue())
+        self.assertTrue("Failed to lower parfor on DPPY-device" in msg.getvalue())
 
     def test_dppy_fallback_reductions(self):
         def reduction(a):
@@ -47,9 +46,8 @@ def reduction(a):
         ref_result = reduction(a)
 
         np.testing.assert_array_equal(dppy_result, ref_result)
-        self.assertTrue(
-            'Failed to lower parfor on DPPY-device' in msg.getvalue())
+        self.assertTrue("Failed to lower parfor on DPPY-device" in msg.getvalue())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_dppy_func.py b/numba_dppy/tests/test_dppy_func.py
index 729030e153..69ff82b38a 100644
--- a/numba_dppy/tests/test_dppy_func.py
+++ b/numba_dppy/tests/test_dppy_func.py
@@ -5,7 +5,7 @@
 import dpctl
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYFunc(unittest.TestCase):
     N = 257
 
@@ -25,7 +25,6 @@ def f(a, b):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
-
         self.assertTrue(np.all(b == 2))
 
     def test_dppy_func_ndarray(self):
@@ -56,5 +55,5 @@ def h(a, b):
             self.assertTrue(np.all(b == 3))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_math_functions.py b/numba_dppy/tests/test_math_functions.py
index 6336c63759..e09202c6a3 100644
--- a/numba_dppy/tests/test_math_functions.py
+++ b/numba_dppy/tests/test_math_functions.py
@@ -5,46 +5,55 @@
 import unittest
 import math
 
+
 @dppy.kernel
-def dppy_fabs(a,b):
+def dppy_fabs(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.fabs(a[i])
 
+
 @dppy.kernel
-def dppy_exp(a,b):
+def dppy_exp(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.exp(a[i])
 
+
 @dppy.kernel
-def dppy_log(a,b):
+def dppy_log(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.log(a[i])
 
+
 @dppy.kernel
-def dppy_sqrt(a,b):
+def dppy_sqrt(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.sqrt(a[i])
 
+
 @dppy.kernel
-def dppy_sin(a,b):
+def dppy_sin(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.sin(a[i])
 
+
 @dppy.kernel
-def dppy_cos(a,b):
+def dppy_cos(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.cos(a[i])
 
+
 @dppy.kernel
-def dppy_tan(a,b):
+def dppy_tan(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.tan(a[i])
 
+
 global_size = 10
 N = global_size
 
 a = np.array(np.random.random(N), dtype=np.float32)
 
+
 def driver(a, jitfunc):
     b = np.ones_like(a)
     # Device buffers
@@ -67,7 +76,7 @@ def test_driver(input_arr, device_ty, jitfunc):
     return out_actual
 
 
-@unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
+@unittest.skipUnless(dpctl.has_cpu_queues(), "test only on CPU system")
 class TestDPPYMathFunctionsCPU(unittest.TestCase):
     def test_fabs_cpu(self):
         b_actual = test_driver(a, "CPU", dppy_fabs)
@@ -77,30 +86,30 @@ def test_fabs_cpu(self):
     def test_sin_cpu(self):
         b_actual = test_driver(a, "CPU", dppy_sin)
         b_expected = np.sin(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_cos_cpu(self):
         b_actual = test_driver(a, "CPU", dppy_cos)
         b_expected = np.cos(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_exp_cpu(self):
         b_actual = test_driver(a, "CPU", dppy_exp)
         b_expected = np.exp(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_sqrt_cpu(self):
         b_actual = test_driver(a, "CPU", dppy_sqrt)
         b_expected = np.sqrt(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_log_cpu(self):
         b_actual = test_driver(a, "CPU", dppy_log)
         b_expected = np.log(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYMathFunctionsGPU(unittest.TestCase):
     def test_fabs_gpu(self):
         b_actual = test_driver(a, "GPU", dppy_fabs)
@@ -110,28 +119,28 @@ def test_fabs_gpu(self):
     def test_sin_gpu(self):
         b_actual = test_driver(a, "GPU", dppy_sin)
         b_expected = np.sin(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_cos_gpu(self):
         b_actual = test_driver(a, "GPU", dppy_cos)
         b_expected = np.cos(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_exp_gpu(self):
         b_actual = test_driver(a, "GPU", dppy_exp)
         b_expected = np.exp(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_sqrt_gpu(self):
         b_actual = test_driver(a, "GPU", dppy_sqrt)
         b_expected = np.sqrt(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_log_gpu(self):
         b_actual = test_driver(a, "GPU", dppy_log)
         b_expected = np.log(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
index 21a8fc8444..c6ffd9433f 100644
--- a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
+++ b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
@@ -5,7 +5,7 @@
 import unittest
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestNumpy_bit_twiddling_functions(unittest.TestCase):
     def test_bitwise_and(self):
         @njit
@@ -111,5 +111,5 @@ def f(a, b):
         self.assertTrue(np.all(c == d))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_numpy_comparison_functions.py b/numba_dppy/tests/test_numpy_comparison_functions.py
index 9d56e94374..b4f47d96b6 100644
--- a/numba_dppy/tests/test_numpy_comparison_functions.py
+++ b/numba_dppy/tests/test_numpy_comparison_functions.py
@@ -5,7 +5,7 @@
 import unittest
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestNumpy_comparison_functions(unittest.TestCase):
     a = np.array([4, 5, 6])
     b = np.array([2, 6, 6])
@@ -202,5 +202,5 @@ def f(a, b):
         np.testing.assert_equal(c, d)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_numpy_floating_functions.py b/numba_dppy/tests/test_numpy_floating_functions.py
index 8df7e2b5d4..6e746c4ed7 100644
--- a/numba_dppy/tests/test_numpy_floating_functions.py
+++ b/numba_dppy/tests/test_numpy_floating_functions.py
@@ -4,7 +4,7 @@
 import unittest
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestNumpy_floating_functions(unittest.TestCase):
     def test_isfinite(self):
         @njit
@@ -12,7 +12,7 @@ def f(a):
             c = np.isfinite(a)
             return c
 
-        test_arr = [np.log(-1.), 1., np.log(0)]
+        test_arr = [np.log(-1.0), 1.0, np.log(0)]
         input_arr = np.asarray(test_arr, dtype=np.float32)
 
         with dpctl.device_context("opencl:gpu"):
@@ -27,7 +27,7 @@ def f(a):
             c = np.isinf(a)
             return c
 
-        test_arr = [np.log(-1.), 1., np.log(0)]
+        test_arr = [np.log(-1.0), 1.0, np.log(0)]
         input_arr = np.asarray(test_arr, dtype=np.float32)
 
         with dpctl.device_context("opencl:gpu"):
@@ -42,7 +42,7 @@ def f(a):
             c = np.isnan(a)
             return c
 
-        test_arr = [np.log(-1.), 1., np.log(0)]
+        test_arr = [np.log(-1.0), 1.0, np.log(0)]
         input_arr = np.asarray(test_arr, dtype=np.float32)
 
         with dpctl.device_context("opencl:gpu"):
@@ -94,5 +94,5 @@ def f(a):
         self.assertTrue(np.all(c == d))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py
index ef5dc235b8..4a701495f9 100644
--- a/numba_dppy/tests/test_numpy_math_functions.py
+++ b/numba_dppy/tests/test_numpy_math_functions.py
@@ -5,7 +5,8 @@
 import unittest
 from . import skip_tests
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestNumpy_math_functions(unittest.TestCase):
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
@@ -138,7 +139,7 @@ def f(a, b):
         with dpctl.device_context("opencl:gpu"):
             c = f(input_arr, divisor)
 
-        self.assertTrue(np.all(c == 1.))
+        self.assertTrue(np.all(c == 1.0))
 
     def test_abs(self):
         @njit
@@ -191,7 +192,7 @@ def f(a):
         with dpctl.device_context("opencl:gpu"):
             c = f(input_arr)
 
-        self.assertTrue(np.all(c == -1.))
+        self.assertTrue(np.all(c == -1.0))
 
     def test_conj(self):
         @njit
@@ -322,7 +323,7 @@ def f(a):
         with dpctl.device_context("opencl:gpu"):
             c = f(input_arr)
 
-        self.assertTrue(np.all(c == 1/input_arr))
+        self.assertTrue(np.all(c == 1 / input_arr))
 
     def test_conjugate(self):
         @njit
@@ -339,5 +340,5 @@ def f(a):
         self.assertTrue(np.all(c == d))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_numpy_trigonomteric_functions.py b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
index 812f3d060c..a67862032b 100644
--- a/numba_dppy/tests/test_numpy_trigonomteric_functions.py
+++ b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
@@ -6,7 +6,7 @@
 from . import skip_tests
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestNumpy_math_functions(unittest.TestCase):
 
     N = 10
@@ -238,5 +238,5 @@ def f(a):
         self.assertTrue(max_abs_err < 1e-5)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_offload_diagnostics.py b/numba_dppy/tests/test_offload_diagnostics.py
index 6b41252fc6..9faebed088 100644
--- a/numba_dppy/tests/test_offload_diagnostics.py
+++ b/numba_dppy/tests/test_offload_diagnostics.py
@@ -16,7 +16,7 @@ def prange_func():
             a = np.ones((n), dtype=np.float64)
             b = np.ones((n), dtype=np.float64)
             c = np.ones((n), dtype=np.float64)
-            for i in prange(n//2):
+            for i in prange(n // 2):
                 a[i] = b[i] + c[i]
 
             return a
@@ -56,5 +56,5 @@ def parallel_sum(a, b, c):
             self.assertTrue("Device -" in got.getvalue())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_parfor_lower_message.py b/numba_dppy/tests/test_parfor_lower_message.py
index 17f1456bb6..0e578ce154 100644
--- a/numba_dppy/tests/test_parfor_lower_message.py
+++ b/numba_dppy/tests/test_parfor_lower_message.py
@@ -13,7 +13,7 @@ def prange_example():
     a = np.ones((n), dtype=np.float64)
     b = np.ones((n), dtype=np.float64)
     c = np.ones((n), dtype=np.float64)
-    for i in prange(n//2):
+    for i in prange(n // 2):
         a[i] = b[i] + c[i]
 
     return a
@@ -33,5 +33,5 @@ def test_parfor_message(self):
             self.assertTrue("Parfor lowered on DPPY-device" in got.getvalue())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_prange.py b/numba_dppy/tests/test_prange.py
index eda9ccebbc..8f5305198a 100644
--- a/numba_dppy/tests/test_prange.py
+++ b/numba_dppy/tests/test_prange.py
@@ -96,14 +96,14 @@ def f(a, b):
 
         self.assertTrue(np.all(b == 12))
 
-    @unittest.skip('numba-dppy issue 110')
+    @unittest.skip("numba-dppy issue 110")
     def test_two_consequent_prange(self):
         def prange_example():
             n = 10
             a = np.ones((n), dtype=np.float64)
             b = np.ones((n), dtype=np.float64)
             c = np.ones((n), dtype=np.float64)
-            for i in prange(n//2):
+            for i in prange(n // 2):
                 a[i] = b[i] + c[i]
 
             return a
@@ -120,20 +120,26 @@ def prange_example():
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(stdout.getvalue().count(
-            'Parfor lowered on DPPY-device'), 2, stdout.getvalue())
-        self.assertEqual(stdout.getvalue().count(
-            'Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
+        self.assertEqual(
+            stdout.getvalue().count("Parfor lowered on DPPY-device"),
+            2,
+            stdout.getvalue(),
+        )
+        self.assertEqual(
+            stdout.getvalue().count("Failed to lower parfor on DPPY-device"),
+            0,
+            stdout.getvalue(),
+        )
         np.testing.assert_equal(res, jitted_res)
 
-    @unittest.skip('NRT required but not enabled')
+    @unittest.skip("NRT required but not enabled")
     def test_2d_arrays(self):
         def prange_example():
             n = 10
             a = np.ones((n, n), dtype=np.float64)
             b = np.ones((n, n), dtype=np.float64)
             c = np.ones((n, n), dtype=np.float64)
-            for i in prange(n//2):
+            for i in prange(n // 2):
                 a[i] = b[i] + c[i]
 
             return a
@@ -150,12 +156,18 @@ def prange_example():
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(stdout.getvalue().count(
-            'Parfor lowered on DPPY-device'), 2, stdout.getvalue())
-        self.assertEqual(stdout.getvalue().count(
-            'Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
+        self.assertEqual(
+            stdout.getvalue().count("Parfor lowered on DPPY-device"),
+            2,
+            stdout.getvalue(),
+        )
+        self.assertEqual(
+            stdout.getvalue().count("Failed to lower parfor on DPPY-device"),
+            0,
+            stdout.getvalue(),
+        )
         np.testing.assert_equal(res, jitted_res)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_print.py b/numba_dppy/tests/test_print.py
index af19658048..8beca0a83f 100644
--- a/numba_dppy/tests/test_print.py
+++ b/numba_dppy/tests/test_print.py
@@ -7,7 +7,7 @@
 import dpctl
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestPrint(unittest.TestCase):
     def test_print_dppy_kernel(self):
         @dppy.func
@@ -30,5 +30,5 @@ def f(a, b):
             f[N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_rename_numpy_function_pass.py b/numba_dppy/tests/test_rename_numpy_function_pass.py
index cfeff09b8d..7ef237fcd0 100644
--- a/numba_dppy/tests/test_rename_numpy_function_pass.py
+++ b/numba_dppy/tests/test_rename_numpy_function_pass.py
@@ -7,10 +7,12 @@
 from numba_dppy.testing import ensure_dpnp
 
 
-from numba.core import (compiler, typing, cpu)
-from numba_dppy.rename_numpy_functions_pass import (DPPYRewriteOverloadedNumPyFunctions,
-        DPPYRewriteNdarrayFunctions)
-from numba.core.typed_passes import (NopythonTypeInference, AnnotateTypes)
+from numba.core import compiler, typing, cpu
+from numba_dppy.rename_numpy_functions_pass import (
+    DPPYRewriteOverloadedNumPyFunctions,
+    DPPYRewriteNdarrayFunctions,
+)
+from numba.core.typed_passes import NopythonTypeInference, AnnotateTypes
 
 
 class MyPipeline(object):
@@ -46,8 +48,10 @@ def check_equivalent(expected_ir, got_ir):
         else:
             if isinstance(expected_stmt, numba.core.ir.Assign):
                 if isinstance(expected_stmt.value, numba.core.ir.Global):
-                    if (expected_stmt.value.name != got_stmt.value.name and
-                        expected_stmt.value.name != "numba_dppy"):
+                    if (
+                        expected_stmt.value.name != got_stmt.value.name
+                        and expected_stmt.value.name != "numba_dppy"
+                    ):
                         return False
                 elif isinstance(expected_stmt.value, numba.core.ir.Expr):
                     # should get "dpnp" and "sum" as attr
@@ -76,7 +80,7 @@ def got(a):
         self.assertTrue(check_equivalent(expected_ir, pipeline.state.func_ir))
 
 
-@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
+@unittest.skipUnless(ensure_dpnp(), "test only when dpNP is available")
 class TestRenameNdarrayFunctionsPass(unittest.TestCase):
     def test_rename_ndarray(self):
         def expected(a):
diff --git a/numba_dppy/tests/test_sum_reduction.py b/numba_dppy/tests/test_sum_reduction.py
index 37ca38a12a..c2001e13a6 100644
--- a/numba_dppy/tests/test_sum_reduction.py
+++ b/numba_dppy/tests/test_sum_reduction.py
@@ -4,34 +4,37 @@
 import unittest
 import dpctl
 
+
 @dppy.kernel
 def reduction_kernel(A, R, stride):
     i = dppy.get_global_id(0)
     # sum two element
-    R[i] = A[i] + A[i+stride]
+    R[i] = A[i] + A[i + stride]
     # store the sum to be used in nex iteration
     A[i] = R[i]
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYSumReduction(unittest.TestCase):
     def test_sum_reduction(self):
         # This test will only work for even case
         N = 1024
-        self.assertTrue(N%2 == 0)
+        self.assertTrue(N % 2 == 0)
 
         A = np.array(np.random.random(N), dtype=np.float32)
         A_copy = A.copy()
         # at max we will require half the size of A to store sum
-        R = np.array(np.random.random(math.ceil(N/2)), dtype=np.float32)
+        R = np.array(np.random.random(math.ceil(N / 2)), dtype=np.float32)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             total = N
 
-            while (total > 1):
+            while total > 1:
                 # call kernel
                 global_size = total // 2
-                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size)
+                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](
+                    A, R, global_size
+                )
                 total = total // 2
 
             result = A_copy.sum()
@@ -39,5 +42,5 @@ def test_sum_reduction(self):
             self.assertTrue(max_abs_err < 1e-4)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_vectorize.py b/numba_dppy/tests/test_vectorize.py
index 5b3a41629c..6f93c232f6 100644
--- a/numba_dppy/tests/test_vectorize.py
+++ b/numba_dppy/tests/test_vectorize.py
@@ -8,20 +8,19 @@
 @unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestVectorize(unittest.TestCase):
     def test_vectorize(self):
-
         @vectorize(nopython=True)
         def axy(a, x, y):
             return a * x + y
 
         @njit
         def f(a0, a1):
-            return np.cos(axy(a0, np.sin(a1) - 1., 1.))
+            return np.cos(axy(a0, np.sin(a1) - 1.0, 1.0))
 
         def f_np(a0, a1):
             sin_res = np.sin(a1)
             res = []
             for i in range(len(a0)):
-                res.append(axy(a0[i], sin_res[i] - 1., 1.))
+                res.append(axy(a0[i], sin_res[i] - 1.0, 1.0))
             return np.cos(np.array(res))
 
         A = np.random.random(10)
@@ -36,5 +35,5 @@ def f_np(a0, a1):
         self.assertTrue(max_abs_err < 1e-5)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_with_context.py b/numba_dppy/tests/test_with_context.py
index 693c155ab2..58f14952d9 100644
--- a/numba_dppy/tests/test_with_context.py
+++ b/numba_dppy/tests/test_with_context.py
@@ -8,10 +8,8 @@
 
 
 class TestWithDPPYContext(unittest.TestCase):
-
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
     def test_with_dppy_context_gpu(self):
-
         @njit
         def nested_func(a, b):
             np.sin(a, b)
@@ -33,11 +31,10 @@ def func(b):
         func(expected)
 
         np.testing.assert_array_equal(expected, got_gpu)
-        self.assertTrue('Parfor lowered on DPPY-device' in got_gpu_message.getvalue())
+        self.assertTrue("Parfor lowered on DPPY-device" in got_gpu_message.getvalue())
 
     @unittest.skipIf(not dpctl.has_cpu_queues(), "No CPU platforms available")
     def test_with_dppy_context_cpu(self):
-
         @njit
         def nested_func(a, b):
             np.sin(a, b)
@@ -59,17 +56,15 @@ def func(b):
         func(expected)
 
         np.testing.assert_array_equal(expected, got_cpu)
-        self.assertTrue('Parfor lowered on DPPY-device' in got_cpu_message.getvalue())
-
+        self.assertTrue("Parfor lowered on DPPY-device" in got_cpu_message.getvalue())
 
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
     def test_with_dppy_context_target(self):
-
-        @njit(target='cpu')
+        @njit(target="cpu")
         def nested_func_target(a, b):
             np.sin(a, b)
 
-        @njit(target='gpu')
+        @njit(target="gpu")
         def func_target(b):
             a = np.ones((64), dtype=np.float64)
             nested_func_target(a, b)
@@ -84,7 +79,6 @@ def func_no_parallel(b):
             a = np.ones((64), dtype=np.float64)
             return a
 
-
         a = np.ones((64), dtype=np.float64)
         b = np.ones((64), dtype=np.float64)
 
@@ -112,5 +106,5 @@ def func_no_parallel(b):
         self.assertTrue(msg_2 in str(raises_4.exception))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/setup.py b/setup.py
index a6dcfd4d32..5ce0234bb8 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,7 @@ def get_ext_modules():
     ext_modules = []
 
     import numba
+
     ext_dppy = Extension(
         name="numba_dppy._dppy_rt",
         sources=["numba_dppy/dppy_rt.c"],
@@ -28,13 +29,15 @@ def get_ext_modules():
     if dpnp_present:
         dpnp_lib_path = []
         dpnp_lib_path += [os.path.dirname(dpnp.__file__)]
-        ext_dpnp_glue = Extension(name='numba_dppy.dpnp_glue.dpnp_fptr_interface',
-                                  sources=['numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx'],
-                                  include_dirs=[dpnp.get_include()],
-                                  libraries=['dpnp_backend_c'],
-                                  library_dirs=dpnp_lib_path,
-                                  runtime_library_dirs=dpnp_lib_path,
-                                  language="c++")
+        ext_dpnp_glue = Extension(
+            name="numba_dppy.dpnp_glue.dpnp_fptr_interface",
+            sources=["numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx"],
+            include_dirs=[dpnp.get_include()],
+            libraries=["dpnp_backend_c"],
+            library_dirs=dpnp_lib_path,
+            runtime_library_dirs=dpnp_lib_path,
+            language="c++",
+        )
         ext_modules += [ext_dpnp_glue]
 
     if dpnp_present:
@@ -76,7 +79,8 @@ def get_ext_modules():
     entry_points={
         "numba_extensions": [
             "init = numba_dppy.numpy_usm_shared:numba_register",
-    ]},
+        ]
+    },
 )
 
 setup(**metadata)

From 88cac155f5acc552b317486789b718a1612636bf Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Wed, 6 Jan 2021 17:44:40 -0600
Subject: [PATCH 33/40] Add ndarray.reshape test that is at the moment an
 expected failure.

---
 numba_dppy/tests/test_usmarray.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/numba_dppy/tests/test_usmarray.py b/numba_dppy/tests/test_usmarray.py
index b79f13d650..b6a58b3e59 100644
--- a/numba_dppy/tests/test_usmarray.py
+++ b/numba_dppy/tests/test_usmarray.py
@@ -68,6 +68,11 @@ def numba_T(x):
     return x.T
 
 
+@numba.njit
+def numba_reshape(x):
+    return x.reshape((4,3))
+
+
 class TestUsmArray(unittest.TestCase):
     def ndarray(self):
         """Create NumPy array"""
@@ -196,3 +201,11 @@ def test_numba_usmarray_T(self):
         dp4 = numba_T(numba_usmarray_empty())
         self.assertIsInstance(dp4, usmarray.ndarray, type(dp4))
         self.assertTrue(usmarray.has_array_interface(dp4))
+
+    @unittest.expectedFailure
+    def test_numba_usmarray_reshape(self):
+        """Testing Numba usmarray.reshape()"""
+        a = usmarray.ones(12)
+        s1 = numba_reshape(a)
+        self.assertIsInstance(s1, usmarray.ndarray, type(s1))
+        self.assertEqual(s1.shape, (4, 3))

From 3781e173572f08bb6645908489ce1ebf4a915e06 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Wed, 6 Jan 2021 17:45:59 -0600
Subject: [PATCH 34/40] Support for typing and lowering of member function of
 usmarray.ndarray.

---
 numba_dppy/numpy_usm_shared.py | 70 ++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 24 deletions(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 62ecc7435b..a34dd834c3 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -26,6 +26,7 @@
     signature,
     bound_function,
 )
+from numba.core.typing.arraydecl import normalize_shape
 from numba.np.arrayobj import _array_copy
 
 import dpctl.dptensor.numpy_usm_shared as nus
@@ -83,6 +84,13 @@ def __init__(
             addrspace=addrspace,
         )
 
+    def copy(self, *args, **kwargs):
+        retty = super(UsmSharedArrayType, self).copy(*args, **kwargs)
+        if isinstance(retty, types.Array):
+            return UsmSharedArrayType(dtype=retty.dtype, ndim=retty.ndim, layout=retty.layout)
+        else:
+            return retty
+
     # Tell Numba typing how to combine UsmSharedArrayType with other ndarray types.
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         if method == "__call__":
@@ -162,12 +170,13 @@ def allocator_UsmArray(context, builder, size, align):
 
 
 def is_usm_callback(obj):
+    dprint("is_usm_callback:", obj, type(obj))
     if isinstance(obj, numba.core.runtime._nrt_python._MemInfo):
         mobj = obj
         while isinstance(mobj, numba.core.runtime._nrt_python._MemInfo):
             ea = mobj.external_allocator
-            d = mobj.data
             dppl_rt_allocator = numba_dppy._dppy_rt.get_external_allocator()
+            dprint("Checking MemInfo:", ea)
             if ea == dppl_rt_allocator:
                 return True
             mobj = mobj.parent
@@ -210,25 +219,34 @@ def numba_register_lower_builtin():
     todo = []
     todo_builtin = []
     todo_getattr = []
+    todo_array_member_func = []
 
     # For all Numpy identifiers that have been registered for typing in Numba...
     # this registry contains functions, getattrs, setattrs, casts and constants...need to do them all? FIX FIX FIX
     for ig in lower_registry.functions:
         impl, func, types = ig
+        dprint("Numpy lowered registry functions:", impl, func, type(func), types)
         # If it is a Numpy function...
         if isinstance(func, ftype):
+            dprint("is ftype")
             if func.__module__ == np.__name__:
+                dprint("is Numpy module")
                 # If we have overloaded that function in the usmarray module (always True right now)...
                 if func.__name__ in functions_list:
                     todo.append(ig)
         if isinstance(func, bftype):
+            dprint("is bftype")
             if func.__module__ == np.__name__:
+                dprint("is Numpy module")
                 # If we have overloaded that function in the usmarray module (always True right now)...
                 if func.__name__ in functions_list:
                     todo.append(ig)
+        if isinstance(func, str) and func.startswith("array."):
+            todo_array_member_func.append(ig)
 
     for lg in lower_registry.getattrs:
         func, attr, types = lg
+        dprint("Numpy lowered registry getattrs:", func, attr, types)
         types_with_usmarray = types_replace_array(types)
         if UsmSharedArrayType in types_with_usmarray:
             dprint(
@@ -251,6 +269,13 @@ def numba_register_lower_builtin():
         new_impl = copy_func_for_usmarray(impl, nus)
         lower_registry.functions.append((new_impl, usmarray_func, types))
 
+    for impl, func, types in todo_array_member_func:
+        types_with_usmarray = types_replace_array(types)
+        usmarray_func = "usm" + func
+        dprint("Registering lowerer for", impl, usmarray_func, types_with_usmarray)
+        new_impl = copy_func_for_usmarray(impl, nus)
+        lower_registry.functions.append((new_impl, usmarray_func, types_with_usmarray))
+
 
 def argspec_to_string(argspec):
     first_default_arg = len(argspec.args) - len(argspec.defaults)
@@ -470,7 +495,7 @@ def resolve_T(self, ary):
         else:
             layout = {"C": "F", "F": "C"}.get(ary.layout, "A")
             retty = ary.copy(layout=layout)
-        return self.convert_array_to_usmarray(retty)
+        return retty
 
     def resolve_real(self, ary):
         return self._resolve_real_imag(ary, attr="real")
@@ -485,14 +510,19 @@ def _resolve_real_imag(self, ary, attr):
             res = ary.copy(dtype=ary.dtype)
             if attr == "imag":
                 res = res.copy(readonly=True)
-            return self.convert_array_to_usmarray(res)
+            return res
         else:
             msg = "cannot access .{} of array of {}"
             raise TypingError(msg.format(attr, ary.dtype))
 
+    @bound_function("usmarray.copy")
+    def resolve_copy(self, ary, args, kws):
+        assert not args
+        assert not kws
+        retty = ary.copy(layout="C", readonly=False)
+        return signature(retty)
 
-"""
-    @bound_function("array.transpose")
+    @bound_function("usmarray.transpose")
     def resolve_transpose(self, ary, args, kws):
         def sentry_shape_scalar(ty):
             if ty in types.number_domain:
@@ -531,14 +561,7 @@ def sentry_shape_scalar(ty):
             assert ary.ndim == len(args)
             return signature(self.resolve_T(ary).copy(layout="A"), *args)
 
-    @bound_function("array.copy")
-    def resolve_copy(self, ary, args, kws):
-        assert not args
-        assert not kws
-        retty = ary.copy(layout="C", readonly=False)
-        return signature(retty)
-
-    @bound_function("array.item")
+    @bound_function("usmarray.item")
     def resolve_item(self, ary, args, kws):
         assert not kws
         # We don't support explicit arguments as that's exactly equivalent
@@ -547,7 +570,7 @@ def resolve_item(self, ary, args, kws):
         if not args:
             return signature(ary.dtype)
 
-    @bound_function("array.itemset")
+    @bound_function("usmarray.itemset")
     def resolve_itemset(self, ary, args, kws):
         assert not kws
         # We don't support explicit arguments as that's exactly equivalent
@@ -556,7 +579,7 @@ def resolve_itemset(self, ary, args, kws):
         if len(args) == 1:
             return signature(types.none, ary.dtype)
 
-    @bound_function("array.nonzero")
+    @bound_function("usmarray.nonzero")
     def resolve_nonzero(self, ary, args, kws):
         assert not args
         assert not kws
@@ -565,7 +588,7 @@ def resolve_nonzero(self, ary, args, kws):
         retty = types.UniTuple(UsmSharedArrayType(types.intp, 1, 'C'), ndim)
         return signature(retty)
 
-    @bound_function("array.reshape")
+    @bound_function("usmarray.reshape")
     def resolve_reshape(self, ary, args, kws):
         def sentry_shape_scalar(ty):
             if ty in types.number_domain:
@@ -608,14 +631,14 @@ def sentry_shape_scalar(ty):
             retty = ary.copy(ndim=len(args))
             return signature(retty, *args)
 
-    @bound_function("array.sort")
+    @bound_function("usmarray.sort")
     def resolve_sort(self, ary, args, kws):
         assert not args
         assert not kws
         if ary.ndim == 1:
             return signature(types.none)
 
-    @bound_function("array.argsort")
+    @bound_function("usmarray.argsort")
     def resolve_argsort(self, ary, args, kws):
         assert not args
         kwargs = dict(kws)
@@ -632,7 +655,7 @@ def argsort_stub(kind='quicksort'):
             sig = signature(UsmSharedArrayType(types.intp, 1, 'C'), kind).replace(pysig=pysig)
             return sig
 
-    @bound_function("array.view")
+    @bound_function("usmarray.view")
     def resolve_view(self, ary, args, kws):
         from .npydecl import parse_dtype
         assert not kws
@@ -643,7 +666,7 @@ def resolve_view(self, ary, args, kws):
         retty = ary.copy(dtype=dtype)
         return signature(retty, *args)
 
-    @bound_function("array.astype")
+    @bound_function("usmarray.astype")
     def resolve_astype(self, ary, args, kws):
         from .npydecl import parse_dtype
         assert not kws
@@ -661,21 +684,21 @@ def resolve_astype(self, ary, args, kws):
         retty = ary.copy(dtype=dtype, layout=layout, readonly=False)
         return signature(retty, *args)
 
-    @bound_function("array.ravel")
+    @bound_function("usmarray.ravel")
     def resolve_ravel(self, ary, args, kws):
         # Only support no argument version (default order='C')
         assert not kws
         assert not args
         return signature(ary.copy(ndim=1, layout='C'))
 
-    @bound_function("array.flatten")
+    @bound_function("usmarray.flatten")
     def resolve_flatten(self, ary, args, kws):
         # Only support no argument version (default order='C')
         assert not kws
         assert not args
         return signature(ary.copy(ndim=1, layout='C'))
 
-    @bound_function("array.take")
+    @bound_function("usmarray.take")
     def resolve_take(self, ary, args, kws):
         assert not kws
         argty, = args
@@ -696,7 +719,6 @@ def generic_resolve(self, ary, attr):
         if isinstance(ary.dtype, types.Record):
             if attr in ary.dtype.fields:
                 return ary.copy(dtype=ary.dtype.typeof(attr), layout='A')
-"""
 
 
 @typing_registry.register_global(nus.as_ndarray)

From dfb35601057f367d907ff33fdc3f6dedfd787d8b Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Wed, 6 Jan 2021 18:24:30 -0600
Subject: [PATCH 35/40] black formatter changes.

---
 numba_dppy/numpy_usm_shared.py    | 63 ++++++++++++++++++-------------
 numba_dppy/tests/test_usmarray.py |  2 +-
 2 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index a34dd834c3..1102a59c9e 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -87,7 +87,9 @@ def __init__(
     def copy(self, *args, **kwargs):
         retty = super(UsmSharedArrayType, self).copy(*args, **kwargs)
         if isinstance(retty, types.Array):
-            return UsmSharedArrayType(dtype=retty.dtype, ndim=retty.ndim, layout=retty.layout)
+            return UsmSharedArrayType(
+                dtype=retty.dtype, ndim=retty.ndim, layout=retty.layout
+            )
         else:
             return retty
 
@@ -538,7 +540,7 @@ def sentry_shape_scalar(ty):
             return signature(self.resolve_T(ary))
 
         if len(args) == 1:
-            shape, = args
+            (shape,) = args
 
             if sentry_shape_scalar(shape):
                 assert ary.ndim == 1
@@ -556,8 +558,9 @@ def sentry_shape_scalar(ty):
 
         else:
             if any(not sentry_shape_scalar(a) for a in args):
-                raise TypeError("transpose({0}) is not supported".format(
-                    ', '.join(args)))
+                raise TypeError(
+                    "transpose({0}) is not supported".format(", ".join(args))
+                )
             assert ary.ndim == len(args)
             return signature(self.resolve_T(ary).copy(layout="A"), *args)
 
@@ -585,7 +588,7 @@ def resolve_nonzero(self, ary, args, kws):
         assert not kws
         # 0-dim arrays return one result array
         ndim = max(ary.ndim, 1)
-        retty = types.UniTuple(UsmSharedArrayType(types.intp, 1, 'C'), ndim)
+        retty = types.UniTuple(UsmSharedArrayType(types.intp, 1, "C"), ndim)
         return signature(retty)
 
     @bound_function("usmarray.reshape")
@@ -600,13 +603,13 @@ def sentry_shape_scalar(ty):
                 return False
 
         assert not kws
-        if ary.layout not in 'CF':
+        if ary.layout not in "CF":
             # only work for contiguous array
             raise TypeError("reshape() supports contiguous array only")
 
         if len(args) == 1:
             # single arg
-            shape, = args
+            (shape,) = args
 
             if sentry_shape_scalar(shape):
                 ndim = 1
@@ -625,8 +628,9 @@ def sentry_shape_scalar(ty):
         else:
             # vararg case
             if any(not sentry_shape_scalar(a) for a in args):
-                raise TypeError("reshape({0}) is not supported".format(
-                    ', '.join(map(str, args))))
+                raise TypeError(
+                    "reshape({0}) is not supported".format(", ".join(map(str, args)))
+                )
 
             retty = ary.copy(ndim=len(args))
             return signature(retty, *args)
@@ -642,24 +646,29 @@ def resolve_sort(self, ary, args, kws):
     def resolve_argsort(self, ary, args, kws):
         assert not args
         kwargs = dict(kws)
-        kind = kwargs.pop('kind', types.StringLiteral('quicksort'))
+        kind = kwargs.pop("kind", types.StringLiteral("quicksort"))
         if not isinstance(kind, types.StringLiteral):
             raise errors.TypingError('"kind" must be a string literal')
         if kwargs:
             msg = "Unsupported keywords: {!r}"
             raise TypingError(msg.format([k for k in kwargs.keys()]))
         if ary.ndim == 1:
-            def argsort_stub(kind='quicksort'):
+
+            def argsort_stub(kind="quicksort"):
                 pass
+
             pysig = utils.pysignature(argsort_stub)
-            sig = signature(UsmSharedArrayType(types.intp, 1, 'C'), kind).replace(pysig=pysig)
+            sig = signature(UsmSharedArrayType(types.intp, 1, "C"), kind).replace(
+                pysig=pysig
+            )
             return sig
 
     @bound_function("usmarray.view")
     def resolve_view(self, ary, args, kws):
         from .npydecl import parse_dtype
+
         assert not kws
-        dtype, = args
+        (dtype,) = args
         dtype = parse_dtype(dtype)
         if dtype is None:
             return
@@ -669,16 +678,18 @@ def resolve_view(self, ary, args, kws):
     @bound_function("usmarray.astype")
     def resolve_astype(self, ary, args, kws):
         from .npydecl import parse_dtype
+
         assert not kws
-        dtype, = args
+        (dtype,) = args
         dtype = parse_dtype(dtype)
         if dtype is None:
             return
         if not self.context.can_convert(ary.dtype, dtype):
-            raise TypeError("astype(%s) not supported on %s: "
-                            "cannot convert from %s to %s"
-                            % (dtype, ary, ary.dtype, dtype))
-        layout = ary.layout if ary.layout in 'CF' else 'C'
+            raise TypeError(
+                "astype(%s) not supported on %s: "
+                "cannot convert from %s to %s" % (dtype, ary, ary.dtype, dtype)
+            )
+        layout = ary.layout if ary.layout in "CF" else "C"
         # reset the write bit irrespective of whether the cast type is the same
         # as the current dtype, this replicates numpy
         retty = ary.copy(dtype=dtype, layout=layout, readonly=False)
@@ -689,27 +700,27 @@ def resolve_ravel(self, ary, args, kws):
         # Only support no argument version (default order='C')
         assert not kws
         assert not args
-        return signature(ary.copy(ndim=1, layout='C'))
+        return signature(ary.copy(ndim=1, layout="C"))
 
     @bound_function("usmarray.flatten")
     def resolve_flatten(self, ary, args, kws):
         # Only support no argument version (default order='C')
         assert not kws
         assert not args
-        return signature(ary.copy(ndim=1, layout='C'))
+        return signature(ary.copy(ndim=1, layout="C"))
 
     @bound_function("usmarray.take")
     def resolve_take(self, ary, args, kws):
         assert not kws
-        argty, = args
+        (argty,) = args
         if isinstance(argty, types.Integer):
             sig = signature(ary.dtype, *args)
         elif isinstance(argty, UsmSharedArrayType):
-            sig = signature(argty.copy(layout='C', dtype=ary.dtype), *args)
-        elif isinstance(argty, types.List): # 1d lists only
-            sig = signature(UsmSharedArrayType(ary.dtype, 1, 'C'), *args)
+            sig = signature(argty.copy(layout="C", dtype=ary.dtype), *args)
+        elif isinstance(argty, types.List):  # 1d lists only
+            sig = signature(UsmSharedArrayType(ary.dtype, 1, "C"), *args)
         elif isinstance(argty, types.BaseTuple):
-            sig = signature(UsmSharedArrayType(ary.dtype, np.ndim(argty), 'C'), *args)
+            sig = signature(UsmSharedArrayType(ary.dtype, np.ndim(argty), "C"), *args)
         else:
             raise TypeError("take(%s) not supported for %s" % argty)
         return sig
@@ -718,7 +729,7 @@ def generic_resolve(self, ary, attr):
         # Resolution of other attributes, for record arrays
         if isinstance(ary.dtype, types.Record):
             if attr in ary.dtype.fields:
-                return ary.copy(dtype=ary.dtype.typeof(attr), layout='A')
+                return ary.copy(dtype=ary.dtype.typeof(attr), layout="A")
 
 
 @typing_registry.register_global(nus.as_ndarray)
diff --git a/numba_dppy/tests/test_usmarray.py b/numba_dppy/tests/test_usmarray.py
index b6a58b3e59..36deed916e 100644
--- a/numba_dppy/tests/test_usmarray.py
+++ b/numba_dppy/tests/test_usmarray.py
@@ -70,7 +70,7 @@ def numba_T(x):
 
 @numba.njit
 def numba_reshape(x):
-    return x.reshape((4,3))
+    return x.reshape((4, 3))
 
 
 class TestUsmArray(unittest.TestCase):

From 384ebd48003b1b405a22751be1e611c2aca2200b Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Wed, 6 Jan 2021 18:38:39 -0600
Subject: [PATCH 36/40] Remove unused test.

---
 numba_dppy/tests/test_usmarray.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/numba_dppy/tests/test_usmarray.py b/numba_dppy/tests/test_usmarray.py
index 36deed916e..b86f9476d7 100644
--- a/numba_dppy/tests/test_usmarray.py
+++ b/numba_dppy/tests/test_usmarray.py
@@ -25,14 +25,6 @@ def numba_mul_usmarray_asarray(a, b):  # a is usmarray, b is numpy
     return a * usmarray.asarray(b)
 
 
-# @numba.njit()
-# def f7(a):  # a is usmarray
-#     # implicit conversion of a to numpy.ndarray
-#     b = numpy.ones(10)
-#     c = a * b
-#     d = a.argsort()  # with no implicit conversion this fails
-
-
 @numba.njit
 def numba_usmarray_as_ndarray(a):
     return usmarray.as_ndarray(a)

From 738a14fdd513d9cf0c6416a93a3ee914c0e162fa Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Wed, 6 Jan 2021 18:38:56 -0600
Subject: [PATCH 37/40] Code review changes.

---
 numba_dppy/numpy_usm_shared.py | 64 +++++++++-------------------------
 1 file changed, 17 insertions(+), 47 deletions(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 1102a59c9e..6a64799495 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -52,16 +52,20 @@ def dprint(*args):
 from dpctl.memory import MemoryUSMShared
 import numba_dppy._dppy_rt
 
-# functions_list = [o[0] for o in getmembers(np) if isfunction(o[1]) or isbuiltin(o[1])]
-# class_list = [o for o in getmembers(np) if isclass(o[1])]
-
 # Register the helper function in dppl_rt so that we can insert calls to them via llvmlite.
 for py_name, c_address in numba_dppy._dppy_rt.c_helpers.items():
     llb.add_symbol(py_name, c_address)
 
 
-# This class creates a type in Numba.
 class UsmSharedArrayType(types.Array):
+    """Creates a Numba type for Numpy arrays that are stored in USM shared
+       memory.  We inherit from Numba's existing Numpy array type but overload
+       how this type is printed during dumping of typing information and we
+       implement the special __array_ufunc__ function to determine who this
+       type gets combined with scalars and regular Numpy types.
+       We re-use Numpy functions as well but those are going to return Numpy
+       arrays allocated in USM and we use the overloaded copy function to
+       convert such USM-backed Numpy arrays into typed USM arrays."""
     def __init__(
         self,
         dtype,
@@ -168,7 +172,7 @@ def allocator_UsmArray(context, builder, size, align):
     return builder.call(fn, [size, align, ext_allocator])
 
 
-registered = False
+_registered = False
 
 
 def is_usm_callback(obj):
@@ -188,9 +192,9 @@ def is_usm_callback(obj):
 
 
 def numba_register():
-    global registered
-    if not registered:
-        registered = True
+    global _registered
+    if not _registered:
+        _registered = True
         ndarray.add_external_usm_checker(is_usm_callback)
         numba_register_typing()
         numba_register_lower_builtin()
@@ -306,7 +310,6 @@ def numba_register_typing():
                 todo.append(ig)
             elif isinstance(typ, numba.core.types.functions.NumberClass):
                 pass
-                # todo_classes.append(ig)
 
     for tgetattr in templates_registry.attributes:
         dprint("Numpy getattr:", tgetattr, type(tgetattr), tgetattr.key)
@@ -366,8 +369,11 @@ def generic_impl(self):
             typer_func = """def typer({}):
                                 original_res = original_typer({})
                                 if isinstance(original_res, types.Array):
-                                    return UsmSharedArrayType(dtype=original_res.dtype, ndim=original_res.ndim, layout=original_res.layout)
-
+                                    return UsmSharedArrayType(
+                                        dtype=original_res.dtype,
+                                        ndim=original_res.ndim,
+                                        layout=original_res.layout
+                                    )
                                 return original_res""".format(
                 astr, ",".join(ot_argspec.args)
             )
@@ -415,42 +421,6 @@ def generic_impl(self):
     # after the registration callback that gets us here so we would miss the
     # attribute registrations we need.
     typing_registry.register_attr(UsmArrayAttribute)
-    """
-    for tgetattr in todo_getattr:
-        class_name = tgetattr.__name__ + "_usmarray"
-        dprint("tgetattr:", tgetattr, type(tgetattr), class_name)
-
-        @classmethod
-        def set_key(cls, key):
-            cls.key = key
-
-        def getattr_impl(self, attr):
-            dprint("getattr_impl:", class_name, attr)
-            if attr.startswith("resolve_"):
-                def wrapper(*args, **kwargs):
-                    attr_res = tgetattr.__getattribute__(self, attr)(*args, **kwargs)
-                    if isinstance(attr_res, types.Array):
-                        return UsmSharedArrayType(
-                            dtype=attr_res.dtype,
-                            ndim=attr_res.ndim,
-                            layout=attr_res.layout,
-                        )
-                    else:
-                        return attr_res
-
-                return wrapper
-            else:
-                return tgetattr.__getattribute__(self, attr)
-
-        new_usmarray_template = type(
-            class_name,
-            (tgetattr,),
-            {"set_class_vars": set_key, "__getattribute__": getattr_impl},
-        )
-
-        new_usmarray_template.set_class_vars(UsmSharedArrayType)
-        templates_registry.register_attr(new_usmarray_template)
-    """
 
 
 class UsmArrayAttribute(AttributeTemplate):

From 3534b6e350e12325568661f9cd63a977dbaa7361 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Thu, 7 Jan 2021 00:29:32 -0600
Subject: [PATCH 38/40] black changes.

---
 numba_dppy/numpy_usm_shared.py | 39 ++++++++++++++++------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 6a64799495..1330f29dfb 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -59,13 +59,13 @@ def dprint(*args):
 
 class UsmSharedArrayType(types.Array):
     """Creates a Numba type for Numpy arrays that are stored in USM shared
-       memory.  We inherit from Numba's existing Numpy array type but overload
-       how this type is printed during dumping of typing information and we
-       implement the special __array_ufunc__ function to determine who this
-       type gets combined with scalars and regular Numpy types.
-       We re-use Numpy functions as well but those are going to return Numpy
-       arrays allocated in USM and we use the overloaded copy function to
-       convert such USM-backed Numpy arrays into typed USM arrays."""
+    memory.  We inherit from Numba's existing Numpy array type but overload
+    how this type is printed during dumping of typing information and we
+    implement the special __array_ufunc__ function to determine who this
+    type gets combined with scalars and regular Numpy types.
+    We re-use Numpy functions as well but those are going to return Numpy
+    arrays allocated in USM and we use the overloaded copy function to
+    convert such USM-backed Numpy arrays into typed USM arrays."""
     def __init__(
         self,
         dtype,
@@ -339,20 +339,17 @@ def numba_register_typing():
         except:
             dprint("failed to eval", val.__name__)
             continue
-        """
-        if debug:
-            print("--------------------------------------------------------------")
-            print("need to re-register for usmarray", val, typ, typ.typing_key)
-            print("val:", val, type(val), "dir val", dir(val))
-            print("typ:", typ, type(typ), "dir typ", dir(typ))
-            print("typing key:", typ.typing_key)
-            print("name:", typ.name)
-            print("key:", typ.key)
-            print("templates:", typ.templates)
-            print("template:", template, type(template))
-            print("dpval:", dpval, type(dpval))
-            print("--------------------------------------------------------------")
-        """
+        dprint("--------------------------------------------------------------")
+        dprint("need to re-register for usmarray", val, typ, typ.typing_key)
+        dprint("val:", val, type(val), "dir val", dir(val))
+        dprint("typ:", typ, type(typ), "dir typ", dir(typ))
+        dprint("typing key:", typ.typing_key)
+        dprint("name:", typ.name)
+        dprint("key:", typ.key)
+        dprint("templates:", typ.templates)
+        dprint("template:", template, type(template))
+        dprint("dpval:", dpval, type(dpval))
+        dprint("--------------------------------------------------------------")
 
         class_name = "DparrayTemplate_" + val.__name__
 

From 68733f44f53924fd3433097cc848df457e520f8d Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Thu, 7 Jan 2021 02:31:06 -0600
Subject: [PATCH 39/40] black changes.

---
 numba_dppy/numpy_usm_shared.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 1330f29dfb..1f63f1cad4 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -66,6 +66,7 @@ class UsmSharedArrayType(types.Array):
     We re-use Numpy functions as well but those are going to return Numpy
     arrays allocated in USM and we use the overloaded copy function to
     convert such USM-backed Numpy arrays into typed USM arrays."""
+
     def __init__(
         self,
         dtype,

From d56a463b87f43038d40f32d27e256f96487a3d9b Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 12 Jan 2021 16:02:55 +0300
Subject: [PATCH 40/40] Update numba_dppy/numpy_usm_shared.py

---
 numba_dppy/numpy_usm_shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 1f63f1cad4..150ab0d3b3 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -229,7 +229,7 @@ def numba_register_lower_builtin():
     todo_array_member_func = []
 
     # For all Numpy identifiers that have been registered for typing in Numba...
-    # this registry contains functions, getattrs, setattrs, casts and constants...need to do them all? FIX FIX FIX
+    # this registry contains functions, getattrs, setattrs, casts and constants...
     for ig in lower_registry.functions:
         impl, func, types = ig
         dprint("Numpy lowered registry functions:", impl, func, type(func), types)