From fea8446477a183fd9dbda659d56d2bfa608944b8 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 17 Nov 2020 11:53:10 +0300
Subject: [PATCH 01/19] Update README.rst about patched for numba-dppy

We have to mention that IntelPython/numba differs from original Numba.
---
 README.rst | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index aa698904f47..7826858f3c2 100644
--- a/README.rst
+++ b/README.rst
@@ -1,5 +1,5 @@
 *****
-Numba
+Numba with patches for numba-dppy
 *****
 
 .. image:: https://badges.gitter.im/numba/numba.svg
@@ -10,6 +10,14 @@ Numba
    :target: https://numba.discourse.group/
    :alt: Discourse
 
+Patches for numba-dppy
+######################
+
+See https://github.com/IntelPython/numba-dppy.
+If `numba-dppy` package is installed this version of Numba provides
+additional features.
+Without `numba-dppy` package this version of Numba works like original Numba.
+
 A Just-In-Time Compiler for Numerical Functions in Python
 #########################################################
 

From 168b8dd763154b3ed22350e193fea4d5b3b6f3fe Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Thu, 12 Nov 2020 13:04:24 +0300
Subject: [PATCH 02/19] Remove the misspelling of finalize_dynamic_globals

This commit fixes misspelling of _finalize_dynamic_globals().
This function is used only in the same file.
---
 numba/core/codegen.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numba/core/codegen.py b/numba/core/codegen.py
index 1e05c12bacd..e22fa89012e 100644
--- a/numba/core/codegen.py
+++ b/numba/core/codegen.py
@@ -247,7 +247,7 @@ def finalize(self):
         self._final_module.verify()
         self._finalize_final_module()
 
-    def _finalize_dyanmic_globals(self):
+    def _finalize_dynamic_globals(self):
         # Scan for dynamic globals
         for gv in self._final_module.global_variables:
             if gv.name.startswith('numba.dynamic.globals'):
@@ -265,7 +265,7 @@ def _finalize_final_module(self):
         """
         Make the underlying LLVM module ready to use.
         """
-        self._finalize_dyanmic_globals()
+        self._finalize_dynamic_globals()
         self._verify_declare_only_symbols()
 
         # Remember this on the module, for the object cache hooks

From 7e503d265c7a419c5dba49c6ffa616781fa63adc Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 13 Nov 2020 04:56:53 -0600
Subject: [PATCH 03/19] Patch for addrspace

This commit adds modification introduced about addrspace from ce96c20 by @DrTodd13
and test fixed in 947b407 by @reazulhoque.

This commit could be proposed to upstream.
---
 numba/core/datamodel/models.py |  2 +-
 numba/core/types/common.py     |  3 ++-
 numba/core/types/misc.py       |  5 +++--
 numba/core/types/npytypes.py   | 12 +++++++-----
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/numba/core/datamodel/models.py b/numba/core/datamodel/models.py
index 77b2fc90a21..4049d7c0e60 100644
--- a/numba/core/datamodel/models.py
+++ b/numba/core/datamodel/models.py
@@ -865,7 +865,7 @@ def __init__(self, dmm, fe_type):
             ('parent', types.pyobject),
             ('nitems', types.intp),
             ('itemsize', types.intp),
-            ('data', types.CPointer(fe_type.dtype)),
+            ('data', types.CPointer(fe_type.dtype, addrspace=fe_type.addrspace)),
             ('shape', types.UniTuple(types.intp, ndim)),
             ('strides', types.UniTuple(types.intp, ndim)),
 
diff --git a/numba/core/types/common.py b/numba/core/types/common.py
index f1806540c60..f054d62652e 100644
--- a/numba/core/types/common.py
+++ b/numba/core/types/common.py
@@ -45,7 +45,7 @@ class Buffer(IterableType, ArrayCompatible):
     # CS and FS are not reserved for inner contig but strided
     LAYOUTS = frozenset(['C', 'F', 'CS', 'FS', 'A'])
 
-    def __init__(self, dtype, ndim, layout, readonly=False, name=None):
+    def __init__(self, dtype, ndim, layout, readonly=False, name=None, addrspace=None):
         from .misc import unliteral
 
         if isinstance(dtype, Buffer):
@@ -55,6 +55,7 @@ def __init__(self, dtype, ndim, layout, readonly=False, name=None):
         self.dtype = unliteral(dtype)
         self.ndim = ndim
         self.layout = layout
+        self.addrspace = addrspace
         if readonly:
             self.mutable = False
         if name is None:
diff --git a/numba/core/types/misc.py b/numba/core/types/misc.py
index 34d8f3a51bf..a2445d967d3 100644
--- a/numba/core/types/misc.py
+++ b/numba/core/types/misc.py
@@ -155,14 +155,15 @@ class CPointer(Type):
     """
     mutable = True
 
-    def __init__(self, dtype):
+    def __init__(self, dtype, addrspace=None):
         self.dtype = dtype
+        self.addrspace = addrspace
         name = "%s*" % dtype
         super(CPointer, self).__init__(name)
 
     @property
     def key(self):
-        return self.dtype
+        return self.dtype, self.addrspace
 
 
 class EphemeralPointer(CPointer):
diff --git a/numba/core/types/npytypes.py b/numba/core/types/npytypes.py
index c7c312adce3..6f6307c5526 100644
--- a/numba/core/types/npytypes.py
+++ b/numba/core/types/npytypes.py
@@ -395,7 +395,7 @@ class Array(Buffer):
     """
 
     def __init__(self, dtype, ndim, layout, readonly=False, name=None,
-                 aligned=True):
+                 aligned=True, addrspace=None):
         if readonly:
             self.mutable = False
         if (not aligned or
@@ -408,7 +408,7 @@ def __init__(self, dtype, ndim, layout, readonly=False, name=None,
             if not self.aligned:
                 type_name = "unaligned " + type_name
             name = "%s(%s, %sd, %s)" % (type_name, dtype, ndim, layout)
-        super(Array, self).__init__(dtype, ndim, layout, name=name)
+        super(Array, self).__init__(dtype, ndim, layout, name=name, addrspace=addrspace)
 
     @property
     def mangling_args(self):
@@ -417,7 +417,7 @@ def mangling_args(self):
                 'aligned' if self.aligned else 'unaligned']
         return self.__class__.__name__, args
 
-    def copy(self, dtype=None, ndim=None, layout=None, readonly=None):
+    def copy(self, dtype=None, ndim=None, layout=None, readonly=None, addrspace=None):
         if dtype is None:
             dtype = self.dtype
         if ndim is None:
@@ -426,12 +426,14 @@ def copy(self, dtype=None, ndim=None, layout=None, readonly=None):
             layout = self.layout
         if readonly is None:
             readonly = not self.mutable
+        if addrspace is None:
+            addrspace = self.addrspace
         return Array(dtype=dtype, ndim=ndim, layout=layout, readonly=readonly,
-                     aligned=self.aligned)
+                     aligned=self.aligned, addrspace=addrspace)
 
     @property
     def key(self):
-        return self.dtype, self.ndim, self.layout, self.mutable, self.aligned
+        return self.dtype, self.ndim, self.layout, self.mutable, self.aligned, self.addrspace
 
     def unify(self, typingctx, other):
         """

From feb86235fd178ebabfa638cc86f91b0d032eecd1 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 13 Nov 2020 04:56:54 -0600
Subject: [PATCH 04/19] Patch for lowering

This patch introduces global initialization of lower_extensions
with one functions and replaces with a list of functions.

This change first made in commit 947b407 by @reazulhoque.
---
 numba/core/lowering.py           | 9 ++++++++-
 numba/parfors/parfor_lowering.py | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/numba/core/lowering.py b/numba/core/lowering.py
index 1c9c19cd3b1..bcbc4e8b980 100644
--- a/numba/core/lowering.py
+++ b/numba/core/lowering.py
@@ -274,6 +274,13 @@ def debug_print(self, msg):
 class Lower(BaseLower):
     GeneratorLower = generators.GeneratorLower
 
+    def __init__(self, context, library, fndesc, func_ir, metadata=None):
+        BaseLower.__init__(self, context, library, fndesc, func_ir, metadata)
+        from numba.parfors.parfor_lowering import _lower_parfor_parallel
+        from numba.parfors import parfor
+        if parfor.Parfor not in lower_extensions:
+            lower_extensions[parfor.Parfor] = [_lower_parfor_parallel]
+
     def pre_block(self, block):
         from numba.core.unsafe import eh
 
@@ -440,7 +447,7 @@ def lower_inst(self, inst):
         else:
             for _class, func in lower_extensions.items():
                 if isinstance(inst, _class):
-                    func(self, inst)
+                    func[-1](self, inst)
                     return
             raise NotImplementedError(type(inst))
 
diff --git a/numba/parfors/parfor_lowering.py b/numba/parfors/parfor_lowering.py
index dc499498326..e559e8c017c 100644
--- a/numba/parfors/parfor_lowering.py
+++ b/numba/parfors/parfor_lowering.py
@@ -480,7 +480,7 @@ def _lower_parfor_parallel(lowerer, parfor):
         print("_lower_parfor_parallel done")
 
 # A work-around to prevent circular imports
-lowering.lower_extensions[parfor.Parfor] = _lower_parfor_parallel
+#lowering.lower_extensions[parfor.Parfor] = _lower_parfor_parallel
 
 
 def _create_shape_signature(

From 10303682c13999a85402a6293548a4456be1b4bf Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 13 Nov 2020 04:56:55 -0600
Subject: [PATCH 05/19] Patch for change numpy functions mapping

Changed in 8ccfd36 and a77eab6 by @reazulhoque.
Possibly this 2 changes are not related to each other.
---
 numba/np/npyimpl.py     | 7 ++++++-
 numba/parfors/parfor.py | 5 +++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/numba/np/npyimpl.py b/numba/np/npyimpl.py
index d1e65e72e80..1763b6a3f42 100644
--- a/numba/np/npyimpl.py
+++ b/numba/np/npyimpl.py
@@ -448,7 +448,12 @@ def __init__(self, context, builder, outer_sig):
             super(_KernelImpl, self).__init__(context, builder, outer_sig)
             loop = ufunc_find_matching_loop(
                 ufunc, outer_sig.args + tuple(_unpack_output_types(ufunc, outer_sig)))
-            self.fn = ufunc_db.get_ufunc_info(ufunc).get(loop.ufunc_sig)
+
+            if hasattr(context, 'ufunc_db'):
+                self.fn = context.ufunc_db[ufunc].get(loop.ufunc_sig)
+            else:
+                self.fn = ufunc_db.get_ufunc_info(ufunc).get(loop.ufunc_sig)
+
             self.inner_sig = _ufunc_loop_sig(loop.outputs, loop.inputs)
 
             if self.fn is None:
diff --git a/numba/parfors/parfor.py b/numba/parfors/parfor.py
index 0f23e69843a..c73d99905dd 100644
--- a/numba/parfors/parfor.py
+++ b/numba/parfors/parfor.py
@@ -1350,7 +1350,7 @@ class PreParforPass(object):
     implementations of numpy functions if available.
     """
     def __init__(self, func_ir, typemap, calltypes, typingctx, options,
-                 swapped={}):
+                 swapped={}, replace_functions_map=replace_functions_map):
         self.func_ir = func_ir
         self.typemap = typemap
         self.calltypes = calltypes
@@ -1358,6 +1358,7 @@ def __init__(self, func_ir, typemap, calltypes, typingctx, options,
         self.options = options
         # diagnostics
         self.swapped = swapped
+        self.replace_functions_map = replace_functions_map
         self.stats = {
             'replaced_func': 0,
             'replaced_dtype': 0,
@@ -1394,7 +1395,7 @@ def _replace_parallel_functions(self, blocks):
                         def replace_func():
                             func_def = get_definition(self.func_ir, expr.func)
                             callname = find_callname(self.func_ir, expr)
-                            repl_func = replace_functions_map.get(callname, None)
+                            repl_func = self.replace_functions_map.get(callname, None)
                             # Handle method on array type
                             if (repl_func is None and
                                 len(callname) == 2 and

From f29f1a45b6134e9f95ebe5fe8d8a54fe1ba8a7fe Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Mon, 16 Nov 2020 05:41:03 -0600
Subject: [PATCH 06/19] Uncomment require_global_compiler_lock()"

This line was commented when introduced codegen debugging by @DrTodd13.
Uncommenting this line requires modifications in numba-dppy.
It seems that numba-dppy compilation does not work correctly with
global compiler lock.
---
 numba/core/codegen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/core/codegen.py b/numba/core/codegen.py
index e22fa89012e..324a999fbe3 100644
--- a/numba/core/codegen.py
+++ b/numba/core/codegen.py
@@ -220,7 +220,7 @@ def finalize(self):
         Finalization involves various stages of code optimization and
         linking.
         """
-        require_global_compiler_lock()
+        #require_global_compiler_lock()
 
         # Report any LLVM-related problems to the user
         self._codegen._check_llvm_bugs()

From 1afe61415e8a67501f3b1d26a4040b4bd1535359 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 18 Nov 2020 04:52:29 -0600
Subject: [PATCH 07/19] Patch for with context

This modifications make jit() decorator use TargetDispatcher from dppl.
Changes made in #57 by @AlexanderKalistratov and @1e-to.
---
 numba/core/decorators.py         | 58 ++++++++++++++++++++++----------
 numba/core/dispatcher.py         | 12 ++++++-
 numba/core/registry.py           |  6 ++++
 numba/tests/test_dispatcher.py   |  2 ++
 numba/tests/test_nrt.py          |  2 ++
 numba/tests/test_record_dtype.py |  4 +--
 numba/tests/test_serialize.py    |  6 ++--
 7 files changed, 67 insertions(+), 23 deletions(-)

diff --git a/numba/core/decorators.py b/numba/core/decorators.py
index cfe91168969..3ecc188a9a7 100644
--- a/numba/core/decorators.py
+++ b/numba/core/decorators.py
@@ -149,7 +149,7 @@ def bar(x, y):
         target = options.pop('target')
         warnings.warn("The 'target' keyword argument is deprecated.", NumbaDeprecationWarning)
     else:
-        target = options.pop('_target', 'cpu')
+        target = options.pop('_target', None)
 
     options['boundscheck'] = boundscheck
 
@@ -183,27 +183,16 @@ def bar(x, y):
 
 
 def _jit(sigs, locals, target, cache, targetoptions, **dispatcher_args):
-    dispatcher = registry.dispatcher_registry[target]
-
-    def wrapper(func):
-        if extending.is_jitted(func):
-            raise TypeError(
-                "A jit decorator was called on an already jitted function "
-                f"{func}.  If trying to access the original python "
-                f"function, use the {func}.py_func attribute."
-            )
-
-        if not inspect.isfunction(func):
-            raise TypeError(
-                "The decorated object is not a function (got type "
-                f"{type(func)})."
-            )
 
+    def wrapper(func, dispatcher):
         if config.ENABLE_CUDASIM and target == 'cuda':
             from numba import cuda
             return cuda.jit(func)
         if config.DISABLE_JIT and not target == 'npyufunc':
             return func
+        if target == 'dppl':
+            from . import dppl
+            return dppl.jit(func)
         disp = dispatcher(py_func=func, locals=locals,
                           targetoptions=targetoptions,
                           **dispatcher_args)
@@ -219,7 +208,42 @@ def wrapper(func):
                 disp.disable_compile()
         return disp
 
-    return wrapper
+    def __wrapper(func):
+        if extending.is_jitted(func):
+            raise TypeError(
+                "A jit decorator was called on an already jitted function "
+                f"{func}.  If trying to access the original python "
+                f"function, use the {func}.py_func attribute."
+            )
+
+        if not inspect.isfunction(func):
+            raise TypeError(
+                "The decorated object is not a function (got type "
+                f"{type(func)})."
+            )
+
+        is_numba_dppy_present = False
+        try:
+            import numba_dppy.config as dppy_config
+
+            is_numba_dppy_present = dppy_config.dppy_present
+        except ImportError:
+            pass
+
+        if (not is_numba_dppy_present
+            or target == 'npyufunc' or targetoptions.get('no_cpython_wrapper')
+            or sigs or config.DISABLE_JIT or not targetoptions.get('nopython')):
+            target_ = target
+            if target_ is None:
+                target_ = 'cpu'
+            disp = registry.dispatcher_registry[target_]
+            return wrapper(func, disp)
+
+        from numba_dppy.target_dispatcher import TargetDispatcher
+        disp = TargetDispatcher(func, wrapper, target, targetoptions.get('parallel'))
+        return disp
+
+    return __wrapper
 
 
 def generated_jit(function=None, target='cpu', cache=False,
diff --git a/numba/core/dispatcher.py b/numba/core/dispatcher.py
index 18d9426cd4d..42418fe5783 100644
--- a/numba/core/dispatcher.py
+++ b/numba/core/dispatcher.py
@@ -673,7 +673,14 @@ def _set_uuid(self, u):
         self._recent.append(self)
 
 
-class Dispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase):
+import abc
+
+class DispatcherMeta(abc.ABCMeta):
+    def __instancecheck__(self, other):
+        return type(type(other)) == DispatcherMeta
+
+
+class Dispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase, metaclass=DispatcherMeta):
     """
     Implementation of user-facing dispatcher objects (i.e. created using
     the @jit decorator).
@@ -899,6 +906,9 @@ def get_function_type(self):
             cres = tuple(self.overloads.values())[0]
             return types.FunctionType(cres.signature)
 
+    def get_compiled(self):
+        return self
+
 
 class LiftedCode(serialize.ReduceMixin, _MemoMixin, _DispatcherBase):
     """
diff --git a/numba/core/registry.py b/numba/core/registry.py
index 2bd47ebe879..01e492f91f4 100644
--- a/numba/core/registry.py
+++ b/numba/core/registry.py
@@ -2,6 +2,7 @@
 
 from numba.core.descriptors import TargetDescriptor
 from numba.core import utils, typing, dispatcher, cpu
+from numba.core.compiler_lock import global_compiler_lock
 
 # -----------------------------------------------------------------------------
 # Default CPU target descriptors
@@ -26,16 +27,19 @@ class CPUTarget(TargetDescriptor):
     _nested = _NestedContext()
 
     @utils.cached_property
+    @global_compiler_lock
     def _toplevel_target_context(self):
         # Lazily-initialized top-level target context, for all threads
         return cpu.CPUContext(self.typing_context)
 
     @utils.cached_property
+    @global_compiler_lock
     def _toplevel_typing_context(self):
         # Lazily-initialized top-level typing context, for all threads
         return typing.Context()
 
     @property
+    @global_compiler_lock
     def target_context(self):
         """
         The target context for CPU targets.
@@ -47,6 +51,7 @@ def target_context(self):
             return self._toplevel_target_context
 
     @property
+    @global_compiler_lock
     def typing_context(self):
         """
         The typing context for CPU targets.
@@ -57,6 +62,7 @@ def typing_context(self):
         else:
             return self._toplevel_typing_context
 
+    @global_compiler_lock
     def nested_context(self, typing_context, target_context):
         """
         A context manager temporarily replacing the contexts with the
diff --git a/numba/tests/test_dispatcher.py b/numba/tests/test_dispatcher.py
index 30a8e081485..b90d42ede26 100644
--- a/numba/tests/test_dispatcher.py
+++ b/numba/tests/test_dispatcher.py
@@ -398,6 +398,8 @@ def test_serialization(self):
         def foo(x):
             return x + 1
 
+        foo = foo.get_compiled()
+
         self.assertEqual(foo(1), 2)
 
         # get serialization memo
diff --git a/numba/tests/test_nrt.py b/numba/tests/test_nrt.py
index e0c94605671..602132258e8 100644
--- a/numba/tests/test_nrt.py
+++ b/numba/tests/test_nrt.py
@@ -249,6 +249,8 @@ def alloc_nrt_memory():
             """
             return np.empty(N, dtype)
 
+        alloc_nrt_memory = alloc_nrt_memory.get_compiled()
+
         def keep_memory():
             return alloc_nrt_memory()
 
diff --git a/numba/tests/test_record_dtype.py b/numba/tests/test_record_dtype.py
index 6d479c413fa..e674bacc957 100644
--- a/numba/tests/test_record_dtype.py
+++ b/numba/tests/test_record_dtype.py
@@ -803,8 +803,8 @@ def test_record_arg_transform(self):
         self.assertIn('Array', transformed)
         self.assertNotIn('first', transformed)
         self.assertNotIn('second', transformed)
-        # Length is usually 50 - 5 chars tolerance as above.
-        self.assertLess(len(transformed), 50)
+        # Length is usually 60 - 5 chars tolerance as above.
+        self.assertLess(len(transformed), 60)
 
     def test_record_two_arrays(self):
         """
diff --git a/numba/tests/test_serialize.py b/numba/tests/test_serialize.py
index 2bcf843458a..90c3db44a16 100644
--- a/numba/tests/test_serialize.py
+++ b/numba/tests/test_serialize.py
@@ -135,9 +135,9 @@ def test_reuse(self):
 
         Note that "same function" is intentionally under-specified.
         """
-        func = closure(5)
+        func = closure(5).get_compiled()
         pickled = pickle.dumps(func)
-        func2 = closure(6)
+        func2 = closure(6).get_compiled()
         pickled2 = pickle.dumps(func2)
 
         f = pickle.loads(pickled)
@@ -152,7 +152,7 @@ def test_reuse(self):
         self.assertEqual(h(2, 3), 11)
 
         # Now make sure the original object doesn't exist when deserializing
-        func = closure(7)
+        func = closure(7).get_compiled()
         func(42, 43)
         pickled = pickle.dumps(func)
         del func

From 0bf83efd8703948d1ab4fc8161548ffeb2e3ba28 Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Fri, 4 Dec 2020 14:26:59 +0300
Subject: [PATCH 08/19] Refactoring for ParforDiagnostics dump function (#135)

Co-authored-by: etotmeni <elena.totmenina@intel.com>
---
 numba/parfors/parfor.py | 625 ++++++++++++++++++++--------------------
 1 file changed, 320 insertions(+), 305 deletions(-)

diff --git a/numba/parfors/parfor.py b/numba/parfors/parfor.py
index c73d99905dd..2632db89adb 100644
--- a/numba/parfors/parfor.py
+++ b/numba/parfors/parfor.py
@@ -862,6 +862,318 @@ def sort_pf_by_line(self, pf_id, parfors_simple):
                             pass
         return line
 
+    def get_parfors_simple(self, print_loop_search):
+        parfors_simple = dict()
+
+        # print in line order, parfors loop id is based on discovery order
+        for pf in sorted(self.initial_parfors, key=lambda x: x.loc.line):
+            # use 0 here, the parfors are mutated by the time this routine
+            # is called, however, fusion appends the patterns so we can just
+            # pull in the first as a "before fusion" emulation
+            r_pattern = pf.patterns[0]
+            pattern = pf.patterns[0]
+            loc = pf.loc
+            if isinstance(pattern, tuple):
+                if pattern[0] == 'prange':
+                    if pattern[1] == 'internal':
+                        replfn = '.'.join(reversed(list(pattern[2][0])))
+                        loc = pattern[2][1]
+                        r_pattern = '%s %s' % (replfn, '(internal parallel version)')
+                    elif pattern[1] == 'user':
+                        r_pattern = "user defined prange"
+                    elif pattern[1] == 'pndindex':
+                        r_pattern = "internal pndindex" #FIXME: trace this!
+                    else:
+                        assert 0
+            fmt = 'Parallel for-loop #%s: is produced from %s:\n    %s\n \n'
+            if print_loop_search:
+                print_wrapped(fmt % (pf.id, loc, r_pattern))
+            parfors_simple[pf.id] = (pf, loc, r_pattern)
+            return parfors_simple
+
+    def get_all_lines(self, parfors_simple):
+        # ensure adjacency lists are the same size for both sets of info
+        # (nests and fusion may not traverse the same space, for
+        # convenience [] is used as a condition to halt recursion)
+        fadj, froots = self.compute_graph_info(self.fusion_info)
+        nadj, _nroots = self.compute_graph_info(self.nested_fusion_info)
+
+        if len(fadj) > len(nadj):
+            lim = len(fadj)
+            tmp = nadj
+        else:
+            lim = len(nadj)
+            tmp = fadj
+        for x in range(len(tmp), lim):
+            tmp.append([])
+
+        # This computes the roots of true loop nests (i.e. loops containing
+        # loops opposed to just a loop that's a root).
+        nroots = set()
+        if _nroots:
+            for r in _nroots:
+                if nadj[r] != []:
+                    nroots.add(r)
+        all_roots = froots ^ nroots
+
+        # This computes all the parfors at the top level that are either:
+        # - roots of loop fusion
+        # - roots of true loop nests
+        # it then combines these based on source line number for ease of
+        # producing output ordered in a manner similar to the code structure
+        froots_lines = {}
+        for x in froots:
+            line = self.sort_pf_by_line(x, parfors_simple)
+            froots_lines[line] = 'fuse', x, fadj
+
+        nroots_lines = {}
+        for x in nroots:
+            line = self.sort_pf_by_line(x, parfors_simple)
+            nroots_lines[line] = 'nest', x, nadj
+
+        all_lines = froots_lines.copy()
+        all_lines.update(nroots_lines)
+        return all_lines
+
+    def source_listing(self, parfors_simple, purpose_str):
+        filename = self.func_ir.loc.filename
+        count = self.count_parfors()
+        func_name = self.func_ir.func_id.func
+        try:
+            lines = inspect.getsource(func_name).splitlines()
+        except OSError: # generated function
+            lines = None
+        if lines and parfors_simple:
+            src_width = max([len(x) for x in lines])
+            map_line_to_pf = defaultdict(list) # parfors can alias lines
+            for k, v in parfors_simple.items():
+                # TODO: do a better job of tracking parfors that are not in
+                # this file but are referred to, e.g. np.arange()
+                if parfors_simple[k][1].filename == filename:
+                    match_line = self.sort_pf_by_line(k, parfors_simple)
+                    map_line_to_pf[match_line].append(str(k))
+
+            max_pf_per_line = max([1] + [len(x) for x in map_line_to_pf.values()])
+            width = src_width + (1 + max_pf_per_line * (len(str(count)) + 2))
+            newlines = []
+            newlines.append('\n')
+            newlines.append('Parallel loop listing for %s' % purpose_str)
+            newlines.append(width * '-' + '|loop #ID')
+            fmt = '{0:{1}}| {2}'
+            # why are these off by 1?
+            lstart = max(0, self.func_ir.loc.line - 1)
+            for no, line in enumerate(lines, lstart):
+                pf_ids = map_line_to_pf.get(no, None)
+                if pf_ids is not None:
+                    pfstr = '#' + ', '.join(pf_ids)
+                else:
+                    pfstr = ''
+                stripped = line.strip('\n')
+                srclen = len(stripped)
+                if pf_ids:
+                    l = fmt.format(width * '-', width, pfstr)
+                else:
+                    l = fmt.format(width * ' ', width, pfstr)
+                newlines.append(stripped + l[srclen:])
+            print('\n'.join(newlines))
+        else:
+            print("No source available")
+
+    def print_unoptimised(self, lines):
+        # This prints the unoptimised parfors state
+        sword = '+--'
+        fac = len(sword)
+
+        def print_nest(fadj_, nadj_, theroot, reported, region_id):
+            def print_g(fadj_, nadj_, nroot, depth):
+                print_wrapped(fac * depth * ' ' + '%s%s %s' % (sword, nroot, '(parallel)'))
+                for k in nadj_[nroot]:
+                    if nadj_[k] == []:
+                        msg = []
+                        msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, k, '(parallel)'))
+                        if fadj_[k] != [] and k not in reported:
+                            fused = self.reachable_nodes(fadj_, k)
+                            for i in fused:
+                                msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, i, '(parallel)'))
+                        reported.append(k)
+                        print_wrapped('\n'.join(msg))
+                    else:
+                        print_g(fadj_, nadj_, k, depth + 1)
+
+            if nadj_[theroot] != []:
+                print_wrapped("Parallel region %s:" % region_id)
+                print_g(fadj_, nadj_, theroot, 0)
+                print("\n")
+                region_id = region_id + 1
+            return region_id
+
+        def print_fuse(ty, pf_id, adj, depth, region_id):
+            msg = []
+            print_wrapped("Parallel region %s:" % region_id)
+            msg.append(fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel)'))
+            if adj[pf_id] != []:
+                fused = sorted(self.reachable_nodes(adj, pf_id))
+                for k in fused:
+                    msg.append(fac * depth * ' ' + '%s%s %s' % (sword, k, '(parallel)'))
+            region_id = region_id + 1
+            print_wrapped('\n'.join(msg))
+            print("\n")
+            return region_id
+
+        # Walk the parfors by src line and print optimised structure
+        region_id = 0
+        reported = []
+        for line, info in sorted(lines.items()):
+            opt_ty, pf_id, adj = info
+            if opt_ty == 'fuse':
+                if pf_id not in reported:
+                    region_id = print_fuse('f', pf_id, adj, 0, region_id)
+            elif opt_ty == 'nest':
+                region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
+            else:
+                assert 0
+
+    def print_optimised(self, lines):
+        # This prints the optimised output based on the transforms that
+        # occurred during loop fusion and rewriting of loop nests
+        sword = '+--'
+        fac = len(sword)
+
+        summary = dict()
+        # region : {fused, serialized}
+
+        def print_nest(fadj_, nadj_, theroot, reported, region_id):
+            def print_g(fadj_, nadj_, nroot, depth):
+                for k in nadj_[nroot]:
+                    msg = fac * depth * ' ' + '%s%s %s' % (sword, k, '(serial')
+                    if nadj_[k] == []:
+                        fused = []
+                        if fadj_[k] != [] and k not in reported:
+                            fused = sorted(self.reachable_nodes(fadj_, k))
+                            msg += ", fused with loop(s): "
+                            msg += ', '.join([str(x) for x in fused])
+                        msg += ')'
+                        reported.append(k)
+                        print_wrapped(msg)
+                        summary[region_id]['fused'] += len(fused)
+                    else:
+                        print_wrapped(msg + ')')
+                        print_g(fadj_, nadj_, k, depth + 1)
+                    summary[region_id]['serialized'] += 1
+
+            if nadj_[theroot] != []:
+                print_wrapped("Parallel region %s:" % region_id)
+                print_wrapped('%s%s %s' % (sword, theroot, '(parallel)'))
+                summary[region_id] = {'root': theroot, 'fused': 0, 'serialized': 0}
+                print_g(fadj_, nadj_, theroot, 1)
+                print("\n")
+                region_id = region_id + 1
+            return region_id
+
+        def print_fuse(ty, pf_id, adj, depth, region_id):
+            print_wrapped("Parallel region %s:" % region_id)
+            msg = fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel')
+            fused = []
+            if adj[pf_id] != []:
+                fused = sorted(self.reachable_nodes(adj, pf_id))
+                msg += ", fused with loop(s): "
+                msg += ', '.join([str(x) for x in fused])
+
+            summary[region_id] = {'root': pf_id, 'fused': len(fused), 'serialized': 0}
+            msg += ')'
+            print_wrapped(msg)
+            print("\n")
+            region_id = region_id + 1
+            return region_id
+
+        # Walk the parfors by src line and print optimised structure
+        region_id = 0
+        reported = []
+        for line, info in sorted(lines.items()):
+            opt_ty, pf_id, adj = info
+            if opt_ty == 'fuse':
+                if pf_id not in reported:
+                    region_id = print_fuse('f', pf_id, adj, 0, region_id)
+            elif opt_ty == 'nest':
+                region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
+            else:
+                assert 0
+
+        # print the summary of the fuse/serialize rewrite
+        if summary:
+            for k, v in sorted(summary.items()):
+                msg = ('\n \nParallel region %s (loop #%s) had %s '
+                    'loop(s) fused')
+                root = v['root']
+                fused = v['fused']
+                serialized = v['serialized']
+                if serialized != 0:
+                    msg += (' and %s loop(s) '
+                    'serialized as part of the larger '
+                    'parallel loop (#%s).')
+                    print_wrapped(msg % (k, root, fused, serialized, root))
+                else:
+                    msg += '.'
+                    print_wrapped(msg % (k, root, fused))
+        else:
+            print_wrapped("Parallel structure is already optimal.")
+
+    def allocation_hoist(self):
+        found = False
+        print('Allocation hoisting:')
+        for pf_id, data in self.hoist_info.items():
+            stmt = data.get('hoisted', [])
+            for inst in stmt:
+                if isinstance(inst.value, ir.Expr):
+                    try:
+                        attr = inst.value.attr
+                        if attr == 'empty':
+                            msg = ("The memory allocation derived from the "
+                                "instruction at %s is hoisted out of the "
+                                "parallel loop labelled #%s (it will be "
+                                "performed before the loop is executed and "
+                                "reused inside the loop):")
+                            loc = inst.loc
+                            print_wrapped(msg % (loc, pf_id))
+                            try:
+                                path = os.path.relpath(loc.filename)
+                            except ValueError:
+                                path = os.path.abspath(loc.filename)
+                            lines = linecache.getlines(path)
+                            if lines and loc.line:
+                                print_wrapped("   Allocation:: " + lines[0 if loc.line < 2 else loc.line - 1].strip())
+                            print_wrapped("    - numpy.empty() is used for the allocation.\n")
+                            found = True
+                    except (KeyError, AttributeError):
+                        pass
+        if not found:
+            print_wrapped('No allocation hoisting found')
+
+    def instruction_hoist(self):
+        print("")
+        print('Instruction hoisting:')
+        hoist_info_printed = False
+        if self.hoist_info:
+            for pf_id, data in self.hoist_info.items():
+                hoisted = data.get('hoisted', None)
+                not_hoisted = data.get('not_hoisted', None)
+                if not hoisted and not not_hoisted:
+                    print("loop #%s has nothing to hoist." % pf_id)
+                    continue
+
+                print("loop #%s:" % pf_id)
+                if hoisted:
+                    print("  Has the following hoisted:")
+                    [print("    %s" % y) for y in hoisted]
+                    hoist_info_printed = True
+                if not_hoisted:
+                    print("  Failed to hoist the following:")
+                    [print("    %s: %s" % (y, x)) for x, y in not_hoisted]
+                    hoist_info_printed = True
+        if not hoist_info_printed:
+            print_wrapped('No instruction hoisting found')
+        print_wrapped(80 * '-')
+
     def dump(self, level=1):
         if not self.has_setup:
             raise RuntimeError("self.setup has not been called")
@@ -919,33 +1231,7 @@ def dump(self, level=1):
 #----------- search section
         if print_loop_search:
             print_wrapped('Looking for parallel loops'.center(_termwidth, '-'))
-
-        parfors_simple = dict()
-
-        # print in line order, parfors loop id is based on discovery order
-        for pf in sorted(self.initial_parfors, key=lambda x: x.loc.line):
-            # use 0 here, the parfors are mutated by the time this routine
-            # is called, however, fusion appends the patterns so we can just
-            # pull in the first as a "before fusion" emulation
-            r_pattern = pf.patterns[0]
-            pattern = pf.patterns[0]
-            loc = pf.loc
-            if isinstance(pattern, tuple):
-                if pattern[0] == 'prange':
-                    if pattern[1] == 'internal':
-                        replfn = '.'.join(reversed(list(pattern[2][0])))
-                        loc = pattern[2][1]
-                        r_pattern = '%s %s' % (replfn, '(internal parallel version)')
-                    elif pattern[1] == 'user':
-                        r_pattern = "user defined prange"
-                    elif pattern[1] == 'pndindex':
-                        r_pattern = "internal pndindex" #FIXME: trace this!
-                    else:
-                        assert 0
-            fmt = 'Parallel for-loop #%s: is produced from %s:\n    %s\n \n'
-            if print_loop_search:
-                print_wrapped(fmt % (pf.id, loc, r_pattern))
-            parfors_simple[pf.id] = (pf, loc, r_pattern)
+        parfors_simple = self.get_parfors_simple(print_loop_search)
 
         count = self.count_parfors()
         if print_loop_search:
@@ -965,46 +1251,7 @@ def dump(self, level=1):
             path = os.path.abspath(filename)
 
         if print_source_listing:
-            func_name = self.func_ir.func_id.func
-            try:
-                lines = inspect.getsource(func_name).splitlines()
-            except OSError: # generated function
-                lines = None
-            if lines:
-                src_width = max([len(x) for x in lines])
-                map_line_to_pf = defaultdict(list) # parfors can alias lines
-                for k, v in parfors_simple.items():
-                    # TODO: do a better job of tracking parfors that are not in
-                    # this file but are referred to, e.g. np.arange()
-                    if parfors_simple[k][1].filename == filename:
-                        match_line = self.sort_pf_by_line(k, parfors_simple)
-                        map_line_to_pf[match_line].append(str(k))
-
-                max_pf_per_line = max([1] + [len(x) for x in map_line_to_pf.values()])
-                width = src_width + (1 + max_pf_per_line * (len(str(count)) + 2))
-                newlines = []
-                newlines.append('\n')
-                newlines.append('Parallel loop listing for %s' % purpose_str)
-                newlines.append(width * '-' + '|loop #ID')
-                fmt = '{0:{1}}| {2}'
-                # why are these off by 1?
-                lstart = max(0, self.func_ir.loc.line - 1)
-                for no, line in enumerate(lines, lstart):
-                    pf_ids = map_line_to_pf.get(no, None)
-                    if pf_ids is not None:
-                        pfstr = '#' + ', '.join(pf_ids)
-                    else:
-                        pfstr = ''
-                    stripped = line.strip('\n')
-                    srclen = len(stripped)
-                    if pf_ids:
-                        l = fmt.format(width * '-', width, pfstr)
-                    else:
-                        l = fmt.format(width * ' ', width, pfstr)
-                    newlines.append(stripped + l[srclen:])
-                print('\n'.join(newlines))
-            else:
-                print("No source available")
+            self.source_listing(parfors_simple, purpose_str)
 
 #---------- these are used a lot here on in
         sword = '+--'
@@ -1075,198 +1322,16 @@ def print_g(adj, root, depth):
                     print_wrapped("")
 
 #---------- compute various properties and orderings in the data for subsequent use
-
-            # ensure adjacency lists are the same size for both sets of info
-            # (nests and fusion may not traverse the same space, for
-            # convenience [] is used as a condition to halt recursion)
-            fadj, froots = self.compute_graph_info(self.fusion_info)
-            nadj, _nroots = self.compute_graph_info(self.nested_fusion_info)
-
-            if len(fadj) > len(nadj):
-                lim = len(fadj)
-                tmp = nadj
-            else:
-                lim = len(nadj)
-                tmp = fadj
-            for x in range(len(tmp), lim):
-                tmp.append([])
-
-            # This computes the roots of true loop nests (i.e. loops containing
-            # loops opposed to just a loop that's a root).
-            nroots = set()
-            if _nroots:
-                for r in _nroots:
-                    if nadj[r] != []:
-                        nroots.add(r)
-            all_roots = froots ^ nroots
-
-            # This computes all the parfors at the top level that are either:
-            # - roots of loop fusion
-            # - roots of true loop nests
-            # it then combines these based on source line number for ease of
-            # producing output ordered in a manner similar to the code structure
-            froots_lines = {}
-            for x in froots:
-                line = self.sort_pf_by_line(x, parfors_simple)
-                froots_lines[line] = 'fuse', x, fadj
-
-            nroots_lines = {}
-            for x in nroots:
-                line = self.sort_pf_by_line(x, parfors_simple)
-                nroots_lines[line] = 'nest', x, nadj
-
-            all_lines = froots_lines.copy()
-            all_lines.update(nroots_lines)
-
-            # nroots, froots, nadj and fadj are all set up correctly
-            # define some print functions
-
-            def print_unoptimised(lines):
-                # This prints the unoptimised parfors state
-
-                fac = len(sword)
-
-                def print_nest(fadj_, nadj_, theroot, reported, region_id):
-                    def print_g(fadj_, nadj_, nroot, depth):
-                        print_wrapped(fac * depth * ' ' + '%s%s %s' % (sword, nroot, '(parallel)'))
-                        for k in nadj_[nroot]:
-                            if nadj_[k] == []:
-                                msg = []
-                                msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, k, '(parallel)'))
-                                if fadj_[k] != [] and k not in reported:
-                                    fused = self.reachable_nodes(fadj_, k)
-                                    for i in fused:
-                                        msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, i, '(parallel)'))
-                                reported.append(k)
-                                print_wrapped('\n'.join(msg))
-                            else:
-                                print_g(fadj_, nadj_, k, depth + 1)
-
-                    if nadj_[theroot] != []:
-                        print_wrapped("Parallel region %s:" % region_id)
-                        print_g(fadj_, nadj_, theroot, 0)
-                        print("\n")
-                        region_id = region_id + 1
-                    return region_id
-
-                def print_fuse(ty, pf_id, adj, depth, region_id):
-                    msg = []
-                    print_wrapped("Parallel region %s:" % region_id)
-                    msg.append(fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel)'))
-                    if adj[pf_id] != []:
-                        fused = sorted(self.reachable_nodes(adj, pf_id))
-                        for k in fused:
-                            msg.append(fac * depth * ' ' + '%s%s %s' % (sword, k, '(parallel)'))
-                    region_id = region_id + 1
-                    print_wrapped('\n'.join(msg))
-                    print("\n")
-                    return region_id
-
-                # Walk the parfors by src line and print optimised structure
-                region_id = 0
-                reported = []
-                for line, info in sorted(lines.items()):
-                    opt_ty, pf_id, adj = info
-                    if opt_ty == 'fuse':
-                        if pf_id not in reported:
-                            region_id = print_fuse('f', pf_id, adj, 0, region_id)
-                    elif opt_ty == 'nest':
-                        region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
-                    else:
-                        assert 0
-
-            def print_optimised(lines):
-                # This prints the optimised output based on the transforms that
-                # occurred during loop fusion and rewriting of loop nests
-                fac = len(sword)
-
-                summary = dict()
-                # region : {fused, serialized}
-
-                def print_nest(fadj_, nadj_, theroot, reported, region_id):
-                    def print_g(fadj_, nadj_, nroot, depth):
-                        for k in nadj_[nroot]:
-                            msg = fac * depth * ' ' + '%s%s %s' % (sword, k, '(serial')
-                            if nadj_[k] == []:
-                                fused = []
-                                if fadj_[k] != [] and k not in reported:
-                                    fused = sorted(self.reachable_nodes(fadj_, k))
-                                    msg += ", fused with loop(s): "
-                                    msg += ', '.join([str(x) for x in fused])
-                                msg += ')'
-                                reported.append(k)
-                                print_wrapped(msg)
-                                summary[region_id]['fused'] += len(fused)
-                            else:
-                                print_wrapped(msg + ')')
-                                print_g(fadj_, nadj_, k, depth + 1)
-                            summary[region_id]['serialized'] += 1
-
-                    if nadj_[theroot] != []:
-                        print_wrapped("Parallel region %s:" % region_id)
-                        print_wrapped('%s%s %s' % (sword, theroot, '(parallel)'))
-                        summary[region_id] = {'root': theroot, 'fused': 0, 'serialized': 0}
-                        print_g(fadj_, nadj_, theroot, 1)
-                        print("\n")
-                        region_id = region_id + 1
-                    return region_id
-
-                def print_fuse(ty, pf_id, adj, depth, region_id):
-                    print_wrapped("Parallel region %s:" % region_id)
-                    msg = fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel')
-                    fused = []
-                    if adj[pf_id] != []:
-                        fused = sorted(self.reachable_nodes(adj, pf_id))
-                        msg += ", fused with loop(s): "
-                        msg += ', '.join([str(x) for x in fused])
-
-                    summary[region_id] = {'root': pf_id, 'fused': len(fused), 'serialized': 0}
-                    msg += ')'
-                    print_wrapped(msg)
-                    print("\n")
-                    region_id = region_id + 1
-                    return region_id
-
-                # Walk the parfors by src line and print optimised structure
-                region_id = 0
-                reported = []
-                for line, info in sorted(lines.items()):
-                    opt_ty, pf_id, adj = info
-                    if opt_ty == 'fuse':
-                        if pf_id not in reported:
-                            region_id = print_fuse('f', pf_id, adj, 0, region_id)
-                    elif opt_ty == 'nest':
-                        region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
-                    else:
-                        assert 0
-
-                # print the summary of the fuse/serialize rewrite
-                if summary:
-                    for k, v in sorted(summary.items()):
-                        msg = ('\n \nParallel region %s (loop #%s) had %s '
-                            'loop(s) fused')
-                        root = v['root']
-                        fused = v['fused']
-                        serialized = v['serialized']
-                        if serialized != 0:
-                            msg += (' and %s loop(s) '
-                            'serialized as part of the larger '
-                            'parallel loop (#%s).')
-                            print_wrapped(msg % (k, root, fused, serialized, root))
-                        else:
-                            msg += '.'
-                            print_wrapped(msg % (k, root, fused))
-                else:
-                    print_wrapped("Parallel structure is already optimal.")
+            all_lines = self.get_all_lines(parfors_simple)
 
             if print_pre_optimised:
                 print(' Before Optimisation '.center(_termwidth,'-'))
-                print_unoptimised(all_lines)
+                self.print_unoptimised(all_lines)
                 print(_termwidth * '-')
 
             if print_post_optimised:
                 print(' After Optimisation '.center(_termwidth,'-'))
-                print_optimised(all_lines)
+                self.print_optimised(all_lines)
                 print(_termwidth * '-')
             print_wrapped("")
             print_wrapped(_termwidth * '-')
@@ -1277,60 +1342,10 @@ def print_fuse(ty, pf_id, adj, depth, region_id):
                 print_wrapped('Loop invariant code motion'.center(80, '-'))
 
             if print_allocation_hoist:
-                found = False
-                print('Allocation hoisting:')
-                for pf_id, data in self.hoist_info.items():
-                    stmt = data.get('hoisted', [])
-                    for inst in stmt:
-                        if isinstance(inst.value, ir.Expr):
-                            try:
-                                attr = inst.value.attr
-                                if attr == 'empty':
-                                    msg = ("The memory allocation derived from the "
-                                        "instruction at %s is hoisted out of the "
-                                        "parallel loop labelled #%s (it will be "
-                                        "performed before the loop is executed and "
-                                        "reused inside the loop):")
-                                    loc = inst.loc
-                                    print_wrapped(msg % (loc, pf_id))
-                                    try:
-                                        path = os.path.relpath(loc.filename)
-                                    except ValueError:
-                                        path = os.path.abspath(loc.filename)
-                                    lines = linecache.getlines(path)
-                                    if lines and loc.line:
-                                        print_wrapped("   Allocation:: " + lines[0 if loc.line < 2 else loc.line - 1].strip())
-                                    print_wrapped("    - numpy.empty() is used for the allocation.\n")
-                                    found = True
-                            except (KeyError, AttributeError):
-                                pass
-                if not found:
-                    print_wrapped('No allocation hoisting found')
-            if print_instruction_hoist:
-                print("")
-                print('Instruction hoisting:')
-                hoist_info_printed = False
-                if self.hoist_info:
-                    for pf_id, data in self.hoist_info.items():
-                        hoisted = data.get('hoisted', None)
-                        not_hoisted = data.get('not_hoisted', None)
-                        if not hoisted and not not_hoisted:
-                            print("loop #%s has nothing to hoist." % pf_id)
-                            continue
-
-                        print("loop #%s:" % pf_id)
-                        if hoisted:
-                            print("  Has the following hoisted:")
-                            [print("    %s" % y) for y in hoisted]
-                            hoist_info_printed = True
-                        if not_hoisted:
-                            print("  Failed to hoist the following:")
-                            [print("    %s: %s" % (y, x)) for x, y in not_hoisted]
-                            hoist_info_printed = True
-                if not hoist_info_printed:
-                    print_wrapped('No instruction hoisting found')
-                print_wrapped(80 * '-')
+                self.allocation_hoist()
 
+            if print_instruction_hoist:
+                self.instruction_hoist()
 
         else: # there are no parfors
             print_wrapped('Function %s, %s, has no parallel for-loops.'.format(name, line))

From 27db7a1994ff8ecbe9dcfc43f68bbc5b9b0fab19 Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Tue, 8 Dec 2020 15:52:23 +0300
Subject: [PATCH 09/19] Fix ParforDiagnostic errors (#137)

Co-authored-by: etotmeni <elena.totmenina@intel.com>
---
 numba/parfors/parfor.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/numba/parfors/parfor.py b/numba/parfors/parfor.py
index 2632db89adb..b845b43bf2a 100644
--- a/numba/parfors/parfor.py
+++ b/numba/parfors/parfor.py
@@ -889,7 +889,7 @@ def get_parfors_simple(self, print_loop_search):
             if print_loop_search:
                 print_wrapped(fmt % (pf.id, loc, r_pattern))
             parfors_simple[pf.id] = (pf, loc, r_pattern)
-            return parfors_simple
+        return parfors_simple
 
     def get_all_lines(self, parfors_simple):
         # ensure adjacency lists are the same size for both sets of info
@@ -983,6 +983,17 @@ def print_unoptimised(self, lines):
         # This prints the unoptimised parfors state
         sword = '+--'
         fac = len(sword)
+        fadj, froots = self.compute_graph_info(self.fusion_info)
+        nadj, _nroots = self.compute_graph_info(self.nested_fusion_info)
+
+        if len(fadj) > len(nadj):
+            lim = len(fadj)
+            tmp = nadj
+        else:
+            lim = len(nadj)
+            tmp = fadj
+        for x in range(len(tmp), lim):
+            tmp.append([])
 
         def print_nest(fadj_, nadj_, theroot, reported, region_id):
             def print_g(fadj_, nadj_, nroot, depth):
@@ -1038,6 +1049,17 @@ def print_optimised(self, lines):
         # occurred during loop fusion and rewriting of loop nests
         sword = '+--'
         fac = len(sword)
+        fadj, froots = self.compute_graph_info(self.fusion_info)
+        nadj, _nroots = self.compute_graph_info(self.nested_fusion_info)
+
+        if len(fadj) > len(nadj):
+            lim = len(fadj)
+            tmp = nadj
+        else:
+            lim = len(nadj)
+            tmp = fadj
+        for x in range(len(tmp), lim):
+            tmp.append([])
 
         summary = dict()
         # region : {fused, serialized}

From 2dabcd0ead3b8036115291fa156eda0265d1715f Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 9 Dec 2020 13:06:39 +0300
Subject: [PATCH 10/19] Initial support for Numpy subclasses

Co-authored-by: Todd A. Anderson <drtodd13@comcast.net>
---
 numba/_typeof.c                     | 13 ++++-
 numba/core/extending.py             |  2 +-
 numba/core/ir_utils.py              | 12 ++--
 numba/core/pythonapi.py             |  9 ++-
 numba/core/runtime/_nrt_python.c    | 37 +++++++++++-
 numba/core/runtime/_nrt_pythonmod.c |  1 +
 numba/core/runtime/nrt.c            | 90 +++++++++++++++++++++++------
 numba/core/runtime/nrt.h            | 24 +++++++-
 numba/core/runtime/nrt_external.h   | 16 +++++
 numba/core/types/npytypes.py        |  4 +-
 numba/core/typing/npydecl.py        | 21 +++++--
 numba/np/arrayobj.py                | 11 +++-
 12 files changed, 203 insertions(+), 37 deletions(-)

diff --git a/numba/_typeof.c b/numba/_typeof.c
index ffe0e3a3c58..9b259164800 100644
--- a/numba/_typeof.c
+++ b/numba/_typeof.c
@@ -768,6 +768,7 @@ int
 typeof_typecode(PyObject *dispatcher, PyObject *val)
 {
     PyTypeObject *tyobj = Py_TYPE(val);
+    int no_subtype_attr;
     /* This needs to be kept in sync with Dispatcher.typeof_pyval(),
      * otherwise funny things may happen.
      */
@@ -794,9 +795,19 @@ typeof_typecode(PyObject *dispatcher, PyObject *val)
         return typecode_arrayscalar(dispatcher, val);
     }
     /* Array handling */
-    else if (PyType_IsSubtype(tyobj, &PyArray_Type)) {
+    else if (tyobj == &PyArray_Type) {
         return typecode_ndarray(dispatcher, (PyArrayObject*)val);
     }
+    /* Subtypes of Array handling */
+    else if (PyType_IsSubtype(tyobj, &PyArray_Type)) {
+        /* If the class has an attribute named __numba_no_subtype_ndarray then
+           don't treat it as a normal variant of a Numpy ndarray but as it's own
+           separate type. */
+        no_subtype_attr = PyObject_HasAttrString(val, "__numba_no_subtype_ndarray__");
+        if (!no_subtype_attr) {
+            return typecode_ndarray(dispatcher, (PyArrayObject*)val);
+        }
+    }
 
     return typecode_using_fingerprint(dispatcher, val);
 }
diff --git a/numba/core/extending.py b/numba/core/extending.py
index 8d8d8525e21..09373708b48 100644
--- a/numba/core/extending.py
+++ b/numba/core/extending.py
@@ -14,7 +14,7 @@
     lower_setattr, lower_setattr_generic, lower_cast)  # noqa: F401
 from numba.core.datamodel import models   # noqa: F401
 from numba.core.datamodel import register_default as register_model  # noqa: F401, E501
-from numba.core.pythonapi import box, unbox, reflect, NativeValue  # noqa: F401
+from numba.core.pythonapi import box, unbox, reflect, NativeValue, allocator  # noqa: F401
 from numba._helperlib import _import_cython_function  # noqa: F401
 from numba.core.serialize import ReduceMixin
 
diff --git a/numba/core/ir_utils.py b/numba/core/ir_utils.py
index 1d58c5c8b5b..9ffdfb16b07 100644
--- a/numba/core/ir_utils.py
+++ b/numba/core/ir_utils.py
@@ -64,6 +64,8 @@ def mk_alloc(typemap, calltypes, lhs, size_var, dtype, scope, loc):
     out = []
     ndims = 1
     size_typ = types.intp
+    # Get the type of the array being allocated.
+    arr_typ = typemap[lhs.name]
     if isinstance(size_var, tuple):
         if len(size_var) == 1:
             size_var = size_var[0]
@@ -108,11 +110,13 @@ def mk_alloc(typemap, calltypes, lhs, size_var, dtype, scope, loc):
     typ_var_assign = ir.Assign(np_typ_getattr, typ_var, loc)
     alloc_call = ir.Expr.call(attr_var, [size_var, typ_var], (), loc)
     if calltypes:
-        calltypes[alloc_call] = typemap[attr_var.name].get_call_type(
+        cac = typemap[attr_var.name].get_call_type(
             typing.Context(), [size_typ, types.functions.NumberClass(dtype)], {})
-    # signature(
-    #    types.npytypes.Array(dtype, ndims, 'C'), size_typ,
-    #    types.functions.NumberClass(dtype))
+        # By default, all calls to "empty" are typed as returning a standard
+        # Numpy ndarray.  If we are allocating a ndarray subclass here then
+        # just change the return type to be that of the subclass.
+        cac._return_type = arr_typ
+        calltypes[alloc_call] = cac
     alloc_assign = ir.Assign(alloc_call, lhs, loc)
 
     out.extend([g_np_assign, attr_assign, typ_var_assign, alloc_assign])
diff --git a/numba/core/pythonapi.py b/numba/core/pythonapi.py
index 7901e761d9f..f84ad7b2ce1 100644
--- a/numba/core/pythonapi.py
+++ b/numba/core/pythonapi.py
@@ -45,10 +45,13 @@ def lookup(self, typeclass, default=None):
 _boxers = _Registry()
 _unboxers = _Registry()
 _reflectors = _Registry()
+# Registry of special allocators for types.
+_allocators = _Registry()
 
 box = _boxers.register
 unbox = _unboxers.register
 reflect = _reflectors.register
+allocator = _allocators.register
 
 class _BoxContext(namedtuple("_BoxContext",
                   ("context", "builder", "pyapi", "env_manager"))):
@@ -1186,8 +1189,11 @@ def nrt_adapt_ndarray_to_python(self, aryty, ary, dtypeptr):
         assert self.context.enable_nrt, "NRT required"
 
         intty = ir.IntType(32)
+        # Embed the Python type of the array (maybe subclass) in the LLVM.
+        serial_aryty_pytype = self.unserialize(self.serialize_object(aryty.py_type))
+
         fnty = Type.function(self.pyobj,
-                             [self.voidptr, intty, intty, self.pyobj])
+                             [self.voidptr, self.pyobj, intty, intty, self.pyobj])
         fn = self._get_function(fnty, name="NRT_adapt_ndarray_to_python")
         fn.args[0].add_attribute(lc.ATTR_NO_CAPTURE)
 
@@ -1197,6 +1203,7 @@ def nrt_adapt_ndarray_to_python(self, aryty, ary, dtypeptr):
         aryptr = cgutils.alloca_once_value(self.builder, ary)
         return self.builder.call(fn, [self.builder.bitcast(aryptr,
                                                            self.voidptr),
+                                      serial_aryty_pytype,
                                       ndim, writable, dtypeptr])
 
     def nrt_meminfo_new_from_pyobject(self, data, pyobj):
diff --git a/numba/core/runtime/_nrt_python.c b/numba/core/runtime/_nrt_python.c
index 33620fd4f1a..efe4467df70 100644
--- a/numba/core/runtime/_nrt_python.c
+++ b/numba/core/runtime/_nrt_python.c
@@ -55,6 +55,8 @@ int MemInfo_init(MemInfoObject *self, PyObject *args, PyObject *kwds) {
         return -1;
     }
     raw_ptr = PyLong_AsVoidPtr(raw_ptr_obj);
+    NRT_Debug(nrt_debug_print("MemInfo_init self=%p raw_ptr=%p\n", self, raw_ptr));
+
     if(PyErr_Occurred()) return -1;
     self->meminfo = (NRT_MemInfo *)raw_ptr;
     assert (NRT_MemInfo_refcount(self->meminfo) > 0 && "0 refcount");
@@ -109,6 +111,27 @@ MemInfo_get_refcount(MemInfoObject *self, void *closure) {
     return PyLong_FromSize_t(refct);
 }
 
+static
+PyObject*
+MemInfo_get_external_allocator(MemInfoObject *self, void *closure) {
+    void *p = NRT_MemInfo_external_allocator(self->meminfo);
+    printf("MemInfo_get_external_allocator %p\n", p);
+    return PyLong_FromVoidPtr(p);
+}
+
+static
+PyObject*
+MemInfo_get_parent(MemInfoObject *self, void *closure) {
+    void *p = NRT_MemInfo_parent(self->meminfo);
+    if (p) {
+        Py_INCREF(p);
+        return (PyObject*)p;
+    } else {
+        Py_INCREF(Py_None);
+        return Py_None;
+    }
+}
+
 static void
 MemInfo_dealloc(MemInfoObject *self)
 {
@@ -136,6 +159,13 @@ static PyGetSetDef MemInfo_getsets[] = {
      (getter)MemInfo_get_refcount, NULL,
      "Get the refcount",
      NULL},
+    {"external_allocator",
+     (getter)MemInfo_get_external_allocator, NULL,
+     "Get the external allocator",
+     NULL},
+    {"parent",
+     (getter)MemInfo_get_parent, NULL,
+     NULL},
     {NULL}  /* Sentinel */
 };
 
@@ -286,7 +316,7 @@ PyObject* try_to_return_parent(arystruct_t *arystruct, int ndim,
 }
 
 NUMBA_EXPORT_FUNC(PyObject *)
-NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim,
+NRT_adapt_ndarray_to_python(arystruct_t* arystruct, PyTypeObject *retty, int ndim,
                             int writeable, PyArray_Descr *descr)
 {
     PyArrayObject *array;
@@ -324,10 +354,13 @@ NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim,
         args = PyTuple_New(1);
         /* SETITEM steals reference */
         PyTuple_SET_ITEM(args, 0, PyLong_FromVoidPtr(arystruct->meminfo));
+        NRT_Debug(nrt_debug_print("NRT_adapt_ndarray_to_python arystruct->meminfo=%p\n", arystruct->meminfo));
         /*  Note: MemInfo_init() does not incref.  This function steals the
          *        NRT reference.
          */
+        NRT_Debug(nrt_debug_print("NRT_adapt_ndarray_to_python created MemInfo=%p\n", miobj));
         if (MemInfo_init(miobj, args, NULL)) {
+            NRT_Debug(nrt_debug_print("MemInfo_init returned 0.\n"));
             return NULL;
         }
         Py_DECREF(args);
@@ -336,7 +369,7 @@ NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim,
     shape = arystruct->shape_and_strides;
     strides = shape + ndim;
     Py_INCREF((PyObject *) descr);
-    array = (PyArrayObject *) PyArray_NewFromDescr(&PyArray_Type, descr, ndim,
+    array = (PyArrayObject *) PyArray_NewFromDescr(retty, descr, ndim,
                                                    shape, strides, arystruct->data,
                                                    flags, (PyObject *) miobj);
 
diff --git a/numba/core/runtime/_nrt_pythonmod.c b/numba/core/runtime/_nrt_pythonmod.c
index 31e1155fd9f..d1300ee8e9a 100644
--- a/numba/core/runtime/_nrt_pythonmod.c
+++ b/numba/core/runtime/_nrt_pythonmod.c
@@ -163,6 +163,7 @@ declmethod(MemInfo_alloc);
 declmethod(MemInfo_alloc_safe);
 declmethod(MemInfo_alloc_aligned);
 declmethod(MemInfo_alloc_safe_aligned);
+declmethod(MemInfo_alloc_safe_aligned_external);
 declmethod(MemInfo_alloc_dtor_safe);
 declmethod(MemInfo_call_dtor);
 declmethod(MemInfo_new_varsize);
diff --git a/numba/core/runtime/nrt.c b/numba/core/runtime/nrt.c
index 534681d5417..fe63a691537 100644
--- a/numba/core/runtime/nrt.c
+++ b/numba/core/runtime/nrt.c
@@ -19,6 +19,7 @@ struct MemInfo {
     void              *dtor_info;
     void              *data;
     size_t            size;    /* only used for NRT allocated memory */
+    NRT_ExternalAllocator *external_allocator;
 };
 
 
@@ -170,13 +171,16 @@ void NRT_MemSys_set_atomic_cas_stub(void) {
  */
 
 void NRT_MemInfo_init(NRT_MemInfo *mi,void *data, size_t size,
-                      NRT_dtor_function dtor, void *dtor_info)
+                      NRT_dtor_function dtor, void *dtor_info,
+                      NRT_ExternalAllocator *external_allocator)
 {
     mi->refct = 1;  /* starts with 1 refct */
     mi->dtor = dtor;
     mi->dtor_info = dtor_info;
     mi->data = data;
     mi->size = size;
+    mi->external_allocator = external_allocator;
+    NRT_Debug(nrt_debug_print("NRT_MemInfo_init mi=%p external_allocator=%p\n", mi, external_allocator));
     /* Update stats */
     TheMSys.atomic_inc(&TheMSys.stats_mi_alloc);
 }
@@ -185,7 +189,8 @@ NRT_MemInfo *NRT_MemInfo_new(void *data, size_t size,
                              NRT_dtor_function dtor, void *dtor_info)
 {
     NRT_MemInfo *mi = NRT_Allocate(sizeof(NRT_MemInfo));
-    NRT_MemInfo_init(mi, data, size, dtor, dtor_info);
+    NRT_Debug(nrt_debug_print("NRT_MemInfo_new mi=%p\n", mi));
+    NRT_MemInfo_init(mi, data, size, dtor, dtor_info, NULL);
     return mi;
 }
 
@@ -206,9 +211,10 @@ void nrt_internal_dtor_safe(void *ptr, size_t size, void *info) {
 }
 
 static
-void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out) {
+void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out, NRT_ExternalAllocator *allocator) {
     NRT_MemInfo *mi;
-    char *base = NRT_Allocate(sizeof(NRT_MemInfo) + size);
+    NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data %p\n", allocator));
+    char *base = NRT_Allocate_External(sizeof(NRT_MemInfo) + size, allocator);
     mi = (NRT_MemInfo *) base;
     *mi_out = mi;
     return base + sizeof(NRT_MemInfo);
@@ -230,9 +236,17 @@ void nrt_internal_custom_dtor_safe(void *ptr, size_t size, void *info) {
 
 NRT_MemInfo *NRT_MemInfo_alloc(size_t size) {
     NRT_MemInfo *mi;
-    void *data = nrt_allocate_meminfo_and_data(size, &mi);
+    void *data = nrt_allocate_meminfo_and_data(size, &mi, NULL);
     NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc %p\n", data));
-    NRT_MemInfo_init(mi, data, size, NULL, NULL);
+    NRT_MemInfo_init(mi, data, size, NULL, NULL, NULL);
+    return mi;
+}
+
+NRT_MemInfo *NRT_MemInfo_alloc_external(size_t size, NRT_ExternalAllocator *allocator) {
+    NRT_MemInfo *mi;
+    void *data = nrt_allocate_meminfo_and_data(size, &mi, allocator);
+    NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc %p\n", data));
+    NRT_MemInfo_init(mi, data, size, NULL, NULL, allocator);
     return mi;
 }
 
@@ -242,22 +256,23 @@ NRT_MemInfo *NRT_MemInfo_alloc_safe(size_t size) {
 
 NRT_MemInfo* NRT_MemInfo_alloc_dtor_safe(size_t size, NRT_dtor_function dtor) {
     NRT_MemInfo *mi;
-    void *data = nrt_allocate_meminfo_and_data(size, &mi);
+    void *data = nrt_allocate_meminfo_and_data(size, &mi, NULL);
     /* Only fill up a couple cachelines with debug markers, to minimize
        overhead. */
     memset(data, 0xCB, MIN(size, 256));
     NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_dtor_safe %p %zu\n", data, size));
-    NRT_MemInfo_init(mi, data, size, nrt_internal_custom_dtor_safe, dtor);
+    NRT_MemInfo_init(mi, data, size, nrt_internal_custom_dtor_safe, dtor, NULL);
     return mi;
 }
 
 
 static
 void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align,
-                                          NRT_MemInfo **mi)
+                                          NRT_MemInfo **mi, NRT_ExternalAllocator *allocator)
 {
     size_t offset, intptr, remainder;
-    char *base = nrt_allocate_meminfo_and_data(size + 2 * align, mi);
+    NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data_align %p\n", allocator));
+    char *base = nrt_allocate_meminfo_and_data(size + 2 * align, mi, allocator);
     intptr = (size_t) base;
     /* See if we are aligned */
     remainder = intptr % align;
@@ -271,26 +286,48 @@ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align,
 
 NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align) {
     NRT_MemInfo *mi;
-    void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi);
+    void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, NULL);
     NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_aligned %p\n", data));
-    NRT_MemInfo_init(mi, data, size, NULL, NULL);
+    NRT_MemInfo_init(mi, data, size, NULL, NULL, NULL);
     return mi;
 }
 
 NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned(size_t size, unsigned align) {
     NRT_MemInfo *mi;
-    void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi);
+    void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, NULL);
     /* Only fill up a couple cachelines with debug markers, to minimize
        overhead. */
     memset(data, 0xCB, MIN(size, 256));
     NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned %p %zu\n",
                               data, size));
-    NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size);
+    NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size, NULL);
     return mi;
 }
 
+NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned_external(size_t size, unsigned align, NRT_ExternalAllocator *allocator) {
+    NRT_MemInfo *mi;
+    NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned_external %p\n", allocator));
+    void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, allocator);
+    /* Only fill up a couple cachelines with debug markers, to minimize
+       overhead. */
+    memset(data, 0xCB, MIN(size, 256));
+    NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned %p %zu\n",
+                              data, size));
+    NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size, allocator);
+    return mi;
+}
+
+void NRT_dealloc(NRT_MemInfo *mi) {
+    NRT_Debug(nrt_debug_print("NRT_dealloc meminfo: %p external_allocator: %p\n", mi, mi->external_allocator));
+    if (mi->external_allocator) {
+        mi->external_allocator->free(mi, mi->external_allocator->opaque_data);
+    } else {
+        NRT_Free(mi);
+    }
+}
+
 void NRT_MemInfo_destroy(NRT_MemInfo *mi) {
-    NRT_Free(mi);
+    NRT_dealloc(mi);
     TheMSys.atomic_inc(&TheMSys.stats_mi_free);
 }
 
@@ -328,6 +365,14 @@ size_t NRT_MemInfo_size(NRT_MemInfo* mi) {
     return mi->size;
 }
 
+void * NRT_MemInfo_external_allocator(NRT_MemInfo *mi) {
+    NRT_Debug(nrt_debug_print("NRT_MemInfo_external_allocator meminfo: %p external_allocator: %p\n", mi, mi->external_allocator));
+    return mi->external_allocator;
+}
+
+void *NRT_MemInfo_parent(NRT_MemInfo *mi) {
+    return mi->dtor_info;
+}
 
 void NRT_MemInfo_dump(NRT_MemInfo *mi, FILE *out) {
     fprintf(out, "MemInfo %p refcount %zu\n", mi, mi->refct);
@@ -414,8 +459,18 @@ void NRT_MemInfo_varsize_free(NRT_MemInfo *mi, void *ptr)
  */
 
 void* NRT_Allocate(size_t size) {
-    void *ptr = TheMSys.allocator.malloc(size);
-    NRT_Debug(nrt_debug_print("NRT_Allocate bytes=%zu ptr=%p\n", size, ptr));
+    return NRT_Allocate_External(size, NULL);
+}
+
+void* NRT_Allocate_External(size_t size, NRT_ExternalAllocator *allocator) {
+    void *ptr;
+    if (allocator) {
+        ptr = allocator->malloc(size, allocator->opaque_data);
+        NRT_Debug(nrt_debug_print("NRT_Allocate custom bytes=%zu ptr=%p\n", size, ptr));
+    } else {
+        ptr = TheMSys.allocator.malloc(size);
+        NRT_Debug(nrt_debug_print("NRT_Allocate bytes=%zu ptr=%p\n", size, ptr));
+    }
     TheMSys.atomic_inc(&TheMSys.stats_alloc);
     return ptr;
 }
@@ -460,6 +515,7 @@ NRT_MemInfo* nrt_manage_memory(void *data, NRT_managed_dtor dtor) {
 static const
 NRT_api_functions nrt_functions_table = {
     NRT_MemInfo_alloc,
+    NRT_MemInfo_alloc_external,
     nrt_manage_memory,
     NRT_MemInfo_acquire,
     NRT_MemInfo_release,
diff --git a/numba/core/runtime/nrt.h b/numba/core/runtime/nrt.h
index 3c74dc58f58..9fb23532964 100644
--- a/numba/core/runtime/nrt.h
+++ b/numba/core/runtime/nrt.h
@@ -15,13 +15,14 @@ All functions described here are threadsafe.
 /* Debugging facilities - enabled at compile-time */
 /* #undef NDEBUG */
 #if 0
-#   define NRT_Debug(X) X
+#   define NRT_Debug(X) {X; fflush(stdout); }
 #else
 #   define NRT_Debug(X) if (0) { X; }
 #endif
 
 /* TypeDefs */
 typedef void (*NRT_dtor_function)(void *ptr, size_t size, void *info);
+typedef void (*NRT_dealloc_func)(void *ptr, void *dealloc_info);
 typedef size_t (*NRT_atomic_inc_dec_func)(size_t *ptr);
 typedef int (*NRT_atomic_cas_func)(void * volatile *ptr, void *cmp, void *repl,
                                    void **oldptr);
@@ -32,7 +33,6 @@ typedef void *(*NRT_malloc_func)(size_t size);
 typedef void *(*NRT_realloc_func)(void *ptr, size_t new_size);
 typedef void (*NRT_free_func)(void *ptr);
 
-
 /* Memory System API */
 
 /* Initialize the memory system */
@@ -101,7 +101,8 @@ NRT_MemInfo* NRT_MemInfo_new(void *data, size_t size,
 
 VISIBILITY_HIDDEN
 void NRT_MemInfo_init(NRT_MemInfo *mi, void *data, size_t size,
-                      NRT_dtor_function dtor, void *dtor_info);
+                      NRT_dtor_function dtor, void *dtor_info,
+                      NRT_ExternalAllocator *external_allocator);
 
 /*
  * Returns the refcount of a MemInfo or (size_t)-1 if error.
@@ -116,6 +117,8 @@ size_t NRT_MemInfo_refcount(NRT_MemInfo *mi);
 VISIBILITY_HIDDEN
 NRT_MemInfo *NRT_MemInfo_alloc(size_t size);
 
+NRT_MemInfo *NRT_MemInfo_alloc_external(size_t size, NRT_ExternalAllocator *allocator);
+
 /*
  * The "safe" NRT_MemInfo_alloc performs additional steps to help debug
  * memory errors.
@@ -141,6 +144,8 @@ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align);
 VISIBILITY_HIDDEN
 NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned(size_t size, unsigned align);
 
+NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned_external(size_t size, unsigned align, NRT_ExternalAllocator *allocator);
+
 /*
  * Internal API.
  * Release a MemInfo. Calls NRT_MemSys_insert_meminfo.
@@ -179,6 +184,18 @@ void* NRT_MemInfo_data(NRT_MemInfo* mi);
 VISIBILITY_HIDDEN
 size_t NRT_MemInfo_size(NRT_MemInfo* mi);
 
+/*
+ * Returns the external allocator
+ */
+VISIBILITY_HIDDEN
+void* NRT_MemInfo_external_allocator(NRT_MemInfo* mi);
+
+/*
+ * Returns the parent MemInfo
+ */
+VISIBILITY_HIDDEN
+void* NRT_MemInfo_parent(NRT_MemInfo* mi);
+
 
 /*
  * NRT API for resizable buffers.
@@ -207,6 +224,7 @@ void NRT_MemInfo_dump(NRT_MemInfo *mi, FILE *out);
  * Allocate memory of `size` bytes.
  */
 VISIBILITY_HIDDEN void* NRT_Allocate(size_t size);
+VISIBILITY_HIDDEN void* NRT_Allocate_External(size_t size, NRT_ExternalAllocator *allocator);
 
 /*
  * Deallocate memory pointed by `ptr`.
diff --git a/numba/core/runtime/nrt_external.h b/numba/core/runtime/nrt_external.h
index 391b6fa1b0e..a4835c36f67 100644
--- a/numba/core/runtime/nrt_external.h
+++ b/numba/core/runtime/nrt_external.h
@@ -7,6 +7,18 @@ typedef struct MemInfo NRT_MemInfo;
 
 typedef void NRT_managed_dtor(void *data);
 
+typedef void *(*NRT_external_malloc_func)(size_t size, void *opaque_data);
+typedef void *(*NRT_external_realloc_func)(void *ptr, size_t new_size, void *opaque_data);
+typedef void (*NRT_external_free_func)(void *ptr, void *opaque_data);
+
+struct ExternalMemAllocator {
+    NRT_external_malloc_func malloc;
+    NRT_external_realloc_func realloc;
+    NRT_external_free_func free;
+    void *opaque_data;
+};
+
+typedef struct ExternalMemAllocator NRT_ExternalAllocator;
 
 typedef struct {
     /* Methods to create MemInfos.
@@ -21,6 +33,10 @@ typedef struct {
     Returning a new reference.
     */
     NRT_MemInfo* (*allocate)(size_t nbytes);
+    /* Allocator memory using an external allocator but still using Numba's MemInfo.
+
+    */
+    NRT_MemInfo* (*allocate_external)(size_t nbytes, NRT_ExternalAllocator *allocator);
 
     /* Convert externally allocated memory into a MemInfo.
 
diff --git a/numba/core/types/npytypes.py b/numba/core/types/npytypes.py
index 6f6307c5526..3c2191ca23e 100644
--- a/numba/core/types/npytypes.py
+++ b/numba/core/types/npytypes.py
@@ -8,6 +8,7 @@
 from numba.core import utils
 from .misc import UnicodeType
 from .containers import Bytes
+import numpy as np
 
 class CharSeq(Type):
     """
@@ -394,8 +395,9 @@ class Array(Buffer):
     Type class for Numpy arrays.
     """
 
-    def __init__(self, dtype, ndim, layout, readonly=False, name=None,
+    def __init__(self, dtype, ndim, layout, py_type=np.ndarray, readonly=False, name=None,
                  aligned=True, addrspace=None):
+        self.py_type = py_type
         if readonly:
             self.mutable = False
         if (not aligned or
diff --git a/numba/core/typing/npydecl.py b/numba/core/typing/npydecl.py
index 2dbbed39be9..e7ecf452fe9 100644
--- a/numba/core/typing/npydecl.py
+++ b/numba/core/typing/npydecl.py
@@ -126,7 +126,21 @@ def generic(self, args, kws):
             ret_tys = ufunc_loop.outputs[-implicit_output_count:]
             if ndims > 0:
                 assert layout is not None
-                ret_tys = [types.Array(dtype=ret_ty, ndim=ndims, layout=layout)
+                # If either of the types involved in the ufunc operation have a
+                # __array_ufunc__ method then invoke the first such one to
+                # determine the output type of the ufunc.
+                array_ufunc_type = None
+                for a in args:
+                    if hasattr(a, "__array_ufunc__"):
+                        array_ufunc_type = a
+                        break
+                output_type = types.Array
+                if array_ufunc_type is not None:
+                    output_type = array_ufunc_type.__array_ufunc__(ufunc, "__call__", *args, **kws)
+                    # Eventually better error handling!  FIX ME!
+                    assert(output_type is not None)
+
+                ret_tys = [output_type(dtype=ret_ty, ndim=ndims, layout=layout)
                            for ret_ty in ret_tys]
                 ret_tys = [resolve_output_type(self.context, args, ret_ty)
                            for ret_ty in ret_tys]
@@ -517,6 +531,7 @@ def typer(shape, dtype=None):
 
 @infer_global(np.empty_like)
 @infer_global(np.zeros_like)
+@infer_global(np.ones_like)
 class NdConstructorLike(CallableTemplate):
     """
     Typing template for np.empty_like(), .zeros_like(), .ones_like().
@@ -544,9 +559,6 @@ def typer(arg, dtype=None):
         return typer
 
 
-infer_global(np.ones_like)(NdConstructorLike)
-
-
 @infer_global(np.full)
 class NdFull(CallableTemplate):
 
@@ -563,6 +575,7 @@ def typer(shape, fill_value, dtype=None):
 
         return typer
 
+
 @infer_global(np.full_like)
 class NdFullLike(CallableTemplate):
 
diff --git a/numba/np/arrayobj.py b/numba/np/arrayobj.py
index 933b1c6565e..5749e7d9b5b 100644
--- a/numba/np/arrayobj.py
+++ b/numba/np/arrayobj.py
@@ -32,7 +32,7 @@
 from numba.misc import quicksort, mergesort
 from numba.cpython import slicing
 from numba.cpython.unsafe.tuple import tuple_setitem
-
+from numba.core.pythonapi import _allocators
 
 def set_range_metadata(builder, load, lower_bound, upper_bound):
     """
@@ -3399,8 +3399,13 @@ def _empty_nd_impl(context, builder, arrtype, shapes):
         )
 
     align = context.get_preferred_array_alignment(arrtype.dtype)
-    meminfo = context.nrt.meminfo_alloc_aligned(builder, size=allocsize,
-                                                align=align)
+    def alloc_unsupported(context, builder, size, align):
+        return context.nrt.meminfo_alloc_aligned(builder, size, align)
+
+    # See if the type has a special allocator, if not use the default
+    # alloc_unsuppported allocator above.
+    allocator_impl = _allocators.lookup(arrtype.__class__, alloc_unsupported)
+    meminfo = allocator_impl(context, builder, size=allocsize, align=align)
 
     data = context.nrt.meminfo_data(builder, meminfo)
 

From 864790061aa7548ac2b7ebcf448354def849e657 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Fri, 11 Dec 2020 14:47:26 -0600
Subject: [PATCH 11/19] Remove printf.

---
 numba/core/runtime/_nrt_python.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/numba/core/runtime/_nrt_python.c b/numba/core/runtime/_nrt_python.c
index efe4467df70..9012498292a 100644
--- a/numba/core/runtime/_nrt_python.c
+++ b/numba/core/runtime/_nrt_python.c
@@ -115,7 +115,6 @@ static
 PyObject*
 MemInfo_get_external_allocator(MemInfoObject *self, void *closure) {
     void *p = NRT_MemInfo_external_allocator(self->meminfo);
-    printf("MemInfo_get_external_allocator %p\n", p);
     return PyLong_FromVoidPtr(p);
 }
 

From 2d5f18c17d60b5281b0669bb61fa39682f000481 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Thu, 12 Nov 2020 13:04:24 +0300
Subject: [PATCH 12/19] Remove the misspelling of finalize_dynamic_globals
 (numba#6466)

This commit fixes misspelling of _finalize_dynamic_globals().
This function is used only in the same file.
---
 numba/core/codegen.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numba/core/codegen.py b/numba/core/codegen.py
index 1e05c12bacd..e22fa89012e 100644
--- a/numba/core/codegen.py
+++ b/numba/core/codegen.py
@@ -247,7 +247,7 @@ def finalize(self):
         self._final_module.verify()
         self._finalize_final_module()
 
-    def _finalize_dyanmic_globals(self):
+    def _finalize_dynamic_globals(self):
         # Scan for dynamic globals
         for gv in self._final_module.global_variables:
             if gv.name.startswith('numba.dynamic.globals'):
@@ -265,7 +265,7 @@ def _finalize_final_module(self):
         """
         Make the underlying LLVM module ready to use.
         """
-        self._finalize_dyanmic_globals()
+        self._finalize_dynamic_globals()
         self._verify_declare_only_symbols()
 
         # Remember this on the module, for the object cache hooks

From 42c24777598b7efb4a635929897fdf2708dab7e0 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 13 Nov 2020 04:56:53 -0600
Subject: [PATCH 13/19] Patch for addrspace (numba#6469)

This commit adds modification introduced about addrspace from ce96c20 by @DrTodd13
and test fixed in 947b407 by @reazulhoque.

This commit could be proposed to upstream.
---
 numba/core/datamodel/models.py |  2 +-
 numba/core/types/common.py     |  3 ++-
 numba/core/types/misc.py       |  5 +++--
 numba/core/types/npytypes.py   | 12 +++++++-----
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/numba/core/datamodel/models.py b/numba/core/datamodel/models.py
index 77b2fc90a21..4049d7c0e60 100644
--- a/numba/core/datamodel/models.py
+++ b/numba/core/datamodel/models.py
@@ -865,7 +865,7 @@ def __init__(self, dmm, fe_type):
             ('parent', types.pyobject),
             ('nitems', types.intp),
             ('itemsize', types.intp),
-            ('data', types.CPointer(fe_type.dtype)),
+            ('data', types.CPointer(fe_type.dtype, addrspace=fe_type.addrspace)),
             ('shape', types.UniTuple(types.intp, ndim)),
             ('strides', types.UniTuple(types.intp, ndim)),
 
diff --git a/numba/core/types/common.py b/numba/core/types/common.py
index f1806540c60..f054d62652e 100644
--- a/numba/core/types/common.py
+++ b/numba/core/types/common.py
@@ -45,7 +45,7 @@ class Buffer(IterableType, ArrayCompatible):
     # CS and FS are not reserved for inner contig but strided
     LAYOUTS = frozenset(['C', 'F', 'CS', 'FS', 'A'])
 
-    def __init__(self, dtype, ndim, layout, readonly=False, name=None):
+    def __init__(self, dtype, ndim, layout, readonly=False, name=None, addrspace=None):
         from .misc import unliteral
 
         if isinstance(dtype, Buffer):
@@ -55,6 +55,7 @@ def __init__(self, dtype, ndim, layout, readonly=False, name=None):
         self.dtype = unliteral(dtype)
         self.ndim = ndim
         self.layout = layout
+        self.addrspace = addrspace
         if readonly:
             self.mutable = False
         if name is None:
diff --git a/numba/core/types/misc.py b/numba/core/types/misc.py
index 34d8f3a51bf..a2445d967d3 100644
--- a/numba/core/types/misc.py
+++ b/numba/core/types/misc.py
@@ -155,14 +155,15 @@ class CPointer(Type):
     """
     mutable = True
 
-    def __init__(self, dtype):
+    def __init__(self, dtype, addrspace=None):
         self.dtype = dtype
+        self.addrspace = addrspace
         name = "%s*" % dtype
         super(CPointer, self).__init__(name)
 
     @property
     def key(self):
-        return self.dtype
+        return self.dtype, self.addrspace
 
 
 class EphemeralPointer(CPointer):
diff --git a/numba/core/types/npytypes.py b/numba/core/types/npytypes.py
index c7c312adce3..6f6307c5526 100644
--- a/numba/core/types/npytypes.py
+++ b/numba/core/types/npytypes.py
@@ -395,7 +395,7 @@ class Array(Buffer):
     """
 
     def __init__(self, dtype, ndim, layout, readonly=False, name=None,
-                 aligned=True):
+                 aligned=True, addrspace=None):
         if readonly:
             self.mutable = False
         if (not aligned or
@@ -408,7 +408,7 @@ def __init__(self, dtype, ndim, layout, readonly=False, name=None,
             if not self.aligned:
                 type_name = "unaligned " + type_name
             name = "%s(%s, %sd, %s)" % (type_name, dtype, ndim, layout)
-        super(Array, self).__init__(dtype, ndim, layout, name=name)
+        super(Array, self).__init__(dtype, ndim, layout, name=name, addrspace=addrspace)
 
     @property
     def mangling_args(self):
@@ -417,7 +417,7 @@ def mangling_args(self):
                 'aligned' if self.aligned else 'unaligned']
         return self.__class__.__name__, args
 
-    def copy(self, dtype=None, ndim=None, layout=None, readonly=None):
+    def copy(self, dtype=None, ndim=None, layout=None, readonly=None, addrspace=None):
         if dtype is None:
             dtype = self.dtype
         if ndim is None:
@@ -426,12 +426,14 @@ def copy(self, dtype=None, ndim=None, layout=None, readonly=None):
             layout = self.layout
         if readonly is None:
             readonly = not self.mutable
+        if addrspace is None:
+            addrspace = self.addrspace
         return Array(dtype=dtype, ndim=ndim, layout=layout, readonly=readonly,
-                     aligned=self.aligned)
+                     aligned=self.aligned, addrspace=addrspace)
 
     @property
     def key(self):
-        return self.dtype, self.ndim, self.layout, self.mutable, self.aligned
+        return self.dtype, self.ndim, self.layout, self.mutable, self.aligned, self.addrspace
 
     def unify(self, typingctx, other):
         """

From b095ac2c2ea2a307700ec3b4f4c1a55c509b0d0b Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 13 Nov 2020 04:56:55 -0600
Subject: [PATCH 14/19] Patch for change numpy functions mapping (numba#6467,
 numba#6468)

Changed in 8ccfd36 and a77eab6 by @reazulhoque.
Possibly this 2 changes are not related to each other.
---
 numba/np/npyimpl.py     | 7 ++++++-
 numba/parfors/parfor.py | 5 +++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/numba/np/npyimpl.py b/numba/np/npyimpl.py
index d1e65e72e80..1763b6a3f42 100644
--- a/numba/np/npyimpl.py
+++ b/numba/np/npyimpl.py
@@ -448,7 +448,12 @@ def __init__(self, context, builder, outer_sig):
             super(_KernelImpl, self).__init__(context, builder, outer_sig)
             loop = ufunc_find_matching_loop(
                 ufunc, outer_sig.args + tuple(_unpack_output_types(ufunc, outer_sig)))
-            self.fn = ufunc_db.get_ufunc_info(ufunc).get(loop.ufunc_sig)
+
+            if hasattr(context, 'ufunc_db'):
+                self.fn = context.ufunc_db[ufunc].get(loop.ufunc_sig)
+            else:
+                self.fn = ufunc_db.get_ufunc_info(ufunc).get(loop.ufunc_sig)
+
             self.inner_sig = _ufunc_loop_sig(loop.outputs, loop.inputs)
 
             if self.fn is None:
diff --git a/numba/parfors/parfor.py b/numba/parfors/parfor.py
index 0f23e69843a..c73d99905dd 100644
--- a/numba/parfors/parfor.py
+++ b/numba/parfors/parfor.py
@@ -1350,7 +1350,7 @@ class PreParforPass(object):
     implementations of numpy functions if available.
     """
     def __init__(self, func_ir, typemap, calltypes, typingctx, options,
-                 swapped={}):
+                 swapped={}, replace_functions_map=replace_functions_map):
         self.func_ir = func_ir
         self.typemap = typemap
         self.calltypes = calltypes
@@ -1358,6 +1358,7 @@ def __init__(self, func_ir, typemap, calltypes, typingctx, options,
         self.options = options
         # diagnostics
         self.swapped = swapped
+        self.replace_functions_map = replace_functions_map
         self.stats = {
             'replaced_func': 0,
             'replaced_dtype': 0,
@@ -1394,7 +1395,7 @@ def _replace_parallel_functions(self, blocks):
                         def replace_func():
                             func_def = get_definition(self.func_ir, expr.func)
                             callname = find_callname(self.func_ir, expr)
-                            repl_func = replace_functions_map.get(callname, None)
+                            repl_func = self.replace_functions_map.get(callname, None)
                             # Handle method on array type
                             if (repl_func is None and
                                 len(callname) == 2 and

From 28d8d06dbb20e2d41e21274779d1e0812a246496 Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Fri, 4 Dec 2020 14:26:59 +0300
Subject: [PATCH 15/19] Refactoring for ParforDiagnostics dump function
 (numba#6545)

Co-authored-by: etotmeni <elena.totmenina@intel.com>
---
 numba/parfors/parfor.py | 647 +++++++++++++++++++++-------------------
 1 file changed, 342 insertions(+), 305 deletions(-)

diff --git a/numba/parfors/parfor.py b/numba/parfors/parfor.py
index c73d99905dd..b845b43bf2a 100644
--- a/numba/parfors/parfor.py
+++ b/numba/parfors/parfor.py
@@ -862,6 +862,340 @@ def sort_pf_by_line(self, pf_id, parfors_simple):
                             pass
         return line
 
+    def get_parfors_simple(self, print_loop_search):
+        parfors_simple = dict()
+
+        # print in line order, parfors loop id is based on discovery order
+        for pf in sorted(self.initial_parfors, key=lambda x: x.loc.line):
+            # use 0 here, the parfors are mutated by the time this routine
+            # is called, however, fusion appends the patterns so we can just
+            # pull in the first as a "before fusion" emulation
+            r_pattern = pf.patterns[0]
+            pattern = pf.patterns[0]
+            loc = pf.loc
+            if isinstance(pattern, tuple):
+                if pattern[0] == 'prange':
+                    if pattern[1] == 'internal':
+                        replfn = '.'.join(reversed(list(pattern[2][0])))
+                        loc = pattern[2][1]
+                        r_pattern = '%s %s' % (replfn, '(internal parallel version)')
+                    elif pattern[1] == 'user':
+                        r_pattern = "user defined prange"
+                    elif pattern[1] == 'pndindex':
+                        r_pattern = "internal pndindex" #FIXME: trace this!
+                    else:
+                        assert 0
+            fmt = 'Parallel for-loop #%s: is produced from %s:\n    %s\n \n'
+            if print_loop_search:
+                print_wrapped(fmt % (pf.id, loc, r_pattern))
+            parfors_simple[pf.id] = (pf, loc, r_pattern)
+        return parfors_simple
+
+    def get_all_lines(self, parfors_simple):
+        # ensure adjacency lists are the same size for both sets of info
+        # (nests and fusion may not traverse the same space, for
+        # convenience [] is used as a condition to halt recursion)
+        fadj, froots = self.compute_graph_info(self.fusion_info)
+        nadj, _nroots = self.compute_graph_info(self.nested_fusion_info)
+
+        if len(fadj) > len(nadj):
+            lim = len(fadj)
+            tmp = nadj
+        else:
+            lim = len(nadj)
+            tmp = fadj
+        for x in range(len(tmp), lim):
+            tmp.append([])
+
+        # This computes the roots of true loop nests (i.e. loops containing
+        # loops opposed to just a loop that's a root).
+        nroots = set()
+        if _nroots:
+            for r in _nroots:
+                if nadj[r] != []:
+                    nroots.add(r)
+        all_roots = froots ^ nroots
+
+        # This computes all the parfors at the top level that are either:
+        # - roots of loop fusion
+        # - roots of true loop nests
+        # it then combines these based on source line number for ease of
+        # producing output ordered in a manner similar to the code structure
+        froots_lines = {}
+        for x in froots:
+            line = self.sort_pf_by_line(x, parfors_simple)
+            froots_lines[line] = 'fuse', x, fadj
+
+        nroots_lines = {}
+        for x in nroots:
+            line = self.sort_pf_by_line(x, parfors_simple)
+            nroots_lines[line] = 'nest', x, nadj
+
+        all_lines = froots_lines.copy()
+        all_lines.update(nroots_lines)
+        return all_lines
+
+    def source_listing(self, parfors_simple, purpose_str):
+        filename = self.func_ir.loc.filename
+        count = self.count_parfors()
+        func_name = self.func_ir.func_id.func
+        try:
+            lines = inspect.getsource(func_name).splitlines()
+        except OSError: # generated function
+            lines = None
+        if lines and parfors_simple:
+            src_width = max([len(x) for x in lines])
+            map_line_to_pf = defaultdict(list) # parfors can alias lines
+            for k, v in parfors_simple.items():
+                # TODO: do a better job of tracking parfors that are not in
+                # this file but are referred to, e.g. np.arange()
+                if parfors_simple[k][1].filename == filename:
+                    match_line = self.sort_pf_by_line(k, parfors_simple)
+                    map_line_to_pf[match_line].append(str(k))
+
+            max_pf_per_line = max([1] + [len(x) for x in map_line_to_pf.values()])
+            width = src_width + (1 + max_pf_per_line * (len(str(count)) + 2))
+            newlines = []
+            newlines.append('\n')
+            newlines.append('Parallel loop listing for %s' % purpose_str)
+            newlines.append(width * '-' + '|loop #ID')
+            fmt = '{0:{1}}| {2}'
+            # why are these off by 1?
+            lstart = max(0, self.func_ir.loc.line - 1)
+            for no, line in enumerate(lines, lstart):
+                pf_ids = map_line_to_pf.get(no, None)
+                if pf_ids is not None:
+                    pfstr = '#' + ', '.join(pf_ids)
+                else:
+                    pfstr = ''
+                stripped = line.strip('\n')
+                srclen = len(stripped)
+                if pf_ids:
+                    l = fmt.format(width * '-', width, pfstr)
+                else:
+                    l = fmt.format(width * ' ', width, pfstr)
+                newlines.append(stripped + l[srclen:])
+            print('\n'.join(newlines))
+        else:
+            print("No source available")
+
+    def print_unoptimised(self, lines):
+        # This prints the unoptimised parfors state
+        sword = '+--'
+        fac = len(sword)
+        fadj, froots = self.compute_graph_info(self.fusion_info)
+        nadj, _nroots = self.compute_graph_info(self.nested_fusion_info)
+
+        if len(fadj) > len(nadj):
+            lim = len(fadj)
+            tmp = nadj
+        else:
+            lim = len(nadj)
+            tmp = fadj
+        for x in range(len(tmp), lim):
+            tmp.append([])
+
+        def print_nest(fadj_, nadj_, theroot, reported, region_id):
+            def print_g(fadj_, nadj_, nroot, depth):
+                print_wrapped(fac * depth * ' ' + '%s%s %s' % (sword, nroot, '(parallel)'))
+                for k in nadj_[nroot]:
+                    if nadj_[k] == []:
+                        msg = []
+                        msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, k, '(parallel)'))
+                        if fadj_[k] != [] and k not in reported:
+                            fused = self.reachable_nodes(fadj_, k)
+                            for i in fused:
+                                msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, i, '(parallel)'))
+                        reported.append(k)
+                        print_wrapped('\n'.join(msg))
+                    else:
+                        print_g(fadj_, nadj_, k, depth + 1)
+
+            if nadj_[theroot] != []:
+                print_wrapped("Parallel region %s:" % region_id)
+                print_g(fadj_, nadj_, theroot, 0)
+                print("\n")
+                region_id = region_id + 1
+            return region_id
+
+        def print_fuse(ty, pf_id, adj, depth, region_id):
+            msg = []
+            print_wrapped("Parallel region %s:" % region_id)
+            msg.append(fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel)'))
+            if adj[pf_id] != []:
+                fused = sorted(self.reachable_nodes(adj, pf_id))
+                for k in fused:
+                    msg.append(fac * depth * ' ' + '%s%s %s' % (sword, k, '(parallel)'))
+            region_id = region_id + 1
+            print_wrapped('\n'.join(msg))
+            print("\n")
+            return region_id
+
+        # Walk the parfors by src line and print optimised structure
+        region_id = 0
+        reported = []
+        for line, info in sorted(lines.items()):
+            opt_ty, pf_id, adj = info
+            if opt_ty == 'fuse':
+                if pf_id not in reported:
+                    region_id = print_fuse('f', pf_id, adj, 0, region_id)
+            elif opt_ty == 'nest':
+                region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
+            else:
+                assert 0
+
+    def print_optimised(self, lines):
+        # This prints the optimised output based on the transforms that
+        # occurred during loop fusion and rewriting of loop nests
+        sword = '+--'
+        fac = len(sword)
+        fadj, froots = self.compute_graph_info(self.fusion_info)
+        nadj, _nroots = self.compute_graph_info(self.nested_fusion_info)
+
+        if len(fadj) > len(nadj):
+            lim = len(fadj)
+            tmp = nadj
+        else:
+            lim = len(nadj)
+            tmp = fadj
+        for x in range(len(tmp), lim):
+            tmp.append([])
+
+        summary = dict()
+        # region : {fused, serialized}
+
+        def print_nest(fadj_, nadj_, theroot, reported, region_id):
+            def print_g(fadj_, nadj_, nroot, depth):
+                for k in nadj_[nroot]:
+                    msg = fac * depth * ' ' + '%s%s %s' % (sword, k, '(serial')
+                    if nadj_[k] == []:
+                        fused = []
+                        if fadj_[k] != [] and k not in reported:
+                            fused = sorted(self.reachable_nodes(fadj_, k))
+                            msg += ", fused with loop(s): "
+                            msg += ', '.join([str(x) for x in fused])
+                        msg += ')'
+                        reported.append(k)
+                        print_wrapped(msg)
+                        summary[region_id]['fused'] += len(fused)
+                    else:
+                        print_wrapped(msg + ')')
+                        print_g(fadj_, nadj_, k, depth + 1)
+                    summary[region_id]['serialized'] += 1
+
+            if nadj_[theroot] != []:
+                print_wrapped("Parallel region %s:" % region_id)
+                print_wrapped('%s%s %s' % (sword, theroot, '(parallel)'))
+                summary[region_id] = {'root': theroot, 'fused': 0, 'serialized': 0}
+                print_g(fadj_, nadj_, theroot, 1)
+                print("\n")
+                region_id = region_id + 1
+            return region_id
+
+        def print_fuse(ty, pf_id, adj, depth, region_id):
+            print_wrapped("Parallel region %s:" % region_id)
+            msg = fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel')
+            fused = []
+            if adj[pf_id] != []:
+                fused = sorted(self.reachable_nodes(adj, pf_id))
+                msg += ", fused with loop(s): "
+                msg += ', '.join([str(x) for x in fused])
+
+            summary[region_id] = {'root': pf_id, 'fused': len(fused), 'serialized': 0}
+            msg += ')'
+            print_wrapped(msg)
+            print("\n")
+            region_id = region_id + 1
+            return region_id
+
+        # Walk the parfors by src line and print optimised structure
+        region_id = 0
+        reported = []
+        for line, info in sorted(lines.items()):
+            opt_ty, pf_id, adj = info
+            if opt_ty == 'fuse':
+                if pf_id not in reported:
+                    region_id = print_fuse('f', pf_id, adj, 0, region_id)
+            elif opt_ty == 'nest':
+                region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
+            else:
+                assert 0
+
+        # print the summary of the fuse/serialize rewrite
+        if summary:
+            for k, v in sorted(summary.items()):
+                msg = ('\n \nParallel region %s (loop #%s) had %s '
+                    'loop(s) fused')
+                root = v['root']
+                fused = v['fused']
+                serialized = v['serialized']
+                if serialized != 0:
+                    msg += (' and %s loop(s) '
+                    'serialized as part of the larger '
+                    'parallel loop (#%s).')
+                    print_wrapped(msg % (k, root, fused, serialized, root))
+                else:
+                    msg += '.'
+                    print_wrapped(msg % (k, root, fused))
+        else:
+            print_wrapped("Parallel structure is already optimal.")
+
+    def allocation_hoist(self):
+        found = False
+        print('Allocation hoisting:')
+        for pf_id, data in self.hoist_info.items():
+            stmt = data.get('hoisted', [])
+            for inst in stmt:
+                if isinstance(inst.value, ir.Expr):
+                    try:
+                        attr = inst.value.attr
+                        if attr == 'empty':
+                            msg = ("The memory allocation derived from the "
+                                "instruction at %s is hoisted out of the "
+                                "parallel loop labelled #%s (it will be "
+                                "performed before the loop is executed and "
+                                "reused inside the loop):")
+                            loc = inst.loc
+                            print_wrapped(msg % (loc, pf_id))
+                            try:
+                                path = os.path.relpath(loc.filename)
+                            except ValueError:
+                                path = os.path.abspath(loc.filename)
+                            lines = linecache.getlines(path)
+                            if lines and loc.line:
+                                print_wrapped("   Allocation:: " + lines[0 if loc.line < 2 else loc.line - 1].strip())
+                            print_wrapped("    - numpy.empty() is used for the allocation.\n")
+                            found = True
+                    except (KeyError, AttributeError):
+                        pass
+        if not found:
+            print_wrapped('No allocation hoisting found')
+
+    def instruction_hoist(self):
+        print("")
+        print('Instruction hoisting:')
+        hoist_info_printed = False
+        if self.hoist_info:
+            for pf_id, data in self.hoist_info.items():
+                hoisted = data.get('hoisted', None)
+                not_hoisted = data.get('not_hoisted', None)
+                if not hoisted and not not_hoisted:
+                    print("loop #%s has nothing to hoist." % pf_id)
+                    continue
+
+                print("loop #%s:" % pf_id)
+                if hoisted:
+                    print("  Has the following hoisted:")
+                    [print("    %s" % y) for y in hoisted]
+                    hoist_info_printed = True
+                if not_hoisted:
+                    print("  Failed to hoist the following:")
+                    [print("    %s: %s" % (y, x)) for x, y in not_hoisted]
+                    hoist_info_printed = True
+        if not hoist_info_printed:
+            print_wrapped('No instruction hoisting found')
+        print_wrapped(80 * '-')
+
     def dump(self, level=1):
         if not self.has_setup:
             raise RuntimeError("self.setup has not been called")
@@ -919,33 +1253,7 @@ def dump(self, level=1):
 #----------- search section
         if print_loop_search:
             print_wrapped('Looking for parallel loops'.center(_termwidth, '-'))
-
-        parfors_simple = dict()
-
-        # print in line order, parfors loop id is based on discovery order
-        for pf in sorted(self.initial_parfors, key=lambda x: x.loc.line):
-            # use 0 here, the parfors are mutated by the time this routine
-            # is called, however, fusion appends the patterns so we can just
-            # pull in the first as a "before fusion" emulation
-            r_pattern = pf.patterns[0]
-            pattern = pf.patterns[0]
-            loc = pf.loc
-            if isinstance(pattern, tuple):
-                if pattern[0] == 'prange':
-                    if pattern[1] == 'internal':
-                        replfn = '.'.join(reversed(list(pattern[2][0])))
-                        loc = pattern[2][1]
-                        r_pattern = '%s %s' % (replfn, '(internal parallel version)')
-                    elif pattern[1] == 'user':
-                        r_pattern = "user defined prange"
-                    elif pattern[1] == 'pndindex':
-                        r_pattern = "internal pndindex" #FIXME: trace this!
-                    else:
-                        assert 0
-            fmt = 'Parallel for-loop #%s: is produced from %s:\n    %s\n \n'
-            if print_loop_search:
-                print_wrapped(fmt % (pf.id, loc, r_pattern))
-            parfors_simple[pf.id] = (pf, loc, r_pattern)
+        parfors_simple = self.get_parfors_simple(print_loop_search)
 
         count = self.count_parfors()
         if print_loop_search:
@@ -965,46 +1273,7 @@ def dump(self, level=1):
             path = os.path.abspath(filename)
 
         if print_source_listing:
-            func_name = self.func_ir.func_id.func
-            try:
-                lines = inspect.getsource(func_name).splitlines()
-            except OSError: # generated function
-                lines = None
-            if lines:
-                src_width = max([len(x) for x in lines])
-                map_line_to_pf = defaultdict(list) # parfors can alias lines
-                for k, v in parfors_simple.items():
-                    # TODO: do a better job of tracking parfors that are not in
-                    # this file but are referred to, e.g. np.arange()
-                    if parfors_simple[k][1].filename == filename:
-                        match_line = self.sort_pf_by_line(k, parfors_simple)
-                        map_line_to_pf[match_line].append(str(k))
-
-                max_pf_per_line = max([1] + [len(x) for x in map_line_to_pf.values()])
-                width = src_width + (1 + max_pf_per_line * (len(str(count)) + 2))
-                newlines = []
-                newlines.append('\n')
-                newlines.append('Parallel loop listing for %s' % purpose_str)
-                newlines.append(width * '-' + '|loop #ID')
-                fmt = '{0:{1}}| {2}'
-                # why are these off by 1?
-                lstart = max(0, self.func_ir.loc.line - 1)
-                for no, line in enumerate(lines, lstart):
-                    pf_ids = map_line_to_pf.get(no, None)
-                    if pf_ids is not None:
-                        pfstr = '#' + ', '.join(pf_ids)
-                    else:
-                        pfstr = ''
-                    stripped = line.strip('\n')
-                    srclen = len(stripped)
-                    if pf_ids:
-                        l = fmt.format(width * '-', width, pfstr)
-                    else:
-                        l = fmt.format(width * ' ', width, pfstr)
-                    newlines.append(stripped + l[srclen:])
-                print('\n'.join(newlines))
-            else:
-                print("No source available")
+            self.source_listing(parfors_simple, purpose_str)
 
 #---------- these are used a lot here on in
         sword = '+--'
@@ -1075,198 +1344,16 @@ def print_g(adj, root, depth):
                     print_wrapped("")
 
 #---------- compute various properties and orderings in the data for subsequent use
-
-            # ensure adjacency lists are the same size for both sets of info
-            # (nests and fusion may not traverse the same space, for
-            # convenience [] is used as a condition to halt recursion)
-            fadj, froots = self.compute_graph_info(self.fusion_info)
-            nadj, _nroots = self.compute_graph_info(self.nested_fusion_info)
-
-            if len(fadj) > len(nadj):
-                lim = len(fadj)
-                tmp = nadj
-            else:
-                lim = len(nadj)
-                tmp = fadj
-            for x in range(len(tmp), lim):
-                tmp.append([])
-
-            # This computes the roots of true loop nests (i.e. loops containing
-            # loops opposed to just a loop that's a root).
-            nroots = set()
-            if _nroots:
-                for r in _nroots:
-                    if nadj[r] != []:
-                        nroots.add(r)
-            all_roots = froots ^ nroots
-
-            # This computes all the parfors at the top level that are either:
-            # - roots of loop fusion
-            # - roots of true loop nests
-            # it then combines these based on source line number for ease of
-            # producing output ordered in a manner similar to the code structure
-            froots_lines = {}
-            for x in froots:
-                line = self.sort_pf_by_line(x, parfors_simple)
-                froots_lines[line] = 'fuse', x, fadj
-
-            nroots_lines = {}
-            for x in nroots:
-                line = self.sort_pf_by_line(x, parfors_simple)
-                nroots_lines[line] = 'nest', x, nadj
-
-            all_lines = froots_lines.copy()
-            all_lines.update(nroots_lines)
-
-            # nroots, froots, nadj and fadj are all set up correctly
-            # define some print functions
-
-            def print_unoptimised(lines):
-                # This prints the unoptimised parfors state
-
-                fac = len(sword)
-
-                def print_nest(fadj_, nadj_, theroot, reported, region_id):
-                    def print_g(fadj_, nadj_, nroot, depth):
-                        print_wrapped(fac * depth * ' ' + '%s%s %s' % (sword, nroot, '(parallel)'))
-                        for k in nadj_[nroot]:
-                            if nadj_[k] == []:
-                                msg = []
-                                msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, k, '(parallel)'))
-                                if fadj_[k] != [] and k not in reported:
-                                    fused = self.reachable_nodes(fadj_, k)
-                                    for i in fused:
-                                        msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, i, '(parallel)'))
-                                reported.append(k)
-                                print_wrapped('\n'.join(msg))
-                            else:
-                                print_g(fadj_, nadj_, k, depth + 1)
-
-                    if nadj_[theroot] != []:
-                        print_wrapped("Parallel region %s:" % region_id)
-                        print_g(fadj_, nadj_, theroot, 0)
-                        print("\n")
-                        region_id = region_id + 1
-                    return region_id
-
-                def print_fuse(ty, pf_id, adj, depth, region_id):
-                    msg = []
-                    print_wrapped("Parallel region %s:" % region_id)
-                    msg.append(fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel)'))
-                    if adj[pf_id] != []:
-                        fused = sorted(self.reachable_nodes(adj, pf_id))
-                        for k in fused:
-                            msg.append(fac * depth * ' ' + '%s%s %s' % (sword, k, '(parallel)'))
-                    region_id = region_id + 1
-                    print_wrapped('\n'.join(msg))
-                    print("\n")
-                    return region_id
-
-                # Walk the parfors by src line and print optimised structure
-                region_id = 0
-                reported = []
-                for line, info in sorted(lines.items()):
-                    opt_ty, pf_id, adj = info
-                    if opt_ty == 'fuse':
-                        if pf_id not in reported:
-                            region_id = print_fuse('f', pf_id, adj, 0, region_id)
-                    elif opt_ty == 'nest':
-                        region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
-                    else:
-                        assert 0
-
-            def print_optimised(lines):
-                # This prints the optimised output based on the transforms that
-                # occurred during loop fusion and rewriting of loop nests
-                fac = len(sword)
-
-                summary = dict()
-                # region : {fused, serialized}
-
-                def print_nest(fadj_, nadj_, theroot, reported, region_id):
-                    def print_g(fadj_, nadj_, nroot, depth):
-                        for k in nadj_[nroot]:
-                            msg = fac * depth * ' ' + '%s%s %s' % (sword, k, '(serial')
-                            if nadj_[k] == []:
-                                fused = []
-                                if fadj_[k] != [] and k not in reported:
-                                    fused = sorted(self.reachable_nodes(fadj_, k))
-                                    msg += ", fused with loop(s): "
-                                    msg += ', '.join([str(x) for x in fused])
-                                msg += ')'
-                                reported.append(k)
-                                print_wrapped(msg)
-                                summary[region_id]['fused'] += len(fused)
-                            else:
-                                print_wrapped(msg + ')')
-                                print_g(fadj_, nadj_, k, depth + 1)
-                            summary[region_id]['serialized'] += 1
-
-                    if nadj_[theroot] != []:
-                        print_wrapped("Parallel region %s:" % region_id)
-                        print_wrapped('%s%s %s' % (sword, theroot, '(parallel)'))
-                        summary[region_id] = {'root': theroot, 'fused': 0, 'serialized': 0}
-                        print_g(fadj_, nadj_, theroot, 1)
-                        print("\n")
-                        region_id = region_id + 1
-                    return region_id
-
-                def print_fuse(ty, pf_id, adj, depth, region_id):
-                    print_wrapped("Parallel region %s:" % region_id)
-                    msg = fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel')
-                    fused = []
-                    if adj[pf_id] != []:
-                        fused = sorted(self.reachable_nodes(adj, pf_id))
-                        msg += ", fused with loop(s): "
-                        msg += ', '.join([str(x) for x in fused])
-
-                    summary[region_id] = {'root': pf_id, 'fused': len(fused), 'serialized': 0}
-                    msg += ')'
-                    print_wrapped(msg)
-                    print("\n")
-                    region_id = region_id + 1
-                    return region_id
-
-                # Walk the parfors by src line and print optimised structure
-                region_id = 0
-                reported = []
-                for line, info in sorted(lines.items()):
-                    opt_ty, pf_id, adj = info
-                    if opt_ty == 'fuse':
-                        if pf_id not in reported:
-                            region_id = print_fuse('f', pf_id, adj, 0, region_id)
-                    elif opt_ty == 'nest':
-                        region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
-                    else:
-                        assert 0
-
-                # print the summary of the fuse/serialize rewrite
-                if summary:
-                    for k, v in sorted(summary.items()):
-                        msg = ('\n \nParallel region %s (loop #%s) had %s '
-                            'loop(s) fused')
-                        root = v['root']
-                        fused = v['fused']
-                        serialized = v['serialized']
-                        if serialized != 0:
-                            msg += (' and %s loop(s) '
-                            'serialized as part of the larger '
-                            'parallel loop (#%s).')
-                            print_wrapped(msg % (k, root, fused, serialized, root))
-                        else:
-                            msg += '.'
-                            print_wrapped(msg % (k, root, fused))
-                else:
-                    print_wrapped("Parallel structure is already optimal.")
+            all_lines = self.get_all_lines(parfors_simple)
 
             if print_pre_optimised:
                 print(' Before Optimisation '.center(_termwidth,'-'))
-                print_unoptimised(all_lines)
+                self.print_unoptimised(all_lines)
                 print(_termwidth * '-')
 
             if print_post_optimised:
                 print(' After Optimisation '.center(_termwidth,'-'))
-                print_optimised(all_lines)
+                self.print_optimised(all_lines)
                 print(_termwidth * '-')
             print_wrapped("")
             print_wrapped(_termwidth * '-')
@@ -1277,60 +1364,10 @@ def print_fuse(ty, pf_id, adj, depth, region_id):
                 print_wrapped('Loop invariant code motion'.center(80, '-'))
 
             if print_allocation_hoist:
-                found = False
-                print('Allocation hoisting:')
-                for pf_id, data in self.hoist_info.items():
-                    stmt = data.get('hoisted', [])
-                    for inst in stmt:
-                        if isinstance(inst.value, ir.Expr):
-                            try:
-                                attr = inst.value.attr
-                                if attr == 'empty':
-                                    msg = ("The memory allocation derived from the "
-                                        "instruction at %s is hoisted out of the "
-                                        "parallel loop labelled #%s (it will be "
-                                        "performed before the loop is executed and "
-                                        "reused inside the loop):")
-                                    loc = inst.loc
-                                    print_wrapped(msg % (loc, pf_id))
-                                    try:
-                                        path = os.path.relpath(loc.filename)
-                                    except ValueError:
-                                        path = os.path.abspath(loc.filename)
-                                    lines = linecache.getlines(path)
-                                    if lines and loc.line:
-                                        print_wrapped("   Allocation:: " + lines[0 if loc.line < 2 else loc.line - 1].strip())
-                                    print_wrapped("    - numpy.empty() is used for the allocation.\n")
-                                    found = True
-                            except (KeyError, AttributeError):
-                                pass
-                if not found:
-                    print_wrapped('No allocation hoisting found')
-            if print_instruction_hoist:
-                print("")
-                print('Instruction hoisting:')
-                hoist_info_printed = False
-                if self.hoist_info:
-                    for pf_id, data in self.hoist_info.items():
-                        hoisted = data.get('hoisted', None)
-                        not_hoisted = data.get('not_hoisted', None)
-                        if not hoisted and not not_hoisted:
-                            print("loop #%s has nothing to hoist." % pf_id)
-                            continue
-
-                        print("loop #%s:" % pf_id)
-                        if hoisted:
-                            print("  Has the following hoisted:")
-                            [print("    %s" % y) for y in hoisted]
-                            hoist_info_printed = True
-                        if not_hoisted:
-                            print("  Failed to hoist the following:")
-                            [print("    %s: %s" % (y, x)) for x, y in not_hoisted]
-                            hoist_info_printed = True
-                if not hoist_info_printed:
-                    print_wrapped('No instruction hoisting found')
-                print_wrapped(80 * '-')
+                self.allocation_hoist()
 
+            if print_instruction_hoist:
+                self.instruction_hoist()
 
         else: # there are no parfors
             print_wrapped('Function %s, %s, has no parallel for-loops.'.format(name, line))

From cd2896c64c9e81c28225953cc0c2901875b478b3 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 9 Dec 2020 13:06:39 +0300
Subject: [PATCH 16/19] Initial support for Numpy subclasses (numba#6148)

Co-authored-by: Todd A. Anderson <drtodd13@comcast.net>
---
 numba/_typeof.c                     | 13 ++++-
 numba/core/extending.py             |  2 +-
 numba/core/ir_utils.py              | 12 ++--
 numba/core/pythonapi.py             |  9 ++-
 numba/core/runtime/_nrt_python.c    | 36 +++++++++++-
 numba/core/runtime/_nrt_pythonmod.c |  1 +
 numba/core/runtime/nrt.c            | 90 +++++++++++++++++++++++------
 numba/core/runtime/nrt.h            | 24 +++++++-
 numba/core/runtime/nrt_external.h   | 16 +++++
 numba/core/types/npytypes.py        |  4 +-
 numba/core/typing/npydecl.py        | 21 +++++--
 numba/np/arrayobj.py                | 11 +++-
 12 files changed, 202 insertions(+), 37 deletions(-)

diff --git a/numba/_typeof.c b/numba/_typeof.c
index ffe0e3a3c58..9b259164800 100644
--- a/numba/_typeof.c
+++ b/numba/_typeof.c
@@ -768,6 +768,7 @@ int
 typeof_typecode(PyObject *dispatcher, PyObject *val)
 {
     PyTypeObject *tyobj = Py_TYPE(val);
+    int no_subtype_attr;
     /* This needs to be kept in sync with Dispatcher.typeof_pyval(),
      * otherwise funny things may happen.
      */
@@ -794,9 +795,19 @@ typeof_typecode(PyObject *dispatcher, PyObject *val)
         return typecode_arrayscalar(dispatcher, val);
     }
     /* Array handling */
-    else if (PyType_IsSubtype(tyobj, &PyArray_Type)) {
+    else if (tyobj == &PyArray_Type) {
         return typecode_ndarray(dispatcher, (PyArrayObject*)val);
     }
+    /* Subtypes of Array handling */
+    else if (PyType_IsSubtype(tyobj, &PyArray_Type)) {
+        /* If the class has an attribute named __numba_no_subtype_ndarray then
+           don't treat it as a normal variant of a Numpy ndarray but as it's own
+           separate type. */
+        no_subtype_attr = PyObject_HasAttrString(val, "__numba_no_subtype_ndarray__");
+        if (!no_subtype_attr) {
+            return typecode_ndarray(dispatcher, (PyArrayObject*)val);
+        }
+    }
 
     return typecode_using_fingerprint(dispatcher, val);
 }
diff --git a/numba/core/extending.py b/numba/core/extending.py
index 8d8d8525e21..09373708b48 100644
--- a/numba/core/extending.py
+++ b/numba/core/extending.py
@@ -14,7 +14,7 @@
     lower_setattr, lower_setattr_generic, lower_cast)  # noqa: F401
 from numba.core.datamodel import models   # noqa: F401
 from numba.core.datamodel import register_default as register_model  # noqa: F401, E501
-from numba.core.pythonapi import box, unbox, reflect, NativeValue  # noqa: F401
+from numba.core.pythonapi import box, unbox, reflect, NativeValue, allocator  # noqa: F401
 from numba._helperlib import _import_cython_function  # noqa: F401
 from numba.core.serialize import ReduceMixin
 
diff --git a/numba/core/ir_utils.py b/numba/core/ir_utils.py
index 1d58c5c8b5b..9ffdfb16b07 100644
--- a/numba/core/ir_utils.py
+++ b/numba/core/ir_utils.py
@@ -64,6 +64,8 @@ def mk_alloc(typemap, calltypes, lhs, size_var, dtype, scope, loc):
     out = []
     ndims = 1
     size_typ = types.intp
+    # Get the type of the array being allocated.
+    arr_typ = typemap[lhs.name]
     if isinstance(size_var, tuple):
         if len(size_var) == 1:
             size_var = size_var[0]
@@ -108,11 +110,13 @@ def mk_alloc(typemap, calltypes, lhs, size_var, dtype, scope, loc):
     typ_var_assign = ir.Assign(np_typ_getattr, typ_var, loc)
     alloc_call = ir.Expr.call(attr_var, [size_var, typ_var], (), loc)
     if calltypes:
-        calltypes[alloc_call] = typemap[attr_var.name].get_call_type(
+        cac = typemap[attr_var.name].get_call_type(
             typing.Context(), [size_typ, types.functions.NumberClass(dtype)], {})
-    # signature(
-    #    types.npytypes.Array(dtype, ndims, 'C'), size_typ,
-    #    types.functions.NumberClass(dtype))
+        # By default, all calls to "empty" are typed as returning a standard
+        # Numpy ndarray.  If we are allocating a ndarray subclass here then
+        # just change the return type to be that of the subclass.
+        cac._return_type = arr_typ
+        calltypes[alloc_call] = cac
     alloc_assign = ir.Assign(alloc_call, lhs, loc)
 
     out.extend([g_np_assign, attr_assign, typ_var_assign, alloc_assign])
diff --git a/numba/core/pythonapi.py b/numba/core/pythonapi.py
index 7901e761d9f..f84ad7b2ce1 100644
--- a/numba/core/pythonapi.py
+++ b/numba/core/pythonapi.py
@@ -45,10 +45,13 @@ def lookup(self, typeclass, default=None):
 _boxers = _Registry()
 _unboxers = _Registry()
 _reflectors = _Registry()
+# Registry of special allocators for types.
+_allocators = _Registry()
 
 box = _boxers.register
 unbox = _unboxers.register
 reflect = _reflectors.register
+allocator = _allocators.register
 
 class _BoxContext(namedtuple("_BoxContext",
                   ("context", "builder", "pyapi", "env_manager"))):
@@ -1186,8 +1189,11 @@ def nrt_adapt_ndarray_to_python(self, aryty, ary, dtypeptr):
         assert self.context.enable_nrt, "NRT required"
 
         intty = ir.IntType(32)
+        # Embed the Python type of the array (maybe subclass) in the LLVM.
+        serial_aryty_pytype = self.unserialize(self.serialize_object(aryty.py_type))
+
         fnty = Type.function(self.pyobj,
-                             [self.voidptr, intty, intty, self.pyobj])
+                             [self.voidptr, self.pyobj, intty, intty, self.pyobj])
         fn = self._get_function(fnty, name="NRT_adapt_ndarray_to_python")
         fn.args[0].add_attribute(lc.ATTR_NO_CAPTURE)
 
@@ -1197,6 +1203,7 @@ def nrt_adapt_ndarray_to_python(self, aryty, ary, dtypeptr):
         aryptr = cgutils.alloca_once_value(self.builder, ary)
         return self.builder.call(fn, [self.builder.bitcast(aryptr,
                                                            self.voidptr),
+                                      serial_aryty_pytype,
                                       ndim, writable, dtypeptr])
 
     def nrt_meminfo_new_from_pyobject(self, data, pyobj):
diff --git a/numba/core/runtime/_nrt_python.c b/numba/core/runtime/_nrt_python.c
index 33620fd4f1a..9012498292a 100644
--- a/numba/core/runtime/_nrt_python.c
+++ b/numba/core/runtime/_nrt_python.c
@@ -55,6 +55,8 @@ int MemInfo_init(MemInfoObject *self, PyObject *args, PyObject *kwds) {
         return -1;
     }
     raw_ptr = PyLong_AsVoidPtr(raw_ptr_obj);
+    NRT_Debug(nrt_debug_print("MemInfo_init self=%p raw_ptr=%p\n", self, raw_ptr));
+
     if(PyErr_Occurred()) return -1;
     self->meminfo = (NRT_MemInfo *)raw_ptr;
     assert (NRT_MemInfo_refcount(self->meminfo) > 0 && "0 refcount");
@@ -109,6 +111,26 @@ MemInfo_get_refcount(MemInfoObject *self, void *closure) {
     return PyLong_FromSize_t(refct);
 }
 
+static
+PyObject*
+MemInfo_get_external_allocator(MemInfoObject *self, void *closure) {
+    void *p = NRT_MemInfo_external_allocator(self->meminfo);
+    return PyLong_FromVoidPtr(p);
+}
+
+static
+PyObject*
+MemInfo_get_parent(MemInfoObject *self, void *closure) {
+    void *p = NRT_MemInfo_parent(self->meminfo);
+    if (p) {
+        Py_INCREF(p);
+        return (PyObject*)p;
+    } else {
+        Py_INCREF(Py_None);
+        return Py_None;
+    }
+}
+
 static void
 MemInfo_dealloc(MemInfoObject *self)
 {
@@ -136,6 +158,13 @@ static PyGetSetDef MemInfo_getsets[] = {
      (getter)MemInfo_get_refcount, NULL,
      "Get the refcount",
      NULL},
+    {"external_allocator",
+     (getter)MemInfo_get_external_allocator, NULL,
+     "Get the external allocator",
+     NULL},
+    {"parent",
+     (getter)MemInfo_get_parent, NULL,
+     NULL},
     {NULL}  /* Sentinel */
 };
 
@@ -286,7 +315,7 @@ PyObject* try_to_return_parent(arystruct_t *arystruct, int ndim,
 }
 
 NUMBA_EXPORT_FUNC(PyObject *)
-NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim,
+NRT_adapt_ndarray_to_python(arystruct_t* arystruct, PyTypeObject *retty, int ndim,
                             int writeable, PyArray_Descr *descr)
 {
     PyArrayObject *array;
@@ -324,10 +353,13 @@ NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim,
         args = PyTuple_New(1);
         /* SETITEM steals reference */
         PyTuple_SET_ITEM(args, 0, PyLong_FromVoidPtr(arystruct->meminfo));
+        NRT_Debug(nrt_debug_print("NRT_adapt_ndarray_to_python arystruct->meminfo=%p\n", arystruct->meminfo));
         /*  Note: MemInfo_init() does not incref.  This function steals the
          *        NRT reference.
          */
+        NRT_Debug(nrt_debug_print("NRT_adapt_ndarray_to_python created MemInfo=%p\n", miobj));
         if (MemInfo_init(miobj, args, NULL)) {
+            NRT_Debug(nrt_debug_print("MemInfo_init returned 0.\n"));
             return NULL;
         }
         Py_DECREF(args);
@@ -336,7 +368,7 @@ NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim,
     shape = arystruct->shape_and_strides;
     strides = shape + ndim;
     Py_INCREF((PyObject *) descr);
-    array = (PyArrayObject *) PyArray_NewFromDescr(&PyArray_Type, descr, ndim,
+    array = (PyArrayObject *) PyArray_NewFromDescr(retty, descr, ndim,
                                                    shape, strides, arystruct->data,
                                                    flags, (PyObject *) miobj);
 
diff --git a/numba/core/runtime/_nrt_pythonmod.c b/numba/core/runtime/_nrt_pythonmod.c
index 31e1155fd9f..d1300ee8e9a 100644
--- a/numba/core/runtime/_nrt_pythonmod.c
+++ b/numba/core/runtime/_nrt_pythonmod.c
@@ -163,6 +163,7 @@ declmethod(MemInfo_alloc);
 declmethod(MemInfo_alloc_safe);
 declmethod(MemInfo_alloc_aligned);
 declmethod(MemInfo_alloc_safe_aligned);
+declmethod(MemInfo_alloc_safe_aligned_external);
 declmethod(MemInfo_alloc_dtor_safe);
 declmethod(MemInfo_call_dtor);
 declmethod(MemInfo_new_varsize);
diff --git a/numba/core/runtime/nrt.c b/numba/core/runtime/nrt.c
index 534681d5417..fe63a691537 100644
--- a/numba/core/runtime/nrt.c
+++ b/numba/core/runtime/nrt.c
@@ -19,6 +19,7 @@ struct MemInfo {
     void              *dtor_info;
     void              *data;
     size_t            size;    /* only used for NRT allocated memory */
+    NRT_ExternalAllocator *external_allocator;
 };
 
 
@@ -170,13 +171,16 @@ void NRT_MemSys_set_atomic_cas_stub(void) {
  */
 
 void NRT_MemInfo_init(NRT_MemInfo *mi,void *data, size_t size,
-                      NRT_dtor_function dtor, void *dtor_info)
+                      NRT_dtor_function dtor, void *dtor_info,
+                      NRT_ExternalAllocator *external_allocator)
 {
     mi->refct = 1;  /* starts with 1 refct */
     mi->dtor = dtor;
     mi->dtor_info = dtor_info;
     mi->data = data;
     mi->size = size;
+    mi->external_allocator = external_allocator;
+    NRT_Debug(nrt_debug_print("NRT_MemInfo_init mi=%p external_allocator=%p\n", mi, external_allocator));
     /* Update stats */
     TheMSys.atomic_inc(&TheMSys.stats_mi_alloc);
 }
@@ -185,7 +189,8 @@ NRT_MemInfo *NRT_MemInfo_new(void *data, size_t size,
                              NRT_dtor_function dtor, void *dtor_info)
 {
     NRT_MemInfo *mi = NRT_Allocate(sizeof(NRT_MemInfo));
-    NRT_MemInfo_init(mi, data, size, dtor, dtor_info);
+    NRT_Debug(nrt_debug_print("NRT_MemInfo_new mi=%p\n", mi));
+    NRT_MemInfo_init(mi, data, size, dtor, dtor_info, NULL);
     return mi;
 }
 
@@ -206,9 +211,10 @@ void nrt_internal_dtor_safe(void *ptr, size_t size, void *info) {
 }
 
 static
-void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out) {
+void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out, NRT_ExternalAllocator *allocator) {
     NRT_MemInfo *mi;
-    char *base = NRT_Allocate(sizeof(NRT_MemInfo) + size);
+    NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data %p\n", allocator));
+    char *base = NRT_Allocate_External(sizeof(NRT_MemInfo) + size, allocator);
     mi = (NRT_MemInfo *) base;
     *mi_out = mi;
     return base + sizeof(NRT_MemInfo);
@@ -230,9 +236,17 @@ void nrt_internal_custom_dtor_safe(void *ptr, size_t size, void *info) {
 
 NRT_MemInfo *NRT_MemInfo_alloc(size_t size) {
     NRT_MemInfo *mi;
-    void *data = nrt_allocate_meminfo_and_data(size, &mi);
+    void *data = nrt_allocate_meminfo_and_data(size, &mi, NULL);
     NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc %p\n", data));
-    NRT_MemInfo_init(mi, data, size, NULL, NULL);
+    NRT_MemInfo_init(mi, data, size, NULL, NULL, NULL);
+    return mi;
+}
+
+NRT_MemInfo *NRT_MemInfo_alloc_external(size_t size, NRT_ExternalAllocator *allocator) {
+    NRT_MemInfo *mi;
+    void *data = nrt_allocate_meminfo_and_data(size, &mi, allocator);
+    NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc %p\n", data));
+    NRT_MemInfo_init(mi, data, size, NULL, NULL, allocator);
     return mi;
 }
 
@@ -242,22 +256,23 @@ NRT_MemInfo *NRT_MemInfo_alloc_safe(size_t size) {
 
 NRT_MemInfo* NRT_MemInfo_alloc_dtor_safe(size_t size, NRT_dtor_function dtor) {
     NRT_MemInfo *mi;
-    void *data = nrt_allocate_meminfo_and_data(size, &mi);
+    void *data = nrt_allocate_meminfo_and_data(size, &mi, NULL);
     /* Only fill up a couple cachelines with debug markers, to minimize
        overhead. */
     memset(data, 0xCB, MIN(size, 256));
     NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_dtor_safe %p %zu\n", data, size));
-    NRT_MemInfo_init(mi, data, size, nrt_internal_custom_dtor_safe, dtor);
+    NRT_MemInfo_init(mi, data, size, nrt_internal_custom_dtor_safe, dtor, NULL);
     return mi;
 }
 
 
 static
 void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align,
-                                          NRT_MemInfo **mi)
+                                          NRT_MemInfo **mi, NRT_ExternalAllocator *allocator)
 {
     size_t offset, intptr, remainder;
-    char *base = nrt_allocate_meminfo_and_data(size + 2 * align, mi);
+    NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data_align %p\n", allocator));
+    char *base = nrt_allocate_meminfo_and_data(size + 2 * align, mi, allocator);
     intptr = (size_t) base;
     /* See if we are aligned */
     remainder = intptr % align;
@@ -271,26 +286,48 @@ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align,
 
 NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align) {
     NRT_MemInfo *mi;
-    void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi);
+    void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, NULL);
     NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_aligned %p\n", data));
-    NRT_MemInfo_init(mi, data, size, NULL, NULL);
+    NRT_MemInfo_init(mi, data, size, NULL, NULL, NULL);
     return mi;
 }
 
 NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned(size_t size, unsigned align) {
     NRT_MemInfo *mi;
-    void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi);
+    void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, NULL);
     /* Only fill up a couple cachelines with debug markers, to minimize
        overhead. */
     memset(data, 0xCB, MIN(size, 256));
     NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned %p %zu\n",
                               data, size));
-    NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size);
+    NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size, NULL);
     return mi;
 }
 
+NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned_external(size_t size, unsigned align, NRT_ExternalAllocator *allocator) {
+    NRT_MemInfo *mi;
+    NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned_external %p\n", allocator));
+    void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, allocator);
+    /* Only fill up a couple cachelines with debug markers, to minimize
+       overhead. */
+    memset(data, 0xCB, MIN(size, 256));
+    NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned %p %zu\n",
+                              data, size));
+    NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size, allocator);
+    return mi;
+}
+
+void NRT_dealloc(NRT_MemInfo *mi) {
+    NRT_Debug(nrt_debug_print("NRT_dealloc meminfo: %p external_allocator: %p\n", mi, mi->external_allocator));
+    if (mi->external_allocator) {
+        mi->external_allocator->free(mi, mi->external_allocator->opaque_data);
+    } else {
+        NRT_Free(mi);
+    }
+}
+
 void NRT_MemInfo_destroy(NRT_MemInfo *mi) {
-    NRT_Free(mi);
+    NRT_dealloc(mi);
     TheMSys.atomic_inc(&TheMSys.stats_mi_free);
 }
 
@@ -328,6 +365,14 @@ size_t NRT_MemInfo_size(NRT_MemInfo* mi) {
     return mi->size;
 }
 
+void * NRT_MemInfo_external_allocator(NRT_MemInfo *mi) {
+    NRT_Debug(nrt_debug_print("NRT_MemInfo_external_allocator meminfo: %p external_allocator: %p\n", mi, mi->external_allocator));
+    return mi->external_allocator;
+}
+
+void *NRT_MemInfo_parent(NRT_MemInfo *mi) {
+    return mi->dtor_info;
+}
 
 void NRT_MemInfo_dump(NRT_MemInfo *mi, FILE *out) {
     fprintf(out, "MemInfo %p refcount %zu\n", mi, mi->refct);
@@ -414,8 +459,18 @@ void NRT_MemInfo_varsize_free(NRT_MemInfo *mi, void *ptr)
  */
 
 void* NRT_Allocate(size_t size) {
-    void *ptr = TheMSys.allocator.malloc(size);
-    NRT_Debug(nrt_debug_print("NRT_Allocate bytes=%zu ptr=%p\n", size, ptr));
+    return NRT_Allocate_External(size, NULL);
+}
+
+void* NRT_Allocate_External(size_t size, NRT_ExternalAllocator *allocator) {
+    void *ptr;
+    if (allocator) {
+        ptr = allocator->malloc(size, allocator->opaque_data);
+        NRT_Debug(nrt_debug_print("NRT_Allocate custom bytes=%zu ptr=%p\n", size, ptr));
+    } else {
+        ptr = TheMSys.allocator.malloc(size);
+        NRT_Debug(nrt_debug_print("NRT_Allocate bytes=%zu ptr=%p\n", size, ptr));
+    }
     TheMSys.atomic_inc(&TheMSys.stats_alloc);
     return ptr;
 }
@@ -460,6 +515,7 @@ NRT_MemInfo* nrt_manage_memory(void *data, NRT_managed_dtor dtor) {
 static const
 NRT_api_functions nrt_functions_table = {
     NRT_MemInfo_alloc,
+    NRT_MemInfo_alloc_external,
     nrt_manage_memory,
     NRT_MemInfo_acquire,
     NRT_MemInfo_release,
diff --git a/numba/core/runtime/nrt.h b/numba/core/runtime/nrt.h
index 3c74dc58f58..9fb23532964 100644
--- a/numba/core/runtime/nrt.h
+++ b/numba/core/runtime/nrt.h
@@ -15,13 +15,14 @@ All functions described here are threadsafe.
 /* Debugging facilities - enabled at compile-time */
 /* #undef NDEBUG */
 #if 0
-#   define NRT_Debug(X) X
+#   define NRT_Debug(X) {X; fflush(stdout); }
 #else
 #   define NRT_Debug(X) if (0) { X; }
 #endif
 
 /* TypeDefs */
 typedef void (*NRT_dtor_function)(void *ptr, size_t size, void *info);
+typedef void (*NRT_dealloc_func)(void *ptr, void *dealloc_info);
 typedef size_t (*NRT_atomic_inc_dec_func)(size_t *ptr);
 typedef int (*NRT_atomic_cas_func)(void * volatile *ptr, void *cmp, void *repl,
                                    void **oldptr);
@@ -32,7 +33,6 @@ typedef void *(*NRT_malloc_func)(size_t size);
 typedef void *(*NRT_realloc_func)(void *ptr, size_t new_size);
 typedef void (*NRT_free_func)(void *ptr);
 
-
 /* Memory System API */
 
 /* Initialize the memory system */
@@ -101,7 +101,8 @@ NRT_MemInfo* NRT_MemInfo_new(void *data, size_t size,
 
 VISIBILITY_HIDDEN
 void NRT_MemInfo_init(NRT_MemInfo *mi, void *data, size_t size,
-                      NRT_dtor_function dtor, void *dtor_info);
+                      NRT_dtor_function dtor, void *dtor_info,
+                      NRT_ExternalAllocator *external_allocator);
 
 /*
  * Returns the refcount of a MemInfo or (size_t)-1 if error.
@@ -116,6 +117,8 @@ size_t NRT_MemInfo_refcount(NRT_MemInfo *mi);
 VISIBILITY_HIDDEN
 NRT_MemInfo *NRT_MemInfo_alloc(size_t size);
 
+NRT_MemInfo *NRT_MemInfo_alloc_external(size_t size, NRT_ExternalAllocator *allocator);
+
 /*
  * The "safe" NRT_MemInfo_alloc performs additional steps to help debug
  * memory errors.
@@ -141,6 +144,8 @@ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align);
 VISIBILITY_HIDDEN
 NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned(size_t size, unsigned align);
 
+NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned_external(size_t size, unsigned align, NRT_ExternalAllocator *allocator);
+
 /*
  * Internal API.
  * Release a MemInfo. Calls NRT_MemSys_insert_meminfo.
@@ -179,6 +184,18 @@ void* NRT_MemInfo_data(NRT_MemInfo* mi);
 VISIBILITY_HIDDEN
 size_t NRT_MemInfo_size(NRT_MemInfo* mi);
 
+/*
+ * Returns the external allocator
+ */
+VISIBILITY_HIDDEN
+void* NRT_MemInfo_external_allocator(NRT_MemInfo* mi);
+
+/*
+ * Returns the parent MemInfo
+ */
+VISIBILITY_HIDDEN
+void* NRT_MemInfo_parent(NRT_MemInfo* mi);
+
 
 /*
  * NRT API for resizable buffers.
@@ -207,6 +224,7 @@ void NRT_MemInfo_dump(NRT_MemInfo *mi, FILE *out);
  * Allocate memory of `size` bytes.
  */
 VISIBILITY_HIDDEN void* NRT_Allocate(size_t size);
+VISIBILITY_HIDDEN void* NRT_Allocate_External(size_t size, NRT_ExternalAllocator *allocator);
 
 /*
  * Deallocate memory pointed by `ptr`.
diff --git a/numba/core/runtime/nrt_external.h b/numba/core/runtime/nrt_external.h
index 391b6fa1b0e..a4835c36f67 100644
--- a/numba/core/runtime/nrt_external.h
+++ b/numba/core/runtime/nrt_external.h
@@ -7,6 +7,18 @@ typedef struct MemInfo NRT_MemInfo;
 
 typedef void NRT_managed_dtor(void *data);
 
+typedef void *(*NRT_external_malloc_func)(size_t size, void *opaque_data);
+typedef void *(*NRT_external_realloc_func)(void *ptr, size_t new_size, void *opaque_data);
+typedef void (*NRT_external_free_func)(void *ptr, void *opaque_data);
+
+struct ExternalMemAllocator {
+    NRT_external_malloc_func malloc;
+    NRT_external_realloc_func realloc;
+    NRT_external_free_func free;
+    void *opaque_data;
+};
+
+typedef struct ExternalMemAllocator NRT_ExternalAllocator;
 
 typedef struct {
     /* Methods to create MemInfos.
@@ -21,6 +33,10 @@ typedef struct {
     Returning a new reference.
     */
     NRT_MemInfo* (*allocate)(size_t nbytes);
+    /* Allocator memory using an external allocator but still using Numba's MemInfo.
+
+    */
+    NRT_MemInfo* (*allocate_external)(size_t nbytes, NRT_ExternalAllocator *allocator);
 
     /* Convert externally allocated memory into a MemInfo.
 
diff --git a/numba/core/types/npytypes.py b/numba/core/types/npytypes.py
index 6f6307c5526..3c2191ca23e 100644
--- a/numba/core/types/npytypes.py
+++ b/numba/core/types/npytypes.py
@@ -8,6 +8,7 @@
 from numba.core import utils
 from .misc import UnicodeType
 from .containers import Bytes
+import numpy as np
 
 class CharSeq(Type):
     """
@@ -394,8 +395,9 @@ class Array(Buffer):
     Type class for Numpy arrays.
     """
 
-    def __init__(self, dtype, ndim, layout, readonly=False, name=None,
+    def __init__(self, dtype, ndim, layout, py_type=np.ndarray, readonly=False, name=None,
                  aligned=True, addrspace=None):
+        self.py_type = py_type
         if readonly:
             self.mutable = False
         if (not aligned or
diff --git a/numba/core/typing/npydecl.py b/numba/core/typing/npydecl.py
index 2dbbed39be9..e7ecf452fe9 100644
--- a/numba/core/typing/npydecl.py
+++ b/numba/core/typing/npydecl.py
@@ -126,7 +126,21 @@ def generic(self, args, kws):
             ret_tys = ufunc_loop.outputs[-implicit_output_count:]
             if ndims > 0:
                 assert layout is not None
-                ret_tys = [types.Array(dtype=ret_ty, ndim=ndims, layout=layout)
+                # If either of the types involved in the ufunc operation have a
+                # __array_ufunc__ method then invoke the first such one to
+                # determine the output type of the ufunc.
+                array_ufunc_type = None
+                for a in args:
+                    if hasattr(a, "__array_ufunc__"):
+                        array_ufunc_type = a
+                        break
+                output_type = types.Array
+                if array_ufunc_type is not None:
+                    output_type = array_ufunc_type.__array_ufunc__(ufunc, "__call__", *args, **kws)
+                    # Eventually better error handling!  FIX ME!
+                    assert(output_type is not None)
+
+                ret_tys = [output_type(dtype=ret_ty, ndim=ndims, layout=layout)
                            for ret_ty in ret_tys]
                 ret_tys = [resolve_output_type(self.context, args, ret_ty)
                            for ret_ty in ret_tys]
@@ -517,6 +531,7 @@ def typer(shape, dtype=None):
 
 @infer_global(np.empty_like)
 @infer_global(np.zeros_like)
+@infer_global(np.ones_like)
 class NdConstructorLike(CallableTemplate):
     """
     Typing template for np.empty_like(), .zeros_like(), .ones_like().
@@ -544,9 +559,6 @@ def typer(arg, dtype=None):
         return typer
 
 
-infer_global(np.ones_like)(NdConstructorLike)
-
-
 @infer_global(np.full)
 class NdFull(CallableTemplate):
 
@@ -563,6 +575,7 @@ def typer(shape, fill_value, dtype=None):
 
         return typer
 
+
 @infer_global(np.full_like)
 class NdFullLike(CallableTemplate):
 
diff --git a/numba/np/arrayobj.py b/numba/np/arrayobj.py
index 933b1c6565e..5749e7d9b5b 100644
--- a/numba/np/arrayobj.py
+++ b/numba/np/arrayobj.py
@@ -32,7 +32,7 @@
 from numba.misc import quicksort, mergesort
 from numba.cpython import slicing
 from numba.cpython.unsafe.tuple import tuple_setitem
-
+from numba.core.pythonapi import _allocators
 
 def set_range_metadata(builder, load, lower_bound, upper_bound):
     """
@@ -3399,8 +3399,13 @@ def _empty_nd_impl(context, builder, arrtype, shapes):
         )
 
     align = context.get_preferred_array_alignment(arrtype.dtype)
-    meminfo = context.nrt.meminfo_alloc_aligned(builder, size=allocsize,
-                                                align=align)
+    def alloc_unsupported(context, builder, size, align):
+        return context.nrt.meminfo_alloc_aligned(builder, size, align)
+
+    # See if the type has a special allocator, if not use the default
+    # alloc_unsuppported allocator above.
+    allocator_impl = _allocators.lookup(arrtype.__class__, alloc_unsupported)
+    meminfo = allocator_impl(context, builder, size=allocsize, align=align)
 
     data = context.nrt.meminfo_data(builder, meminfo)
 

From d07f9af0ba8ad131f5283171c49b2ca00cdf7b68 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 18 Nov 2020 04:52:29 -0600
Subject: [PATCH 17/19] Patch for with context (#96)

This modifications make jit() decorator use TargetDispatcher from dppl.
Changes made in #57 by @AlexanderKalistratov and @1e-to.
---
 numba/core/decorators.py         | 58 ++++++++++++++++++++++----------
 numba/core/dispatcher.py         | 12 ++++++-
 numba/core/registry.py           |  6 ++++
 numba/tests/test_dispatcher.py   |  2 ++
 numba/tests/test_nrt.py          |  2 ++
 numba/tests/test_record_dtype.py |  4 +--
 numba/tests/test_serialize.py    |  6 ++--
 7 files changed, 67 insertions(+), 23 deletions(-)

diff --git a/numba/core/decorators.py b/numba/core/decorators.py
index cfe91168969..3ecc188a9a7 100644
--- a/numba/core/decorators.py
+++ b/numba/core/decorators.py
@@ -149,7 +149,7 @@ def bar(x, y):
         target = options.pop('target')
         warnings.warn("The 'target' keyword argument is deprecated.", NumbaDeprecationWarning)
     else:
-        target = options.pop('_target', 'cpu')
+        target = options.pop('_target', None)
 
     options['boundscheck'] = boundscheck
 
@@ -183,27 +183,16 @@ def bar(x, y):
 
 
 def _jit(sigs, locals, target, cache, targetoptions, **dispatcher_args):
-    dispatcher = registry.dispatcher_registry[target]
-
-    def wrapper(func):
-        if extending.is_jitted(func):
-            raise TypeError(
-                "A jit decorator was called on an already jitted function "
-                f"{func}.  If trying to access the original python "
-                f"function, use the {func}.py_func attribute."
-            )
-
-        if not inspect.isfunction(func):
-            raise TypeError(
-                "The decorated object is not a function (got type "
-                f"{type(func)})."
-            )
 
+    def wrapper(func, dispatcher):
         if config.ENABLE_CUDASIM and target == 'cuda':
             from numba import cuda
             return cuda.jit(func)
         if config.DISABLE_JIT and not target == 'npyufunc':
             return func
+        if target == 'dppl':
+            from . import dppl
+            return dppl.jit(func)
         disp = dispatcher(py_func=func, locals=locals,
                           targetoptions=targetoptions,
                           **dispatcher_args)
@@ -219,7 +208,42 @@ def wrapper(func):
                 disp.disable_compile()
         return disp
 
-    return wrapper
+    def __wrapper(func):
+        if extending.is_jitted(func):
+            raise TypeError(
+                "A jit decorator was called on an already jitted function "
+                f"{func}.  If trying to access the original python "
+                f"function, use the {func}.py_func attribute."
+            )
+
+        if not inspect.isfunction(func):
+            raise TypeError(
+                "The decorated object is not a function (got type "
+                f"{type(func)})."
+            )
+
+        is_numba_dppy_present = False
+        try:
+            import numba_dppy.config as dppy_config
+
+            is_numba_dppy_present = dppy_config.dppy_present
+        except ImportError:
+            pass
+
+        if (not is_numba_dppy_present
+            or target == 'npyufunc' or targetoptions.get('no_cpython_wrapper')
+            or sigs or config.DISABLE_JIT or not targetoptions.get('nopython')):
+            target_ = target
+            if target_ is None:
+                target_ = 'cpu'
+            disp = registry.dispatcher_registry[target_]
+            return wrapper(func, disp)
+
+        from numba_dppy.target_dispatcher import TargetDispatcher
+        disp = TargetDispatcher(func, wrapper, target, targetoptions.get('parallel'))
+        return disp
+
+    return __wrapper
 
 
 def generated_jit(function=None, target='cpu', cache=False,
diff --git a/numba/core/dispatcher.py b/numba/core/dispatcher.py
index 18d9426cd4d..42418fe5783 100644
--- a/numba/core/dispatcher.py
+++ b/numba/core/dispatcher.py
@@ -673,7 +673,14 @@ def _set_uuid(self, u):
         self._recent.append(self)
 
 
-class Dispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase):
+import abc
+
+class DispatcherMeta(abc.ABCMeta):
+    def __instancecheck__(self, other):
+        return type(type(other)) == DispatcherMeta
+
+
+class Dispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase, metaclass=DispatcherMeta):
     """
     Implementation of user-facing dispatcher objects (i.e. created using
     the @jit decorator).
@@ -899,6 +906,9 @@ def get_function_type(self):
             cres = tuple(self.overloads.values())[0]
             return types.FunctionType(cres.signature)
 
+    def get_compiled(self):
+        return self
+
 
 class LiftedCode(serialize.ReduceMixin, _MemoMixin, _DispatcherBase):
     """
diff --git a/numba/core/registry.py b/numba/core/registry.py
index 2bd47ebe879..01e492f91f4 100644
--- a/numba/core/registry.py
+++ b/numba/core/registry.py
@@ -2,6 +2,7 @@
 
 from numba.core.descriptors import TargetDescriptor
 from numba.core import utils, typing, dispatcher, cpu
+from numba.core.compiler_lock import global_compiler_lock
 
 # -----------------------------------------------------------------------------
 # Default CPU target descriptors
@@ -26,16 +27,19 @@ class CPUTarget(TargetDescriptor):
     _nested = _NestedContext()
 
     @utils.cached_property
+    @global_compiler_lock
     def _toplevel_target_context(self):
         # Lazily-initialized top-level target context, for all threads
         return cpu.CPUContext(self.typing_context)
 
     @utils.cached_property
+    @global_compiler_lock
     def _toplevel_typing_context(self):
         # Lazily-initialized top-level typing context, for all threads
         return typing.Context()
 
     @property
+    @global_compiler_lock
     def target_context(self):
         """
         The target context for CPU targets.
@@ -47,6 +51,7 @@ def target_context(self):
             return self._toplevel_target_context
 
     @property
+    @global_compiler_lock
     def typing_context(self):
         """
         The typing context for CPU targets.
@@ -57,6 +62,7 @@ def typing_context(self):
         else:
             return self._toplevel_typing_context
 
+    @global_compiler_lock
     def nested_context(self, typing_context, target_context):
         """
         A context manager temporarily replacing the contexts with the
diff --git a/numba/tests/test_dispatcher.py b/numba/tests/test_dispatcher.py
index 30a8e081485..b90d42ede26 100644
--- a/numba/tests/test_dispatcher.py
+++ b/numba/tests/test_dispatcher.py
@@ -398,6 +398,8 @@ def test_serialization(self):
         def foo(x):
             return x + 1
 
+        foo = foo.get_compiled()
+
         self.assertEqual(foo(1), 2)
 
         # get serialization memo
diff --git a/numba/tests/test_nrt.py b/numba/tests/test_nrt.py
index e0c94605671..602132258e8 100644
--- a/numba/tests/test_nrt.py
+++ b/numba/tests/test_nrt.py
@@ -249,6 +249,8 @@ def alloc_nrt_memory():
             """
             return np.empty(N, dtype)
 
+        alloc_nrt_memory = alloc_nrt_memory.get_compiled()
+
         def keep_memory():
             return alloc_nrt_memory()
 
diff --git a/numba/tests/test_record_dtype.py b/numba/tests/test_record_dtype.py
index 6d479c413fa..e674bacc957 100644
--- a/numba/tests/test_record_dtype.py
+++ b/numba/tests/test_record_dtype.py
@@ -803,8 +803,8 @@ def test_record_arg_transform(self):
         self.assertIn('Array', transformed)
         self.assertNotIn('first', transformed)
         self.assertNotIn('second', transformed)
-        # Length is usually 50 - 5 chars tolerance as above.
-        self.assertLess(len(transformed), 50)
+        # Length is usually 60 - 5 chars tolerance as above.
+        self.assertLess(len(transformed), 60)
 
     def test_record_two_arrays(self):
         """
diff --git a/numba/tests/test_serialize.py b/numba/tests/test_serialize.py
index 2bcf843458a..90c3db44a16 100644
--- a/numba/tests/test_serialize.py
+++ b/numba/tests/test_serialize.py
@@ -135,9 +135,9 @@ def test_reuse(self):
 
         Note that "same function" is intentionally under-specified.
         """
-        func = closure(5)
+        func = closure(5).get_compiled()
         pickled = pickle.dumps(func)
-        func2 = closure(6)
+        func2 = closure(6).get_compiled()
         pickled2 = pickle.dumps(func2)
 
         f = pickle.loads(pickled)
@@ -152,7 +152,7 @@ def test_reuse(self):
         self.assertEqual(h(2, 3), 11)
 
         # Now make sure the original object doesn't exist when deserializing
-        func = closure(7)
+        func = closure(7).get_compiled()
         func(42, 43)
         pickled = pickle.dumps(func)
         del func

From 00e056eb0c3f499b704f7efc70c8b4a8e21c72bd Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 13 Nov 2020 04:56:54 -0600
Subject: [PATCH 18/19] Patch for lowering (#98)

This patch introduces global initialization of lower_extensions
with one functions and replaces with a list of functions.

This change first made in commit 947b407 by @reazulhoque.
---
 numba/core/lowering.py           | 9 ++++++++-
 numba/parfors/parfor_lowering.py | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/numba/core/lowering.py b/numba/core/lowering.py
index 1c9c19cd3b1..bcbc4e8b980 100644
--- a/numba/core/lowering.py
+++ b/numba/core/lowering.py
@@ -274,6 +274,13 @@ def debug_print(self, msg):
 class Lower(BaseLower):
     GeneratorLower = generators.GeneratorLower
 
+    def __init__(self, context, library, fndesc, func_ir, metadata=None):
+        BaseLower.__init__(self, context, library, fndesc, func_ir, metadata)
+        from numba.parfors.parfor_lowering import _lower_parfor_parallel
+        from numba.parfors import parfor
+        if parfor.Parfor not in lower_extensions:
+            lower_extensions[parfor.Parfor] = [_lower_parfor_parallel]
+
     def pre_block(self, block):
         from numba.core.unsafe import eh
 
@@ -440,7 +447,7 @@ def lower_inst(self, inst):
         else:
             for _class, func in lower_extensions.items():
                 if isinstance(inst, _class):
-                    func(self, inst)
+                    func[-1](self, inst)
                     return
             raise NotImplementedError(type(inst))
 
diff --git a/numba/parfors/parfor_lowering.py b/numba/parfors/parfor_lowering.py
index dc499498326..e559e8c017c 100644
--- a/numba/parfors/parfor_lowering.py
+++ b/numba/parfors/parfor_lowering.py
@@ -480,7 +480,7 @@ def _lower_parfor_parallel(lowerer, parfor):
         print("_lower_parfor_parallel done")
 
 # A work-around to prevent circular imports
-lowering.lower_extensions[parfor.Parfor] = _lower_parfor_parallel
+#lowering.lower_extensions[parfor.Parfor] = _lower_parfor_parallel
 
 
 def _create_shape_signature(

From 5a1b372956ce881ebaf98c4ddd65681ece90ba0e Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Mon, 16 Nov 2020 05:41:03 -0600
Subject: [PATCH 19/19] Uncomment require_global_compiler_lock() (#120)

This line was commented when introduced codegen debugging by @DrTodd13.
Uncommenting this line requires modifications in numba-dppy.
It seems that numba-dppy compilation does not work correctly with
global compiler lock.
---
 numba/core/codegen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/core/codegen.py b/numba/core/codegen.py
index e22fa89012e..324a999fbe3 100644
--- a/numba/core/codegen.py
+++ b/numba/core/codegen.py
@@ -220,7 +220,7 @@ def finalize(self):
         Finalization involves various stages of code optimization and
         linking.
         """
-        require_global_compiler_lock()
+        #require_global_compiler_lock()
 
         # Report any LLVM-related problems to the user
         self._codegen._check_llvm_bugs()