From fea8446477a183fd9dbda659d56d2bfa608944b8 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Tue, 17 Nov 2020 11:53:10 +0300 Subject: [PATCH 01/19] Update README.rst about patched for numba-dppy We have to mention that IntelPython/numba differs from original Numba. --- README.rst | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index aa698904f47..7826858f3c2 100644 --- a/README.rst +++ b/README.rst @@ -1,5 +1,5 @@ ***** -Numba +Numba with patches for numba-dppy ***** .. image:: https://badges.gitter.im/numba/numba.svg @@ -10,6 +10,14 @@ Numba :target: https://numba.discourse.group/ :alt: Discourse +Patches for numba-dppy +###################### + +See https://github.com/IntelPython/numba-dppy. +If `numba-dppy` package is installed this version of Numba provides +additional features. +Without `numba-dppy` package this version of Numba works like original Numba. + A Just-In-Time Compiler for Numerical Functions in Python ######################################################### From 168b8dd763154b3ed22350e193fea4d5b3b6f3fe Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Thu, 12 Nov 2020 13:04:24 +0300 Subject: [PATCH 02/19] Remove the misspelling of finalize_dynamic_globals This commit fixes misspelling of _finalize_dynamic_globals(). This function is used only in the same file. --- numba/core/codegen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numba/core/codegen.py b/numba/core/codegen.py index 1e05c12bacd..e22fa89012e 100644 --- a/numba/core/codegen.py +++ b/numba/core/codegen.py @@ -247,7 +247,7 @@ def finalize(self): self._final_module.verify() self._finalize_final_module() - def _finalize_dyanmic_globals(self): + def _finalize_dynamic_globals(self): # Scan for dynamic globals for gv in self._final_module.global_variables: if gv.name.startswith('numba.dynamic.globals'): @@ -265,7 +265,7 @@ def _finalize_final_module(self): """ Make the underlying LLVM module ready to use. """ - self._finalize_dyanmic_globals() + self._finalize_dynamic_globals() self._verify_declare_only_symbols() # Remember this on the module, for the object cache hooks From 7e503d265c7a419c5dba49c6ffa616781fa63adc Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Fri, 13 Nov 2020 04:56:53 -0600 Subject: [PATCH 03/19] Patch for addrspace This commit adds modification introduced about addrspace from ce96c20 by @DrTodd13 and test fixed in 947b407 by @reazulhoque. This commit could be proposed to upstream. --- numba/core/datamodel/models.py | 2 +- numba/core/types/common.py | 3 ++- numba/core/types/misc.py | 5 +++-- numba/core/types/npytypes.py | 12 +++++++----- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/numba/core/datamodel/models.py b/numba/core/datamodel/models.py index 77b2fc90a21..4049d7c0e60 100644 --- a/numba/core/datamodel/models.py +++ b/numba/core/datamodel/models.py @@ -865,7 +865,7 @@ def __init__(self, dmm, fe_type): ('parent', types.pyobject), ('nitems', types.intp), ('itemsize', types.intp), - ('data', types.CPointer(fe_type.dtype)), + ('data', types.CPointer(fe_type.dtype, addrspace=fe_type.addrspace)), ('shape', types.UniTuple(types.intp, ndim)), ('strides', types.UniTuple(types.intp, ndim)), diff --git a/numba/core/types/common.py b/numba/core/types/common.py index f1806540c60..f054d62652e 100644 --- a/numba/core/types/common.py +++ b/numba/core/types/common.py @@ -45,7 +45,7 @@ class Buffer(IterableType, ArrayCompatible): # CS and FS are not reserved for inner contig but strided LAYOUTS = frozenset(['C', 'F', 'CS', 'FS', 'A']) - def __init__(self, dtype, ndim, layout, readonly=False, name=None): + def __init__(self, dtype, ndim, layout, readonly=False, name=None, addrspace=None): from .misc import unliteral if isinstance(dtype, Buffer): @@ -55,6 +55,7 @@ def __init__(self, dtype, ndim, layout, readonly=False, name=None): self.dtype = unliteral(dtype) self.ndim = ndim self.layout = layout + self.addrspace = addrspace if readonly: self.mutable = False if name is None: diff --git a/numba/core/types/misc.py b/numba/core/types/misc.py index 34d8f3a51bf..a2445d967d3 100644 --- a/numba/core/types/misc.py +++ b/numba/core/types/misc.py @@ -155,14 +155,15 @@ class CPointer(Type): """ mutable = True - def __init__(self, dtype): + def __init__(self, dtype, addrspace=None): self.dtype = dtype + self.addrspace = addrspace name = "%s*" % dtype super(CPointer, self).__init__(name) @property def key(self): - return self.dtype + return self.dtype, self.addrspace class EphemeralPointer(CPointer): diff --git a/numba/core/types/npytypes.py b/numba/core/types/npytypes.py index c7c312adce3..6f6307c5526 100644 --- a/numba/core/types/npytypes.py +++ b/numba/core/types/npytypes.py @@ -395,7 +395,7 @@ class Array(Buffer): """ def __init__(self, dtype, ndim, layout, readonly=False, name=None, - aligned=True): + aligned=True, addrspace=None): if readonly: self.mutable = False if (not aligned or @@ -408,7 +408,7 @@ def __init__(self, dtype, ndim, layout, readonly=False, name=None, if not self.aligned: type_name = "unaligned " + type_name name = "%s(%s, %sd, %s)" % (type_name, dtype, ndim, layout) - super(Array, self).__init__(dtype, ndim, layout, name=name) + super(Array, self).__init__(dtype, ndim, layout, name=name, addrspace=addrspace) @property def mangling_args(self): @@ -417,7 +417,7 @@ def mangling_args(self): 'aligned' if self.aligned else 'unaligned'] return self.__class__.__name__, args - def copy(self, dtype=None, ndim=None, layout=None, readonly=None): + def copy(self, dtype=None, ndim=None, layout=None, readonly=None, addrspace=None): if dtype is None: dtype = self.dtype if ndim is None: @@ -426,12 +426,14 @@ def copy(self, dtype=None, ndim=None, layout=None, readonly=None): layout = self.layout if readonly is None: readonly = not self.mutable + if addrspace is None: + addrspace = self.addrspace return Array(dtype=dtype, ndim=ndim, layout=layout, readonly=readonly, - aligned=self.aligned) + aligned=self.aligned, addrspace=addrspace) @property def key(self): - return self.dtype, self.ndim, self.layout, self.mutable, self.aligned + return self.dtype, self.ndim, self.layout, self.mutable, self.aligned, self.addrspace def unify(self, typingctx, other): """ From feb86235fd178ebabfa638cc86f91b0d032eecd1 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Fri, 13 Nov 2020 04:56:54 -0600 Subject: [PATCH 04/19] Patch for lowering This patch introduces global initialization of lower_extensions with one functions and replaces with a list of functions. This change first made in commit 947b407 by @reazulhoque. --- numba/core/lowering.py | 9 ++++++++- numba/parfors/parfor_lowering.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/numba/core/lowering.py b/numba/core/lowering.py index 1c9c19cd3b1..bcbc4e8b980 100644 --- a/numba/core/lowering.py +++ b/numba/core/lowering.py @@ -274,6 +274,13 @@ def debug_print(self, msg): class Lower(BaseLower): GeneratorLower = generators.GeneratorLower + def __init__(self, context, library, fndesc, func_ir, metadata=None): + BaseLower.__init__(self, context, library, fndesc, func_ir, metadata) + from numba.parfors.parfor_lowering import _lower_parfor_parallel + from numba.parfors import parfor + if parfor.Parfor not in lower_extensions: + lower_extensions[parfor.Parfor] = [_lower_parfor_parallel] + def pre_block(self, block): from numba.core.unsafe import eh @@ -440,7 +447,7 @@ def lower_inst(self, inst): else: for _class, func in lower_extensions.items(): if isinstance(inst, _class): - func(self, inst) + func[-1](self, inst) return raise NotImplementedError(type(inst)) diff --git a/numba/parfors/parfor_lowering.py b/numba/parfors/parfor_lowering.py index dc499498326..e559e8c017c 100644 --- a/numba/parfors/parfor_lowering.py +++ b/numba/parfors/parfor_lowering.py @@ -480,7 +480,7 @@ def _lower_parfor_parallel(lowerer, parfor): print("_lower_parfor_parallel done") # A work-around to prevent circular imports -lowering.lower_extensions[parfor.Parfor] = _lower_parfor_parallel +#lowering.lower_extensions[parfor.Parfor] = _lower_parfor_parallel def _create_shape_signature( From 10303682c13999a85402a6293548a4456be1b4bf Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Fri, 13 Nov 2020 04:56:55 -0600 Subject: [PATCH 05/19] Patch for change numpy functions mapping Changed in 8ccfd36 and a77eab6 by @reazulhoque. Possibly this 2 changes are not related to each other. --- numba/np/npyimpl.py | 7 ++++++- numba/parfors/parfor.py | 5 +++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/numba/np/npyimpl.py b/numba/np/npyimpl.py index d1e65e72e80..1763b6a3f42 100644 --- a/numba/np/npyimpl.py +++ b/numba/np/npyimpl.py @@ -448,7 +448,12 @@ def __init__(self, context, builder, outer_sig): super(_KernelImpl, self).__init__(context, builder, outer_sig) loop = ufunc_find_matching_loop( ufunc, outer_sig.args + tuple(_unpack_output_types(ufunc, outer_sig))) - self.fn = ufunc_db.get_ufunc_info(ufunc).get(loop.ufunc_sig) + + if hasattr(context, 'ufunc_db'): + self.fn = context.ufunc_db[ufunc].get(loop.ufunc_sig) + else: + self.fn = ufunc_db.get_ufunc_info(ufunc).get(loop.ufunc_sig) + self.inner_sig = _ufunc_loop_sig(loop.outputs, loop.inputs) if self.fn is None: diff --git a/numba/parfors/parfor.py b/numba/parfors/parfor.py index 0f23e69843a..c73d99905dd 100644 --- a/numba/parfors/parfor.py +++ b/numba/parfors/parfor.py @@ -1350,7 +1350,7 @@ class PreParforPass(object): implementations of numpy functions if available. """ def __init__(self, func_ir, typemap, calltypes, typingctx, options, - swapped={}): + swapped={}, replace_functions_map=replace_functions_map): self.func_ir = func_ir self.typemap = typemap self.calltypes = calltypes @@ -1358,6 +1358,7 @@ def __init__(self, func_ir, typemap, calltypes, typingctx, options, self.options = options # diagnostics self.swapped = swapped + self.replace_functions_map = replace_functions_map self.stats = { 'replaced_func': 0, 'replaced_dtype': 0, @@ -1394,7 +1395,7 @@ def _replace_parallel_functions(self, blocks): def replace_func(): func_def = get_definition(self.func_ir, expr.func) callname = find_callname(self.func_ir, expr) - repl_func = replace_functions_map.get(callname, None) + repl_func = self.replace_functions_map.get(callname, None) # Handle method on array type if (repl_func is None and len(callname) == 2 and From f29f1a45b6134e9f95ebe5fe8d8a54fe1ba8a7fe Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Mon, 16 Nov 2020 05:41:03 -0600 Subject: [PATCH 06/19] Uncomment require_global_compiler_lock()" This line was commented when introduced codegen debugging by @DrTodd13. Uncommenting this line requires modifications in numba-dppy. It seems that numba-dppy compilation does not work correctly with global compiler lock. --- numba/core/codegen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba/core/codegen.py b/numba/core/codegen.py index e22fa89012e..324a999fbe3 100644 --- a/numba/core/codegen.py +++ b/numba/core/codegen.py @@ -220,7 +220,7 @@ def finalize(self): Finalization involves various stages of code optimization and linking. """ - require_global_compiler_lock() + #require_global_compiler_lock() # Report any LLVM-related problems to the user self._codegen._check_llvm_bugs() From 1afe61415e8a67501f3b1d26a4040b4bd1535359 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Wed, 18 Nov 2020 04:52:29 -0600 Subject: [PATCH 07/19] Patch for with context This modifications make jit() decorator use TargetDispatcher from dppl. Changes made in #57 by @AlexanderKalistratov and @1e-to. --- numba/core/decorators.py | 58 ++++++++++++++++++++++---------- numba/core/dispatcher.py | 12 ++++++- numba/core/registry.py | 6 ++++ numba/tests/test_dispatcher.py | 2 ++ numba/tests/test_nrt.py | 2 ++ numba/tests/test_record_dtype.py | 4 +-- numba/tests/test_serialize.py | 6 ++-- 7 files changed, 67 insertions(+), 23 deletions(-) diff --git a/numba/core/decorators.py b/numba/core/decorators.py index cfe91168969..3ecc188a9a7 100644 --- a/numba/core/decorators.py +++ b/numba/core/decorators.py @@ -149,7 +149,7 @@ def bar(x, y): target = options.pop('target') warnings.warn("The 'target' keyword argument is deprecated.", NumbaDeprecationWarning) else: - target = options.pop('_target', 'cpu') + target = options.pop('_target', None) options['boundscheck'] = boundscheck @@ -183,27 +183,16 @@ def bar(x, y): def _jit(sigs, locals, target, cache, targetoptions, **dispatcher_args): - dispatcher = registry.dispatcher_registry[target] - - def wrapper(func): - if extending.is_jitted(func): - raise TypeError( - "A jit decorator was called on an already jitted function " - f"{func}. If trying to access the original python " - f"function, use the {func}.py_func attribute." - ) - - if not inspect.isfunction(func): - raise TypeError( - "The decorated object is not a function (got type " - f"{type(func)})." - ) + def wrapper(func, dispatcher): if config.ENABLE_CUDASIM and target == 'cuda': from numba import cuda return cuda.jit(func) if config.DISABLE_JIT and not target == 'npyufunc': return func + if target == 'dppl': + from . import dppl + return dppl.jit(func) disp = dispatcher(py_func=func, locals=locals, targetoptions=targetoptions, **dispatcher_args) @@ -219,7 +208,42 @@ def wrapper(func): disp.disable_compile() return disp - return wrapper + def __wrapper(func): + if extending.is_jitted(func): + raise TypeError( + "A jit decorator was called on an already jitted function " + f"{func}. If trying to access the original python " + f"function, use the {func}.py_func attribute." + ) + + if not inspect.isfunction(func): + raise TypeError( + "The decorated object is not a function (got type " + f"{type(func)})." + ) + + is_numba_dppy_present = False + try: + import numba_dppy.config as dppy_config + + is_numba_dppy_present = dppy_config.dppy_present + except ImportError: + pass + + if (not is_numba_dppy_present + or target == 'npyufunc' or targetoptions.get('no_cpython_wrapper') + or sigs or config.DISABLE_JIT or not targetoptions.get('nopython')): + target_ = target + if target_ is None: + target_ = 'cpu' + disp = registry.dispatcher_registry[target_] + return wrapper(func, disp) + + from numba_dppy.target_dispatcher import TargetDispatcher + disp = TargetDispatcher(func, wrapper, target, targetoptions.get('parallel')) + return disp + + return __wrapper def generated_jit(function=None, target='cpu', cache=False, diff --git a/numba/core/dispatcher.py b/numba/core/dispatcher.py index 18d9426cd4d..42418fe5783 100644 --- a/numba/core/dispatcher.py +++ b/numba/core/dispatcher.py @@ -673,7 +673,14 @@ def _set_uuid(self, u): self._recent.append(self) -class Dispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase): +import abc + +class DispatcherMeta(abc.ABCMeta): + def __instancecheck__(self, other): + return type(type(other)) == DispatcherMeta + + +class Dispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase, metaclass=DispatcherMeta): """ Implementation of user-facing dispatcher objects (i.e. created using the @jit decorator). @@ -899,6 +906,9 @@ def get_function_type(self): cres = tuple(self.overloads.values())[0] return types.FunctionType(cres.signature) + def get_compiled(self): + return self + class LiftedCode(serialize.ReduceMixin, _MemoMixin, _DispatcherBase): """ diff --git a/numba/core/registry.py b/numba/core/registry.py index 2bd47ebe879..01e492f91f4 100644 --- a/numba/core/registry.py +++ b/numba/core/registry.py @@ -2,6 +2,7 @@ from numba.core.descriptors import TargetDescriptor from numba.core import utils, typing, dispatcher, cpu +from numba.core.compiler_lock import global_compiler_lock # ----------------------------------------------------------------------------- # Default CPU target descriptors @@ -26,16 +27,19 @@ class CPUTarget(TargetDescriptor): _nested = _NestedContext() @utils.cached_property + @global_compiler_lock def _toplevel_target_context(self): # Lazily-initialized top-level target context, for all threads return cpu.CPUContext(self.typing_context) @utils.cached_property + @global_compiler_lock def _toplevel_typing_context(self): # Lazily-initialized top-level typing context, for all threads return typing.Context() @property + @global_compiler_lock def target_context(self): """ The target context for CPU targets. @@ -47,6 +51,7 @@ def target_context(self): return self._toplevel_target_context @property + @global_compiler_lock def typing_context(self): """ The typing context for CPU targets. @@ -57,6 +62,7 @@ def typing_context(self): else: return self._toplevel_typing_context + @global_compiler_lock def nested_context(self, typing_context, target_context): """ A context manager temporarily replacing the contexts with the diff --git a/numba/tests/test_dispatcher.py b/numba/tests/test_dispatcher.py index 30a8e081485..b90d42ede26 100644 --- a/numba/tests/test_dispatcher.py +++ b/numba/tests/test_dispatcher.py @@ -398,6 +398,8 @@ def test_serialization(self): def foo(x): return x + 1 + foo = foo.get_compiled() + self.assertEqual(foo(1), 2) # get serialization memo diff --git a/numba/tests/test_nrt.py b/numba/tests/test_nrt.py index e0c94605671..602132258e8 100644 --- a/numba/tests/test_nrt.py +++ b/numba/tests/test_nrt.py @@ -249,6 +249,8 @@ def alloc_nrt_memory(): """ return np.empty(N, dtype) + alloc_nrt_memory = alloc_nrt_memory.get_compiled() + def keep_memory(): return alloc_nrt_memory() diff --git a/numba/tests/test_record_dtype.py b/numba/tests/test_record_dtype.py index 6d479c413fa..e674bacc957 100644 --- a/numba/tests/test_record_dtype.py +++ b/numba/tests/test_record_dtype.py @@ -803,8 +803,8 @@ def test_record_arg_transform(self): self.assertIn('Array', transformed) self.assertNotIn('first', transformed) self.assertNotIn('second', transformed) - # Length is usually 50 - 5 chars tolerance as above. - self.assertLess(len(transformed), 50) + # Length is usually 60 - 5 chars tolerance as above. + self.assertLess(len(transformed), 60) def test_record_two_arrays(self): """ diff --git a/numba/tests/test_serialize.py b/numba/tests/test_serialize.py index 2bcf843458a..90c3db44a16 100644 --- a/numba/tests/test_serialize.py +++ b/numba/tests/test_serialize.py @@ -135,9 +135,9 @@ def test_reuse(self): Note that "same function" is intentionally under-specified. """ - func = closure(5) + func = closure(5).get_compiled() pickled = pickle.dumps(func) - func2 = closure(6) + func2 = closure(6).get_compiled() pickled2 = pickle.dumps(func2) f = pickle.loads(pickled) @@ -152,7 +152,7 @@ def test_reuse(self): self.assertEqual(h(2, 3), 11) # Now make sure the original object doesn't exist when deserializing - func = closure(7) + func = closure(7).get_compiled() func(42, 43) pickled = pickle.dumps(func) del func From 0bf83efd8703948d1ab4fc8161548ffeb2e3ba28 Mon Sep 17 00:00:00 2001 From: Elena Totmenina Date: Fri, 4 Dec 2020 14:26:59 +0300 Subject: [PATCH 08/19] Refactoring for ParforDiagnostics dump function (#135) Co-authored-by: etotmeni --- numba/parfors/parfor.py | 625 ++++++++++++++++++++-------------------- 1 file changed, 320 insertions(+), 305 deletions(-) diff --git a/numba/parfors/parfor.py b/numba/parfors/parfor.py index c73d99905dd..2632db89adb 100644 --- a/numba/parfors/parfor.py +++ b/numba/parfors/parfor.py @@ -862,6 +862,318 @@ def sort_pf_by_line(self, pf_id, parfors_simple): pass return line + def get_parfors_simple(self, print_loop_search): + parfors_simple = dict() + + # print in line order, parfors loop id is based on discovery order + for pf in sorted(self.initial_parfors, key=lambda x: x.loc.line): + # use 0 here, the parfors are mutated by the time this routine + # is called, however, fusion appends the patterns so we can just + # pull in the first as a "before fusion" emulation + r_pattern = pf.patterns[0] + pattern = pf.patterns[0] + loc = pf.loc + if isinstance(pattern, tuple): + if pattern[0] == 'prange': + if pattern[1] == 'internal': + replfn = '.'.join(reversed(list(pattern[2][0]))) + loc = pattern[2][1] + r_pattern = '%s %s' % (replfn, '(internal parallel version)') + elif pattern[1] == 'user': + r_pattern = "user defined prange" + elif pattern[1] == 'pndindex': + r_pattern = "internal pndindex" #FIXME: trace this! + else: + assert 0 + fmt = 'Parallel for-loop #%s: is produced from %s:\n %s\n \n' + if print_loop_search: + print_wrapped(fmt % (pf.id, loc, r_pattern)) + parfors_simple[pf.id] = (pf, loc, r_pattern) + return parfors_simple + + def get_all_lines(self, parfors_simple): + # ensure adjacency lists are the same size for both sets of info + # (nests and fusion may not traverse the same space, for + # convenience [] is used as a condition to halt recursion) + fadj, froots = self.compute_graph_info(self.fusion_info) + nadj, _nroots = self.compute_graph_info(self.nested_fusion_info) + + if len(fadj) > len(nadj): + lim = len(fadj) + tmp = nadj + else: + lim = len(nadj) + tmp = fadj + for x in range(len(tmp), lim): + tmp.append([]) + + # This computes the roots of true loop nests (i.e. loops containing + # loops opposed to just a loop that's a root). + nroots = set() + if _nroots: + for r in _nroots: + if nadj[r] != []: + nroots.add(r) + all_roots = froots ^ nroots + + # This computes all the parfors at the top level that are either: + # - roots of loop fusion + # - roots of true loop nests + # it then combines these based on source line number for ease of + # producing output ordered in a manner similar to the code structure + froots_lines = {} + for x in froots: + line = self.sort_pf_by_line(x, parfors_simple) + froots_lines[line] = 'fuse', x, fadj + + nroots_lines = {} + for x in nroots: + line = self.sort_pf_by_line(x, parfors_simple) + nroots_lines[line] = 'nest', x, nadj + + all_lines = froots_lines.copy() + all_lines.update(nroots_lines) + return all_lines + + def source_listing(self, parfors_simple, purpose_str): + filename = self.func_ir.loc.filename + count = self.count_parfors() + func_name = self.func_ir.func_id.func + try: + lines = inspect.getsource(func_name).splitlines() + except OSError: # generated function + lines = None + if lines and parfors_simple: + src_width = max([len(x) for x in lines]) + map_line_to_pf = defaultdict(list) # parfors can alias lines + for k, v in parfors_simple.items(): + # TODO: do a better job of tracking parfors that are not in + # this file but are referred to, e.g. np.arange() + if parfors_simple[k][1].filename == filename: + match_line = self.sort_pf_by_line(k, parfors_simple) + map_line_to_pf[match_line].append(str(k)) + + max_pf_per_line = max([1] + [len(x) for x in map_line_to_pf.values()]) + width = src_width + (1 + max_pf_per_line * (len(str(count)) + 2)) + newlines = [] + newlines.append('\n') + newlines.append('Parallel loop listing for %s' % purpose_str) + newlines.append(width * '-' + '|loop #ID') + fmt = '{0:{1}}| {2}' + # why are these off by 1? + lstart = max(0, self.func_ir.loc.line - 1) + for no, line in enumerate(lines, lstart): + pf_ids = map_line_to_pf.get(no, None) + if pf_ids is not None: + pfstr = '#' + ', '.join(pf_ids) + else: + pfstr = '' + stripped = line.strip('\n') + srclen = len(stripped) + if pf_ids: + l = fmt.format(width * '-', width, pfstr) + else: + l = fmt.format(width * ' ', width, pfstr) + newlines.append(stripped + l[srclen:]) + print('\n'.join(newlines)) + else: + print("No source available") + + def print_unoptimised(self, lines): + # This prints the unoptimised parfors state + sword = '+--' + fac = len(sword) + + def print_nest(fadj_, nadj_, theroot, reported, region_id): + def print_g(fadj_, nadj_, nroot, depth): + print_wrapped(fac * depth * ' ' + '%s%s %s' % (sword, nroot, '(parallel)')) + for k in nadj_[nroot]: + if nadj_[k] == []: + msg = [] + msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, k, '(parallel)')) + if fadj_[k] != [] and k not in reported: + fused = self.reachable_nodes(fadj_, k) + for i in fused: + msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, i, '(parallel)')) + reported.append(k) + print_wrapped('\n'.join(msg)) + else: + print_g(fadj_, nadj_, k, depth + 1) + + if nadj_[theroot] != []: + print_wrapped("Parallel region %s:" % region_id) + print_g(fadj_, nadj_, theroot, 0) + print("\n") + region_id = region_id + 1 + return region_id + + def print_fuse(ty, pf_id, adj, depth, region_id): + msg = [] + print_wrapped("Parallel region %s:" % region_id) + msg.append(fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel)')) + if adj[pf_id] != []: + fused = sorted(self.reachable_nodes(adj, pf_id)) + for k in fused: + msg.append(fac * depth * ' ' + '%s%s %s' % (sword, k, '(parallel)')) + region_id = region_id + 1 + print_wrapped('\n'.join(msg)) + print("\n") + return region_id + + # Walk the parfors by src line and print optimised structure + region_id = 0 + reported = [] + for line, info in sorted(lines.items()): + opt_ty, pf_id, adj = info + if opt_ty == 'fuse': + if pf_id not in reported: + region_id = print_fuse('f', pf_id, adj, 0, region_id) + elif opt_ty == 'nest': + region_id = print_nest(fadj, nadj, pf_id, reported, region_id) + else: + assert 0 + + def print_optimised(self, lines): + # This prints the optimised output based on the transforms that + # occurred during loop fusion and rewriting of loop nests + sword = '+--' + fac = len(sword) + + summary = dict() + # region : {fused, serialized} + + def print_nest(fadj_, nadj_, theroot, reported, region_id): + def print_g(fadj_, nadj_, nroot, depth): + for k in nadj_[nroot]: + msg = fac * depth * ' ' + '%s%s %s' % (sword, k, '(serial') + if nadj_[k] == []: + fused = [] + if fadj_[k] != [] and k not in reported: + fused = sorted(self.reachable_nodes(fadj_, k)) + msg += ", fused with loop(s): " + msg += ', '.join([str(x) for x in fused]) + msg += ')' + reported.append(k) + print_wrapped(msg) + summary[region_id]['fused'] += len(fused) + else: + print_wrapped(msg + ')') + print_g(fadj_, nadj_, k, depth + 1) + summary[region_id]['serialized'] += 1 + + if nadj_[theroot] != []: + print_wrapped("Parallel region %s:" % region_id) + print_wrapped('%s%s %s' % (sword, theroot, '(parallel)')) + summary[region_id] = {'root': theroot, 'fused': 0, 'serialized': 0} + print_g(fadj_, nadj_, theroot, 1) + print("\n") + region_id = region_id + 1 + return region_id + + def print_fuse(ty, pf_id, adj, depth, region_id): + print_wrapped("Parallel region %s:" % region_id) + msg = fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel') + fused = [] + if adj[pf_id] != []: + fused = sorted(self.reachable_nodes(adj, pf_id)) + msg += ", fused with loop(s): " + msg += ', '.join([str(x) for x in fused]) + + summary[region_id] = {'root': pf_id, 'fused': len(fused), 'serialized': 0} + msg += ')' + print_wrapped(msg) + print("\n") + region_id = region_id + 1 + return region_id + + # Walk the parfors by src line and print optimised structure + region_id = 0 + reported = [] + for line, info in sorted(lines.items()): + opt_ty, pf_id, adj = info + if opt_ty == 'fuse': + if pf_id not in reported: + region_id = print_fuse('f', pf_id, adj, 0, region_id) + elif opt_ty == 'nest': + region_id = print_nest(fadj, nadj, pf_id, reported, region_id) + else: + assert 0 + + # print the summary of the fuse/serialize rewrite + if summary: + for k, v in sorted(summary.items()): + msg = ('\n \nParallel region %s (loop #%s) had %s ' + 'loop(s) fused') + root = v['root'] + fused = v['fused'] + serialized = v['serialized'] + if serialized != 0: + msg += (' and %s loop(s) ' + 'serialized as part of the larger ' + 'parallel loop (#%s).') + print_wrapped(msg % (k, root, fused, serialized, root)) + else: + msg += '.' + print_wrapped(msg % (k, root, fused)) + else: + print_wrapped("Parallel structure is already optimal.") + + def allocation_hoist(self): + found = False + print('Allocation hoisting:') + for pf_id, data in self.hoist_info.items(): + stmt = data.get('hoisted', []) + for inst in stmt: + if isinstance(inst.value, ir.Expr): + try: + attr = inst.value.attr + if attr == 'empty': + msg = ("The memory allocation derived from the " + "instruction at %s is hoisted out of the " + "parallel loop labelled #%s (it will be " + "performed before the loop is executed and " + "reused inside the loop):") + loc = inst.loc + print_wrapped(msg % (loc, pf_id)) + try: + path = os.path.relpath(loc.filename) + except ValueError: + path = os.path.abspath(loc.filename) + lines = linecache.getlines(path) + if lines and loc.line: + print_wrapped(" Allocation:: " + lines[0 if loc.line < 2 else loc.line - 1].strip()) + print_wrapped(" - numpy.empty() is used for the allocation.\n") + found = True + except (KeyError, AttributeError): + pass + if not found: + print_wrapped('No allocation hoisting found') + + def instruction_hoist(self): + print("") + print('Instruction hoisting:') + hoist_info_printed = False + if self.hoist_info: + for pf_id, data in self.hoist_info.items(): + hoisted = data.get('hoisted', None) + not_hoisted = data.get('not_hoisted', None) + if not hoisted and not not_hoisted: + print("loop #%s has nothing to hoist." % pf_id) + continue + + print("loop #%s:" % pf_id) + if hoisted: + print(" Has the following hoisted:") + [print(" %s" % y) for y in hoisted] + hoist_info_printed = True + if not_hoisted: + print(" Failed to hoist the following:") + [print(" %s: %s" % (y, x)) for x, y in not_hoisted] + hoist_info_printed = True + if not hoist_info_printed: + print_wrapped('No instruction hoisting found') + print_wrapped(80 * '-') + def dump(self, level=1): if not self.has_setup: raise RuntimeError("self.setup has not been called") @@ -919,33 +1231,7 @@ def dump(self, level=1): #----------- search section if print_loop_search: print_wrapped('Looking for parallel loops'.center(_termwidth, '-')) - - parfors_simple = dict() - - # print in line order, parfors loop id is based on discovery order - for pf in sorted(self.initial_parfors, key=lambda x: x.loc.line): - # use 0 here, the parfors are mutated by the time this routine - # is called, however, fusion appends the patterns so we can just - # pull in the first as a "before fusion" emulation - r_pattern = pf.patterns[0] - pattern = pf.patterns[0] - loc = pf.loc - if isinstance(pattern, tuple): - if pattern[0] == 'prange': - if pattern[1] == 'internal': - replfn = '.'.join(reversed(list(pattern[2][0]))) - loc = pattern[2][1] - r_pattern = '%s %s' % (replfn, '(internal parallel version)') - elif pattern[1] == 'user': - r_pattern = "user defined prange" - elif pattern[1] == 'pndindex': - r_pattern = "internal pndindex" #FIXME: trace this! - else: - assert 0 - fmt = 'Parallel for-loop #%s: is produced from %s:\n %s\n \n' - if print_loop_search: - print_wrapped(fmt % (pf.id, loc, r_pattern)) - parfors_simple[pf.id] = (pf, loc, r_pattern) + parfors_simple = self.get_parfors_simple(print_loop_search) count = self.count_parfors() if print_loop_search: @@ -965,46 +1251,7 @@ def dump(self, level=1): path = os.path.abspath(filename) if print_source_listing: - func_name = self.func_ir.func_id.func - try: - lines = inspect.getsource(func_name).splitlines() - except OSError: # generated function - lines = None - if lines: - src_width = max([len(x) for x in lines]) - map_line_to_pf = defaultdict(list) # parfors can alias lines - for k, v in parfors_simple.items(): - # TODO: do a better job of tracking parfors that are not in - # this file but are referred to, e.g. np.arange() - if parfors_simple[k][1].filename == filename: - match_line = self.sort_pf_by_line(k, parfors_simple) - map_line_to_pf[match_line].append(str(k)) - - max_pf_per_line = max([1] + [len(x) for x in map_line_to_pf.values()]) - width = src_width + (1 + max_pf_per_line * (len(str(count)) + 2)) - newlines = [] - newlines.append('\n') - newlines.append('Parallel loop listing for %s' % purpose_str) - newlines.append(width * '-' + '|loop #ID') - fmt = '{0:{1}}| {2}' - # why are these off by 1? - lstart = max(0, self.func_ir.loc.line - 1) - for no, line in enumerate(lines, lstart): - pf_ids = map_line_to_pf.get(no, None) - if pf_ids is not None: - pfstr = '#' + ', '.join(pf_ids) - else: - pfstr = '' - stripped = line.strip('\n') - srclen = len(stripped) - if pf_ids: - l = fmt.format(width * '-', width, pfstr) - else: - l = fmt.format(width * ' ', width, pfstr) - newlines.append(stripped + l[srclen:]) - print('\n'.join(newlines)) - else: - print("No source available") + self.source_listing(parfors_simple, purpose_str) #---------- these are used a lot here on in sword = '+--' @@ -1075,198 +1322,16 @@ def print_g(adj, root, depth): print_wrapped("") #---------- compute various properties and orderings in the data for subsequent use - - # ensure adjacency lists are the same size for both sets of info - # (nests and fusion may not traverse the same space, for - # convenience [] is used as a condition to halt recursion) - fadj, froots = self.compute_graph_info(self.fusion_info) - nadj, _nroots = self.compute_graph_info(self.nested_fusion_info) - - if len(fadj) > len(nadj): - lim = len(fadj) - tmp = nadj - else: - lim = len(nadj) - tmp = fadj - for x in range(len(tmp), lim): - tmp.append([]) - - # This computes the roots of true loop nests (i.e. loops containing - # loops opposed to just a loop that's a root). - nroots = set() - if _nroots: - for r in _nroots: - if nadj[r] != []: - nroots.add(r) - all_roots = froots ^ nroots - - # This computes all the parfors at the top level that are either: - # - roots of loop fusion - # - roots of true loop nests - # it then combines these based on source line number for ease of - # producing output ordered in a manner similar to the code structure - froots_lines = {} - for x in froots: - line = self.sort_pf_by_line(x, parfors_simple) - froots_lines[line] = 'fuse', x, fadj - - nroots_lines = {} - for x in nroots: - line = self.sort_pf_by_line(x, parfors_simple) - nroots_lines[line] = 'nest', x, nadj - - all_lines = froots_lines.copy() - all_lines.update(nroots_lines) - - # nroots, froots, nadj and fadj are all set up correctly - # define some print functions - - def print_unoptimised(lines): - # This prints the unoptimised parfors state - - fac = len(sword) - - def print_nest(fadj_, nadj_, theroot, reported, region_id): - def print_g(fadj_, nadj_, nroot, depth): - print_wrapped(fac * depth * ' ' + '%s%s %s' % (sword, nroot, '(parallel)')) - for k in nadj_[nroot]: - if nadj_[k] == []: - msg = [] - msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, k, '(parallel)')) - if fadj_[k] != [] and k not in reported: - fused = self.reachable_nodes(fadj_, k) - for i in fused: - msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, i, '(parallel)')) - reported.append(k) - print_wrapped('\n'.join(msg)) - else: - print_g(fadj_, nadj_, k, depth + 1) - - if nadj_[theroot] != []: - print_wrapped("Parallel region %s:" % region_id) - print_g(fadj_, nadj_, theroot, 0) - print("\n") - region_id = region_id + 1 - return region_id - - def print_fuse(ty, pf_id, adj, depth, region_id): - msg = [] - print_wrapped("Parallel region %s:" % region_id) - msg.append(fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel)')) - if adj[pf_id] != []: - fused = sorted(self.reachable_nodes(adj, pf_id)) - for k in fused: - msg.append(fac * depth * ' ' + '%s%s %s' % (sword, k, '(parallel)')) - region_id = region_id + 1 - print_wrapped('\n'.join(msg)) - print("\n") - return region_id - - # Walk the parfors by src line and print optimised structure - region_id = 0 - reported = [] - for line, info in sorted(lines.items()): - opt_ty, pf_id, adj = info - if opt_ty == 'fuse': - if pf_id not in reported: - region_id = print_fuse('f', pf_id, adj, 0, region_id) - elif opt_ty == 'nest': - region_id = print_nest(fadj, nadj, pf_id, reported, region_id) - else: - assert 0 - - def print_optimised(lines): - # This prints the optimised output based on the transforms that - # occurred during loop fusion and rewriting of loop nests - fac = len(sword) - - summary = dict() - # region : {fused, serialized} - - def print_nest(fadj_, nadj_, theroot, reported, region_id): - def print_g(fadj_, nadj_, nroot, depth): - for k in nadj_[nroot]: - msg = fac * depth * ' ' + '%s%s %s' % (sword, k, '(serial') - if nadj_[k] == []: - fused = [] - if fadj_[k] != [] and k not in reported: - fused = sorted(self.reachable_nodes(fadj_, k)) - msg += ", fused with loop(s): " - msg += ', '.join([str(x) for x in fused]) - msg += ')' - reported.append(k) - print_wrapped(msg) - summary[region_id]['fused'] += len(fused) - else: - print_wrapped(msg + ')') - print_g(fadj_, nadj_, k, depth + 1) - summary[region_id]['serialized'] += 1 - - if nadj_[theroot] != []: - print_wrapped("Parallel region %s:" % region_id) - print_wrapped('%s%s %s' % (sword, theroot, '(parallel)')) - summary[region_id] = {'root': theroot, 'fused': 0, 'serialized': 0} - print_g(fadj_, nadj_, theroot, 1) - print("\n") - region_id = region_id + 1 - return region_id - - def print_fuse(ty, pf_id, adj, depth, region_id): - print_wrapped("Parallel region %s:" % region_id) - msg = fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel') - fused = [] - if adj[pf_id] != []: - fused = sorted(self.reachable_nodes(adj, pf_id)) - msg += ", fused with loop(s): " - msg += ', '.join([str(x) for x in fused]) - - summary[region_id] = {'root': pf_id, 'fused': len(fused), 'serialized': 0} - msg += ')' - print_wrapped(msg) - print("\n") - region_id = region_id + 1 - return region_id - - # Walk the parfors by src line and print optimised structure - region_id = 0 - reported = [] - for line, info in sorted(lines.items()): - opt_ty, pf_id, adj = info - if opt_ty == 'fuse': - if pf_id not in reported: - region_id = print_fuse('f', pf_id, adj, 0, region_id) - elif opt_ty == 'nest': - region_id = print_nest(fadj, nadj, pf_id, reported, region_id) - else: - assert 0 - - # print the summary of the fuse/serialize rewrite - if summary: - for k, v in sorted(summary.items()): - msg = ('\n \nParallel region %s (loop #%s) had %s ' - 'loop(s) fused') - root = v['root'] - fused = v['fused'] - serialized = v['serialized'] - if serialized != 0: - msg += (' and %s loop(s) ' - 'serialized as part of the larger ' - 'parallel loop (#%s).') - print_wrapped(msg % (k, root, fused, serialized, root)) - else: - msg += '.' - print_wrapped(msg % (k, root, fused)) - else: - print_wrapped("Parallel structure is already optimal.") + all_lines = self.get_all_lines(parfors_simple) if print_pre_optimised: print(' Before Optimisation '.center(_termwidth,'-')) - print_unoptimised(all_lines) + self.print_unoptimised(all_lines) print(_termwidth * '-') if print_post_optimised: print(' After Optimisation '.center(_termwidth,'-')) - print_optimised(all_lines) + self.print_optimised(all_lines) print(_termwidth * '-') print_wrapped("") print_wrapped(_termwidth * '-') @@ -1277,60 +1342,10 @@ def print_fuse(ty, pf_id, adj, depth, region_id): print_wrapped('Loop invariant code motion'.center(80, '-')) if print_allocation_hoist: - found = False - print('Allocation hoisting:') - for pf_id, data in self.hoist_info.items(): - stmt = data.get('hoisted', []) - for inst in stmt: - if isinstance(inst.value, ir.Expr): - try: - attr = inst.value.attr - if attr == 'empty': - msg = ("The memory allocation derived from the " - "instruction at %s is hoisted out of the " - "parallel loop labelled #%s (it will be " - "performed before the loop is executed and " - "reused inside the loop):") - loc = inst.loc - print_wrapped(msg % (loc, pf_id)) - try: - path = os.path.relpath(loc.filename) - except ValueError: - path = os.path.abspath(loc.filename) - lines = linecache.getlines(path) - if lines and loc.line: - print_wrapped(" Allocation:: " + lines[0 if loc.line < 2 else loc.line - 1].strip()) - print_wrapped(" - numpy.empty() is used for the allocation.\n") - found = True - except (KeyError, AttributeError): - pass - if not found: - print_wrapped('No allocation hoisting found') - if print_instruction_hoist: - print("") - print('Instruction hoisting:') - hoist_info_printed = False - if self.hoist_info: - for pf_id, data in self.hoist_info.items(): - hoisted = data.get('hoisted', None) - not_hoisted = data.get('not_hoisted', None) - if not hoisted and not not_hoisted: - print("loop #%s has nothing to hoist." % pf_id) - continue - - print("loop #%s:" % pf_id) - if hoisted: - print(" Has the following hoisted:") - [print(" %s" % y) for y in hoisted] - hoist_info_printed = True - if not_hoisted: - print(" Failed to hoist the following:") - [print(" %s: %s" % (y, x)) for x, y in not_hoisted] - hoist_info_printed = True - if not hoist_info_printed: - print_wrapped('No instruction hoisting found') - print_wrapped(80 * '-') + self.allocation_hoist() + if print_instruction_hoist: + self.instruction_hoist() else: # there are no parfors print_wrapped('Function %s, %s, has no parallel for-loops.'.format(name, line)) From 27db7a1994ff8ecbe9dcfc43f68bbc5b9b0fab19 Mon Sep 17 00:00:00 2001 From: Elena Totmenina Date: Tue, 8 Dec 2020 15:52:23 +0300 Subject: [PATCH 09/19] Fix ParforDiagnostic errors (#137) Co-authored-by: etotmeni --- numba/parfors/parfor.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/numba/parfors/parfor.py b/numba/parfors/parfor.py index 2632db89adb..b845b43bf2a 100644 --- a/numba/parfors/parfor.py +++ b/numba/parfors/parfor.py @@ -889,7 +889,7 @@ def get_parfors_simple(self, print_loop_search): if print_loop_search: print_wrapped(fmt % (pf.id, loc, r_pattern)) parfors_simple[pf.id] = (pf, loc, r_pattern) - return parfors_simple + return parfors_simple def get_all_lines(self, parfors_simple): # ensure adjacency lists are the same size for both sets of info @@ -983,6 +983,17 @@ def print_unoptimised(self, lines): # This prints the unoptimised parfors state sword = '+--' fac = len(sword) + fadj, froots = self.compute_graph_info(self.fusion_info) + nadj, _nroots = self.compute_graph_info(self.nested_fusion_info) + + if len(fadj) > len(nadj): + lim = len(fadj) + tmp = nadj + else: + lim = len(nadj) + tmp = fadj + for x in range(len(tmp), lim): + tmp.append([]) def print_nest(fadj_, nadj_, theroot, reported, region_id): def print_g(fadj_, nadj_, nroot, depth): @@ -1038,6 +1049,17 @@ def print_optimised(self, lines): # occurred during loop fusion and rewriting of loop nests sword = '+--' fac = len(sword) + fadj, froots = self.compute_graph_info(self.fusion_info) + nadj, _nroots = self.compute_graph_info(self.nested_fusion_info) + + if len(fadj) > len(nadj): + lim = len(fadj) + tmp = nadj + else: + lim = len(nadj) + tmp = fadj + for x in range(len(tmp), lim): + tmp.append([]) summary = dict() # region : {fused, serialized} From 2dabcd0ead3b8036115291fa156eda0265d1715f Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Wed, 9 Dec 2020 13:06:39 +0300 Subject: [PATCH 10/19] Initial support for Numpy subclasses Co-authored-by: Todd A. Anderson --- numba/_typeof.c | 13 ++++- numba/core/extending.py | 2 +- numba/core/ir_utils.py | 12 ++-- numba/core/pythonapi.py | 9 ++- numba/core/runtime/_nrt_python.c | 37 +++++++++++- numba/core/runtime/_nrt_pythonmod.c | 1 + numba/core/runtime/nrt.c | 90 +++++++++++++++++++++++------ numba/core/runtime/nrt.h | 24 +++++++- numba/core/runtime/nrt_external.h | 16 +++++ numba/core/types/npytypes.py | 4 +- numba/core/typing/npydecl.py | 21 +++++-- numba/np/arrayobj.py | 11 +++- 12 files changed, 203 insertions(+), 37 deletions(-) diff --git a/numba/_typeof.c b/numba/_typeof.c index ffe0e3a3c58..9b259164800 100644 --- a/numba/_typeof.c +++ b/numba/_typeof.c @@ -768,6 +768,7 @@ int typeof_typecode(PyObject *dispatcher, PyObject *val) { PyTypeObject *tyobj = Py_TYPE(val); + int no_subtype_attr; /* This needs to be kept in sync with Dispatcher.typeof_pyval(), * otherwise funny things may happen. */ @@ -794,9 +795,19 @@ typeof_typecode(PyObject *dispatcher, PyObject *val) return typecode_arrayscalar(dispatcher, val); } /* Array handling */ - else if (PyType_IsSubtype(tyobj, &PyArray_Type)) { + else if (tyobj == &PyArray_Type) { return typecode_ndarray(dispatcher, (PyArrayObject*)val); } + /* Subtypes of Array handling */ + else if (PyType_IsSubtype(tyobj, &PyArray_Type)) { + /* If the class has an attribute named __numba_no_subtype_ndarray then + don't treat it as a normal variant of a Numpy ndarray but as it's own + separate type. */ + no_subtype_attr = PyObject_HasAttrString(val, "__numba_no_subtype_ndarray__"); + if (!no_subtype_attr) { + return typecode_ndarray(dispatcher, (PyArrayObject*)val); + } + } return typecode_using_fingerprint(dispatcher, val); } diff --git a/numba/core/extending.py b/numba/core/extending.py index 8d8d8525e21..09373708b48 100644 --- a/numba/core/extending.py +++ b/numba/core/extending.py @@ -14,7 +14,7 @@ lower_setattr, lower_setattr_generic, lower_cast) # noqa: F401 from numba.core.datamodel import models # noqa: F401 from numba.core.datamodel import register_default as register_model # noqa: F401, E501 -from numba.core.pythonapi import box, unbox, reflect, NativeValue # noqa: F401 +from numba.core.pythonapi import box, unbox, reflect, NativeValue, allocator # noqa: F401 from numba._helperlib import _import_cython_function # noqa: F401 from numba.core.serialize import ReduceMixin diff --git a/numba/core/ir_utils.py b/numba/core/ir_utils.py index 1d58c5c8b5b..9ffdfb16b07 100644 --- a/numba/core/ir_utils.py +++ b/numba/core/ir_utils.py @@ -64,6 +64,8 @@ def mk_alloc(typemap, calltypes, lhs, size_var, dtype, scope, loc): out = [] ndims = 1 size_typ = types.intp + # Get the type of the array being allocated. + arr_typ = typemap[lhs.name] if isinstance(size_var, tuple): if len(size_var) == 1: size_var = size_var[0] @@ -108,11 +110,13 @@ def mk_alloc(typemap, calltypes, lhs, size_var, dtype, scope, loc): typ_var_assign = ir.Assign(np_typ_getattr, typ_var, loc) alloc_call = ir.Expr.call(attr_var, [size_var, typ_var], (), loc) if calltypes: - calltypes[alloc_call] = typemap[attr_var.name].get_call_type( + cac = typemap[attr_var.name].get_call_type( typing.Context(), [size_typ, types.functions.NumberClass(dtype)], {}) - # signature( - # types.npytypes.Array(dtype, ndims, 'C'), size_typ, - # types.functions.NumberClass(dtype)) + # By default, all calls to "empty" are typed as returning a standard + # Numpy ndarray. If we are allocating a ndarray subclass here then + # just change the return type to be that of the subclass. + cac._return_type = arr_typ + calltypes[alloc_call] = cac alloc_assign = ir.Assign(alloc_call, lhs, loc) out.extend([g_np_assign, attr_assign, typ_var_assign, alloc_assign]) diff --git a/numba/core/pythonapi.py b/numba/core/pythonapi.py index 7901e761d9f..f84ad7b2ce1 100644 --- a/numba/core/pythonapi.py +++ b/numba/core/pythonapi.py @@ -45,10 +45,13 @@ def lookup(self, typeclass, default=None): _boxers = _Registry() _unboxers = _Registry() _reflectors = _Registry() +# Registry of special allocators for types. +_allocators = _Registry() box = _boxers.register unbox = _unboxers.register reflect = _reflectors.register +allocator = _allocators.register class _BoxContext(namedtuple("_BoxContext", ("context", "builder", "pyapi", "env_manager"))): @@ -1186,8 +1189,11 @@ def nrt_adapt_ndarray_to_python(self, aryty, ary, dtypeptr): assert self.context.enable_nrt, "NRT required" intty = ir.IntType(32) + # Embed the Python type of the array (maybe subclass) in the LLVM. + serial_aryty_pytype = self.unserialize(self.serialize_object(aryty.py_type)) + fnty = Type.function(self.pyobj, - [self.voidptr, intty, intty, self.pyobj]) + [self.voidptr, self.pyobj, intty, intty, self.pyobj]) fn = self._get_function(fnty, name="NRT_adapt_ndarray_to_python") fn.args[0].add_attribute(lc.ATTR_NO_CAPTURE) @@ -1197,6 +1203,7 @@ def nrt_adapt_ndarray_to_python(self, aryty, ary, dtypeptr): aryptr = cgutils.alloca_once_value(self.builder, ary) return self.builder.call(fn, [self.builder.bitcast(aryptr, self.voidptr), + serial_aryty_pytype, ndim, writable, dtypeptr]) def nrt_meminfo_new_from_pyobject(self, data, pyobj): diff --git a/numba/core/runtime/_nrt_python.c b/numba/core/runtime/_nrt_python.c index 33620fd4f1a..efe4467df70 100644 --- a/numba/core/runtime/_nrt_python.c +++ b/numba/core/runtime/_nrt_python.c @@ -55,6 +55,8 @@ int MemInfo_init(MemInfoObject *self, PyObject *args, PyObject *kwds) { return -1; } raw_ptr = PyLong_AsVoidPtr(raw_ptr_obj); + NRT_Debug(nrt_debug_print("MemInfo_init self=%p raw_ptr=%p\n", self, raw_ptr)); + if(PyErr_Occurred()) return -1; self->meminfo = (NRT_MemInfo *)raw_ptr; assert (NRT_MemInfo_refcount(self->meminfo) > 0 && "0 refcount"); @@ -109,6 +111,27 @@ MemInfo_get_refcount(MemInfoObject *self, void *closure) { return PyLong_FromSize_t(refct); } +static +PyObject* +MemInfo_get_external_allocator(MemInfoObject *self, void *closure) { + void *p = NRT_MemInfo_external_allocator(self->meminfo); + printf("MemInfo_get_external_allocator %p\n", p); + return PyLong_FromVoidPtr(p); +} + +static +PyObject* +MemInfo_get_parent(MemInfoObject *self, void *closure) { + void *p = NRT_MemInfo_parent(self->meminfo); + if (p) { + Py_INCREF(p); + return (PyObject*)p; + } else { + Py_INCREF(Py_None); + return Py_None; + } +} + static void MemInfo_dealloc(MemInfoObject *self) { @@ -136,6 +159,13 @@ static PyGetSetDef MemInfo_getsets[] = { (getter)MemInfo_get_refcount, NULL, "Get the refcount", NULL}, + {"external_allocator", + (getter)MemInfo_get_external_allocator, NULL, + "Get the external allocator", + NULL}, + {"parent", + (getter)MemInfo_get_parent, NULL, + NULL}, {NULL} /* Sentinel */ }; @@ -286,7 +316,7 @@ PyObject* try_to_return_parent(arystruct_t *arystruct, int ndim, } NUMBA_EXPORT_FUNC(PyObject *) -NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim, +NRT_adapt_ndarray_to_python(arystruct_t* arystruct, PyTypeObject *retty, int ndim, int writeable, PyArray_Descr *descr) { PyArrayObject *array; @@ -324,10 +354,13 @@ NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim, args = PyTuple_New(1); /* SETITEM steals reference */ PyTuple_SET_ITEM(args, 0, PyLong_FromVoidPtr(arystruct->meminfo)); + NRT_Debug(nrt_debug_print("NRT_adapt_ndarray_to_python arystruct->meminfo=%p\n", arystruct->meminfo)); /* Note: MemInfo_init() does not incref. This function steals the * NRT reference. */ + NRT_Debug(nrt_debug_print("NRT_adapt_ndarray_to_python created MemInfo=%p\n", miobj)); if (MemInfo_init(miobj, args, NULL)) { + NRT_Debug(nrt_debug_print("MemInfo_init returned 0.\n")); return NULL; } Py_DECREF(args); @@ -336,7 +369,7 @@ NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim, shape = arystruct->shape_and_strides; strides = shape + ndim; Py_INCREF((PyObject *) descr); - array = (PyArrayObject *) PyArray_NewFromDescr(&PyArray_Type, descr, ndim, + array = (PyArrayObject *) PyArray_NewFromDescr(retty, descr, ndim, shape, strides, arystruct->data, flags, (PyObject *) miobj); diff --git a/numba/core/runtime/_nrt_pythonmod.c b/numba/core/runtime/_nrt_pythonmod.c index 31e1155fd9f..d1300ee8e9a 100644 --- a/numba/core/runtime/_nrt_pythonmod.c +++ b/numba/core/runtime/_nrt_pythonmod.c @@ -163,6 +163,7 @@ declmethod(MemInfo_alloc); declmethod(MemInfo_alloc_safe); declmethod(MemInfo_alloc_aligned); declmethod(MemInfo_alloc_safe_aligned); +declmethod(MemInfo_alloc_safe_aligned_external); declmethod(MemInfo_alloc_dtor_safe); declmethod(MemInfo_call_dtor); declmethod(MemInfo_new_varsize); diff --git a/numba/core/runtime/nrt.c b/numba/core/runtime/nrt.c index 534681d5417..fe63a691537 100644 --- a/numba/core/runtime/nrt.c +++ b/numba/core/runtime/nrt.c @@ -19,6 +19,7 @@ struct MemInfo { void *dtor_info; void *data; size_t size; /* only used for NRT allocated memory */ + NRT_ExternalAllocator *external_allocator; }; @@ -170,13 +171,16 @@ void NRT_MemSys_set_atomic_cas_stub(void) { */ void NRT_MemInfo_init(NRT_MemInfo *mi,void *data, size_t size, - NRT_dtor_function dtor, void *dtor_info) + NRT_dtor_function dtor, void *dtor_info, + NRT_ExternalAllocator *external_allocator) { mi->refct = 1; /* starts with 1 refct */ mi->dtor = dtor; mi->dtor_info = dtor_info; mi->data = data; mi->size = size; + mi->external_allocator = external_allocator; + NRT_Debug(nrt_debug_print("NRT_MemInfo_init mi=%p external_allocator=%p\n", mi, external_allocator)); /* Update stats */ TheMSys.atomic_inc(&TheMSys.stats_mi_alloc); } @@ -185,7 +189,8 @@ NRT_MemInfo *NRT_MemInfo_new(void *data, size_t size, NRT_dtor_function dtor, void *dtor_info) { NRT_MemInfo *mi = NRT_Allocate(sizeof(NRT_MemInfo)); - NRT_MemInfo_init(mi, data, size, dtor, dtor_info); + NRT_Debug(nrt_debug_print("NRT_MemInfo_new mi=%p\n", mi)); + NRT_MemInfo_init(mi, data, size, dtor, dtor_info, NULL); return mi; } @@ -206,9 +211,10 @@ void nrt_internal_dtor_safe(void *ptr, size_t size, void *info) { } static -void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out) { +void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out, NRT_ExternalAllocator *allocator) { NRT_MemInfo *mi; - char *base = NRT_Allocate(sizeof(NRT_MemInfo) + size); + NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data %p\n", allocator)); + char *base = NRT_Allocate_External(sizeof(NRT_MemInfo) + size, allocator); mi = (NRT_MemInfo *) base; *mi_out = mi; return base + sizeof(NRT_MemInfo); @@ -230,9 +236,17 @@ void nrt_internal_custom_dtor_safe(void *ptr, size_t size, void *info) { NRT_MemInfo *NRT_MemInfo_alloc(size_t size) { NRT_MemInfo *mi; - void *data = nrt_allocate_meminfo_and_data(size, &mi); + void *data = nrt_allocate_meminfo_and_data(size, &mi, NULL); NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc %p\n", data)); - NRT_MemInfo_init(mi, data, size, NULL, NULL); + NRT_MemInfo_init(mi, data, size, NULL, NULL, NULL); + return mi; +} + +NRT_MemInfo *NRT_MemInfo_alloc_external(size_t size, NRT_ExternalAllocator *allocator) { + NRT_MemInfo *mi; + void *data = nrt_allocate_meminfo_and_data(size, &mi, allocator); + NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc %p\n", data)); + NRT_MemInfo_init(mi, data, size, NULL, NULL, allocator); return mi; } @@ -242,22 +256,23 @@ NRT_MemInfo *NRT_MemInfo_alloc_safe(size_t size) { NRT_MemInfo* NRT_MemInfo_alloc_dtor_safe(size_t size, NRT_dtor_function dtor) { NRT_MemInfo *mi; - void *data = nrt_allocate_meminfo_and_data(size, &mi); + void *data = nrt_allocate_meminfo_and_data(size, &mi, NULL); /* Only fill up a couple cachelines with debug markers, to minimize overhead. */ memset(data, 0xCB, MIN(size, 256)); NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_dtor_safe %p %zu\n", data, size)); - NRT_MemInfo_init(mi, data, size, nrt_internal_custom_dtor_safe, dtor); + NRT_MemInfo_init(mi, data, size, nrt_internal_custom_dtor_safe, dtor, NULL); return mi; } static void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, - NRT_MemInfo **mi) + NRT_MemInfo **mi, NRT_ExternalAllocator *allocator) { size_t offset, intptr, remainder; - char *base = nrt_allocate_meminfo_and_data(size + 2 * align, mi); + NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data_align %p\n", allocator)); + char *base = nrt_allocate_meminfo_and_data(size + 2 * align, mi, allocator); intptr = (size_t) base; /* See if we are aligned */ remainder = intptr % align; @@ -271,26 +286,48 @@ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align) { NRT_MemInfo *mi; - void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi); + void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, NULL); NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_aligned %p\n", data)); - NRT_MemInfo_init(mi, data, size, NULL, NULL); + NRT_MemInfo_init(mi, data, size, NULL, NULL, NULL); return mi; } NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned(size_t size, unsigned align) { NRT_MemInfo *mi; - void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi); + void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, NULL); /* Only fill up a couple cachelines with debug markers, to minimize overhead. */ memset(data, 0xCB, MIN(size, 256)); NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned %p %zu\n", data, size)); - NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size); + NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size, NULL); return mi; } +NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned_external(size_t size, unsigned align, NRT_ExternalAllocator *allocator) { + NRT_MemInfo *mi; + NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned_external %p\n", allocator)); + void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, allocator); + /* Only fill up a couple cachelines with debug markers, to minimize + overhead. */ + memset(data, 0xCB, MIN(size, 256)); + NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned %p %zu\n", + data, size)); + NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size, allocator); + return mi; +} + +void NRT_dealloc(NRT_MemInfo *mi) { + NRT_Debug(nrt_debug_print("NRT_dealloc meminfo: %p external_allocator: %p\n", mi, mi->external_allocator)); + if (mi->external_allocator) { + mi->external_allocator->free(mi, mi->external_allocator->opaque_data); + } else { + NRT_Free(mi); + } +} + void NRT_MemInfo_destroy(NRT_MemInfo *mi) { - NRT_Free(mi); + NRT_dealloc(mi); TheMSys.atomic_inc(&TheMSys.stats_mi_free); } @@ -328,6 +365,14 @@ size_t NRT_MemInfo_size(NRT_MemInfo* mi) { return mi->size; } +void * NRT_MemInfo_external_allocator(NRT_MemInfo *mi) { + NRT_Debug(nrt_debug_print("NRT_MemInfo_external_allocator meminfo: %p external_allocator: %p\n", mi, mi->external_allocator)); + return mi->external_allocator; +} + +void *NRT_MemInfo_parent(NRT_MemInfo *mi) { + return mi->dtor_info; +} void NRT_MemInfo_dump(NRT_MemInfo *mi, FILE *out) { fprintf(out, "MemInfo %p refcount %zu\n", mi, mi->refct); @@ -414,8 +459,18 @@ void NRT_MemInfo_varsize_free(NRT_MemInfo *mi, void *ptr) */ void* NRT_Allocate(size_t size) { - void *ptr = TheMSys.allocator.malloc(size); - NRT_Debug(nrt_debug_print("NRT_Allocate bytes=%zu ptr=%p\n", size, ptr)); + return NRT_Allocate_External(size, NULL); +} + +void* NRT_Allocate_External(size_t size, NRT_ExternalAllocator *allocator) { + void *ptr; + if (allocator) { + ptr = allocator->malloc(size, allocator->opaque_data); + NRT_Debug(nrt_debug_print("NRT_Allocate custom bytes=%zu ptr=%p\n", size, ptr)); + } else { + ptr = TheMSys.allocator.malloc(size); + NRT_Debug(nrt_debug_print("NRT_Allocate bytes=%zu ptr=%p\n", size, ptr)); + } TheMSys.atomic_inc(&TheMSys.stats_alloc); return ptr; } @@ -460,6 +515,7 @@ NRT_MemInfo* nrt_manage_memory(void *data, NRT_managed_dtor dtor) { static const NRT_api_functions nrt_functions_table = { NRT_MemInfo_alloc, + NRT_MemInfo_alloc_external, nrt_manage_memory, NRT_MemInfo_acquire, NRT_MemInfo_release, diff --git a/numba/core/runtime/nrt.h b/numba/core/runtime/nrt.h index 3c74dc58f58..9fb23532964 100644 --- a/numba/core/runtime/nrt.h +++ b/numba/core/runtime/nrt.h @@ -15,13 +15,14 @@ All functions described here are threadsafe. /* Debugging facilities - enabled at compile-time */ /* #undef NDEBUG */ #if 0 -# define NRT_Debug(X) X +# define NRT_Debug(X) {X; fflush(stdout); } #else # define NRT_Debug(X) if (0) { X; } #endif /* TypeDefs */ typedef void (*NRT_dtor_function)(void *ptr, size_t size, void *info); +typedef void (*NRT_dealloc_func)(void *ptr, void *dealloc_info); typedef size_t (*NRT_atomic_inc_dec_func)(size_t *ptr); typedef int (*NRT_atomic_cas_func)(void * volatile *ptr, void *cmp, void *repl, void **oldptr); @@ -32,7 +33,6 @@ typedef void *(*NRT_malloc_func)(size_t size); typedef void *(*NRT_realloc_func)(void *ptr, size_t new_size); typedef void (*NRT_free_func)(void *ptr); - /* Memory System API */ /* Initialize the memory system */ @@ -101,7 +101,8 @@ NRT_MemInfo* NRT_MemInfo_new(void *data, size_t size, VISIBILITY_HIDDEN void NRT_MemInfo_init(NRT_MemInfo *mi, void *data, size_t size, - NRT_dtor_function dtor, void *dtor_info); + NRT_dtor_function dtor, void *dtor_info, + NRT_ExternalAllocator *external_allocator); /* * Returns the refcount of a MemInfo or (size_t)-1 if error. @@ -116,6 +117,8 @@ size_t NRT_MemInfo_refcount(NRT_MemInfo *mi); VISIBILITY_HIDDEN NRT_MemInfo *NRT_MemInfo_alloc(size_t size); +NRT_MemInfo *NRT_MemInfo_alloc_external(size_t size, NRT_ExternalAllocator *allocator); + /* * The "safe" NRT_MemInfo_alloc performs additional steps to help debug * memory errors. @@ -141,6 +144,8 @@ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align); VISIBILITY_HIDDEN NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned(size_t size, unsigned align); +NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned_external(size_t size, unsigned align, NRT_ExternalAllocator *allocator); + /* * Internal API. * Release a MemInfo. Calls NRT_MemSys_insert_meminfo. @@ -179,6 +184,18 @@ void* NRT_MemInfo_data(NRT_MemInfo* mi); VISIBILITY_HIDDEN size_t NRT_MemInfo_size(NRT_MemInfo* mi); +/* + * Returns the external allocator + */ +VISIBILITY_HIDDEN +void* NRT_MemInfo_external_allocator(NRT_MemInfo* mi); + +/* + * Returns the parent MemInfo + */ +VISIBILITY_HIDDEN +void* NRT_MemInfo_parent(NRT_MemInfo* mi); + /* * NRT API for resizable buffers. @@ -207,6 +224,7 @@ void NRT_MemInfo_dump(NRT_MemInfo *mi, FILE *out); * Allocate memory of `size` bytes. */ VISIBILITY_HIDDEN void* NRT_Allocate(size_t size); +VISIBILITY_HIDDEN void* NRT_Allocate_External(size_t size, NRT_ExternalAllocator *allocator); /* * Deallocate memory pointed by `ptr`. diff --git a/numba/core/runtime/nrt_external.h b/numba/core/runtime/nrt_external.h index 391b6fa1b0e..a4835c36f67 100644 --- a/numba/core/runtime/nrt_external.h +++ b/numba/core/runtime/nrt_external.h @@ -7,6 +7,18 @@ typedef struct MemInfo NRT_MemInfo; typedef void NRT_managed_dtor(void *data); +typedef void *(*NRT_external_malloc_func)(size_t size, void *opaque_data); +typedef void *(*NRT_external_realloc_func)(void *ptr, size_t new_size, void *opaque_data); +typedef void (*NRT_external_free_func)(void *ptr, void *opaque_data); + +struct ExternalMemAllocator { + NRT_external_malloc_func malloc; + NRT_external_realloc_func realloc; + NRT_external_free_func free; + void *opaque_data; +}; + +typedef struct ExternalMemAllocator NRT_ExternalAllocator; typedef struct { /* Methods to create MemInfos. @@ -21,6 +33,10 @@ typedef struct { Returning a new reference. */ NRT_MemInfo* (*allocate)(size_t nbytes); + /* Allocator memory using an external allocator but still using Numba's MemInfo. + + */ + NRT_MemInfo* (*allocate_external)(size_t nbytes, NRT_ExternalAllocator *allocator); /* Convert externally allocated memory into a MemInfo. diff --git a/numba/core/types/npytypes.py b/numba/core/types/npytypes.py index 6f6307c5526..3c2191ca23e 100644 --- a/numba/core/types/npytypes.py +++ b/numba/core/types/npytypes.py @@ -8,6 +8,7 @@ from numba.core import utils from .misc import UnicodeType from .containers import Bytes +import numpy as np class CharSeq(Type): """ @@ -394,8 +395,9 @@ class Array(Buffer): Type class for Numpy arrays. """ - def __init__(self, dtype, ndim, layout, readonly=False, name=None, + def __init__(self, dtype, ndim, layout, py_type=np.ndarray, readonly=False, name=None, aligned=True, addrspace=None): + self.py_type = py_type if readonly: self.mutable = False if (not aligned or diff --git a/numba/core/typing/npydecl.py b/numba/core/typing/npydecl.py index 2dbbed39be9..e7ecf452fe9 100644 --- a/numba/core/typing/npydecl.py +++ b/numba/core/typing/npydecl.py @@ -126,7 +126,21 @@ def generic(self, args, kws): ret_tys = ufunc_loop.outputs[-implicit_output_count:] if ndims > 0: assert layout is not None - ret_tys = [types.Array(dtype=ret_ty, ndim=ndims, layout=layout) + # If either of the types involved in the ufunc operation have a + # __array_ufunc__ method then invoke the first such one to + # determine the output type of the ufunc. + array_ufunc_type = None + for a in args: + if hasattr(a, "__array_ufunc__"): + array_ufunc_type = a + break + output_type = types.Array + if array_ufunc_type is not None: + output_type = array_ufunc_type.__array_ufunc__(ufunc, "__call__", *args, **kws) + # Eventually better error handling! FIX ME! + assert(output_type is not None) + + ret_tys = [output_type(dtype=ret_ty, ndim=ndims, layout=layout) for ret_ty in ret_tys] ret_tys = [resolve_output_type(self.context, args, ret_ty) for ret_ty in ret_tys] @@ -517,6 +531,7 @@ def typer(shape, dtype=None): @infer_global(np.empty_like) @infer_global(np.zeros_like) +@infer_global(np.ones_like) class NdConstructorLike(CallableTemplate): """ Typing template for np.empty_like(), .zeros_like(), .ones_like(). @@ -544,9 +559,6 @@ def typer(arg, dtype=None): return typer -infer_global(np.ones_like)(NdConstructorLike) - - @infer_global(np.full) class NdFull(CallableTemplate): @@ -563,6 +575,7 @@ def typer(shape, fill_value, dtype=None): return typer + @infer_global(np.full_like) class NdFullLike(CallableTemplate): diff --git a/numba/np/arrayobj.py b/numba/np/arrayobj.py index 933b1c6565e..5749e7d9b5b 100644 --- a/numba/np/arrayobj.py +++ b/numba/np/arrayobj.py @@ -32,7 +32,7 @@ from numba.misc import quicksort, mergesort from numba.cpython import slicing from numba.cpython.unsafe.tuple import tuple_setitem - +from numba.core.pythonapi import _allocators def set_range_metadata(builder, load, lower_bound, upper_bound): """ @@ -3399,8 +3399,13 @@ def _empty_nd_impl(context, builder, arrtype, shapes): ) align = context.get_preferred_array_alignment(arrtype.dtype) - meminfo = context.nrt.meminfo_alloc_aligned(builder, size=allocsize, - align=align) + def alloc_unsupported(context, builder, size, align): + return context.nrt.meminfo_alloc_aligned(builder, size, align) + + # See if the type has a special allocator, if not use the default + # alloc_unsuppported allocator above. + allocator_impl = _allocators.lookup(arrtype.__class__, alloc_unsupported) + meminfo = allocator_impl(context, builder, size=allocsize, align=align) data = context.nrt.meminfo_data(builder, meminfo) From 864790061aa7548ac2b7ebcf448354def849e657 Mon Sep 17 00:00:00 2001 From: "Todd A. Anderson" Date: Fri, 11 Dec 2020 14:47:26 -0600 Subject: [PATCH 11/19] Remove printf. --- numba/core/runtime/_nrt_python.c | 1 - 1 file changed, 1 deletion(-) diff --git a/numba/core/runtime/_nrt_python.c b/numba/core/runtime/_nrt_python.c index efe4467df70..9012498292a 100644 --- a/numba/core/runtime/_nrt_python.c +++ b/numba/core/runtime/_nrt_python.c @@ -115,7 +115,6 @@ static PyObject* MemInfo_get_external_allocator(MemInfoObject *self, void *closure) { void *p = NRT_MemInfo_external_allocator(self->meminfo); - printf("MemInfo_get_external_allocator %p\n", p); return PyLong_FromVoidPtr(p); } From 2d5f18c17d60b5281b0669bb61fa39682f000481 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Thu, 12 Nov 2020 13:04:24 +0300 Subject: [PATCH 12/19] Remove the misspelling of finalize_dynamic_globals (numba#6466) This commit fixes misspelling of _finalize_dynamic_globals(). This function is used only in the same file. --- numba/core/codegen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numba/core/codegen.py b/numba/core/codegen.py index 1e05c12bacd..e22fa89012e 100644 --- a/numba/core/codegen.py +++ b/numba/core/codegen.py @@ -247,7 +247,7 @@ def finalize(self): self._final_module.verify() self._finalize_final_module() - def _finalize_dyanmic_globals(self): + def _finalize_dynamic_globals(self): # Scan for dynamic globals for gv in self._final_module.global_variables: if gv.name.startswith('numba.dynamic.globals'): @@ -265,7 +265,7 @@ def _finalize_final_module(self): """ Make the underlying LLVM module ready to use. """ - self._finalize_dyanmic_globals() + self._finalize_dynamic_globals() self._verify_declare_only_symbols() # Remember this on the module, for the object cache hooks From 42c24777598b7efb4a635929897fdf2708dab7e0 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Fri, 13 Nov 2020 04:56:53 -0600 Subject: [PATCH 13/19] Patch for addrspace (numba#6469) This commit adds modification introduced about addrspace from ce96c20 by @DrTodd13 and test fixed in 947b407 by @reazulhoque. This commit could be proposed to upstream. --- numba/core/datamodel/models.py | 2 +- numba/core/types/common.py | 3 ++- numba/core/types/misc.py | 5 +++-- numba/core/types/npytypes.py | 12 +++++++----- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/numba/core/datamodel/models.py b/numba/core/datamodel/models.py index 77b2fc90a21..4049d7c0e60 100644 --- a/numba/core/datamodel/models.py +++ b/numba/core/datamodel/models.py @@ -865,7 +865,7 @@ def __init__(self, dmm, fe_type): ('parent', types.pyobject), ('nitems', types.intp), ('itemsize', types.intp), - ('data', types.CPointer(fe_type.dtype)), + ('data', types.CPointer(fe_type.dtype, addrspace=fe_type.addrspace)), ('shape', types.UniTuple(types.intp, ndim)), ('strides', types.UniTuple(types.intp, ndim)), diff --git a/numba/core/types/common.py b/numba/core/types/common.py index f1806540c60..f054d62652e 100644 --- a/numba/core/types/common.py +++ b/numba/core/types/common.py @@ -45,7 +45,7 @@ class Buffer(IterableType, ArrayCompatible): # CS and FS are not reserved for inner contig but strided LAYOUTS = frozenset(['C', 'F', 'CS', 'FS', 'A']) - def __init__(self, dtype, ndim, layout, readonly=False, name=None): + def __init__(self, dtype, ndim, layout, readonly=False, name=None, addrspace=None): from .misc import unliteral if isinstance(dtype, Buffer): @@ -55,6 +55,7 @@ def __init__(self, dtype, ndim, layout, readonly=False, name=None): self.dtype = unliteral(dtype) self.ndim = ndim self.layout = layout + self.addrspace = addrspace if readonly: self.mutable = False if name is None: diff --git a/numba/core/types/misc.py b/numba/core/types/misc.py index 34d8f3a51bf..a2445d967d3 100644 --- a/numba/core/types/misc.py +++ b/numba/core/types/misc.py @@ -155,14 +155,15 @@ class CPointer(Type): """ mutable = True - def __init__(self, dtype): + def __init__(self, dtype, addrspace=None): self.dtype = dtype + self.addrspace = addrspace name = "%s*" % dtype super(CPointer, self).__init__(name) @property def key(self): - return self.dtype + return self.dtype, self.addrspace class EphemeralPointer(CPointer): diff --git a/numba/core/types/npytypes.py b/numba/core/types/npytypes.py index c7c312adce3..6f6307c5526 100644 --- a/numba/core/types/npytypes.py +++ b/numba/core/types/npytypes.py @@ -395,7 +395,7 @@ class Array(Buffer): """ def __init__(self, dtype, ndim, layout, readonly=False, name=None, - aligned=True): + aligned=True, addrspace=None): if readonly: self.mutable = False if (not aligned or @@ -408,7 +408,7 @@ def __init__(self, dtype, ndim, layout, readonly=False, name=None, if not self.aligned: type_name = "unaligned " + type_name name = "%s(%s, %sd, %s)" % (type_name, dtype, ndim, layout) - super(Array, self).__init__(dtype, ndim, layout, name=name) + super(Array, self).__init__(dtype, ndim, layout, name=name, addrspace=addrspace) @property def mangling_args(self): @@ -417,7 +417,7 @@ def mangling_args(self): 'aligned' if self.aligned else 'unaligned'] return self.__class__.__name__, args - def copy(self, dtype=None, ndim=None, layout=None, readonly=None): + def copy(self, dtype=None, ndim=None, layout=None, readonly=None, addrspace=None): if dtype is None: dtype = self.dtype if ndim is None: @@ -426,12 +426,14 @@ def copy(self, dtype=None, ndim=None, layout=None, readonly=None): layout = self.layout if readonly is None: readonly = not self.mutable + if addrspace is None: + addrspace = self.addrspace return Array(dtype=dtype, ndim=ndim, layout=layout, readonly=readonly, - aligned=self.aligned) + aligned=self.aligned, addrspace=addrspace) @property def key(self): - return self.dtype, self.ndim, self.layout, self.mutable, self.aligned + return self.dtype, self.ndim, self.layout, self.mutable, self.aligned, self.addrspace def unify(self, typingctx, other): """ From b095ac2c2ea2a307700ec3b4f4c1a55c509b0d0b Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Fri, 13 Nov 2020 04:56:55 -0600 Subject: [PATCH 14/19] Patch for change numpy functions mapping (numba#6467, numba#6468) Changed in 8ccfd36 and a77eab6 by @reazulhoque. Possibly this 2 changes are not related to each other. --- numba/np/npyimpl.py | 7 ++++++- numba/parfors/parfor.py | 5 +++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/numba/np/npyimpl.py b/numba/np/npyimpl.py index d1e65e72e80..1763b6a3f42 100644 --- a/numba/np/npyimpl.py +++ b/numba/np/npyimpl.py @@ -448,7 +448,12 @@ def __init__(self, context, builder, outer_sig): super(_KernelImpl, self).__init__(context, builder, outer_sig) loop = ufunc_find_matching_loop( ufunc, outer_sig.args + tuple(_unpack_output_types(ufunc, outer_sig))) - self.fn = ufunc_db.get_ufunc_info(ufunc).get(loop.ufunc_sig) + + if hasattr(context, 'ufunc_db'): + self.fn = context.ufunc_db[ufunc].get(loop.ufunc_sig) + else: + self.fn = ufunc_db.get_ufunc_info(ufunc).get(loop.ufunc_sig) + self.inner_sig = _ufunc_loop_sig(loop.outputs, loop.inputs) if self.fn is None: diff --git a/numba/parfors/parfor.py b/numba/parfors/parfor.py index 0f23e69843a..c73d99905dd 100644 --- a/numba/parfors/parfor.py +++ b/numba/parfors/parfor.py @@ -1350,7 +1350,7 @@ class PreParforPass(object): implementations of numpy functions if available. """ def __init__(self, func_ir, typemap, calltypes, typingctx, options, - swapped={}): + swapped={}, replace_functions_map=replace_functions_map): self.func_ir = func_ir self.typemap = typemap self.calltypes = calltypes @@ -1358,6 +1358,7 @@ def __init__(self, func_ir, typemap, calltypes, typingctx, options, self.options = options # diagnostics self.swapped = swapped + self.replace_functions_map = replace_functions_map self.stats = { 'replaced_func': 0, 'replaced_dtype': 0, @@ -1394,7 +1395,7 @@ def _replace_parallel_functions(self, blocks): def replace_func(): func_def = get_definition(self.func_ir, expr.func) callname = find_callname(self.func_ir, expr) - repl_func = replace_functions_map.get(callname, None) + repl_func = self.replace_functions_map.get(callname, None) # Handle method on array type if (repl_func is None and len(callname) == 2 and From 28d8d06dbb20e2d41e21274779d1e0812a246496 Mon Sep 17 00:00:00 2001 From: Elena Totmenina Date: Fri, 4 Dec 2020 14:26:59 +0300 Subject: [PATCH 15/19] Refactoring for ParforDiagnostics dump function (numba#6545) Co-authored-by: etotmeni --- numba/parfors/parfor.py | 647 +++++++++++++++++++++------------------- 1 file changed, 342 insertions(+), 305 deletions(-) diff --git a/numba/parfors/parfor.py b/numba/parfors/parfor.py index c73d99905dd..b845b43bf2a 100644 --- a/numba/parfors/parfor.py +++ b/numba/parfors/parfor.py @@ -862,6 +862,340 @@ def sort_pf_by_line(self, pf_id, parfors_simple): pass return line + def get_parfors_simple(self, print_loop_search): + parfors_simple = dict() + + # print in line order, parfors loop id is based on discovery order + for pf in sorted(self.initial_parfors, key=lambda x: x.loc.line): + # use 0 here, the parfors are mutated by the time this routine + # is called, however, fusion appends the patterns so we can just + # pull in the first as a "before fusion" emulation + r_pattern = pf.patterns[0] + pattern = pf.patterns[0] + loc = pf.loc + if isinstance(pattern, tuple): + if pattern[0] == 'prange': + if pattern[1] == 'internal': + replfn = '.'.join(reversed(list(pattern[2][0]))) + loc = pattern[2][1] + r_pattern = '%s %s' % (replfn, '(internal parallel version)') + elif pattern[1] == 'user': + r_pattern = "user defined prange" + elif pattern[1] == 'pndindex': + r_pattern = "internal pndindex" #FIXME: trace this! + else: + assert 0 + fmt = 'Parallel for-loop #%s: is produced from %s:\n %s\n \n' + if print_loop_search: + print_wrapped(fmt % (pf.id, loc, r_pattern)) + parfors_simple[pf.id] = (pf, loc, r_pattern) + return parfors_simple + + def get_all_lines(self, parfors_simple): + # ensure adjacency lists are the same size for both sets of info + # (nests and fusion may not traverse the same space, for + # convenience [] is used as a condition to halt recursion) + fadj, froots = self.compute_graph_info(self.fusion_info) + nadj, _nroots = self.compute_graph_info(self.nested_fusion_info) + + if len(fadj) > len(nadj): + lim = len(fadj) + tmp = nadj + else: + lim = len(nadj) + tmp = fadj + for x in range(len(tmp), lim): + tmp.append([]) + + # This computes the roots of true loop nests (i.e. loops containing + # loops opposed to just a loop that's a root). + nroots = set() + if _nroots: + for r in _nroots: + if nadj[r] != []: + nroots.add(r) + all_roots = froots ^ nroots + + # This computes all the parfors at the top level that are either: + # - roots of loop fusion + # - roots of true loop nests + # it then combines these based on source line number for ease of + # producing output ordered in a manner similar to the code structure + froots_lines = {} + for x in froots: + line = self.sort_pf_by_line(x, parfors_simple) + froots_lines[line] = 'fuse', x, fadj + + nroots_lines = {} + for x in nroots: + line = self.sort_pf_by_line(x, parfors_simple) + nroots_lines[line] = 'nest', x, nadj + + all_lines = froots_lines.copy() + all_lines.update(nroots_lines) + return all_lines + + def source_listing(self, parfors_simple, purpose_str): + filename = self.func_ir.loc.filename + count = self.count_parfors() + func_name = self.func_ir.func_id.func + try: + lines = inspect.getsource(func_name).splitlines() + except OSError: # generated function + lines = None + if lines and parfors_simple: + src_width = max([len(x) for x in lines]) + map_line_to_pf = defaultdict(list) # parfors can alias lines + for k, v in parfors_simple.items(): + # TODO: do a better job of tracking parfors that are not in + # this file but are referred to, e.g. np.arange() + if parfors_simple[k][1].filename == filename: + match_line = self.sort_pf_by_line(k, parfors_simple) + map_line_to_pf[match_line].append(str(k)) + + max_pf_per_line = max([1] + [len(x) for x in map_line_to_pf.values()]) + width = src_width + (1 + max_pf_per_line * (len(str(count)) + 2)) + newlines = [] + newlines.append('\n') + newlines.append('Parallel loop listing for %s' % purpose_str) + newlines.append(width * '-' + '|loop #ID') + fmt = '{0:{1}}| {2}' + # why are these off by 1? + lstart = max(0, self.func_ir.loc.line - 1) + for no, line in enumerate(lines, lstart): + pf_ids = map_line_to_pf.get(no, None) + if pf_ids is not None: + pfstr = '#' + ', '.join(pf_ids) + else: + pfstr = '' + stripped = line.strip('\n') + srclen = len(stripped) + if pf_ids: + l = fmt.format(width * '-', width, pfstr) + else: + l = fmt.format(width * ' ', width, pfstr) + newlines.append(stripped + l[srclen:]) + print('\n'.join(newlines)) + else: + print("No source available") + + def print_unoptimised(self, lines): + # This prints the unoptimised parfors state + sword = '+--' + fac = len(sword) + fadj, froots = self.compute_graph_info(self.fusion_info) + nadj, _nroots = self.compute_graph_info(self.nested_fusion_info) + + if len(fadj) > len(nadj): + lim = len(fadj) + tmp = nadj + else: + lim = len(nadj) + tmp = fadj + for x in range(len(tmp), lim): + tmp.append([]) + + def print_nest(fadj_, nadj_, theroot, reported, region_id): + def print_g(fadj_, nadj_, nroot, depth): + print_wrapped(fac * depth * ' ' + '%s%s %s' % (sword, nroot, '(parallel)')) + for k in nadj_[nroot]: + if nadj_[k] == []: + msg = [] + msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, k, '(parallel)')) + if fadj_[k] != [] and k not in reported: + fused = self.reachable_nodes(fadj_, k) + for i in fused: + msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, i, '(parallel)')) + reported.append(k) + print_wrapped('\n'.join(msg)) + else: + print_g(fadj_, nadj_, k, depth + 1) + + if nadj_[theroot] != []: + print_wrapped("Parallel region %s:" % region_id) + print_g(fadj_, nadj_, theroot, 0) + print("\n") + region_id = region_id + 1 + return region_id + + def print_fuse(ty, pf_id, adj, depth, region_id): + msg = [] + print_wrapped("Parallel region %s:" % region_id) + msg.append(fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel)')) + if adj[pf_id] != []: + fused = sorted(self.reachable_nodes(adj, pf_id)) + for k in fused: + msg.append(fac * depth * ' ' + '%s%s %s' % (sword, k, '(parallel)')) + region_id = region_id + 1 + print_wrapped('\n'.join(msg)) + print("\n") + return region_id + + # Walk the parfors by src line and print optimised structure + region_id = 0 + reported = [] + for line, info in sorted(lines.items()): + opt_ty, pf_id, adj = info + if opt_ty == 'fuse': + if pf_id not in reported: + region_id = print_fuse('f', pf_id, adj, 0, region_id) + elif opt_ty == 'nest': + region_id = print_nest(fadj, nadj, pf_id, reported, region_id) + else: + assert 0 + + def print_optimised(self, lines): + # This prints the optimised output based on the transforms that + # occurred during loop fusion and rewriting of loop nests + sword = '+--' + fac = len(sword) + fadj, froots = self.compute_graph_info(self.fusion_info) + nadj, _nroots = self.compute_graph_info(self.nested_fusion_info) + + if len(fadj) > len(nadj): + lim = len(fadj) + tmp = nadj + else: + lim = len(nadj) + tmp = fadj + for x in range(len(tmp), lim): + tmp.append([]) + + summary = dict() + # region : {fused, serialized} + + def print_nest(fadj_, nadj_, theroot, reported, region_id): + def print_g(fadj_, nadj_, nroot, depth): + for k in nadj_[nroot]: + msg = fac * depth * ' ' + '%s%s %s' % (sword, k, '(serial') + if nadj_[k] == []: + fused = [] + if fadj_[k] != [] and k not in reported: + fused = sorted(self.reachable_nodes(fadj_, k)) + msg += ", fused with loop(s): " + msg += ', '.join([str(x) for x in fused]) + msg += ')' + reported.append(k) + print_wrapped(msg) + summary[region_id]['fused'] += len(fused) + else: + print_wrapped(msg + ')') + print_g(fadj_, nadj_, k, depth + 1) + summary[region_id]['serialized'] += 1 + + if nadj_[theroot] != []: + print_wrapped("Parallel region %s:" % region_id) + print_wrapped('%s%s %s' % (sword, theroot, '(parallel)')) + summary[region_id] = {'root': theroot, 'fused': 0, 'serialized': 0} + print_g(fadj_, nadj_, theroot, 1) + print("\n") + region_id = region_id + 1 + return region_id + + def print_fuse(ty, pf_id, adj, depth, region_id): + print_wrapped("Parallel region %s:" % region_id) + msg = fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel') + fused = [] + if adj[pf_id] != []: + fused = sorted(self.reachable_nodes(adj, pf_id)) + msg += ", fused with loop(s): " + msg += ', '.join([str(x) for x in fused]) + + summary[region_id] = {'root': pf_id, 'fused': len(fused), 'serialized': 0} + msg += ')' + print_wrapped(msg) + print("\n") + region_id = region_id + 1 + return region_id + + # Walk the parfors by src line and print optimised structure + region_id = 0 + reported = [] + for line, info in sorted(lines.items()): + opt_ty, pf_id, adj = info + if opt_ty == 'fuse': + if pf_id not in reported: + region_id = print_fuse('f', pf_id, adj, 0, region_id) + elif opt_ty == 'nest': + region_id = print_nest(fadj, nadj, pf_id, reported, region_id) + else: + assert 0 + + # print the summary of the fuse/serialize rewrite + if summary: + for k, v in sorted(summary.items()): + msg = ('\n \nParallel region %s (loop #%s) had %s ' + 'loop(s) fused') + root = v['root'] + fused = v['fused'] + serialized = v['serialized'] + if serialized != 0: + msg += (' and %s loop(s) ' + 'serialized as part of the larger ' + 'parallel loop (#%s).') + print_wrapped(msg % (k, root, fused, serialized, root)) + else: + msg += '.' + print_wrapped(msg % (k, root, fused)) + else: + print_wrapped("Parallel structure is already optimal.") + + def allocation_hoist(self): + found = False + print('Allocation hoisting:') + for pf_id, data in self.hoist_info.items(): + stmt = data.get('hoisted', []) + for inst in stmt: + if isinstance(inst.value, ir.Expr): + try: + attr = inst.value.attr + if attr == 'empty': + msg = ("The memory allocation derived from the " + "instruction at %s is hoisted out of the " + "parallel loop labelled #%s (it will be " + "performed before the loop is executed and " + "reused inside the loop):") + loc = inst.loc + print_wrapped(msg % (loc, pf_id)) + try: + path = os.path.relpath(loc.filename) + except ValueError: + path = os.path.abspath(loc.filename) + lines = linecache.getlines(path) + if lines and loc.line: + print_wrapped(" Allocation:: " + lines[0 if loc.line < 2 else loc.line - 1].strip()) + print_wrapped(" - numpy.empty() is used for the allocation.\n") + found = True + except (KeyError, AttributeError): + pass + if not found: + print_wrapped('No allocation hoisting found') + + def instruction_hoist(self): + print("") + print('Instruction hoisting:') + hoist_info_printed = False + if self.hoist_info: + for pf_id, data in self.hoist_info.items(): + hoisted = data.get('hoisted', None) + not_hoisted = data.get('not_hoisted', None) + if not hoisted and not not_hoisted: + print("loop #%s has nothing to hoist." % pf_id) + continue + + print("loop #%s:" % pf_id) + if hoisted: + print(" Has the following hoisted:") + [print(" %s" % y) for y in hoisted] + hoist_info_printed = True + if not_hoisted: + print(" Failed to hoist the following:") + [print(" %s: %s" % (y, x)) for x, y in not_hoisted] + hoist_info_printed = True + if not hoist_info_printed: + print_wrapped('No instruction hoisting found') + print_wrapped(80 * '-') + def dump(self, level=1): if not self.has_setup: raise RuntimeError("self.setup has not been called") @@ -919,33 +1253,7 @@ def dump(self, level=1): #----------- search section if print_loop_search: print_wrapped('Looking for parallel loops'.center(_termwidth, '-')) - - parfors_simple = dict() - - # print in line order, parfors loop id is based on discovery order - for pf in sorted(self.initial_parfors, key=lambda x: x.loc.line): - # use 0 here, the parfors are mutated by the time this routine - # is called, however, fusion appends the patterns so we can just - # pull in the first as a "before fusion" emulation - r_pattern = pf.patterns[0] - pattern = pf.patterns[0] - loc = pf.loc - if isinstance(pattern, tuple): - if pattern[0] == 'prange': - if pattern[1] == 'internal': - replfn = '.'.join(reversed(list(pattern[2][0]))) - loc = pattern[2][1] - r_pattern = '%s %s' % (replfn, '(internal parallel version)') - elif pattern[1] == 'user': - r_pattern = "user defined prange" - elif pattern[1] == 'pndindex': - r_pattern = "internal pndindex" #FIXME: trace this! - else: - assert 0 - fmt = 'Parallel for-loop #%s: is produced from %s:\n %s\n \n' - if print_loop_search: - print_wrapped(fmt % (pf.id, loc, r_pattern)) - parfors_simple[pf.id] = (pf, loc, r_pattern) + parfors_simple = self.get_parfors_simple(print_loop_search) count = self.count_parfors() if print_loop_search: @@ -965,46 +1273,7 @@ def dump(self, level=1): path = os.path.abspath(filename) if print_source_listing: - func_name = self.func_ir.func_id.func - try: - lines = inspect.getsource(func_name).splitlines() - except OSError: # generated function - lines = None - if lines: - src_width = max([len(x) for x in lines]) - map_line_to_pf = defaultdict(list) # parfors can alias lines - for k, v in parfors_simple.items(): - # TODO: do a better job of tracking parfors that are not in - # this file but are referred to, e.g. np.arange() - if parfors_simple[k][1].filename == filename: - match_line = self.sort_pf_by_line(k, parfors_simple) - map_line_to_pf[match_line].append(str(k)) - - max_pf_per_line = max([1] + [len(x) for x in map_line_to_pf.values()]) - width = src_width + (1 + max_pf_per_line * (len(str(count)) + 2)) - newlines = [] - newlines.append('\n') - newlines.append('Parallel loop listing for %s' % purpose_str) - newlines.append(width * '-' + '|loop #ID') - fmt = '{0:{1}}| {2}' - # why are these off by 1? - lstart = max(0, self.func_ir.loc.line - 1) - for no, line in enumerate(lines, lstart): - pf_ids = map_line_to_pf.get(no, None) - if pf_ids is not None: - pfstr = '#' + ', '.join(pf_ids) - else: - pfstr = '' - stripped = line.strip('\n') - srclen = len(stripped) - if pf_ids: - l = fmt.format(width * '-', width, pfstr) - else: - l = fmt.format(width * ' ', width, pfstr) - newlines.append(stripped + l[srclen:]) - print('\n'.join(newlines)) - else: - print("No source available") + self.source_listing(parfors_simple, purpose_str) #---------- these are used a lot here on in sword = '+--' @@ -1075,198 +1344,16 @@ def print_g(adj, root, depth): print_wrapped("") #---------- compute various properties and orderings in the data for subsequent use - - # ensure adjacency lists are the same size for both sets of info - # (nests and fusion may not traverse the same space, for - # convenience [] is used as a condition to halt recursion) - fadj, froots = self.compute_graph_info(self.fusion_info) - nadj, _nroots = self.compute_graph_info(self.nested_fusion_info) - - if len(fadj) > len(nadj): - lim = len(fadj) - tmp = nadj - else: - lim = len(nadj) - tmp = fadj - for x in range(len(tmp), lim): - tmp.append([]) - - # This computes the roots of true loop nests (i.e. loops containing - # loops opposed to just a loop that's a root). - nroots = set() - if _nroots: - for r in _nroots: - if nadj[r] != []: - nroots.add(r) - all_roots = froots ^ nroots - - # This computes all the parfors at the top level that are either: - # - roots of loop fusion - # - roots of true loop nests - # it then combines these based on source line number for ease of - # producing output ordered in a manner similar to the code structure - froots_lines = {} - for x in froots: - line = self.sort_pf_by_line(x, parfors_simple) - froots_lines[line] = 'fuse', x, fadj - - nroots_lines = {} - for x in nroots: - line = self.sort_pf_by_line(x, parfors_simple) - nroots_lines[line] = 'nest', x, nadj - - all_lines = froots_lines.copy() - all_lines.update(nroots_lines) - - # nroots, froots, nadj and fadj are all set up correctly - # define some print functions - - def print_unoptimised(lines): - # This prints the unoptimised parfors state - - fac = len(sword) - - def print_nest(fadj_, nadj_, theroot, reported, region_id): - def print_g(fadj_, nadj_, nroot, depth): - print_wrapped(fac * depth * ' ' + '%s%s %s' % (sword, nroot, '(parallel)')) - for k in nadj_[nroot]: - if nadj_[k] == []: - msg = [] - msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, k, '(parallel)')) - if fadj_[k] != [] and k not in reported: - fused = self.reachable_nodes(fadj_, k) - for i in fused: - msg.append(fac * (depth + 1) * ' ' + '%s%s %s' % (sword, i, '(parallel)')) - reported.append(k) - print_wrapped('\n'.join(msg)) - else: - print_g(fadj_, nadj_, k, depth + 1) - - if nadj_[theroot] != []: - print_wrapped("Parallel region %s:" % region_id) - print_g(fadj_, nadj_, theroot, 0) - print("\n") - region_id = region_id + 1 - return region_id - - def print_fuse(ty, pf_id, adj, depth, region_id): - msg = [] - print_wrapped("Parallel region %s:" % region_id) - msg.append(fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel)')) - if adj[pf_id] != []: - fused = sorted(self.reachable_nodes(adj, pf_id)) - for k in fused: - msg.append(fac * depth * ' ' + '%s%s %s' % (sword, k, '(parallel)')) - region_id = region_id + 1 - print_wrapped('\n'.join(msg)) - print("\n") - return region_id - - # Walk the parfors by src line and print optimised structure - region_id = 0 - reported = [] - for line, info in sorted(lines.items()): - opt_ty, pf_id, adj = info - if opt_ty == 'fuse': - if pf_id not in reported: - region_id = print_fuse('f', pf_id, adj, 0, region_id) - elif opt_ty == 'nest': - region_id = print_nest(fadj, nadj, pf_id, reported, region_id) - else: - assert 0 - - def print_optimised(lines): - # This prints the optimised output based on the transforms that - # occurred during loop fusion and rewriting of loop nests - fac = len(sword) - - summary = dict() - # region : {fused, serialized} - - def print_nest(fadj_, nadj_, theroot, reported, region_id): - def print_g(fadj_, nadj_, nroot, depth): - for k in nadj_[nroot]: - msg = fac * depth * ' ' + '%s%s %s' % (sword, k, '(serial') - if nadj_[k] == []: - fused = [] - if fadj_[k] != [] and k not in reported: - fused = sorted(self.reachable_nodes(fadj_, k)) - msg += ", fused with loop(s): " - msg += ', '.join([str(x) for x in fused]) - msg += ')' - reported.append(k) - print_wrapped(msg) - summary[region_id]['fused'] += len(fused) - else: - print_wrapped(msg + ')') - print_g(fadj_, nadj_, k, depth + 1) - summary[region_id]['serialized'] += 1 - - if nadj_[theroot] != []: - print_wrapped("Parallel region %s:" % region_id) - print_wrapped('%s%s %s' % (sword, theroot, '(parallel)')) - summary[region_id] = {'root': theroot, 'fused': 0, 'serialized': 0} - print_g(fadj_, nadj_, theroot, 1) - print("\n") - region_id = region_id + 1 - return region_id - - def print_fuse(ty, pf_id, adj, depth, region_id): - print_wrapped("Parallel region %s:" % region_id) - msg = fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel') - fused = [] - if adj[pf_id] != []: - fused = sorted(self.reachable_nodes(adj, pf_id)) - msg += ", fused with loop(s): " - msg += ', '.join([str(x) for x in fused]) - - summary[region_id] = {'root': pf_id, 'fused': len(fused), 'serialized': 0} - msg += ')' - print_wrapped(msg) - print("\n") - region_id = region_id + 1 - return region_id - - # Walk the parfors by src line and print optimised structure - region_id = 0 - reported = [] - for line, info in sorted(lines.items()): - opt_ty, pf_id, adj = info - if opt_ty == 'fuse': - if pf_id not in reported: - region_id = print_fuse('f', pf_id, adj, 0, region_id) - elif opt_ty == 'nest': - region_id = print_nest(fadj, nadj, pf_id, reported, region_id) - else: - assert 0 - - # print the summary of the fuse/serialize rewrite - if summary: - for k, v in sorted(summary.items()): - msg = ('\n \nParallel region %s (loop #%s) had %s ' - 'loop(s) fused') - root = v['root'] - fused = v['fused'] - serialized = v['serialized'] - if serialized != 0: - msg += (' and %s loop(s) ' - 'serialized as part of the larger ' - 'parallel loop (#%s).') - print_wrapped(msg % (k, root, fused, serialized, root)) - else: - msg += '.' - print_wrapped(msg % (k, root, fused)) - else: - print_wrapped("Parallel structure is already optimal.") + all_lines = self.get_all_lines(parfors_simple) if print_pre_optimised: print(' Before Optimisation '.center(_termwidth,'-')) - print_unoptimised(all_lines) + self.print_unoptimised(all_lines) print(_termwidth * '-') if print_post_optimised: print(' After Optimisation '.center(_termwidth,'-')) - print_optimised(all_lines) + self.print_optimised(all_lines) print(_termwidth * '-') print_wrapped("") print_wrapped(_termwidth * '-') @@ -1277,60 +1364,10 @@ def print_fuse(ty, pf_id, adj, depth, region_id): print_wrapped('Loop invariant code motion'.center(80, '-')) if print_allocation_hoist: - found = False - print('Allocation hoisting:') - for pf_id, data in self.hoist_info.items(): - stmt = data.get('hoisted', []) - for inst in stmt: - if isinstance(inst.value, ir.Expr): - try: - attr = inst.value.attr - if attr == 'empty': - msg = ("The memory allocation derived from the " - "instruction at %s is hoisted out of the " - "parallel loop labelled #%s (it will be " - "performed before the loop is executed and " - "reused inside the loop):") - loc = inst.loc - print_wrapped(msg % (loc, pf_id)) - try: - path = os.path.relpath(loc.filename) - except ValueError: - path = os.path.abspath(loc.filename) - lines = linecache.getlines(path) - if lines and loc.line: - print_wrapped(" Allocation:: " + lines[0 if loc.line < 2 else loc.line - 1].strip()) - print_wrapped(" - numpy.empty() is used for the allocation.\n") - found = True - except (KeyError, AttributeError): - pass - if not found: - print_wrapped('No allocation hoisting found') - if print_instruction_hoist: - print("") - print('Instruction hoisting:') - hoist_info_printed = False - if self.hoist_info: - for pf_id, data in self.hoist_info.items(): - hoisted = data.get('hoisted', None) - not_hoisted = data.get('not_hoisted', None) - if not hoisted and not not_hoisted: - print("loop #%s has nothing to hoist." % pf_id) - continue - - print("loop #%s:" % pf_id) - if hoisted: - print(" Has the following hoisted:") - [print(" %s" % y) for y in hoisted] - hoist_info_printed = True - if not_hoisted: - print(" Failed to hoist the following:") - [print(" %s: %s" % (y, x)) for x, y in not_hoisted] - hoist_info_printed = True - if not hoist_info_printed: - print_wrapped('No instruction hoisting found') - print_wrapped(80 * '-') + self.allocation_hoist() + if print_instruction_hoist: + self.instruction_hoist() else: # there are no parfors print_wrapped('Function %s, %s, has no parallel for-loops.'.format(name, line)) From cd2896c64c9e81c28225953cc0c2901875b478b3 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Wed, 9 Dec 2020 13:06:39 +0300 Subject: [PATCH 16/19] Initial support for Numpy subclasses (numba#6148) Co-authored-by: Todd A. Anderson --- numba/_typeof.c | 13 ++++- numba/core/extending.py | 2 +- numba/core/ir_utils.py | 12 ++-- numba/core/pythonapi.py | 9 ++- numba/core/runtime/_nrt_python.c | 36 +++++++++++- numba/core/runtime/_nrt_pythonmod.c | 1 + numba/core/runtime/nrt.c | 90 +++++++++++++++++++++++------ numba/core/runtime/nrt.h | 24 +++++++- numba/core/runtime/nrt_external.h | 16 +++++ numba/core/types/npytypes.py | 4 +- numba/core/typing/npydecl.py | 21 +++++-- numba/np/arrayobj.py | 11 +++- 12 files changed, 202 insertions(+), 37 deletions(-) diff --git a/numba/_typeof.c b/numba/_typeof.c index ffe0e3a3c58..9b259164800 100644 --- a/numba/_typeof.c +++ b/numba/_typeof.c @@ -768,6 +768,7 @@ int typeof_typecode(PyObject *dispatcher, PyObject *val) { PyTypeObject *tyobj = Py_TYPE(val); + int no_subtype_attr; /* This needs to be kept in sync with Dispatcher.typeof_pyval(), * otherwise funny things may happen. */ @@ -794,9 +795,19 @@ typeof_typecode(PyObject *dispatcher, PyObject *val) return typecode_arrayscalar(dispatcher, val); } /* Array handling */ - else if (PyType_IsSubtype(tyobj, &PyArray_Type)) { + else if (tyobj == &PyArray_Type) { return typecode_ndarray(dispatcher, (PyArrayObject*)val); } + /* Subtypes of Array handling */ + else if (PyType_IsSubtype(tyobj, &PyArray_Type)) { + /* If the class has an attribute named __numba_no_subtype_ndarray then + don't treat it as a normal variant of a Numpy ndarray but as it's own + separate type. */ + no_subtype_attr = PyObject_HasAttrString(val, "__numba_no_subtype_ndarray__"); + if (!no_subtype_attr) { + return typecode_ndarray(dispatcher, (PyArrayObject*)val); + } + } return typecode_using_fingerprint(dispatcher, val); } diff --git a/numba/core/extending.py b/numba/core/extending.py index 8d8d8525e21..09373708b48 100644 --- a/numba/core/extending.py +++ b/numba/core/extending.py @@ -14,7 +14,7 @@ lower_setattr, lower_setattr_generic, lower_cast) # noqa: F401 from numba.core.datamodel import models # noqa: F401 from numba.core.datamodel import register_default as register_model # noqa: F401, E501 -from numba.core.pythonapi import box, unbox, reflect, NativeValue # noqa: F401 +from numba.core.pythonapi import box, unbox, reflect, NativeValue, allocator # noqa: F401 from numba._helperlib import _import_cython_function # noqa: F401 from numba.core.serialize import ReduceMixin diff --git a/numba/core/ir_utils.py b/numba/core/ir_utils.py index 1d58c5c8b5b..9ffdfb16b07 100644 --- a/numba/core/ir_utils.py +++ b/numba/core/ir_utils.py @@ -64,6 +64,8 @@ def mk_alloc(typemap, calltypes, lhs, size_var, dtype, scope, loc): out = [] ndims = 1 size_typ = types.intp + # Get the type of the array being allocated. + arr_typ = typemap[lhs.name] if isinstance(size_var, tuple): if len(size_var) == 1: size_var = size_var[0] @@ -108,11 +110,13 @@ def mk_alloc(typemap, calltypes, lhs, size_var, dtype, scope, loc): typ_var_assign = ir.Assign(np_typ_getattr, typ_var, loc) alloc_call = ir.Expr.call(attr_var, [size_var, typ_var], (), loc) if calltypes: - calltypes[alloc_call] = typemap[attr_var.name].get_call_type( + cac = typemap[attr_var.name].get_call_type( typing.Context(), [size_typ, types.functions.NumberClass(dtype)], {}) - # signature( - # types.npytypes.Array(dtype, ndims, 'C'), size_typ, - # types.functions.NumberClass(dtype)) + # By default, all calls to "empty" are typed as returning a standard + # Numpy ndarray. If we are allocating a ndarray subclass here then + # just change the return type to be that of the subclass. + cac._return_type = arr_typ + calltypes[alloc_call] = cac alloc_assign = ir.Assign(alloc_call, lhs, loc) out.extend([g_np_assign, attr_assign, typ_var_assign, alloc_assign]) diff --git a/numba/core/pythonapi.py b/numba/core/pythonapi.py index 7901e761d9f..f84ad7b2ce1 100644 --- a/numba/core/pythonapi.py +++ b/numba/core/pythonapi.py @@ -45,10 +45,13 @@ def lookup(self, typeclass, default=None): _boxers = _Registry() _unboxers = _Registry() _reflectors = _Registry() +# Registry of special allocators for types. +_allocators = _Registry() box = _boxers.register unbox = _unboxers.register reflect = _reflectors.register +allocator = _allocators.register class _BoxContext(namedtuple("_BoxContext", ("context", "builder", "pyapi", "env_manager"))): @@ -1186,8 +1189,11 @@ def nrt_adapt_ndarray_to_python(self, aryty, ary, dtypeptr): assert self.context.enable_nrt, "NRT required" intty = ir.IntType(32) + # Embed the Python type of the array (maybe subclass) in the LLVM. + serial_aryty_pytype = self.unserialize(self.serialize_object(aryty.py_type)) + fnty = Type.function(self.pyobj, - [self.voidptr, intty, intty, self.pyobj]) + [self.voidptr, self.pyobj, intty, intty, self.pyobj]) fn = self._get_function(fnty, name="NRT_adapt_ndarray_to_python") fn.args[0].add_attribute(lc.ATTR_NO_CAPTURE) @@ -1197,6 +1203,7 @@ def nrt_adapt_ndarray_to_python(self, aryty, ary, dtypeptr): aryptr = cgutils.alloca_once_value(self.builder, ary) return self.builder.call(fn, [self.builder.bitcast(aryptr, self.voidptr), + serial_aryty_pytype, ndim, writable, dtypeptr]) def nrt_meminfo_new_from_pyobject(self, data, pyobj): diff --git a/numba/core/runtime/_nrt_python.c b/numba/core/runtime/_nrt_python.c index 33620fd4f1a..9012498292a 100644 --- a/numba/core/runtime/_nrt_python.c +++ b/numba/core/runtime/_nrt_python.c @@ -55,6 +55,8 @@ int MemInfo_init(MemInfoObject *self, PyObject *args, PyObject *kwds) { return -1; } raw_ptr = PyLong_AsVoidPtr(raw_ptr_obj); + NRT_Debug(nrt_debug_print("MemInfo_init self=%p raw_ptr=%p\n", self, raw_ptr)); + if(PyErr_Occurred()) return -1; self->meminfo = (NRT_MemInfo *)raw_ptr; assert (NRT_MemInfo_refcount(self->meminfo) > 0 && "0 refcount"); @@ -109,6 +111,26 @@ MemInfo_get_refcount(MemInfoObject *self, void *closure) { return PyLong_FromSize_t(refct); } +static +PyObject* +MemInfo_get_external_allocator(MemInfoObject *self, void *closure) { + void *p = NRT_MemInfo_external_allocator(self->meminfo); + return PyLong_FromVoidPtr(p); +} + +static +PyObject* +MemInfo_get_parent(MemInfoObject *self, void *closure) { + void *p = NRT_MemInfo_parent(self->meminfo); + if (p) { + Py_INCREF(p); + return (PyObject*)p; + } else { + Py_INCREF(Py_None); + return Py_None; + } +} + static void MemInfo_dealloc(MemInfoObject *self) { @@ -136,6 +158,13 @@ static PyGetSetDef MemInfo_getsets[] = { (getter)MemInfo_get_refcount, NULL, "Get the refcount", NULL}, + {"external_allocator", + (getter)MemInfo_get_external_allocator, NULL, + "Get the external allocator", + NULL}, + {"parent", + (getter)MemInfo_get_parent, NULL, + NULL}, {NULL} /* Sentinel */ }; @@ -286,7 +315,7 @@ PyObject* try_to_return_parent(arystruct_t *arystruct, int ndim, } NUMBA_EXPORT_FUNC(PyObject *) -NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim, +NRT_adapt_ndarray_to_python(arystruct_t* arystruct, PyTypeObject *retty, int ndim, int writeable, PyArray_Descr *descr) { PyArrayObject *array; @@ -324,10 +353,13 @@ NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim, args = PyTuple_New(1); /* SETITEM steals reference */ PyTuple_SET_ITEM(args, 0, PyLong_FromVoidPtr(arystruct->meminfo)); + NRT_Debug(nrt_debug_print("NRT_adapt_ndarray_to_python arystruct->meminfo=%p\n", arystruct->meminfo)); /* Note: MemInfo_init() does not incref. This function steals the * NRT reference. */ + NRT_Debug(nrt_debug_print("NRT_adapt_ndarray_to_python created MemInfo=%p\n", miobj)); if (MemInfo_init(miobj, args, NULL)) { + NRT_Debug(nrt_debug_print("MemInfo_init returned 0.\n")); return NULL; } Py_DECREF(args); @@ -336,7 +368,7 @@ NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim, shape = arystruct->shape_and_strides; strides = shape + ndim; Py_INCREF((PyObject *) descr); - array = (PyArrayObject *) PyArray_NewFromDescr(&PyArray_Type, descr, ndim, + array = (PyArrayObject *) PyArray_NewFromDescr(retty, descr, ndim, shape, strides, arystruct->data, flags, (PyObject *) miobj); diff --git a/numba/core/runtime/_nrt_pythonmod.c b/numba/core/runtime/_nrt_pythonmod.c index 31e1155fd9f..d1300ee8e9a 100644 --- a/numba/core/runtime/_nrt_pythonmod.c +++ b/numba/core/runtime/_nrt_pythonmod.c @@ -163,6 +163,7 @@ declmethod(MemInfo_alloc); declmethod(MemInfo_alloc_safe); declmethod(MemInfo_alloc_aligned); declmethod(MemInfo_alloc_safe_aligned); +declmethod(MemInfo_alloc_safe_aligned_external); declmethod(MemInfo_alloc_dtor_safe); declmethod(MemInfo_call_dtor); declmethod(MemInfo_new_varsize); diff --git a/numba/core/runtime/nrt.c b/numba/core/runtime/nrt.c index 534681d5417..fe63a691537 100644 --- a/numba/core/runtime/nrt.c +++ b/numba/core/runtime/nrt.c @@ -19,6 +19,7 @@ struct MemInfo { void *dtor_info; void *data; size_t size; /* only used for NRT allocated memory */ + NRT_ExternalAllocator *external_allocator; }; @@ -170,13 +171,16 @@ void NRT_MemSys_set_atomic_cas_stub(void) { */ void NRT_MemInfo_init(NRT_MemInfo *mi,void *data, size_t size, - NRT_dtor_function dtor, void *dtor_info) + NRT_dtor_function dtor, void *dtor_info, + NRT_ExternalAllocator *external_allocator) { mi->refct = 1; /* starts with 1 refct */ mi->dtor = dtor; mi->dtor_info = dtor_info; mi->data = data; mi->size = size; + mi->external_allocator = external_allocator; + NRT_Debug(nrt_debug_print("NRT_MemInfo_init mi=%p external_allocator=%p\n", mi, external_allocator)); /* Update stats */ TheMSys.atomic_inc(&TheMSys.stats_mi_alloc); } @@ -185,7 +189,8 @@ NRT_MemInfo *NRT_MemInfo_new(void *data, size_t size, NRT_dtor_function dtor, void *dtor_info) { NRT_MemInfo *mi = NRT_Allocate(sizeof(NRT_MemInfo)); - NRT_MemInfo_init(mi, data, size, dtor, dtor_info); + NRT_Debug(nrt_debug_print("NRT_MemInfo_new mi=%p\n", mi)); + NRT_MemInfo_init(mi, data, size, dtor, dtor_info, NULL); return mi; } @@ -206,9 +211,10 @@ void nrt_internal_dtor_safe(void *ptr, size_t size, void *info) { } static -void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out) { +void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out, NRT_ExternalAllocator *allocator) { NRT_MemInfo *mi; - char *base = NRT_Allocate(sizeof(NRT_MemInfo) + size); + NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data %p\n", allocator)); + char *base = NRT_Allocate_External(sizeof(NRT_MemInfo) + size, allocator); mi = (NRT_MemInfo *) base; *mi_out = mi; return base + sizeof(NRT_MemInfo); @@ -230,9 +236,17 @@ void nrt_internal_custom_dtor_safe(void *ptr, size_t size, void *info) { NRT_MemInfo *NRT_MemInfo_alloc(size_t size) { NRT_MemInfo *mi; - void *data = nrt_allocate_meminfo_and_data(size, &mi); + void *data = nrt_allocate_meminfo_and_data(size, &mi, NULL); NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc %p\n", data)); - NRT_MemInfo_init(mi, data, size, NULL, NULL); + NRT_MemInfo_init(mi, data, size, NULL, NULL, NULL); + return mi; +} + +NRT_MemInfo *NRT_MemInfo_alloc_external(size_t size, NRT_ExternalAllocator *allocator) { + NRT_MemInfo *mi; + void *data = nrt_allocate_meminfo_and_data(size, &mi, allocator); + NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc %p\n", data)); + NRT_MemInfo_init(mi, data, size, NULL, NULL, allocator); return mi; } @@ -242,22 +256,23 @@ NRT_MemInfo *NRT_MemInfo_alloc_safe(size_t size) { NRT_MemInfo* NRT_MemInfo_alloc_dtor_safe(size_t size, NRT_dtor_function dtor) { NRT_MemInfo *mi; - void *data = nrt_allocate_meminfo_and_data(size, &mi); + void *data = nrt_allocate_meminfo_and_data(size, &mi, NULL); /* Only fill up a couple cachelines with debug markers, to minimize overhead. */ memset(data, 0xCB, MIN(size, 256)); NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_dtor_safe %p %zu\n", data, size)); - NRT_MemInfo_init(mi, data, size, nrt_internal_custom_dtor_safe, dtor); + NRT_MemInfo_init(mi, data, size, nrt_internal_custom_dtor_safe, dtor, NULL); return mi; } static void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, - NRT_MemInfo **mi) + NRT_MemInfo **mi, NRT_ExternalAllocator *allocator) { size_t offset, intptr, remainder; - char *base = nrt_allocate_meminfo_and_data(size + 2 * align, mi); + NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data_align %p\n", allocator)); + char *base = nrt_allocate_meminfo_and_data(size + 2 * align, mi, allocator); intptr = (size_t) base; /* See if we are aligned */ remainder = intptr % align; @@ -271,26 +286,48 @@ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align) { NRT_MemInfo *mi; - void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi); + void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, NULL); NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_aligned %p\n", data)); - NRT_MemInfo_init(mi, data, size, NULL, NULL); + NRT_MemInfo_init(mi, data, size, NULL, NULL, NULL); return mi; } NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned(size_t size, unsigned align) { NRT_MemInfo *mi; - void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi); + void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, NULL); /* Only fill up a couple cachelines with debug markers, to minimize overhead. */ memset(data, 0xCB, MIN(size, 256)); NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned %p %zu\n", data, size)); - NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size); + NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size, NULL); return mi; } +NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned_external(size_t size, unsigned align, NRT_ExternalAllocator *allocator) { + NRT_MemInfo *mi; + NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned_external %p\n", allocator)); + void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, allocator); + /* Only fill up a couple cachelines with debug markers, to minimize + overhead. */ + memset(data, 0xCB, MIN(size, 256)); + NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned %p %zu\n", + data, size)); + NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size, allocator); + return mi; +} + +void NRT_dealloc(NRT_MemInfo *mi) { + NRT_Debug(nrt_debug_print("NRT_dealloc meminfo: %p external_allocator: %p\n", mi, mi->external_allocator)); + if (mi->external_allocator) { + mi->external_allocator->free(mi, mi->external_allocator->opaque_data); + } else { + NRT_Free(mi); + } +} + void NRT_MemInfo_destroy(NRT_MemInfo *mi) { - NRT_Free(mi); + NRT_dealloc(mi); TheMSys.atomic_inc(&TheMSys.stats_mi_free); } @@ -328,6 +365,14 @@ size_t NRT_MemInfo_size(NRT_MemInfo* mi) { return mi->size; } +void * NRT_MemInfo_external_allocator(NRT_MemInfo *mi) { + NRT_Debug(nrt_debug_print("NRT_MemInfo_external_allocator meminfo: %p external_allocator: %p\n", mi, mi->external_allocator)); + return mi->external_allocator; +} + +void *NRT_MemInfo_parent(NRT_MemInfo *mi) { + return mi->dtor_info; +} void NRT_MemInfo_dump(NRT_MemInfo *mi, FILE *out) { fprintf(out, "MemInfo %p refcount %zu\n", mi, mi->refct); @@ -414,8 +459,18 @@ void NRT_MemInfo_varsize_free(NRT_MemInfo *mi, void *ptr) */ void* NRT_Allocate(size_t size) { - void *ptr = TheMSys.allocator.malloc(size); - NRT_Debug(nrt_debug_print("NRT_Allocate bytes=%zu ptr=%p\n", size, ptr)); + return NRT_Allocate_External(size, NULL); +} + +void* NRT_Allocate_External(size_t size, NRT_ExternalAllocator *allocator) { + void *ptr; + if (allocator) { + ptr = allocator->malloc(size, allocator->opaque_data); + NRT_Debug(nrt_debug_print("NRT_Allocate custom bytes=%zu ptr=%p\n", size, ptr)); + } else { + ptr = TheMSys.allocator.malloc(size); + NRT_Debug(nrt_debug_print("NRT_Allocate bytes=%zu ptr=%p\n", size, ptr)); + } TheMSys.atomic_inc(&TheMSys.stats_alloc); return ptr; } @@ -460,6 +515,7 @@ NRT_MemInfo* nrt_manage_memory(void *data, NRT_managed_dtor dtor) { static const NRT_api_functions nrt_functions_table = { NRT_MemInfo_alloc, + NRT_MemInfo_alloc_external, nrt_manage_memory, NRT_MemInfo_acquire, NRT_MemInfo_release, diff --git a/numba/core/runtime/nrt.h b/numba/core/runtime/nrt.h index 3c74dc58f58..9fb23532964 100644 --- a/numba/core/runtime/nrt.h +++ b/numba/core/runtime/nrt.h @@ -15,13 +15,14 @@ All functions described here are threadsafe. /* Debugging facilities - enabled at compile-time */ /* #undef NDEBUG */ #if 0 -# define NRT_Debug(X) X +# define NRT_Debug(X) {X; fflush(stdout); } #else # define NRT_Debug(X) if (0) { X; } #endif /* TypeDefs */ typedef void (*NRT_dtor_function)(void *ptr, size_t size, void *info); +typedef void (*NRT_dealloc_func)(void *ptr, void *dealloc_info); typedef size_t (*NRT_atomic_inc_dec_func)(size_t *ptr); typedef int (*NRT_atomic_cas_func)(void * volatile *ptr, void *cmp, void *repl, void **oldptr); @@ -32,7 +33,6 @@ typedef void *(*NRT_malloc_func)(size_t size); typedef void *(*NRT_realloc_func)(void *ptr, size_t new_size); typedef void (*NRT_free_func)(void *ptr); - /* Memory System API */ /* Initialize the memory system */ @@ -101,7 +101,8 @@ NRT_MemInfo* NRT_MemInfo_new(void *data, size_t size, VISIBILITY_HIDDEN void NRT_MemInfo_init(NRT_MemInfo *mi, void *data, size_t size, - NRT_dtor_function dtor, void *dtor_info); + NRT_dtor_function dtor, void *dtor_info, + NRT_ExternalAllocator *external_allocator); /* * Returns the refcount of a MemInfo or (size_t)-1 if error. @@ -116,6 +117,8 @@ size_t NRT_MemInfo_refcount(NRT_MemInfo *mi); VISIBILITY_HIDDEN NRT_MemInfo *NRT_MemInfo_alloc(size_t size); +NRT_MemInfo *NRT_MemInfo_alloc_external(size_t size, NRT_ExternalAllocator *allocator); + /* * The "safe" NRT_MemInfo_alloc performs additional steps to help debug * memory errors. @@ -141,6 +144,8 @@ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align); VISIBILITY_HIDDEN NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned(size_t size, unsigned align); +NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned_external(size_t size, unsigned align, NRT_ExternalAllocator *allocator); + /* * Internal API. * Release a MemInfo. Calls NRT_MemSys_insert_meminfo. @@ -179,6 +184,18 @@ void* NRT_MemInfo_data(NRT_MemInfo* mi); VISIBILITY_HIDDEN size_t NRT_MemInfo_size(NRT_MemInfo* mi); +/* + * Returns the external allocator + */ +VISIBILITY_HIDDEN +void* NRT_MemInfo_external_allocator(NRT_MemInfo* mi); + +/* + * Returns the parent MemInfo + */ +VISIBILITY_HIDDEN +void* NRT_MemInfo_parent(NRT_MemInfo* mi); + /* * NRT API for resizable buffers. @@ -207,6 +224,7 @@ void NRT_MemInfo_dump(NRT_MemInfo *mi, FILE *out); * Allocate memory of `size` bytes. */ VISIBILITY_HIDDEN void* NRT_Allocate(size_t size); +VISIBILITY_HIDDEN void* NRT_Allocate_External(size_t size, NRT_ExternalAllocator *allocator); /* * Deallocate memory pointed by `ptr`. diff --git a/numba/core/runtime/nrt_external.h b/numba/core/runtime/nrt_external.h index 391b6fa1b0e..a4835c36f67 100644 --- a/numba/core/runtime/nrt_external.h +++ b/numba/core/runtime/nrt_external.h @@ -7,6 +7,18 @@ typedef struct MemInfo NRT_MemInfo; typedef void NRT_managed_dtor(void *data); +typedef void *(*NRT_external_malloc_func)(size_t size, void *opaque_data); +typedef void *(*NRT_external_realloc_func)(void *ptr, size_t new_size, void *opaque_data); +typedef void (*NRT_external_free_func)(void *ptr, void *opaque_data); + +struct ExternalMemAllocator { + NRT_external_malloc_func malloc; + NRT_external_realloc_func realloc; + NRT_external_free_func free; + void *opaque_data; +}; + +typedef struct ExternalMemAllocator NRT_ExternalAllocator; typedef struct { /* Methods to create MemInfos. @@ -21,6 +33,10 @@ typedef struct { Returning a new reference. */ NRT_MemInfo* (*allocate)(size_t nbytes); + /* Allocator memory using an external allocator but still using Numba's MemInfo. + + */ + NRT_MemInfo* (*allocate_external)(size_t nbytes, NRT_ExternalAllocator *allocator); /* Convert externally allocated memory into a MemInfo. diff --git a/numba/core/types/npytypes.py b/numba/core/types/npytypes.py index 6f6307c5526..3c2191ca23e 100644 --- a/numba/core/types/npytypes.py +++ b/numba/core/types/npytypes.py @@ -8,6 +8,7 @@ from numba.core import utils from .misc import UnicodeType from .containers import Bytes +import numpy as np class CharSeq(Type): """ @@ -394,8 +395,9 @@ class Array(Buffer): Type class for Numpy arrays. """ - def __init__(self, dtype, ndim, layout, readonly=False, name=None, + def __init__(self, dtype, ndim, layout, py_type=np.ndarray, readonly=False, name=None, aligned=True, addrspace=None): + self.py_type = py_type if readonly: self.mutable = False if (not aligned or diff --git a/numba/core/typing/npydecl.py b/numba/core/typing/npydecl.py index 2dbbed39be9..e7ecf452fe9 100644 --- a/numba/core/typing/npydecl.py +++ b/numba/core/typing/npydecl.py @@ -126,7 +126,21 @@ def generic(self, args, kws): ret_tys = ufunc_loop.outputs[-implicit_output_count:] if ndims > 0: assert layout is not None - ret_tys = [types.Array(dtype=ret_ty, ndim=ndims, layout=layout) + # If either of the types involved in the ufunc operation have a + # __array_ufunc__ method then invoke the first such one to + # determine the output type of the ufunc. + array_ufunc_type = None + for a in args: + if hasattr(a, "__array_ufunc__"): + array_ufunc_type = a + break + output_type = types.Array + if array_ufunc_type is not None: + output_type = array_ufunc_type.__array_ufunc__(ufunc, "__call__", *args, **kws) + # Eventually better error handling! FIX ME! + assert(output_type is not None) + + ret_tys = [output_type(dtype=ret_ty, ndim=ndims, layout=layout) for ret_ty in ret_tys] ret_tys = [resolve_output_type(self.context, args, ret_ty) for ret_ty in ret_tys] @@ -517,6 +531,7 @@ def typer(shape, dtype=None): @infer_global(np.empty_like) @infer_global(np.zeros_like) +@infer_global(np.ones_like) class NdConstructorLike(CallableTemplate): """ Typing template for np.empty_like(), .zeros_like(), .ones_like(). @@ -544,9 +559,6 @@ def typer(arg, dtype=None): return typer -infer_global(np.ones_like)(NdConstructorLike) - - @infer_global(np.full) class NdFull(CallableTemplate): @@ -563,6 +575,7 @@ def typer(shape, fill_value, dtype=None): return typer + @infer_global(np.full_like) class NdFullLike(CallableTemplate): diff --git a/numba/np/arrayobj.py b/numba/np/arrayobj.py index 933b1c6565e..5749e7d9b5b 100644 --- a/numba/np/arrayobj.py +++ b/numba/np/arrayobj.py @@ -32,7 +32,7 @@ from numba.misc import quicksort, mergesort from numba.cpython import slicing from numba.cpython.unsafe.tuple import tuple_setitem - +from numba.core.pythonapi import _allocators def set_range_metadata(builder, load, lower_bound, upper_bound): """ @@ -3399,8 +3399,13 @@ def _empty_nd_impl(context, builder, arrtype, shapes): ) align = context.get_preferred_array_alignment(arrtype.dtype) - meminfo = context.nrt.meminfo_alloc_aligned(builder, size=allocsize, - align=align) + def alloc_unsupported(context, builder, size, align): + return context.nrt.meminfo_alloc_aligned(builder, size, align) + + # See if the type has a special allocator, if not use the default + # alloc_unsuppported allocator above. + allocator_impl = _allocators.lookup(arrtype.__class__, alloc_unsupported) + meminfo = allocator_impl(context, builder, size=allocsize, align=align) data = context.nrt.meminfo_data(builder, meminfo) From d07f9af0ba8ad131f5283171c49b2ca00cdf7b68 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Wed, 18 Nov 2020 04:52:29 -0600 Subject: [PATCH 17/19] Patch for with context (#96) This modifications make jit() decorator use TargetDispatcher from dppl. Changes made in #57 by @AlexanderKalistratov and @1e-to. --- numba/core/decorators.py | 58 ++++++++++++++++++++++---------- numba/core/dispatcher.py | 12 ++++++- numba/core/registry.py | 6 ++++ numba/tests/test_dispatcher.py | 2 ++ numba/tests/test_nrt.py | 2 ++ numba/tests/test_record_dtype.py | 4 +-- numba/tests/test_serialize.py | 6 ++-- 7 files changed, 67 insertions(+), 23 deletions(-) diff --git a/numba/core/decorators.py b/numba/core/decorators.py index cfe91168969..3ecc188a9a7 100644 --- a/numba/core/decorators.py +++ b/numba/core/decorators.py @@ -149,7 +149,7 @@ def bar(x, y): target = options.pop('target') warnings.warn("The 'target' keyword argument is deprecated.", NumbaDeprecationWarning) else: - target = options.pop('_target', 'cpu') + target = options.pop('_target', None) options['boundscheck'] = boundscheck @@ -183,27 +183,16 @@ def bar(x, y): def _jit(sigs, locals, target, cache, targetoptions, **dispatcher_args): - dispatcher = registry.dispatcher_registry[target] - - def wrapper(func): - if extending.is_jitted(func): - raise TypeError( - "A jit decorator was called on an already jitted function " - f"{func}. If trying to access the original python " - f"function, use the {func}.py_func attribute." - ) - - if not inspect.isfunction(func): - raise TypeError( - "The decorated object is not a function (got type " - f"{type(func)})." - ) + def wrapper(func, dispatcher): if config.ENABLE_CUDASIM and target == 'cuda': from numba import cuda return cuda.jit(func) if config.DISABLE_JIT and not target == 'npyufunc': return func + if target == 'dppl': + from . import dppl + return dppl.jit(func) disp = dispatcher(py_func=func, locals=locals, targetoptions=targetoptions, **dispatcher_args) @@ -219,7 +208,42 @@ def wrapper(func): disp.disable_compile() return disp - return wrapper + def __wrapper(func): + if extending.is_jitted(func): + raise TypeError( + "A jit decorator was called on an already jitted function " + f"{func}. If trying to access the original python " + f"function, use the {func}.py_func attribute." + ) + + if not inspect.isfunction(func): + raise TypeError( + "The decorated object is not a function (got type " + f"{type(func)})." + ) + + is_numba_dppy_present = False + try: + import numba_dppy.config as dppy_config + + is_numba_dppy_present = dppy_config.dppy_present + except ImportError: + pass + + if (not is_numba_dppy_present + or target == 'npyufunc' or targetoptions.get('no_cpython_wrapper') + or sigs or config.DISABLE_JIT or not targetoptions.get('nopython')): + target_ = target + if target_ is None: + target_ = 'cpu' + disp = registry.dispatcher_registry[target_] + return wrapper(func, disp) + + from numba_dppy.target_dispatcher import TargetDispatcher + disp = TargetDispatcher(func, wrapper, target, targetoptions.get('parallel')) + return disp + + return __wrapper def generated_jit(function=None, target='cpu', cache=False, diff --git a/numba/core/dispatcher.py b/numba/core/dispatcher.py index 18d9426cd4d..42418fe5783 100644 --- a/numba/core/dispatcher.py +++ b/numba/core/dispatcher.py @@ -673,7 +673,14 @@ def _set_uuid(self, u): self._recent.append(self) -class Dispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase): +import abc + +class DispatcherMeta(abc.ABCMeta): + def __instancecheck__(self, other): + return type(type(other)) == DispatcherMeta + + +class Dispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase, metaclass=DispatcherMeta): """ Implementation of user-facing dispatcher objects (i.e. created using the @jit decorator). @@ -899,6 +906,9 @@ def get_function_type(self): cres = tuple(self.overloads.values())[0] return types.FunctionType(cres.signature) + def get_compiled(self): + return self + class LiftedCode(serialize.ReduceMixin, _MemoMixin, _DispatcherBase): """ diff --git a/numba/core/registry.py b/numba/core/registry.py index 2bd47ebe879..01e492f91f4 100644 --- a/numba/core/registry.py +++ b/numba/core/registry.py @@ -2,6 +2,7 @@ from numba.core.descriptors import TargetDescriptor from numba.core import utils, typing, dispatcher, cpu +from numba.core.compiler_lock import global_compiler_lock # ----------------------------------------------------------------------------- # Default CPU target descriptors @@ -26,16 +27,19 @@ class CPUTarget(TargetDescriptor): _nested = _NestedContext() @utils.cached_property + @global_compiler_lock def _toplevel_target_context(self): # Lazily-initialized top-level target context, for all threads return cpu.CPUContext(self.typing_context) @utils.cached_property + @global_compiler_lock def _toplevel_typing_context(self): # Lazily-initialized top-level typing context, for all threads return typing.Context() @property + @global_compiler_lock def target_context(self): """ The target context for CPU targets. @@ -47,6 +51,7 @@ def target_context(self): return self._toplevel_target_context @property + @global_compiler_lock def typing_context(self): """ The typing context for CPU targets. @@ -57,6 +62,7 @@ def typing_context(self): else: return self._toplevel_typing_context + @global_compiler_lock def nested_context(self, typing_context, target_context): """ A context manager temporarily replacing the contexts with the diff --git a/numba/tests/test_dispatcher.py b/numba/tests/test_dispatcher.py index 30a8e081485..b90d42ede26 100644 --- a/numba/tests/test_dispatcher.py +++ b/numba/tests/test_dispatcher.py @@ -398,6 +398,8 @@ def test_serialization(self): def foo(x): return x + 1 + foo = foo.get_compiled() + self.assertEqual(foo(1), 2) # get serialization memo diff --git a/numba/tests/test_nrt.py b/numba/tests/test_nrt.py index e0c94605671..602132258e8 100644 --- a/numba/tests/test_nrt.py +++ b/numba/tests/test_nrt.py @@ -249,6 +249,8 @@ def alloc_nrt_memory(): """ return np.empty(N, dtype) + alloc_nrt_memory = alloc_nrt_memory.get_compiled() + def keep_memory(): return alloc_nrt_memory() diff --git a/numba/tests/test_record_dtype.py b/numba/tests/test_record_dtype.py index 6d479c413fa..e674bacc957 100644 --- a/numba/tests/test_record_dtype.py +++ b/numba/tests/test_record_dtype.py @@ -803,8 +803,8 @@ def test_record_arg_transform(self): self.assertIn('Array', transformed) self.assertNotIn('first', transformed) self.assertNotIn('second', transformed) - # Length is usually 50 - 5 chars tolerance as above. - self.assertLess(len(transformed), 50) + # Length is usually 60 - 5 chars tolerance as above. + self.assertLess(len(transformed), 60) def test_record_two_arrays(self): """ diff --git a/numba/tests/test_serialize.py b/numba/tests/test_serialize.py index 2bcf843458a..90c3db44a16 100644 --- a/numba/tests/test_serialize.py +++ b/numba/tests/test_serialize.py @@ -135,9 +135,9 @@ def test_reuse(self): Note that "same function" is intentionally under-specified. """ - func = closure(5) + func = closure(5).get_compiled() pickled = pickle.dumps(func) - func2 = closure(6) + func2 = closure(6).get_compiled() pickled2 = pickle.dumps(func2) f = pickle.loads(pickled) @@ -152,7 +152,7 @@ def test_reuse(self): self.assertEqual(h(2, 3), 11) # Now make sure the original object doesn't exist when deserializing - func = closure(7) + func = closure(7).get_compiled() func(42, 43) pickled = pickle.dumps(func) del func From 00e056eb0c3f499b704f7efc70c8b4a8e21c72bd Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Fri, 13 Nov 2020 04:56:54 -0600 Subject: [PATCH 18/19] Patch for lowering (#98) This patch introduces global initialization of lower_extensions with one functions and replaces with a list of functions. This change first made in commit 947b407 by @reazulhoque. --- numba/core/lowering.py | 9 ++++++++- numba/parfors/parfor_lowering.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/numba/core/lowering.py b/numba/core/lowering.py index 1c9c19cd3b1..bcbc4e8b980 100644 --- a/numba/core/lowering.py +++ b/numba/core/lowering.py @@ -274,6 +274,13 @@ def debug_print(self, msg): class Lower(BaseLower): GeneratorLower = generators.GeneratorLower + def __init__(self, context, library, fndesc, func_ir, metadata=None): + BaseLower.__init__(self, context, library, fndesc, func_ir, metadata) + from numba.parfors.parfor_lowering import _lower_parfor_parallel + from numba.parfors import parfor + if parfor.Parfor not in lower_extensions: + lower_extensions[parfor.Parfor] = [_lower_parfor_parallel] + def pre_block(self, block): from numba.core.unsafe import eh @@ -440,7 +447,7 @@ def lower_inst(self, inst): else: for _class, func in lower_extensions.items(): if isinstance(inst, _class): - func(self, inst) + func[-1](self, inst) return raise NotImplementedError(type(inst)) diff --git a/numba/parfors/parfor_lowering.py b/numba/parfors/parfor_lowering.py index dc499498326..e559e8c017c 100644 --- a/numba/parfors/parfor_lowering.py +++ b/numba/parfors/parfor_lowering.py @@ -480,7 +480,7 @@ def _lower_parfor_parallel(lowerer, parfor): print("_lower_parfor_parallel done") # A work-around to prevent circular imports -lowering.lower_extensions[parfor.Parfor] = _lower_parfor_parallel +#lowering.lower_extensions[parfor.Parfor] = _lower_parfor_parallel def _create_shape_signature( From 5a1b372956ce881ebaf98c4ddd65681ece90ba0e Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Mon, 16 Nov 2020 05:41:03 -0600 Subject: [PATCH 19/19] Uncomment require_global_compiler_lock() (#120) This line was commented when introduced codegen debugging by @DrTodd13. Uncommenting this line requires modifications in numba-dppy. It seems that numba-dppy compilation does not work correctly with global compiler lock. --- numba/core/codegen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba/core/codegen.py b/numba/core/codegen.py index e22fa89012e..324a999fbe3 100644 --- a/numba/core/codegen.py +++ b/numba/core/codegen.py @@ -220,7 +220,7 @@ def finalize(self): Finalization involves various stages of code optimization and linking. """ - require_global_compiler_lock() + #require_global_compiler_lock() # Report any LLVM-related problems to the user self._codegen._check_llvm_bugs()