diff --git a/CHANGE_LOG b/CHANGE_LOG index d266d552a55..68b0e38e4ad 100644 --- a/CHANGE_LOG +++ b/CHANGE_LOG @@ -1,7 +1,242 @@ -Version 0.51.0 --------------- +Version 0.51.0 (August 12, 2020) +-------------------------------- + +This release continues to add new features to Numba and also contains a +significant number of bug fixes and stability improvements. + +Highlights of core feature changes include: + +* The compilation chain is now based on LLVM 10 (Valentin Haenel). +* Numba has internally switched to prefer non-literal types over literal ones so + as to reduce function over-specialisation, this with view of speeding up + compile times (Siu Kwan Lam). +* On the CUDA target: Support for CUDA Toolkit 11, Ampere, and Compute + Capability 8.0; Printing of ``SASS`` code for kernels; Callbacks to Python + functions can be inserted into CUDA streams, and streams are async awaitable; + Atomic ``nanmin`` and ``nanmax`` functions are added; Fixes for various + miscompilations and segfaults. (mostly Graham Markall; call backs on + streams by Peter Würtz). + +Intel also kindly sponsored research and development that lead to some exciting +new features: + +* Support for heterogeneous immutable lists and heterogeneous immutable string + key dictionaries. Also optional initial/construction value capturing for all + lists and dictionaries containing literal values (Stuart Archibald). +* A new pass-by-reference mutable structure extension type ``StructRef`` (Siu + Kwan Lam). +* Object mode blocks are now cacheable, with the side effect of numerous bug + fixes and performance improvements in caching. This also permits caching of + functions defined in closures (Siu Kwan Lam). + +Deprecations to note: + +To align with other targets, the ``argtypes`` and ``restypes`` kwargs to +``@cuda.jit`` are now deprecated, the ``bind`` kwarg is also deprecated. +Further the ``target`` kwarg to the ``numba.jit`` decorator family is +deprecated. + +General Enhancements: + +* PR #5463: Add str(int) impl +* PR #5526: Impl. np.asarray(literal) +* PR #5619: Add support for multi-output ufuncs +* PR #5711: Division with timedelta input +* PR #5763: Support minlength argument to np.bincount +* PR #5779: Return zero array from np.dot when the arguments are empty. +* PR #5796: Add implementation for np.positive +* PR #5849: Setitem for records when index is StringLiteral, including literal + unroll +* PR #5856: Add support for conversion of inplace_binop to parfor. +* PR #5893: Allocate 1D iteration space one at a time for more even + distribution. +* PR #5922: Reduce objmode and unpickling overhead +* PR #5944: re-enable OpenMP in wheels +* PR #5946: Implement literal dictionaries and lists. +* PR #5956: Update numba_sysinfo.py +* PR #5978: Add structref as a mutable struct that is pass-by-ref +* PR #5980: Deprecate target kwarg for numba.jit. +* PR #6058: Add prefer_literal option to overload API + +Fixes: + +* PR #5674: Fix #3955. Allow `with objmode` to be cached +* PR #5724: Initialize process lock lazily to prevent multiprocessing issue +* PR #5783: Make np.divide and np.remainder code more similar +* PR #5808: Fix 5665 Block jit(nopython=True, forceobj=True) and suppress + njit(forceobj=True) +* PR #5834: Fix the is operator on Ellipsis +* PR #5838: Ensure ``Dispatcher.__eq__`` always returns a bool +* PR #5841: cleanup: Use PythonAPI.bool_from_bool in more places +* PR #5862: Do not leak loop iteration variables into the numba.np.npyimpl + namespace +* PR #5869: Update repomap +* PR #5879: Fix erroneous input mutation in linalg routines +* PR #5882: Type check function in jit decorator +* PR #5925: Use np.inf and -np.inf for max and min float values respectively. +* PR #5935: Fix default arguments with multiprocessing +* PR #5952: Fix "Internal error ... local variable 'errstr' referenced before + assignment during BoundFunction(...)" +* PR #5962: Fix SVML tests with LLVM 10 and AVX512 +* PR #5972: fix flake8 for numba/runtests.py +* PR #5995: Update setup.py with new llvmlite versions +* PR #5996: Set lower bound for llvmlite to 0.33 +* PR #6004: Fix problem in branch pruning with LiteralStrKeyDict +* PR #6017: Fixing up numba_do_raise +* PR #6028: Fix #6023 +* PR #6031: Continue 5821 +* PR #6035: Fix overspecialize of literal +* PR #6046: Fixes statement reordering bug in maximize fusion step. +* PR #6056: Fix issue on invalid inlining of non-empty build_list by + inline_arraycall +* PR #6057: fix aarch64/python_3.8 failure on master +* PR #6070: Fix overspecialized containers +* PR #6071: Remove f-strings in setup.py +* PR #6072: Fix for #6005 +* PR #6073: Fixes invalid C prototype in helper function. +* PR #6078: Duplicate NumPy's PyArray_DescrCheck macro +* PR #6081: Fix issue with cross drive use and relpath. +* PR #6083: Fix bug in initial value unify. + +CUDA Enhancements/Fixes: + +* PR #5359: Remove special-casing of 0d arrays +* PR #5709: CUDA: Refactoring of cuda.jit and kernel / dispatcher abstractions +* PR #5732: CUDA Docs: document ``forall`` method of kernels +* PR #5745: CUDA stream callbacks and async awaitable streams +* PR #5761: Add implmentation for int types for isnan and isinf for CUDA +* PR #5819: Add support for CUDA 11 and Ampere / CC 8.0 +* PR #5826: CUDA: Add function to get SASS for kernels +* PR #5846: CUDA: Allow disabling NVVM optimizations, and fix debug issues +* PR #5851: CUDA EMM enhancements - add default get_ipc_handle implementation, + skip a test conditionally +* PR #5852: CUDA: Fix ``cuda.test()`` +* PR #5857: CUDA docs: Add notes on resetting the EMM plugin +* PR #5859: CUDA: Fix reduce docs and style improvements +* PR #6016: Fixes change of list spelling in a cuda test. +* PR #6020: CUDA: Fix #5820, adding atomic nanmin / nanmax +* PR #6030: CUDA: Don't optimize IR before sending it to NVVM +* PR #6052: Fix dtype for atomic_add_double testsuite +* PR #6080: CUDA: Prevent auto-upgrade of atomic intrinsics + +Documentation Updates: + +* PR #5782: Host docs on Read the Docs +* PR #5830: doc: Mention that caching uses pickle +* PR #5963: Fix broken link to numpy ufunc signature docs +* PR #5975: restructure communication section +* PR #5981: Document bounds-checking behavior in python deviations page +* PR #5993: Docs for structref +* PR #6008: Small fix so bullet points are rendered by sphinx +* PR #6013: emphasize cuda kernel functions are asynchronous +* PR #6036: Update deprecation doc from numba.errors to numba.core.errors +* PR #6062: Change references to numba.pydata.org to https + +CI updates: + +* PR #5850: Updates the "New Issue" behaviour to better redirect users. +* PR #5940: Add discourse badge +* PR #5960: Setting mypy on CI -In development +Enhancements from user contributed PRs (with thanks!): + +* Aisha Tammy added the ability to switch off TBB support at compile time in + #5821 (continued in #6031 by Stuart Archibald). +* Alexander Stiebing fixed a reference before assignment bug in #5952. +* Alexey Kozlov fixed a bug in tuple getitem for literals in #6028. +* Andrew Eckart updated the repomap in #5869, added support for Read the Docs + in #5782, fixed a bug in the ``np.dot`` implementation to correctly handle + empty arrays in #5779 and added support for ``minlength`` to ``np.bincount`` + in #5763. +* ``@bitsisbits`` updated ``numba_sysinfo.py`` to handle HSA agents correctly in + #5956. +* Daichi Suzuo Fixed a bug in the threading backend initialisation sequence such + that it is now correctly a lazy lock in #5724. +* Eric Wieser contributed a number of patches, particularly in enhancing and + improving the ``ufunc`` capabilities: + + * #5359: Remove special-casing of 0d arrays + * #5834: Fix the is operator on Ellipsis + * #5619: Add support for multi-output ufuncs + * #5841: cleanup: Use PythonAPI.bool_from_bool in more places + * #5862: Do not leak loop iteration variables into the numba.np.npyimpl + namespace + * #5838: Ensure ``Dispatcher.__eq__`` always returns a bool + * #5830: doc: Mention that caching uses pickle + * #5783: Make np.divide and np.remainder code more similar + +* Ethan Pronovost added a guard to prevent the common mistake of applying a jit + decorator to the same function twice in #5881. +* Graham Markall contributed many patches to the CUDA target, as follows: + + * #6052: Fix dtype for atomic_add_double tests + * #6030: CUDA: Don't optimize IR before sending it to NVVM + * #5846: CUDA: Allow disabling NVVM optimizations, and fix debug issues + * #5826: CUDA: Add function to get SASS for kernels + * #5851: CUDA EMM enhancements - add default get_ipc_handle implementation, + skip a test conditionally + * #5709: CUDA: Refactoring of cuda.jit and kernel / dispatcher abstractions + * #5819: Add support for CUDA 11 and Ampere / CC 8.0 + * #6020: CUDA: Fix #5820, adding atomic nanmin / nanmax + * #5857: CUDA docs: Add notes on resetting the EMM plugin + * #5859: CUDA: Fix reduce docs and style improvements + * #5852: CUDA: Fix ``cuda.test()`` + * #5732: CUDA Docs: document ``forall`` method of kernels + +* Guilherme Leobas added support for ``str(int)`` in #5463 and + ``np.asarray(literal value)``` in #5526. +* Hameer Abbasi deprecated the ``target`` kwarg for ``numba.jit`` in #5980. +* Hannes Pahl added a badge to the Numba github page linking to the new + discourse forum in #5940 and also fixed a bug that permitted illegal + combinations of flags to be passed into ``@jit`` in #5808. +* Kayran Schmidt emphasized that CUDA kernel functions are asynchronous in the + documentation in #6013. +* Leonardo Uieda fixed a broken link to the NumPy ufunc signature docs in #5963. +* Lucio Fernandez-Arjona added mypy to CI and started adding type annotations to + the code base in #5960, also fixed a (de)serialization problem on the + dispatcher in #5935, improved the undefined variable error message in #5876, + added support for division with timedelta input in #5711 and implemented + ``setitem`` for records when the index is a ``StringLiteral`` in #5849. +* Ludovic Tiako documented Numba's bounds-checking behavior in the python + deviations page in #5981. +* Matt Roeschke changed all ``http`` references ``https`` in #6062. +* ``@niteya-shah`` implemented ``isnan`` and ``isinf`` for integer types on the + CUDA target in #5761 and implemented ``np.positive`` in #5796. +* Peter Würtz added CUDA stream callbacks and async awaitable streams in #5745. +* ``@rht`` fixed an invalid import referred to in the deprecation documentation + in #6036. +* Sergey Pokhodenko updated the SVML tests for LLVM 10 in #5962. +* Shyam Saladi fixed a Sphinx rendering bug in #6008. + +Authors: + +* Aisha Tammy +* Alexander Stiebing +* Alexey Kozlov +* Andrew Eckart +* ``@bitsisbits`` +* Daichi Suzuo +* Eric Wieser +* Ethan Pronovost +* Graham Markall +* Guilherme Leobas +* Hameer Abbasi +* Hannes Pahl +* Kayran Schmidt +* Kozlov, Alexey +* Leonardo Uieda +* Lucio Fernandez-Arjona +* Ludovic Tiako +* Matt Roeschke +* ``@niteya-shah`` +* Peter Würtz +* Sergey Pokhodenko +* Shyam Saladi +* ``@rht`` +* Siu Kwan Lam (core dev) +* Stuart Archibald (core dev) +* Todd A. Anderson (core dev) +* Valentin Haenel (core dev) Version 0.50.1 (Jun 24, 2020) diff --git a/docs/source/conf.py b/docs/source/conf.py index 3db68875355..e1402297ae2 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -122,7 +122,7 @@ # All sphinx_rtd_theme options. Default values commented out; uncomment to # change. html_theme_options = { - 'canonical_url': 'http://numba.pydata.org/numba-doc/latest/index.html', + 'canonical_url': 'https://numba.pydata.org/numba-doc/latest/index.html', # 'logo_only': False, # 'display_version': True, # 'prev_next_buttons_location': 'bottom', diff --git a/docs/source/cuda-reference/host.rst b/docs/source/cuda-reference/host.rst index 317d1c0ab48..d69b5de4e48 100644 --- a/docs/source/cuda-reference/host.rst +++ b/docs/source/cuda-reference/host.rst @@ -178,7 +178,7 @@ Programming Guide Streams section Streams are instances of :class:`numba.cuda.cudadrv.driver.Stream`: .. autoclass:: numba.cuda.cudadrv.driver.Stream - :members: synchronize, auto_synchronize + :members: synchronize, auto_synchronize, add_callback, async_done To create a new stream: diff --git a/docs/source/cuda-reference/kernel.rst b/docs/source/cuda-reference/kernel.rst index 8ab9c06b2ca..cd6e4fb3370 100644 --- a/docs/source/cuda-reference/kernel.rst +++ b/docs/source/cuda-reference/kernel.rst @@ -57,8 +57,8 @@ Dispatcher objects also provide several utility methods for inspection and creating a specialized instance: .. autoclass:: numba.cuda.compiler.Dispatcher - :members: inspect_asm, inspect_llvm, inspect_types, specialize, specialized, - extensions + :members: inspect_asm, inspect_llvm, inspect_sass, inspect_types, + specialize, specialized, extensions Intrinsic Attributes and Functions diff --git a/docs/source/reference/deprecation.rst b/docs/source/reference/deprecation.rst index ecdd99688f7..1f43b11c305 100644 --- a/docs/source/reference/deprecation.rst +++ b/docs/source/reference/deprecation.rst @@ -254,3 +254,22 @@ This feature will be moved with respect to this schedule: * Deprecation warnings will be issued in version 0.49.0 * Support for importing from ``numba.jitclass`` will be removed in version 0.52.0. + +Deprecation of the target kwarg +=============================== +There have been a number of users attempting to use the ``target`` keyword +argument that's meant for internal use only. We are deprecating this argument, +as alternative solutions are available to achieve the same behaviour. + +Recommendations +--------------- +Update the ``jit`` decorator as follows: + +* Change ``@numba.jit(..., target='cuda')`` to ``numba.cuda.jit(...)``. + +Schedule +-------- +This feature will be moved with respect to this schedule: + +* Deprecation warnings will be issued in 0.51.0. +* The target kwarg will be removed in version 0.53.0. diff --git a/docs/source/reference/envvars.rst b/docs/source/reference/envvars.rst index 3e9f888a814..f9946b3b61d 100644 --- a/docs/source/reference/envvars.rst +++ b/docs/source/reference/envvars.rst @@ -3,6 +3,10 @@ Environment variables ===================== +.. note:: This section relates to environment variables that impact Numba's + runtime, for compile time environment variables see + :ref:`numba-source-install-env_vars`. + Numba allows its behaviour to be changed through the use of environment variables. Unless otherwise mentioned, those variables have integer values and default to zero. diff --git a/docs/source/user/installing.rst b/docs/source/user/installing.rst index aa5c71df27c..37a4ffc47ac 100644 --- a/docs/source/user/installing.rst +++ b/docs/source/user/installing.rst @@ -170,6 +170,37 @@ Then you can build and install Numba from the top level of the source tree:: $ python setup.py install +.. _numba-source-install-env_vars: + +Build time environment variables and configuration of optional components +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Below are environment variables that are applicable to altering how Numba would +otherwise build by default along with information on configuration options. + +.. envvar:: NUMBA_DISABLE_OPENMP (default: not set) + + To disable compilation of the OpenMP threading backend set this environment + variable to a non-empty string when building. If not set (default): + + * For Linux and Windows it is necessary to provide OpenMP C headers and + runtime libraries compatible with the compiler tool chain mentioned above, + and for these to be accessible to the compiler via standard flags. + * For OSX the conda packages ``llvm-openmp`` and ``intel-openmp`` provide + suitable C headers and libraries. If the compilation requirements are not + met the OpenMP threading backend will not be compiled + +.. envvar:: NUMBA_DISABLE_TBB (default: not set) + + To disable the compilation of the TBB threading backend set this environment + variable to a non-empty string when building. If not set (default) the TBB C + headers and libraries must be available at compile time. If building with + ``conda build`` this requirement can be met by installing the ``tbb-devel`` + package. If not building with ``conda build`` the requirement can be met via a + system installation of TBB or through the use of the ``TBBROOT`` environment + variable to provide the location of the TBB installation. For more + information about setting ``TBBROOT`` see the `Intel documentation `_. + .. _numba-source-install-check: Dependency List @@ -177,7 +208,7 @@ Dependency List Numba has numerous required and optional dependencies which additionally may vary with target operating system and hardware. The following lists them all -(as of September 2019). +(as of July 2020). * Required build time: @@ -185,26 +216,24 @@ vary with target operating system and hardware. The following lists them all * ``numpy`` * ``llvmlite`` * Compiler toolchain mentioned above - * OpenMP C headers and runtime libraries compatible with the compiler - toolchain mentioned above and accessible to the compiler via standard flags - (Linux, Windows). + +* Required run time: + + * ``setuptools`` + * ``numpy`` + * ``llvmlite`` * Optional build time: + See :ref:`numba-source-install-env_vars` for more details about additional + options for the configuration and specification of these optional components. + * ``llvm-openmp`` (OSX) - provides headers for compiling OpenMP support into Numba's threading backend * ``intel-openmp`` (OSX) - provides OpenMP library support for Numba's threading backend. * ``tbb-devel`` - provides TBB headers/libraries for compiling TBB support into Numba's threading backend - * ``pickle5`` - provides Python 3.8 pickling features for faster pickling in - Python 3.6 and 3.7. - -* Required run time: - - * ``setuptools`` - * ``numpy`` - * ``llvmlite`` * Optional runtime are: @@ -234,6 +263,8 @@ vary with target operating system and hardware. The following lists them all inspection. `See here `_ for information on obtaining and installing. * ``graphviz`` - for some CFG inspection functionality. + * ``pickle5`` - provides Python 3.8 pickling features for faster pickling in + Python 3.6 and 3.7. * To build the documentation: @@ -299,4 +330,3 @@ further information. pci bus id: 1 (output truncated due to length) - diff --git a/numba/_helperlib.c b/numba/_helperlib.c index ad698cdf5b2..399c6c10415 100644 --- a/numba/_helperlib.c +++ b/numba/_helperlib.c @@ -825,143 +825,17 @@ static void traceback_add(const char *funcname, const char *filename, int lineno _PyErr_ChainExceptions(exc, val, tb); } -/* Logic for raising an arbitrary object. Adapted from CPython's ceval.c. - This *consumes* a reference count to its argument. */ -NUMBA_EXPORT_FUNC(int) -numba_do_raise(PyObject *exc_packed) -{ - PyObject *exc = NULL, *type = NULL, *value = NULL, *loc = NULL; + +/* + * Add traceback information to *loc* to the active exception. + * loc can be NULL, which causes this function to become a no-op. + */ +static +void traceback_add_loc(PyObject *loc) { const char *function_name_str = NULL, *filename_str = NULL; PyObject *function_name = NULL, *filename = NULL, *lineno = NULL; Py_ssize_t pos; - /* We support the following forms of raise: - raise - raise - raise */ - - /* could be a tuple from npm (some exc like thing, args, location) */ - if (PyTuple_CheckExact(exc_packed)) { - /* Unpack a (class/inst/tuple, arguments, location) tuple. */ - if (!PyArg_ParseTuple(exc_packed, "OOO", &exc, &value, &loc)) { - Py_DECREF(exc_packed); - goto raise_error_w_loc; - } - - if (exc == Py_None) { - /* Reraise */ - PyThreadState *tstate = PyThreadState_GET(); - PyObject *tb; -#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 7) - _PyErr_StackItem *tstate_exc = tstate->exc_info; -#else - PyThreadState *tstate_exc = tstate; -#endif - Py_DECREF(exc_packed); - type = tstate_exc->exc_type; - value = tstate_exc->exc_value; - tb = tstate_exc->exc_traceback; - if (type == Py_None) { - PyErr_SetString(PyExc_RuntimeError, - "No active exception to reraise"); - return 0; - } - Py_XINCREF(type); - Py_XINCREF(value); - Py_XINCREF(tb); - PyErr_Restore(type, value, tb); - return 1; - } - - /* the unpacked exc should be a class, value and loc are set from above - */ - Py_XINCREF(value); - Py_XINCREF(loc); - if (PyExceptionClass_Check(exc)) { - /* It is a class, type used here just as a tmp var */ - type = PyObject_CallObject(exc, value); - if (type == NULL) - goto raise_error_w_loc; - if (!PyExceptionInstance_Check(type)) { - PyErr_SetString(PyExc_TypeError, - "exceptions must derive from BaseException"); - goto raise_error_w_loc; - } - /* all ok, set type to the exc */ - Py_DECREF(type); - type = exc; - } else { - /* this should be unreachable as typing should catch it */ - /* Not something you can raise. You get an exception - anyway, just not what you specified :-) */ - Py_DECREF(exc_packed); - PyErr_SetString(PyExc_TypeError, - "exceptions must derive from BaseException"); - goto raise_error_w_loc; - } - - /* as this branch is exited: - * - type should be an exception class - * - value should be the args for the exception class instantiation - * - loc should be the location information (or None) - */ - } else { /* could be a reraise or an exception from objmode */ - exc = exc_packed; - if (exc == Py_None) { - /* Reraise */ - PyThreadState *tstate = PyThreadState_GET(); - PyObject *tb; -#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 7) - _PyErr_StackItem *tstate_exc = tstate->exc_info; -#else - PyThreadState *tstate_exc = tstate; -#endif - Py_DECREF(exc); - type = tstate_exc->exc_type; - value = tstate_exc->exc_value; - tb = tstate_exc->exc_traceback; - if (type == Py_None) { - PyErr_SetString(PyExc_RuntimeError, - "No active exception to reraise"); - return 0; - } - Py_XINCREF(type); - Py_XINCREF(value); - Py_XINCREF(tb); - PyErr_Restore(type, value, tb); - return 1; - } - - /* exc should be an exception class or an instance of an exception */ - if (PyExceptionClass_Check(exc)) { - type = exc; - value = PyObject_CallObject(exc, value); - if (value == NULL) - goto raise_error; - if (!PyExceptionInstance_Check(value)) { - PyErr_SetString(PyExc_TypeError, - "exceptions must derive from BaseException"); - goto raise_error; - } - } - else if (PyExceptionInstance_Check(exc)) { - value = exc; - type = PyExceptionInstance_Class(exc); - Py_INCREF(type); - } - else { - /* Not something you can raise. You get an exception - anyway, just not what you specified :-) */ - Py_DECREF(exc); // exc points to exc_packed - PyErr_SetString(PyExc_TypeError, - "exceptions must derive from BaseException"); - goto raise_error; - } - } - - PyErr_SetObject(type, value); - -raise_error_w_loc: /* instance is instantiated/internal exception is raised, if loc is present * add a frame for it into the traceback */ if(loc && loc != Py_None && PyTuple_Check(loc)) @@ -977,31 +851,141 @@ numba_do_raise(PyObject *exc_packed) traceback_add(function_name_str, filename_str, \ (int)PyLong_AsLong(lineno)); } +} - /* PyErr_SetObject incref's its arguments */ - Py_XDECREF(value); - Py_XDECREF(type); - return 0; +/** + * Re-raise the current active exception. + * Called internal by process_raise() when *exc* is None. + */ +static +int reraise_exc_is_none(void) { + /* Reraise */ + PyThreadState *tstate = PyThreadState_GET(); + PyObject *tb, *type, *value; +#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 7) + _PyErr_StackItem *tstate_exc = tstate->exc_info; +#else + PyThreadState *tstate_exc = tstate; +#endif + type = tstate_exc->exc_type; + value = tstate_exc->exc_value; + tb = tstate_exc->exc_traceback; + if (type == Py_None) { + PyErr_SetString(PyExc_RuntimeError, + "No active exception to reraise"); + return 0; + } + /* incref needed because PyErr_Restore DOES NOT */ + Py_XINCREF(type); + Py_XINCREF(value); + Py_XINCREF(tb); + PyErr_Restore(type, value, tb); + return 1; +} -raise_error: - Py_XDECREF(value); - Py_XDECREF(type); - return 0; +/* + * Set exception given the Exception type and the constructor argument. + * Equivalent to ``raise exc(value)``. + * PyExceptionClass_Check(exc) must be True. + * value can be NULL. + */ +static +int process_exception_class(PyObject *exc, PyObject *value) { + PyObject *type; + /* It is a class, type used here just as a tmp var */ + type = PyObject_CallObject(exc, value); + if (type == NULL){ + return 0; + } + if (!PyExceptionInstance_Check(type)) { + PyErr_SetString(PyExc_TypeError, + "exceptions must derive from BaseException"); + Py_DECREF(type); + return 0; + } + /* all ok, set type to the exc */ + Py_DECREF(type); + type = exc; + PyErr_SetObject(type, value); + return 1; } +/* + * Internal routine to process exceptions. + * exc cannot be NULL. It can be a None, Exception type, or Exception instance. + * value can be NULL for absent, or any PyObject valid for the exception. + */ +static +int process_raise(PyObject *exc, PyObject *value) { + /* exc is None */ + if (exc == Py_None) { + return reraise_exc_is_none(); + } + /* exc should be an exception class */ + else if (PyExceptionClass_Check(exc)) { + return process_exception_class(exc, value); + } + /* exc is an instance of an Exception */ + else if (PyExceptionInstance_Check(exc)) { + PyObject *type = PyExceptionInstance_Class(exc); + PyErr_SetObject(type, exc); + return 0; + } + else { + /* Not something you can raise. You get an exception + anyway, just not what you specified :-) */ + PyErr_SetString(PyExc_TypeError, + "exceptions must derive from BaseException"); + return 0; + } +} + +/* Logic for raising an arbitrary object. Adapted from CPython's ceval.c. + This *consumes* a reference count to its argument. */ +NUMBA_EXPORT_FUNC(int) +numba_do_raise(PyObject *exc_packed) +{ + int status; + PyObject *exc = NULL, *value = NULL, *loc = NULL; + + /* We support the following forms of raise: + raise + raise + raise */ + + /* could be a tuple from npm (some exc like thing, args, location) */ + if (PyTuple_CheckExact(exc_packed)) { + /* Unpack a (class/inst/tuple, arguments, location) tuple. */ + if (!PyArg_ParseTuple(exc_packed, "OOO", &exc, &value, &loc)) { + traceback_add_loc(loc); + return 0; + } + } else { + /* could be a reraise or an exception from objmode */ + exc = exc_packed; + /* branch exit with value = NULL and loc = NULL */ + } + /* value is either NULL or borrowed */ + status = process_raise(exc, value); + traceback_add_loc(loc); + Py_DECREF(exc_packed); + return status; +} + + NUMBA_EXPORT_FUNC(PyObject *) -numba_unpickle(const char *data, int n) +numba_unpickle(const char *data, int n, const char *hashed) { - PyObject *buf, *obj; - static PyObject *loads; + PyObject *buf=NULL, *obj=NULL, *addr=NULL, *hashedbuf=NULL; + static PyObject *loads=NULL; /* Caching the pickle.loads function shaves a couple µs here. */ if (loads == NULL) { PyObject *picklemod; - picklemod = PyImport_ImportModule("pickle"); + picklemod = PyImport_ImportModule("numba.core.serialize"); if (picklemod == NULL) return NULL; - loads = PyObject_GetAttrString(picklemod, "loads"); + loads = PyObject_GetAttrString(picklemod, "_numba_unpickle"); Py_DECREF(picklemod); if (loads == NULL) return NULL; @@ -1010,7 +994,17 @@ numba_unpickle(const char *data, int n) buf = PyBytes_FromStringAndSize(data, n); if (buf == NULL) return NULL; - obj = PyObject_CallFunctionObjArgs(loads, buf, NULL); + /* SHA1 produces 160 bit or 20 bytes */ + hashedbuf = PyBytes_FromStringAndSize(hashed, 20); + if (hashedbuf == NULL) + goto error; + addr = PyLong_FromVoidPtr((void*)data); + if (addr == NULL) + goto error; + obj = PyObject_CallFunctionObjArgs(loads, addr, buf, hashedbuf, NULL); +error: + Py_XDECREF(addr); + Py_XDECREF(hashedbuf); Py_DECREF(buf); return obj; } diff --git a/numba/_numba_common.h b/numba/_numba_common.h index f406db58317..c5e67d9c6a3 100644 --- a/numba/_numba_common.h +++ b/numba/_numba_common.h @@ -17,4 +17,23 @@ #define VISIBILITY_HIDDEN #endif +/* + * Numba's version of the PyArray_DescrCheck macro from NumPy, use it as a + * direct replacement of NumPy's PyArray_DescrCheck to ensure binary + * compatibility. + * + * Details of why this is needed: + * NumPy 1.18 changed the definition of the PyArray_DescrCheck macro here: + * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698 + * the result of this being that building against NumPy <1.18 would prevent + * Numba running against NumPy >= 1.20 as noted here: + * https://github.com/numba/numba/issues/6041#issuecomment-665132199 + * + * This macro definition is copied from: + * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698#diff-ad2213da23136c5fc5883d9eb2d88666R26 + * + * NOTE: This is the NumPy 1.18 and above version of the macro. + */ +#define NUMBA_PyArray_DescrCheck(op) PyObject_TypeCheck(op, &PyArrayDescr_Type) + #endif /* NUMBA_COMMON_H_ */ diff --git a/numba/_typeof.c b/numba/_typeof.c index 242150cccd0..ffe0e3a3c58 100644 --- a/numba/_typeof.c +++ b/numba/_typeof.c @@ -4,6 +4,7 @@ #include #include +#include "_numba_common.h" #include "_typeof.h" #include "_hashtable.h" @@ -372,7 +373,7 @@ compute_fingerprint(string_writer_t *w, PyObject *val) PyBuffer_Release(&buf); return 0; } - if (PyArray_DescrCheck(val)) { + if (NUMBA_PyArray_DescrCheck(val)) { TRY(string_writer_put_char, w, OP_NP_DTYPE); return compute_dtype_fingerprint(w, (PyArray_Descr *) val); } diff --git a/numba/core/analysis.py b/numba/core/analysis.py index ebf4c0c87c9..96cb2014cd9 100644 --- a/numba/core/analysis.py +++ b/numba/core/analysis.py @@ -608,6 +608,13 @@ def find_literally_calls(func_ir, argtypes): first_loc.setdefault(argindex, assign.loc) # Signal the dispatcher to force literal typing for pos in marked_args: - if not isinstance(argtypes[pos], (types.Literal, types.InitialValue)): + query_arg = argtypes[pos] + do_raise = (isinstance(query_arg, types.InitialValue) and + query_arg.initial_value is None) + if do_raise: + loc = first_loc[pos] + raise errors.ForceLiteralArg(marked_args, loc=loc) + + if not isinstance(query_arg, (types.Literal, types.InitialValue)): loc = first_loc[pos] raise errors.ForceLiteralArg(marked_args, loc=loc) diff --git a/numba/core/callconv.py b/numba/core/callconv.py index ec154128887..7a7bb85feb3 100644 --- a/numba/core/callconv.py +++ b/numba/core/callconv.py @@ -316,8 +316,9 @@ def get_exception(self, exc_id): msg = "unknown error %d in native function" % exc_id return SystemError, (msg,) - -excinfo_t = ir.LiteralStructType([GENERIC_POINTER, int32_t]) +# The structure type constructed by PythonAPI.serialize_uncached() +# i.e a {i8* pickle_buf, i32 pickle_bufsz, i8* hash_buf} +excinfo_t = ir.LiteralStructType([GENERIC_POINTER, int32_t, GENERIC_POINTER]) excinfo_ptr_t = ir.PointerType(excinfo_t) diff --git a/numba/core/codegen.py b/numba/core/codegen.py index 05a68e10d99..1e05c12bacd 100644 --- a/numba/core/codegen.py +++ b/numba/core/codegen.py @@ -742,7 +742,7 @@ def _check_llvm_bugs(self): raise RuntimeError( "LLVM will produce incorrect floating-point code " "in the current locale %s.\nPlease read " - "http://numba.pydata.org/numba-doc/latest/user/faq.html#llvm-locale-bug " + "https://numba.pydata.org/numba-doc/latest/user/faq.html#llvm-locale-bug " "for more information." % (loc,)) raise AssertionError("Unexpected IR:\n%s\n" % (ir_out,)) diff --git a/numba/core/cpu_options.py b/numba/core/cpu_options.py index 758f95b8479..c8019949a64 100644 --- a/numba/core/cpu_options.py +++ b/numba/core/cpu_options.py @@ -50,6 +50,7 @@ def __init__(self, value): self.enabled = value self.comprehension = value self.reduction = value + self.inplace_binop = value self.setitem = value self.numpy = value self.stencil = value @@ -59,6 +60,7 @@ def __init__(self, value): self.enabled = True self.comprehension = value.pop('comprehension', True) self.reduction = value.pop('reduction', True) + self.inplace_binop = value.pop('inplace_binop', True) self.setitem = value.pop('setitem', True) self.numpy = value.pop('numpy', True) self.stencil = value.pop('stencil', True) diff --git a/numba/core/debuginfo.py b/numba/core/debuginfo.py index 23f49d2b655..b26c7f30219 100644 --- a/numba/core/debuginfo.py +++ b/numba/core/debuginfo.py @@ -31,6 +31,12 @@ def mark_subprogram(self, function, name, loc): """ pass + @abc.abstractmethod + def initialize(self): + """Initialize the debug info. An opportunity for the debuginfo to + prepare any necessary data structures. + """ + @abc.abstractmethod def finalize(self): """Finalize the debuginfo by emitting all necessary metadata. @@ -52,6 +58,9 @@ def mark_location(self, builder, loc): def mark_subprogram(self, function, name, loc): pass + def initialize(self): + pass + def finalize(self): pass @@ -66,6 +75,11 @@ def __init__(self, module, filepath): self.filepath = os.path.abspath(filepath) self.difile = self._di_file() self.subprograms = [] + self.initialize() + + def initialize(self): + # Create the compile unit now because it is referenced when + # constructing subprograms self.dicompileunit = self._di_compile_unit() def _var_type(self, lltype, size): @@ -302,6 +316,7 @@ def _di_file(self): def _di_compile_unit(self): filepair = self._filepair() empty = self.module.add_metadata([self._const_int(0)]) + sp_metadata = self.module.add_metadata(self.subprograms) return self.module.add_metadata([ self._const_int(self.DI_Compile_unit), # tag filepair, # source directory and file pair @@ -374,3 +389,11 @@ def _di_location(self, line): None, # original scope ]) + def initialize(self): + pass + + def finalize(self): + # We create the compile unit at this point because subprograms is + # populated and can be referred to by the compile unit. + self.dicompileunit = self._di_compile_unit() + super().finalize() diff --git a/numba/core/decorators.py b/numba/core/decorators.py index e1870091ba6..cfe91168969 100644 --- a/numba/core/decorators.py +++ b/numba/core/decorators.py @@ -23,7 +23,7 @@ "Signatures should be passed as the first " "positional argument.") -def jit(signature_or_function=None, locals={}, target='cpu', cache=False, +def jit(signature_or_function=None, locals={}, cache=False, pipeline_class=None, boundscheck=False, **options): """ This decorator is used to compile a Python function into native code. @@ -41,7 +41,7 @@ def jit(signature_or_function=None, locals={}, target='cpu', cache=False, Mapping of local variable names to Numba types. Used to override the types deduced by Numba's type inference engine. - target: str + target (deprecated): str Specifies the target platform to compile for. Valid targets are cpu, gpu, npyufunc, and cuda. Defaults to cpu. @@ -145,6 +145,11 @@ def bar(x, y): raise DeprecationError(_msg_deprecated_signature_arg.format('restype')) if options.get('nopython', False) and options.get('forceobj', False): raise ValueError("Only one of 'nopython' or 'forceobj' can be True.") + if 'target' in options: + target = options.pop('target') + warnings.warn("The 'target' keyword argument is deprecated.", NumbaDeprecationWarning) + else: + target = options.pop('_target', 'cpu') options['boundscheck'] = boundscheck diff --git a/numba/core/errors.py b/numba/core/errors.py index 46426a9e002..3417a86777b 100644 --- a/numba/core/errors.py +++ b/numba/core/errors.py @@ -336,9 +336,9 @@ def termcolor(): please file a feature request at: https://github.com/numba/numba/issues/new To see Python/NumPy features supported by the latest release of Numba visit: -http://numba.pydata.org/numba-doc/latest/reference/pysupported.html +https://numba.pydata.org/numba-doc/latest/reference/pysupported.html and -http://numba.pydata.org/numba-doc/latest/reference/numpysupported.html +https://numba.pydata.org/numba-doc/latest/reference/numpysupported.html """ constant_inference_info = """ @@ -347,7 +347,7 @@ def termcolor(): however please first check that your code is valid for compilation, particularly with respect to string interpolation (not supported!) and the requirement of compile time constants as arguments to exceptions: -http://numba.pydata.org/numba-doc/latest/reference/pysupported.html?highlight=exceptions#constructs +https://numba.pydata.org/numba-doc/latest/reference/pysupported.html?highlight=exceptions#constructs If the code is valid and the unsupported functionality is important to you please file a feature request at: https://github.com/numba/numba/issues/new @@ -360,12 +360,12 @@ def termcolor(): the use of unsupported features or an issue in resolving types. To see Python/NumPy features supported by the latest release of Numba visit: -http://numba.pydata.org/numba-doc/latest/reference/pysupported.html +https://numba.pydata.org/numba-doc/latest/reference/pysupported.html and -http://numba.pydata.org/numba-doc/latest/reference/numpysupported.html +https://numba.pydata.org/numba-doc/latest/reference/numpysupported.html For more information about typing errors and how to debug them visit: -http://numba.pydata.org/numba-doc/latest/user/troubleshoot.html#my-code-doesn-t-compile +https://numba.pydata.org/numba-doc/latest/user/troubleshoot.html#my-code-doesn-t-compile If you think your code should work with Numba, please report the error message and traceback, along with a minimal reproducer at: diff --git a/numba/core/extending.py b/numba/core/extending.py index 96f90a6df33..8d8d8525e21 100644 --- a/numba/core/extending.py +++ b/numba/core/extending.py @@ -55,7 +55,8 @@ def generic(self): _overload_default_jit_options = {'no_cpython_wrapper': True} -def overload(func, jit_options={}, strict=True, inline='never'): +def overload(func, jit_options={}, strict=True, inline='never', + prefer_literal=False): """ A decorator marking the decorated function as typing and implementing *func* in nopython mode. @@ -101,6 +102,12 @@ def len_impl(seq): holds the information from the callee. The function should return Truthy to determine whether to inline, this essentially permitting custom inlining rules (typical use might be cost models). + + The *prefer_literal* option allows users to control if literal types should + be tried first or last. The default (`False`) is to use non-literal types. + Implementations that can specialize based on literal values should set the + option to `True`. Note, this option maybe expanded in the near future to + allow for more control (e.g. disabling non-literal types). """ from numba.core.typing.templates import make_overload_template, infer_global @@ -110,7 +117,7 @@ def len_impl(seq): def decorate(overload_func): template = make_overload_template(func, overload_func, opts, strict, - inline) + inline, prefer_literal) infer(template) if callable(func): infer_global(func, types.Function(template)) @@ -207,6 +214,7 @@ def decorate(overload_func): template = make_overload_method_template( typ, attr, overload_func, inline=kwargs.get('inline', 'never'), + prefer_literal=kwargs.get('prefer_literal', False) ) infer_getattr(template) overload(overload_func, **kwargs)(overload_func) diff --git a/numba/core/inline_closurecall.py b/numba/core/inline_closurecall.py index 30cb64c4d51..be56fe05162 100644 --- a/numba/core/inline_closurecall.py +++ b/numba/core/inline_closurecall.py @@ -939,6 +939,8 @@ def _inline_arraycall(func_ir, cfg, visited, loop, swapped, enable_prange=False, list_var_def = get_definition(func_ir, list_var_def.value) # Check if the definition is a build_list require(isinstance(list_var_def, ir.Expr) and list_var_def.op == 'build_list') + # The build_list must be empty + require(len(list_var_def.items) == 0) # Look for list_append in "last" block in loop body, which should be a block that is # a post-dominator of the loop header. diff --git a/numba/core/ir_utils.py b/numba/core/ir_utils.py index 96a74095398..1d58c5c8b5b 100644 --- a/numba/core/ir_utils.py +++ b/numba/core/ir_utils.py @@ -2003,7 +2003,7 @@ def raise_on_unsupported_feature(func_ir, typemap): "in a function is unsupported (strange things happen!), use " "numba.gdb_breakpoint() to create additional breakpoints " "instead.\n\nRelevant documentation is available here:\n" - "http://numba.pydata.org/numba-doc/latest/user/troubleshoot.html" + "https://numba.pydata.org/numba-doc/latest/user/troubleshoot.html" "/troubleshoot.html#using-numba-s-direct-gdb-bindings-in-" "nopython-mode\n\nConflicting calls found at:\n %s") buf = '\n'.join([x.strformat() for x in gdb_calls]) @@ -2021,7 +2021,7 @@ def warn_deprecated(func_ir, typemap): arg = name.split('.')[1] fname = func_ir.func_id.func_qualname tyname = 'list' if isinstance(ty, types.List) else 'set' - url = ("http://numba.pydata.org/numba-doc/latest/reference/" + url = ("https://numba.pydata.org/numba-doc/latest/reference/" "deprecation.html#deprecation-of-reflection-for-list-and" "-set-types") msg = ("\nEncountered the use of a type that is scheduled for " diff --git a/numba/core/lowering.py b/numba/core/lowering.py index 1cd0bb5d831..1c9c19cd3b1 100644 --- a/numba/core/lowering.py +++ b/numba/core/lowering.py @@ -465,7 +465,8 @@ def lower_setitem(self, target_var, index_var, value_var, signature): target = self.context.cast(self.builder, target, targetty, targetty.type) else: - assert targetty == signature.args[0] + ul = types.unliteral + assert ul(targetty) == ul(signature.args[0]) index = self.context.cast(self.builder, index, indexty, signature.args[1]) diff --git a/numba/core/object_mode_passes.py b/numba/core/object_mode_passes.py index 99748e8b5a6..ce302a4b5a7 100644 --- a/numba/core/object_mode_passes.py +++ b/numba/core/object_mode_passes.py @@ -177,7 +177,7 @@ def backend_object_mode(): warnings.warn(errors.NumbaWarning(warn_msg, state.func_ir.loc)) - url = ("http://numba.pydata.org/numba-doc/latest/reference/" + url = ("https://numba.pydata.org/numba-doc/latest/reference/" "deprecation.html#deprecation-of-object-mode-fall-" "back-behaviour-when-using-jit") msg = ("\nFall-back from the nopython compilation path to the " diff --git a/numba/core/pylowering.py b/numba/core/pylowering.py index b5783ceb748..2b8aa8c6002 100644 --- a/numba/core/pylowering.py +++ b/numba/core/pylowering.py @@ -5,6 +5,7 @@ import builtins import operator +import inspect from llvmlite.llvmpy.core import Type, Constant import llvmlite.llvmpy.core as lc @@ -75,10 +76,6 @@ def init(self): def pre_lower(self): super(PyLower, self).pre_lower() self.init_pyapi() - # Pre-computed for later use - from numba.core.dispatcher import OmittedArg - self.omitted_typobj = self.pyapi.unserialize( - self.pyapi.serialize_object(OmittedArg)) def post_lower(self): pass @@ -170,6 +167,15 @@ def lower_inst(self, inst): else: raise NotImplementedError(type(inst), inst) + @utils.cached_property + def _omitted_typobj(self): + """Return a `OmittedArg` type instance as a LLVM value suitable for + testing at runtime. + """ + from numba.core.dispatcher import OmittedArg + return self.pyapi.unserialize( + self.pyapi.serialize_object(OmittedArg)) + def lower_assign(self, inst): """ The returned object must have a new reference @@ -188,21 +194,28 @@ def lower_assign(self, inst): elif isinstance(value, ir.Yield): return self.lower_yield(value) elif isinstance(value, ir.Arg): + param = self.func_ir.func_id.pysig.parameters.get(value.name) + obj = self.fnargs[value.index] - # When an argument is omitted, the dispatcher hands it as - # _OmittedArg() - typobj = self.pyapi.get_type(obj) slot = cgutils.alloca_once_value(self.builder, obj) - is_omitted = self.builder.icmp_unsigned('==', typobj, - self.omitted_typobj) - with self.builder.if_else(is_omitted, likely=False) as (omitted, present): - with present: - self.incref(obj) - self.builder.store(obj, slot) - with omitted: - # The argument is omitted => get the default value - obj = self.pyapi.object_getattr_string(obj, 'value') - self.builder.store(obj, slot) + # Don't check for OmittedArg unless the argument has a default + if param is not None and param.default is inspect.Parameter.empty: + self.incref(obj) + self.builder.store(obj, slot) + else: + # When an argument is omitted, the dispatcher hands it as + # _OmittedArg() + typobj = self.pyapi.get_type(obj) + is_omitted = self.builder.icmp_unsigned('==', typobj, + self._omitted_typobj) + with self.builder.if_else(is_omitted, likely=False) as (omitted, present): + with present: + self.incref(obj) + self.builder.store(obj, slot) + with omitted: + # The argument is omitted => get the default value + obj = self.pyapi.object_getattr_string(obj, 'value') + self.builder.store(obj, slot) return self.builder.load(slot) else: diff --git a/numba/core/pythonapi.py b/numba/core/pythonapi.py index 0d85e0e17e6..93a960849b5 100644 --- a/numba/core/pythonapi.py +++ b/numba/core/pythonapi.py @@ -1,6 +1,7 @@ from collections import namedtuple import contextlib import pickle +import hashlib from llvmlite import ir from llvmlite.llvmpy.core import Type, Constant @@ -1298,27 +1299,37 @@ def unserialize(self, structptr): Unserialize some data. *structptr* should be a pointer to a {i8* data, i32 length} structure. """ - fnty = Type.function(self.pyobj, (self.voidptr, ir.IntType(32))) + fnty = Type.function(self.pyobj, + (self.voidptr, ir.IntType(32), self.voidptr)) fn = self._get_function(fnty, name="numba_unpickle") ptr = self.builder.extract_value(self.builder.load(structptr), 0) n = self.builder.extract_value(self.builder.load(structptr), 1) - return self.builder.call(fn, (ptr, n)) + hashed = self.builder.extract_value(self.builder.load(structptr), 2) + return self.builder.call(fn, (ptr, n, hashed)) def serialize_uncached(self, obj): """ Same as serialize_object(), but don't create a global variable, - simply return a literal {i8* data, i32 length} structure. + simply return a literal {i8* data, i32 length, i8* hashbuf} structure. """ # First make the array constant data = serialize.dumps(obj) assert len(data) < 2**31 name = ".const.pickledata.%s" % (id(obj) if config.DIFF_IR == 0 else "DIFF_IR") bdata = cgutils.make_bytearray(data) + # Make SHA1 hash on the pickled content + # NOTE: update buffer size in numba_unpickle() when changing the + # hash algorithm. + hashed = cgutils.make_bytearray(hashlib.sha1(data).digest()) arr = self.context.insert_unique_const(self.module, name, bdata) + hasharr = self.context.insert_unique_const( + self.module, f"{name}.sha1", hashed, + ) # Then populate the structure constant struct = ir.Constant.literal_struct([ arr.bitcast(self.voidptr), ir.Constant(ir.IntType(32), arr.type.pointee.count), + hasharr.bitcast(self.voidptr), ]) return struct diff --git a/numba/core/runtime/_nrt_python.c b/numba/core/runtime/_nrt_python.c index 97f7f19efb3..c3053bda134 100644 --- a/numba/core/runtime/_nrt_python.c +++ b/numba/core/runtime/_nrt_python.c @@ -10,6 +10,7 @@ #include #include "../../_arraystruct.h" +#include "../../_numba_common.h" #include "nrt.h" @@ -294,7 +295,7 @@ NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim, npy_intp *shape, *strides; int flags = 0; - if (!PyArray_DescrCheck(descr)) { + if (!NUMBA_PyArray_DescrCheck(descr)) { PyErr_Format(PyExc_TypeError, "expected dtype object, got '%.200s'", Py_TYPE(descr)->tp_name); diff --git a/numba/core/serialize.py b/numba/core/serialize.py index bac37ef7dca..ae5e7ac32be 100644 --- a/numba/core/serialize.py +++ b/numba/core/serialize.py @@ -133,6 +133,32 @@ def _rebuild_code(marshal_version, bytecode_magic, marshalled): return marshal.loads(marshalled) +# Keep unpickled object via `numba_unpickle` alive. +_unpickled_memo = {} + + +def _numba_unpickle(address, bytedata, hashed): + """Used by `numba_unpickle` from _helperlib.c + + Parameters + ---------- + address : int + bytedata : bytes + hashed : bytes + + Returns + ------- + obj : object + unpickled object + """ + key = (address, hashed) + try: + obj = _unpickled_memo[key] + except KeyError: + _unpickled_memo[key] = obj = pickle.loads(bytedata) + return obj + + def dumps(obj): """Similar to `pickle.dumps()`. Returns the serialized object in bytes. """ diff --git a/numba/core/typeconv/typeconv.py b/numba/core/typeconv/typeconv.py index 6f05e276a7f..08c05649caf 100644 --- a/numba/core/typeconv/typeconv.py +++ b/numba/core/typeconv/typeconv.py @@ -5,7 +5,7 @@ except ImportError as e: from numba.core.errors import feedback_details as reportme import sys - url = "http://numba.pydata.org/numba-doc/latest/developer/contributing.html" + url = "https://numba.pydata.org/numba-doc/latest/developer/contributing.html" dashes = '-' * 80 msg = ("Numba could not be imported.\nIf you are seeing this message and " "are undertaking Numba development work, you may need to re-run:\n\n" diff --git a/numba/core/typed_passes.py b/numba/core/typed_passes.py index 25d8f7a91a0..3756a3e7c22 100644 --- a/numba/core/typed_passes.py +++ b/numba/core/typed_passes.py @@ -304,7 +304,7 @@ def run_pass(self, state): # parfor calls the compiler chain again with a string if not (config.DISABLE_PERFORMANCE_WARNINGS or state.func_ir.loc.filename == ''): - url = ("http://numba.pydata.org/numba-doc/latest/user/" + url = ("https://numba.pydata.org/numba-doc/latest/user/" "parallel.html#diagnostics") msg = ("\nThe keyword argument 'parallel=True' was specified " "but no transformation for parallel execution was " diff --git a/numba/core/typeinfer.py b/numba/core/typeinfer.py index 89825d34244..e1a4eed79cd 100644 --- a/numba/core/typeinfer.py +++ b/numba/core/typeinfer.py @@ -1138,7 +1138,7 @@ def diagnose_imprecision(offender): the type of empty lists can be inferred, this is not always the case, see this documentation for help: -http://numba.pydata.org/numba-doc/latest/user/troubleshoot.html#my-code-has-an-untyped-list-problem +https://numba.pydata.org/numba-doc/latest/user/troubleshoot.html#my-code-has-an-untyped-list-problem """ if offender is not None: # This block deals with imprecise lists diff --git a/numba/core/types/containers.py b/numba/core/types/containers.py index 37a2573ac5b..87f4258c9dc 100644 --- a/numba/core/types/containers.py +++ b/numba/core/types/containers.py @@ -200,7 +200,7 @@ def can_convert_to(self, typingctx, other): return max(kinds) def __unliteral__(self): - return BaseTuple.from_types([unliteral(t) for t in self]) + return type(self).from_types([unliteral(t) for t in self]) class _HomogeneousTuple(Sequence, BaseTuple): @@ -253,6 +253,9 @@ def unify(self, typingctx, other): if dtype is not None: return UniTuple(dtype=dtype, count=self.count) + def __unliteral__(self): + return type(self)(dtype=unliteral(self.dtype), count=self.count) + class UniTupleIter(BaseContainerIterator): """ @@ -426,14 +429,22 @@ def copy(self, dtype=None, reflected=None): dtype = self.dtype if reflected is None: reflected = self.reflected - return List(dtype, reflected) + return List(dtype, reflected, self.initial_value) def unify(self, typingctx, other): if isinstance(other, List): dtype = typingctx.unify_pairs(self.dtype, other.dtype) reflected = self.reflected or other.reflected if dtype is not None: - return List(dtype, reflected) + siv = self.initial_value + oiv = other.initial_value + if siv is not None and oiv is not None: + use = siv + if siv is None: + use = oiv + return List(dtype, reflected, use) + else: + return List(dtype, reflected) @property def key(self): @@ -452,6 +463,10 @@ def __getitem__(self, args): """ return self.dtype + def __unliteral__(self): + return List(self.dtype, reflected=self.reflected, + initial_value=None) + class LiteralList(Literal, _HeterogeneousTuple): """A heterogeneous immutable list (basically a tuple with list semantics). @@ -695,11 +710,27 @@ def unify(self, typingctx, other): if isinstance(other, DictType): if not other.is_precise(): return self + else: + ukey_type = self.key_type == other.key_type + uvalue_type = self.value_type == other.value_type + if ukey_type and uvalue_type: + siv = self.initial_value + oiv = other.initial_value + siv_none = siv is None + oiv_none = oiv is None + if not siv_none and not oiv_none: + if siv == oiv: + return DictType(self.key_type, other.value_type, + siv) + return DictType(self.key_type, other.value_type) @property def key(self): return self.key_type, self.value_type, str(self.initial_value) + def __unliteral__(self): + return DictType(self.key_type, self.value_type) + class LiteralStrKeyDict(Literal, NamedTuple): """A Dictionary of string keys to heterogeneous values (basically a diff --git a/numba/core/types/functions.py b/numba/core/types/functions.py index 54d4c159e2c..7acff943c92 100644 --- a/numba/core/types/functions.py +++ b/numba/core/types/functions.py @@ -224,6 +224,16 @@ def raise_error(self): raise errors.TypingError(self.format()) +def _unlit_non_poison(ty): + """Apply unliteral(ty) and raise a TypingError if type is Poison. + """ + out = unliteral(ty) + if isinstance(out, types.Poison): + m = f"Poison type used in arguments; got {out}" + raise TypingError(m) + return out + + class BaseFunction(Callable): """ Base type class for some function types. @@ -266,18 +276,23 @@ def get_impl_key(self, sig): return self._impl_keys[sig.args] def get_call_type(self, context, args, kws): + prefer_lit = [True, False] # old behavior preferring literal + prefer_not = [False, True] # new behavior preferring non-literal failures = _ResolutionFailures(context, self, args, kws, depth=self._depth) self._depth += 1 for temp_cls in self.templates: temp = temp_cls(context) - for uselit in [True, False]: + # The template can override the default and prefer literal args + choice = prefer_lit if temp.prefer_literal else prefer_not + for uselit in choice: try: if uselit: sig = temp.apply(args, kws) else: - nolitargs = tuple([unliteral(a) for a in args]) - nolitkws = {k: unliteral(v) for k, v in kws.items()} + nolitargs = tuple([_unlit_non_poison(a) for a in args]) + nolitkws = {k: _unlit_non_poison(v) + for k, v in kws.items()} sig = temp.apply(nolitargs, nolitkws) except Exception as e: sig = None @@ -360,33 +375,45 @@ def get_call_type(self, context, args, kws): template = self.template(context) literal_e = None nonliteral_e = None + out = None - - # Try with Literal - try: - out = template.apply(args, kws) - except Exception as exc: - if isinstance(exc, errors.ForceLiteralArg): - raise exc - literal_e = exc - out = None - - # if the unliteral_args and unliteral_kws are the same as the literal - # ones, set up to not bother retrying - unliteral_args = tuple([unliteral(a) for a in args]) - unliteral_kws = {k: unliteral(v) for k, v in kws.items()} - skip = unliteral_args == args and kws == unliteral_kws - - # If the above template application failed and the non-literal args are - # different to the literal ones, try again with literals rewritten as - # non-literals - if not skip and out is None: - try: - out = template.apply(unliteral_args, unliteral_kws) - except Exception as exc: - if isinstance(exc, errors.ForceLiteralArg): - raise exc - nonliteral_e = exc + choice = [True, False] if template.prefer_literal else [False, True] + for uselit in choice: + if uselit: + # Try with Literal + try: + out = template.apply(args, kws) + except Exception as exc: + if isinstance(exc, errors.ForceLiteralArg): + raise exc + literal_e = exc + out = None + else: + break + else: + # if the unliteral_args and unliteral_kws are the same as the literal + # ones, set up to not bother retrying + unliteral_args = tuple([_unlit_non_poison(a) for a in args]) + unliteral_kws = {k: _unlit_non_poison(v) + for k, v in kws.items()} + skip = unliteral_args == args and kws == unliteral_kws + + # If the above template application failed and the non-literal args are + # different to the literal ones, try again with literals rewritten as + # non-literals + if not skip and out is None: + try: + out = template.apply(unliteral_args, unliteral_kws) + except Exception as exc: + if isinstance(exc, errors.ForceLiteralArg): + if template.prefer_literal: + # For template that prefers literal types, + # reaching here means that the literal types + # have failed typing as well. + raise exc + nonliteral_e = exc + else: + break if out is None and (nonliteral_e is not None or literal_e is not None): header = "- Resolution failure for {} arguments:\n{}\n" diff --git a/numba/core/typing/arraydecl.py b/numba/core/typing/arraydecl.py index 58001e47ed4..c75ed8ee5ec 100644 --- a/numba/core/typing/arraydecl.py +++ b/numba/core/typing/arraydecl.py @@ -416,6 +416,8 @@ def resolve_argsort(self, ary, args, kws): assert not args kwargs = dict(kws) kind = kwargs.pop('kind', types.StringLiteral('quicksort')) + if not isinstance(kind, types.StringLiteral): + raise errors.TypingError('"kind" must be a string literal') if kwargs: msg = "Unsupported keywords: {!r}" raise TypingError(msg.format([k for k in kwargs.keys()])) @@ -742,8 +744,9 @@ def generic_index(self, args, kws): assert not kws return signature(types.intp, recvr=self.this) -def install_array_method(name, generic): - my_attr = {"key": "array." + name, "generic": generic} +def install_array_method(name, generic, prefer_literal=True): + my_attr = {"key": "array." + name, "generic": generic, + "prefer_literal": prefer_literal} temp_class = type("Array_" + name, (AbstractTemplate,), my_attr) def array_attribute_attachment(self, ary): return types.BoundFunction(temp_class, ary) @@ -756,7 +759,7 @@ def array_attribute_attachment(self, ary): # Functions that return a machine-width type, to avoid overflows install_array_method("prod", generic_expand) -install_array_method("sum", sum_expand) +install_array_method("sum", sum_expand, prefer_literal=True) # Functions that return a machine-width type, to avoid overflows for fname in ["cumsum", "cumprod"]: diff --git a/numba/core/typing/npydecl.py b/numba/core/typing/npydecl.py index 878829ef9e8..2dbbed39be9 100644 --- a/numba/core/typing/npydecl.py +++ b/numba/core/typing/npydecl.py @@ -341,6 +341,10 @@ class Numpy_method_redirection(AbstractTemplate): array method of the same name (e.g. ndarray.sum). """ + # Arguments like *axis* can specialize on literals but also support + # non-literals + prefer_literal = True + def generic(self, args, kws): pysig = None if kws: diff --git a/numba/core/typing/templates.py b/numba/core/typing/templates.py index 71e4ed7dbf8..f77e3557ffd 100644 --- a/numba/core/typing/templates.py +++ b/numba/core/typing/templates.py @@ -251,7 +251,14 @@ class FunctionTemplate(ABC): # Set to true to disable unsafe cast. # subclass overide-able unsafe_casting = True + # Set to true to require exact match without casting. + # subclass overide-able exact_match_required = False + # Set to true to prefer literal arguments. + # Useful for definitions that specialize on literal but also support + # non-literals. + # subclass overide-able + prefer_literal = False def __init__(self, context): self.context = context @@ -278,6 +285,28 @@ def get_impl_key(self, sig): key = key.im_func return key + @classmethod + def get_source_code_info(cls, impl): + """ + Gets the source information about function impl. + Returns: + + code - str: source code as a string + firstlineno - int: the first line number of the function impl + path - str: the path to file containing impl + + if any of the above are not available something generic is returned + """ + try: + code, firstlineno = inspect.getsourcelines(impl) + except OSError: # missing source, probably a string + code = "None available (built from string?)" + firstlineno = 0 + path = inspect.getsourcefile(impl) + if path is None: + path = " (built from string?)" + return code, firstlineno, path + @abstractmethod def get_template_info(self): """ @@ -296,6 +325,13 @@ def get_template_info(self): """ pass + def __str__(self): + info = self.get_template_info() + srcinfo = f"{info['filename']}:{info['lines'][0]}" + return f"<{self.__class__.__name__} {srcinfo}>" + + __repr__ = __str__ + class AbstractTemplate(FunctionTemplate): """ @@ -332,14 +368,14 @@ def unpack_opt(x): def get_template_info(self): impl = getattr(self, "generic") basepath = os.path.dirname(os.path.dirname(numba.__file__)) - code, firstlineno = inspect.getsourcelines(impl) - path = inspect.getsourcefile(impl) + + code, firstlineno, path = self.get_source_code_info(impl) sig = str(utils.pysignature(impl)) info = { 'kind': "overload", 'name': getattr(impl, '__qualname__', impl.__name__), 'sig': sig, - 'filename': os.path.relpath(path, start=basepath), + 'filename': utils.safe_relpath(path, start=basepath), 'lines': (firstlineno, firstlineno + len(code) - 1), 'docstring': impl.__doc__ } @@ -407,15 +443,14 @@ def unpack_opt(x): def get_template_info(self): impl = getattr(self, "generic") basepath = os.path.dirname(os.path.dirname(numba.__file__)) - code, firstlineno = inspect.getsourcelines(impl) - path = inspect.getsourcefile(impl) + code, firstlineno, path = self.get_source_code_info(impl) sig = str(utils.pysignature(impl)) info = { 'kind': "overload", 'name': getattr(self.key, '__name__', getattr(impl, '__qualname__', impl.__name__),), 'sig': sig, - 'filename': os.path.relpath(path, start=basepath), + 'filename': utils.safe_relpath(path, start=basepath), 'lines': (firstlineno, firstlineno + len(code) - 1), 'docstring': impl.__doc__ } @@ -682,6 +717,7 @@ def _build_impl(self, cache_key, args, kws): # should be using. sig, pyfunc = ovf_result args = sig.args + kws = {} cache_key = None # don't cache else: # Regular case @@ -699,6 +735,9 @@ def _build_impl(self, cache_key, args, kws): # Make dispatcher jitdecor = jit(nopython=True, **self._jit_options) disp = jitdecor(pyfunc) + # Make sure that the implementation can be fully compiled + disp_type = types.Dispatcher(disp) + disp_type.get_call_type(self.context, args, kws) if cache_key is not None: self._impl_cache[cache_key] = disp, args return disp, args @@ -712,7 +751,7 @@ def get_impl_key(self, sig): @classmethod def get_source_info(cls): - """Return a dictionary with information about the source code of the + """Return a dictionary with information about the source code of the implementation. Returns @@ -733,14 +772,13 @@ def get_source_info(cls): """ basepath = os.path.dirname(os.path.dirname(numba.__file__)) impl = cls._overload_func - code, firstlineno = inspect.getsourcelines(impl) - path = inspect.getsourcefile(impl) + code, firstlineno, path = cls.get_source_code_info(impl) sig = str(utils.pysignature(impl)) info = { 'kind': "overload", 'name': getattr(impl, '__qualname__', impl.__name__), 'sig': sig, - 'filename': os.path.relpath(path, start=basepath), + 'filename': utils.safe_relpath(path, start=basepath), 'lines': (firstlineno, firstlineno + len(code) - 1), 'docstring': impl.__doc__ } @@ -749,14 +787,13 @@ def get_source_info(cls): def get_template_info(self): basepath = os.path.dirname(os.path.dirname(numba.__file__)) impl = self._overload_func - code, firstlineno = inspect.getsourcelines(impl) - path = inspect.getsourcefile(impl) + code, firstlineno, path = self.get_source_code_info(impl) sig = str(utils.pysignature(impl)) info = { 'kind': "overload", 'name': getattr(impl, '__qualname__', impl.__name__), 'sig': sig, - 'filename': os.path.relpath(path, start=basepath), + 'filename': utils.safe_relpath(path, start=basepath), 'lines': (firstlineno, firstlineno + len(code) - 1), 'docstring': impl.__doc__ } @@ -764,7 +801,7 @@ def get_template_info(self): def make_overload_template(func, overload_func, jit_options, strict, - inline): + inline, prefer_literal=False): """ Make a template class for function *func* overloaded by *overload_func*. Compiler options are passed as a dictionary to *jit_options*. @@ -775,7 +812,7 @@ def make_overload_template(func, overload_func, jit_options, strict, dct = dict(key=func, _overload_func=staticmethod(overload_func), _impl_cache={}, _compiled_overloads={}, _jit_options=jit_options, _strict=strict, _inline=staticmethod(InlineOptions(inline)), - _inline_overloads={}) + _inline_overloads={}, prefer_literal=prefer_literal) return type(base)(name, (base,), dct) @@ -818,14 +855,13 @@ def get_impl_key(self, sig): def get_template_info(self): basepath = os.path.dirname(os.path.dirname(numba.__file__)) impl = self._definition_func - code, firstlineno = inspect.getsourcelines(impl) - path = inspect.getsourcefile(impl) + code, firstlineno, path = self.get_source_code_info(impl) sig = str(utils.pysignature(impl)) info = { 'kind': "intrinsic", 'name': getattr(impl, '__qualname__', impl.__name__), 'sig': sig, - 'filename': os.path.relpath(path, start=basepath), + 'filename': utils.safe_relpath(path, start=basepath), 'lines': (firstlineno, firstlineno + len(code) - 1), 'docstring': impl.__doc__ } @@ -967,6 +1003,7 @@ class MethodTemplate(AbstractTemplate): _inline = self._inline _overload_func = staticmethod(self._overload_func) _inline_overloads = self._inline_overloads + prefer_literal = self.prefer_literal def generic(_, args, kws): args = (typ,) + tuple(args) @@ -982,6 +1019,7 @@ def generic(_, args, kws): def make_overload_attribute_template(typ, attr, overload_func, inline, + prefer_literal=False, base=_OverloadAttributeTemplate): """ Make a template class for attribute *attr* of *typ* overloaded by @@ -994,18 +1032,21 @@ def make_overload_attribute_template(typ, attr, overload_func, inline, _inline=staticmethod(InlineOptions(inline)), _inline_overloads={}, _overload_func=staticmethod(overload_func), + prefer_literal=prefer_literal, ) - return type(base)(name, (base,), dct) + obj = type(base)(name, (base,), dct) + return obj -def make_overload_method_template(typ, attr, overload_func, inline): +def make_overload_method_template(typ, attr, overload_func, inline, + prefer_literal=False): """ Make a template class for method *attr* of *typ* overloaded by *overload_func*. """ return make_overload_attribute_template( typ, attr, overload_func, inline=inline, - base=_OverloadMethodTemplate, + base=_OverloadMethodTemplate, prefer_literal=prefer_literal, ) diff --git a/numba/core/utils.py b/numba/core/utils.py index 2dd3fdb21c0..a74e3ef9283 100644 --- a/numba/core/utils.py +++ b/numba/core/utils.py @@ -104,6 +104,23 @@ def erase_traceback(exc_value): return exc_value.with_traceback(None) +def safe_relpath(path, start=os.curdir): + """ + Produces a "safe" relative path, on windows relpath doesn't work across + drives as technically they don't share the same root. + See: https://bugs.python.org/issue7195 for details. + """ + # find the drive letters for path and start and if they are not the same + # then don't use relpath! + drive_letter = lambda x: os.path.splitdrive(os.path.abspath(x))[0] + drive_path = drive_letter(path) + drive_start = drive_letter(start) + if drive_path != drive_start: + return os.path.abspath(path) + else: + return os.path.relpath(path, start=start) + + # Mapping between operator module functions and the corresponding built-in # operators. diff --git a/numba/cpython/tupleobj.py b/numba/cpython/tupleobj.py index 7a547577f8a..34739623fff 100644 --- a/numba/cpython/tupleobj.py +++ b/numba/cpython/tupleobj.py @@ -187,6 +187,24 @@ def iternext_unituple(context, builder, sig, args, result): builder.store(nidx, iterval.index) +@overload(operator.getitem) +def getitem_literal_idx(tup, idx): + """ + Overloads BaseTuple getitem to cover cases where constant + inference and RewriteConstGetitems cannot replace it + with a static_getitem. + """ + if not (isinstance(tup, types.BaseTuple) + and isinstance(idx, types.IntegerLiteral)): + return None + + idx_val = idx.literal_value + def getitem_literal_idx_impl(tup, idx): + return tup[idx_val] + + return getitem_literal_idx_impl + + @lower_builtin('typed_getitem', types.BaseTuple, types.Any) def getitem_typed(context, builder, sig, args): tupty, _ = sig.args diff --git a/numba/cuda/codegen.py b/numba/cuda/codegen.py index e201a2101e4..bfd58c27ca7 100644 --- a/numba/cuda/codegen.py +++ b/numba/cuda/codegen.py @@ -12,22 +12,14 @@ class CUDACodeLibrary(CodeLibrary): + # We don't optimize the IR at the function or module level because it is + # optimized by NVVM after we've passed it on. + def _optimize_functions(self, ll_module): pass def _optimize_final_module(self): - # Run some lightweight optimization to simplify the module. - # This seems to workaround a libnvvm compilation bug (see #1341) - pmb = ll.PassManagerBuilder() - pmb.opt_level = 1 - pmb.disable_unit_at_a_time = False - pmb.disable_unroll_loops = True - pmb.loop_vectorize = False - pmb.slp_vectorize = False - - pm = ll.ModulePassManager() - pmb.populate(pm) - pm.run(self._final_module) + pass def _finalize_specific(self): # Fix global naming diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py index b3324cf5b56..dd57817e438 100644 --- a/numba/cuda/compiler.py +++ b/numba/cuda/compiler.py @@ -1,7 +1,9 @@ import ctypes import inspect import os +import subprocess import sys +import tempfile import numpy as np @@ -51,7 +53,7 @@ def compile_cuda(pyfunc, return_type, args, debug=False, inline=False): @global_compiler_lock def compile_kernel(pyfunc, args, link, debug=False, inline=False, - fastmath=False, extensions=[], max_registers=None): + fastmath=False, extensions=[], max_registers=None, opt=True): cres = compile_cuda(pyfunc, types.void, args, debug=debug, inline=inline) fname = cres.fndesc.llvm_func_name lib, kernel = cres.target_context.prepare_cuda_kernel(cres.library, fname, @@ -65,6 +67,7 @@ def compile_kernel(pyfunc, args, link, debug=False, inline=False, type_annotation=cres.type_annotation, link=link, debug=debug, + opt=opt, call_helper=cres.call_helper, fastmath=fastmath, extensions=extensions, @@ -130,13 +133,43 @@ def compile_ptx_for_current_device(pyfunc, args, debug=False, device=False, fastmath=fastmath, cc=cc, opt=True) +def disassemble_cubin(cubin): + # nvdisasm only accepts input from a file, so we need to write out to a + # temp file and clean up afterwards. + fd = None + fname = None + try: + fd, fname = tempfile.mkstemp() + with open(fname, 'wb') as f: + f.write(cubin) + + try: + cp = subprocess.run(['nvdisasm', fname], check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + except FileNotFoundError as e: + if e.filename == 'nvdisasm': + msg = ("nvdisasm is required for SASS inspection, and has not " + "been found.\n\nYou may need to install the CUDA " + "toolkit and ensure that it is available on your " + "PATH.\n") + raise RuntimeError(msg) + return cp.stdout.decode('utf-8') + finally: + if fd is not None: + os.close(fd) + if fname is not None: + os.unlink(fname) + + class DeviceFunctionTemplate(serialize.ReduceMixin): """Unmaterialized device function """ - def __init__(self, pyfunc, debug, inline): + def __init__(self, pyfunc, debug, inline, opt): self.py_func = pyfunc self.debug = debug self.inline = inline + self.opt = opt self._compileinfos = {} name = getattr(pyfunc, '__name__', 'unknown') self.__name__ = f"{name} ".format(name) @@ -187,6 +220,10 @@ def inspect_llvm(self, args): ------- llvmir : str """ + # Force a compilation to occur if none has yet - this can be needed if + # the user attempts to inspect LLVM IR or PTX before the function has + # been called for the given arguments from a jitted kernel. + self.compile(args) cres = self._compileinfos[args] mod = cres.library._final_module return str(mod) @@ -211,17 +248,18 @@ def inspect_ptx(self, args, nvvm_options={}): device = cuctx.device cc = device.compute_capability arch = nvvm.get_arch_option(*cc) - ptx = nvvm.llvm_to_ptx(llvmir, opt=3, arch=arch, **nvvm_options) + opt = 3 if self.opt else 0 + ptx = nvvm.llvm_to_ptx(llvmir, opt=opt, arch=arch, **nvvm_options) return ptx -def compile_device_template(pyfunc, debug=False, inline=False): +def compile_device_template(pyfunc, debug=False, inline=False, opt=True): """Create a DeviceFunctionTemplate object and register the object to the CUDA typing context. """ from .descriptor import CUDATargetDesc - dft = DeviceFunctionTemplate(pyfunc, debug=debug, inline=inline) + dft = DeviceFunctionTemplate(pyfunc, debug=debug, inline=inline, opt=opt) class device_function_template(AbstractTemplate): key = dft @@ -239,7 +277,7 @@ def get_template_info(cls): 'kind': "overload", 'name': getattr(cls.key, '__name__', "unknown"), 'sig': sig, - 'filename': os.path.relpath(path, start=basepath), + 'filename': utils.safe_relpath(path, start=basepath), 'lines': (firstlineno, firstlineno + len(code) - 1), 'docstring': pyfunc.__doc__ } @@ -371,7 +409,7 @@ def get(self): ptx = self.cache.get(cc) if ptx is None: arch = nvvm.get_arch_option(*cc) - ptx = nvvm.llvm_to_ptx(self.llvmir, opt=3, arch=arch, + ptx = nvvm.llvm_to_ptx(self.llvmir, arch=arch, **self._extra_options) self.cache[cc] = ptx if config.DUMP_ASSEMBLY: @@ -394,6 +432,7 @@ def __init__(self, entry_name, ptx, linking, max_registers): self.linking = linking self.cache = {} self.ccinfos = {} + self.cubins = {} self.max_registers = max_registers def get(self): @@ -408,16 +447,27 @@ def get(self): linker.add_ptx(ptx) for path in self.linking: linker.add_file_guess_ext(path) - cubin, _size = linker.complete() + cubin, size = linker.complete() compile_info = linker.info_log module = cuctx.create_module_image(cubin) # Load cufunc = module.get_function(self.entry_name) + + # Populate caches self.cache[device.id] = cufunc self.ccinfos[device.id] = compile_info + # We take a copy of the cubin because it's owned by the linker + cubin_ptr = ctypes.cast(cubin, ctypes.POINTER(ctypes.c_char)) + cubin_data = np.ctypeslib.as_array(cubin_ptr, shape=(size,)).copy() + self.cubins[device.id] = cubin_data return cufunc + def get_sass(self): + self.get() # trigger compilation + device = get_context().device + return disassemble_cubin(self.cubins[device.id]) + def get_info(self): self.get() # trigger compilation cuctx = get_context() @@ -453,12 +503,13 @@ class _Kernel(serialize.ReduceMixin): ''' def __init__(self, llvm_module, name, pretty_name, argtypes, call_helper, link=(), debug=False, fastmath=False, type_annotation=None, - extensions=[], max_registers=None): + extensions=[], max_registers=None, opt=True): super().__init__() # initialize CUfunction options = { 'debug': debug, - 'fastmath': fastmath + 'fastmath': fastmath, + 'opt': 3 if opt else 0 } ptx = CachedPTX(pretty_name, str(llvm_module), options=options) @@ -547,6 +598,14 @@ def inspect_asm(self): ''' return self._func.ptx.get().decode('ascii') + def inspect_sass(self): + ''' + Returns the SASS code for this kernel. + + Requires nvdisasm to be available on the PATH. + ''' + return self._func.get_sass() + def inspect_types(self, file=None): ''' Produce a dump of the Python source of this function annotated with the @@ -881,10 +940,10 @@ def inspect_llvm(self, signature=None, compute_capability=None): def inspect_asm(self, signature=None, compute_capability=None): ''' - Return the generated assembly code for all signatures encountered thus - far, or the LLVM IR for a specific signature and compute_capability - if given. If the dispatcher is specialized, the assembly code for the - single specialization is returned. + Return the generated PTX assembly code for all signatures encountered + thus far, or the PTX assembly code for a specific signature and + compute_capability if given. If the dispatcher is specialized, the + assembly code for the single specialization is returned. ''' cc = compute_capability or get_current_device().compute_capability if signature is not None: @@ -895,6 +954,23 @@ def inspect_asm(self, signature=None, compute_capability=None): return dict((sig, defn.inspect_asm()) for sig, defn in self.definitions.items()) + def inspect_sass(self, signature=None, compute_capability=None): + ''' + Return the generated SASS code for all signatures encountered thus + far, or the SASS code for a specific signature and compute_capability + if given. + + Requires nvdisasm to be available on the PATH. + ''' + cc = compute_capability or get_current_device().compute_capability + if signature is not None: + return self.definitions[(cc, signature)].inspect_sass() + elif self.specialized: + return self.definition.inspect_sass() + else: + return dict((sig, defn.inspect_sass()) + for sig, defn in self.definitions.items()) + def inspect_types(self, file=None): ''' Produce a dump of the Python source of this function annotated with the diff --git a/numba/cuda/cudadrv/driver.py b/numba/cuda/cudadrv/driver.py index 0b0ee044e2b..414377229ff 100644 --- a/numba/cuda/cudadrv/driver.py +++ b/numba/cuda/cudadrv/driver.py @@ -20,6 +20,7 @@ import warnings import logging import threading +import asyncio from itertools import product from abc import ABCMeta, abstractmethod from ctypes import (c_int, byref, c_size_t, c_char, c_char_p, addressof, @@ -33,7 +34,7 @@ from numba.core import utils, errors, serialize, config from .error import CudaSupportError, CudaDriverError from .drvapi import API_PROTOTYPES -from .drvapi import cu_occupancy_b2d_size +from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj from numba.cuda.cudadrv import enums, drvapi, _extras from numba.core.utils import longint as long from numba.cuda.envvars import get_numba_envvar @@ -44,6 +45,12 @@ SUPPORTS_IPC = sys.platform.startswith('linux') +_py_decref = ctypes.pythonapi.Py_DecRef +_py_incref = ctypes.pythonapi.Py_IncRef +_py_decref.argtypes = [ctypes.py_object] +_py_incref.argtypes = [ctypes.py_object] + + def make_logger(): logger = logging.getLogger(__name__) # is logging configured? @@ -1791,6 +1798,64 @@ def auto_synchronize(self): yield self self.synchronize() + def add_callback(self, callback, arg): + """ + Add a callback to a compute stream. + The user provided function is called from a driver thread once all + preceding stream operations are complete. + + Callback functions are called from a CUDA driver thread, not from + the thread that invoked `add_callback`. No CUDA API functions may + be called from within the callback function. + + The duration of a callback function should be kept short, as the + callback will block later work in the stream and may block other + callbacks from being executed. + + Note: This function is marked as deprecated and may be replaced in a + future CUDA release. + + :param callback: Callback function with arguments (stream, status, arg). + :param arg: User data to be passed to the callback function. + """ + data = (self, callback, arg) + _py_incref(data) + driver.cuStreamAddCallback(self.handle, self._stream_callback, data, 0) + + @staticmethod + @cu_stream_callback_pyobj + def _stream_callback(handle, status, data): + try: + stream, callback, arg = data + callback(stream, status, arg) + except Exception as e: + warnings.warn(f"Exception in stream callback: {e}") + finally: + _py_decref(data) + + def async_done(self) -> asyncio.futures.Future: + """ + Return an awaitable that resolves once all preceding stream operations + are complete. + """ + loop = asyncio.get_running_loop() if utils.PYVERSION >= (3, 7) \ + else asyncio.get_event_loop() + future = loop.create_future() + + def resolver(future, status): + if future.done(): + return + elif status == 0: + future.set_result(None) + else: + future.set_exception(Exception(f"Stream error {status}")) + + def callback(stream, status, future): + loop.call_soon_threadsafe(resolver, future, status) + + self.add_callback(callback, future) + return future + class Event(object): def __init__(self, context, handle, finalizer=None): diff --git a/numba/cuda/cudadrv/drvapi.py b/numba/cuda/cudadrv/drvapi.py index 9fb6c95db65..8a78edb1e05 100644 --- a/numba/cuda/cudadrv/drvapi.py +++ b/numba/cuda/cudadrv/drvapi.py @@ -17,6 +17,8 @@ cu_function_attribute = c_int cu_ipc_mem_handle = (c_byte * _extras.CUDA_IPC_HANDLE_SIZE) # 64 bytes wide +cu_stream_callback_pyobj = CFUNCTYPE(None, cu_stream, c_int, py_object) + cu_occupancy_b2d_size = CFUNCTYPE(c_size_t, c_int) # See https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html @@ -159,6 +161,9 @@ # CUresult cuStreamSynchronize(CUstream hStream); 'cuStreamSynchronize': (c_int, cu_stream), +# CUresult cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void* userData, unsigned int flags) +'cuStreamAddCallback': (c_int, cu_stream, cu_stream_callback_pyobj, py_object, c_uint), + # CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, # unsigned int gridDimY, # unsigned int gridDimZ, diff --git a/numba/cuda/cudadrv/nvvm.py b/numba/cuda/cudadrv/nvvm.py index bf5acba9893..24569f99b43 100644 --- a/numba/cuda/cudadrv/nvvm.py +++ b/numba/cuda/cudadrv/nvvm.py @@ -202,7 +202,7 @@ def compile(self, **options): if options.pop('debug'): opts.append('-g') - if options.get('opt'): + if 'opt' in options: opts.append('-opt=%d' % options.pop('opt')) if options.get('arch'): @@ -500,6 +500,12 @@ def llvm_to_ptx(llvmir, **opts): for decl, fn in replacements: llvmir = llvmir.replace(decl, fn) + # llvm.numba_nvvm.atomic is used to prevent LLVM 9 onwards auto-upgrading + # these intrinsics into atomicrmw instructions, which are not recognized by + # NVVM. We can now replace them with the real intrinsic names, ready to + # pass to NVVM. + llvmir = llvmir.replace('llvm.numba_nvvm.atomic', 'llvm.nvvm.atomic') + llvmir = llvm39_to_34_ir(llvmir) cu.add_module(llvmir.encode('utf8')) cu.add_module(libdevice.get()) @@ -666,7 +672,11 @@ def _replace_llvm_memset_usage(m): Used as functor for `re.sub. """ params = list(m.group(1).split(',')) - align = re.search(r'align (\d+)', params[0]).group(1) + align_attr = re.search(r'align (\d+)', params[0]) + if not align_attr: + raise ValueError("No alignment attribute found on memset dest") + else: + align = align_attr.group(1) params.insert(-1, 'i32 {}'.format(align)) out = ', '.join(params) return '({})'.format(out) diff --git a/numba/cuda/cudaimpl.py b/numba/cuda/cudaimpl.py index 1ef9401fe7e..621f662b7c4 100644 --- a/numba/cuda/cudaimpl.py +++ b/numba/cuda/cudaimpl.py @@ -468,6 +468,12 @@ def ptx_round(context, builder, sig, args): ]) +@lower(math.isinf, types.Integer) +@lower(math.isnan, types.Integer) +def math_isinf_isnan_int(context, builder, sig, args): + return lc.Constant.int(lc.Type.int(1), 0) + + def gen_deg_rad(const): def impl(context, builder, sig, args): argty, = sig.args diff --git a/numba/cuda/decorators.py b/numba/cuda/decorators.py index 3abcafd9eaa..2e87a5b3f80 100644 --- a/numba/cuda/decorators.py +++ b/numba/cuda/decorators.py @@ -11,17 +11,17 @@ "positional argument.") -def jitdevice(func, link=[], debug=None, inline=False): +def jitdevice(func, link=[], debug=None, inline=False, opt=True): """Wrapper for device-jit. """ debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug if link: raise ValueError("link keyword invalid for device function") - return compile_device_template(func, debug=debug, inline=inline) + return compile_device_template(func, debug=debug, inline=inline, opt=opt) def jit(func_or_sig=None, argtypes=None, device=False, inline=False, - link=[], debug=None, **kws): + link=[], debug=None, opt=True, **kws): """ JIT compile a python function conforming to the CUDA Python specification. If a signature is supplied, then a function is returned that takes a @@ -51,6 +51,10 @@ def jit(func_or_sig=None, argtypes=None, device=False, inline=False, from which they are called. :param max_registers: Limit the kernel to using at most this number of registers per thread. Useful for increasing occupancy. + :param opt: Whether to compile from LLVM IR to PTX with optimization + enabled. When ``True``, ``-opt=3`` is passed to NVVM. When + ``False``, ``-opt=0`` is passed to NVVM. Defaults to ``True``. + :type opt: bool """ debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug @@ -79,7 +83,7 @@ def autojitwrapper(func): debug=debug) else: def autojitwrapper(func): - return jit(func, device=device, debug=debug, **kws) + return jit(func, device=device, debug=debug, opt=opt, **kws) return autojitwrapper # func_or_sig is a function @@ -88,10 +92,11 @@ def autojitwrapper(func): return FakeCUDAKernel(func_or_sig, device=device, fastmath=fastmath, debug=debug) elif device: - return jitdevice(func_or_sig, debug=debug, **kws) + return jitdevice(func_or_sig, debug=debug, opt=opt, **kws) else: targetoptions = kws.copy() targetoptions['debug'] = debug + targetoptions['opt'] = opt targetoptions['link'] = link sigs = None return Dispatcher(func_or_sig, sigs, bind=bind, @@ -127,6 +132,7 @@ def kernel_jit(func): targetoptions = kws.copy() targetoptions['debug'] = debug targetoptions['link'] = link + targetoptions['opt'] = opt return Dispatcher(func, sigs, bind=bind, targetoptions=targetoptions) def device_jit(func): diff --git a/numba/cuda/envvars.py b/numba/cuda/envvars.py index c79edae38da..dfe29759ea9 100644 --- a/numba/cuda/envvars.py +++ b/numba/cuda/envvars.py @@ -8,7 +8,7 @@ def get_numbapro_envvar(envvar, default=None): # use vanilla get here so as to use `None` as a signal for not-set value = os.environ.get(envvar) if value is not None: - url = ("http://numba.pydata.org/numba-doc/latest/cuda/overview.html", + url = ("https://numba.pydata.org/numba-doc/latest/cuda/overview.html", "#cudatoolkit-lookup") msg = ("\nEnvironment variables with the 'NUMBAPRO' prefix are " "deprecated and consequently ignored, found use of %s=%s.\n\n" diff --git a/numba/cuda/nvvmutils.py b/numba/cuda/nvvmutils.py index 543a48539d2..c273dcd2bf5 100644 --- a/numba/cuda/nvvmutils.py +++ b/numba/cuda/nvvmutils.py @@ -12,8 +12,13 @@ def declare_atomic_cas_int32(lmod): return lmod.get_or_insert_function(fnty, fname) +# For atomic intrinsics, "numba_nvvm" prevents LLVM 9 onwards auto-upgrading +# them into atomicrmw instructions that are not recognized by NVVM. It is +# replaced with "nvvm" in llvm_to_ptx later, after the module has been parsed +# and dumped by LLVM. + def declare_atomic_add_float32(lmod): - fname = 'llvm.nvvm.atomic.load.add.f32.p0f32' + fname = 'llvm.numba_nvvm.atomic.load.add.f32.p0f32' fnty = lc.Type.function(lc.Type.float(), (lc.Type.pointer(lc.Type.float(), 0), lc.Type.float())) return lmod.get_or_insert_function(fnty, name=fname) @@ -21,7 +26,7 @@ def declare_atomic_add_float32(lmod): def declare_atomic_add_float64(lmod): if current_context().device.compute_capability >= (6, 0): - fname = 'llvm.nvvm.atomic.load.add.f64.p0f64' + fname = 'llvm.numba_nvvm.atomic.load.add.f64.p0f64' else: fname = '___numba_atomic_double_add' fnty = lc.Type.function(lc.Type.double(), diff --git a/numba/cuda/target.py b/numba/cuda/target.py index 6ac492efa39..d8465bf1912 100644 --- a/numba/cuda/target.py +++ b/numba/cuda/target.py @@ -38,7 +38,9 @@ def resolve_value_type(self, val): if not val._can_compile: raise ValueError('using cpu function on device ' 'but its compilation is disabled') - jd = jitdevice(val, debug=val.targetoptions.get('debug')) + opt = val.targetoptions.get('opt', True) + jd = jitdevice(val, debug=val.targetoptions.get('debug'), + opt=opt) # cache the device function for future use and to avoid # duplicated copy of the same function. val.__cudajitdevice = jd diff --git a/numba/cuda/testing.py b/numba/cuda/testing.py index 5a3c1562644..43146685153 100644 --- a/numba/cuda/testing.py +++ b/numba/cuda/testing.py @@ -1,5 +1,6 @@ import contextlib import os +import shutil import sys from numba.tests.support import ( @@ -59,6 +60,16 @@ def skip_under_cuda_memcheck(reason): return unittest.skipIf(os.environ.get('CUDA_MEMCHECK') is not None, reason) +def skip_without_nvdisasm(reason): + nvdisasm_path = shutil.which('nvdisasm') + return unittest.skipIf(nvdisasm_path is None, reason) + + +def skip_with_nvdisasm(reason): + nvdisasm_path = shutil.which('nvdisasm') + return unittest.skipIf(nvdisasm_path is not None, reason) + + class CUDATextCapture(object): def __init__(self, stream): diff --git a/numba/cuda/tests/cudadrv/test_streams.py b/numba/cuda/tests/cudadrv/test_streams.py new file mode 100644 index 00000000000..485498b6077 --- /dev/null +++ b/numba/cuda/tests/cudadrv/test_streams.py @@ -0,0 +1,104 @@ +import asyncio +import functools +import threading +import numpy as np +from numba import cuda +from numba.core import utils +from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim + + +def with_asyncio_loop(f): + @functools.wraps(f) + def runner(*args, **kwds): + loop = asyncio.new_event_loop() + loop.set_debug(True) + try: + return loop.run_until_complete(f(*args, **kwds)) + finally: + loop.close() + return runner + + +asyncio_create_task = asyncio.create_task if utils.PYVERSION >= (3, 7) \ + else asyncio.ensure_future + + +@skip_on_cudasim('CUDA Driver API unsupported in the simulator') +class TestCudaStream(CUDATestCase): + def test_add_callback(self): + def callback(stream, status, event): + event.set() + + stream = cuda.stream() + callback_event = threading.Event() + stream.add_callback(callback, callback_event) + self.assertTrue(callback_event.wait(1.0)) + + @with_asyncio_loop + async def test_async_done(self): + stream = cuda.stream() + await stream.async_done() + + @with_asyncio_loop + async def test_parallel_tasks(self): + async def async_cuda_fn(value_in: float) -> float: + stream = cuda.stream() + h_src, h_dst = cuda.pinned_array(8), cuda.pinned_array(8) + h_src[:] = value_in + d_ary = cuda.to_device(h_src, stream=stream) + d_ary.copy_to_host(h_dst, stream=stream) + await stream.async_done() + return h_dst.mean() + + values_in = [1, 2, 3, 4] + tasks = [asyncio_create_task(async_cuda_fn(v)) for v in values_in] + values_out = await asyncio.gather(*tasks) + self.assertTrue(np.allclose(values_in, values_out)) + + @with_asyncio_loop + async def test_multiple_async_done(self): + stream = cuda.stream() + done_aws = [stream.async_done() for _ in range(4)] + await asyncio.gather(*done_aws) + + @with_asyncio_loop + async def test_cancelled_future(self): + stream = cuda.stream() + done1, done2 = stream.async_done(), stream.async_done() + done1.cancel() + await done2 + self.assertTrue(done1.cancelled()) + self.assertTrue(done2.done()) + + +@skip_on_cudasim('CUDA Driver API unsupported in the simulator') +class TestFailingStream(CUDATestCase): + # This test can only be run in isolation because it corrupts the CUDA + # context, which cannot be recovered from within the same process. It is + # left here so that it can be run manually for debugging / testing purposes + # - or may be re-enabled if in future there is infrastructure added for + # running tests in a separate process (a subprocess cannot be used because + # CUDA will have been initialized before the fork, so it cannot be used in + # the child process). + @unittest.skip + @with_asyncio_loop + async def test_failed_stream(self): + ctx = cuda.current_context() + module = ctx.create_module_ptx(""" + .version 6.5 + .target sm_30 + .address_size 64 + .visible .entry failing_kernel() { trap; } + """) + failing_kernel = module.get_function("failing_kernel") + + stream = cuda.stream() + failing_kernel.configure((1,), (1,), stream=stream).__call__() + done = stream.async_done() + with self.assertRaises(Exception): + await done + self.assertIsNotNone(done.exception()) + + +if __name__ == '__main__': + unittest.main() diff --git a/numba/cuda/tests/cudapy/test_atomics.py b/numba/cuda/tests/cudapy/test_atomics.py index 2c5b7491b46..95cdc702627 100644 --- a/numba/cuda/tests/cudapy/test_atomics.py +++ b/numba/cuda/tests/cudapy/test_atomics.py @@ -264,7 +264,7 @@ def assertCorrectFloat64Atomics(self, kernel, shared=True): @skip_unless_cc_50 def test_atomic_add_double(self): - idx = np.random.randint(0, 32, size=32) + idx = np.random.randint(0, 32, size=32, dtype=np.int64) ary = np.zeros(32, np.float64) cuda_func = cuda.jit('void(int64[:], float64[:])')(atomic_add_double) cuda_func[1, 32](idx, ary) @@ -295,7 +295,7 @@ def test_atomic_add_double_3(self): @skip_unless_cc_50 def test_atomic_add_double_global(self): - idx = np.random.randint(0, 32, size=32) + idx = np.random.randint(0, 32, size=32, dtype=np.int64) ary = np.zeros(32, np.float64) cuda_func = cuda.jit('void(int64[:], float64[:])')(atomic_add_double_global) cuda_func[1, 32](idx, ary) diff --git a/numba/cuda/tests/cudapy/test_constmem.py b/numba/cuda/tests/cudapy/test_constmem.py index 43d52809e42..11ad2a5431a 100644 --- a/numba/cuda/tests/cudapy/test_constmem.py +++ b/numba/cuda/tests/cudapy/test_constmem.py @@ -1,7 +1,8 @@ import numpy as np +import sys from numba import cuda -from numba.cuda.testing import unittest, CUDATestCase +from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim from numba.core.config import ENABLE_CUDASIM CONST_EMPTY = np.array([]) @@ -133,10 +134,14 @@ def test_const_array_3d(self): self.assertTrue(np.all(A == CONST3D)) if not ENABLE_CUDASIM: - self.assertIn( - 'ld.const.v2.u32', - jcuconst3d.ptx, - "load the two halves of the complex as u32s") + if cuda.runtime.get_version() in ((8, 0), (9, 0), (9, 1)): + complex_load = 'ld.const.v2.f32' + description = 'Load the complex as a vector of 2x f32' + else: + complex_load = 'ld.const.f32' + description = 'load each half of the complex as f32' + + self.assertIn(complex_load, jcuconst3d.ptx, description) def test_const_record_empty(self): jcuconstRecEmpty = cuda.jit('void(float64[:])')(cuconstRecEmpty) @@ -149,27 +154,50 @@ def test_const_record(self): B = np.zeros(2, dtype=int) jcuconst = cuda.jit(cuconstRec).specialize(A, B) - if not ENABLE_CUDASIM: - if not any(c in jcuconst.ptx for c in [ - # a vector load: the compiler fuses the load - # of the x and y fields into a single instruction! - 'ld.const.v2.u64', - - # for some reason Win64 / Py3 / CUDA 9.1 decides - # to do two u32 loads, and shifts and ors the - # values to get the float `x` field, then uses - # another ld.const.u32 to load the int `y` as - # a 32-bit value! - 'ld.const.u32', - ]): - raise AssertionError( - "the compiler should realise it doesn't " \ - "need to interpret the bytes as float!") - jcuconst[2, 1](A, B) np.testing.assert_allclose(A, CONST_RECORD['x']) np.testing.assert_allclose(B, CONST_RECORD['y']) + @skip_on_cudasim('PTX inspection not supported on the simulator') + def test_const_record_optimization(self): + A = np.zeros(2, dtype=float) + B = np.zeros(2, dtype=int) + jcuconst = cuda.jit(cuconstRec).specialize(A, B) + + old_runtime = cuda.runtime.get_version() in ((8, 0), (9, 0), (9, 1)) + windows = sys.platform.startswith('win') + + if old_runtime: + if windows: + # for some reason Win64 / Py3 / CUDA 9.1 decides to do two u32 + # loads, and shifts and ors the values to get the float `x` + # field, then uses another ld.const.u32 to load the int `y` as + # a 32-bit value! + self.assertIn('ld.const.u32', jcuconst.ptx, + 'load record fields as u32') + else: + # Load of the x and y fields fused into a single instruction + self.assertIn('ld.const.v2.f64', jcuconst.ptx, + 'load record fields as vector of 2x f64') + else: + # In newer toolkits, constant values are all loaded 8 bits at a + # time. Check that there are enough 8-bit loads for everything to + # have been loaded. This is possibly less than optimal, but is the + # observed behaviour with current toolkit versions when IR is not + # optimized before sending to NVVM. + u8_load_count = len([s for s in jcuconst.ptx.split() + if 'ld.const.u8' in s]) + + if windows: + # NumPy ints are 32-bit on Windows by default, so only 4 bytes + # for loading the int (and 8 for the float) + expected_load_count = 12 + else: + # int is 64-bit elsewhere + expected_load_count = 16 + self.assertGreaterEqual(u8_load_count, expected_load_count, + 'load record values as individual bytes') + def test_const_record_align(self): A = np.zeros(2, dtype=np.float64) B = np.zeros(2, dtype=np.float64) @@ -178,37 +206,57 @@ def test_const_record_align(self): E = np.zeros(2, dtype=np.float64) jcuconst = cuda.jit(cuconstRecAlign).specialize(A, B, C, D, E) - if not ENABLE_CUDASIM: - # Code generation differs slightly in 10.2 onwards - if cuda.runtime.get_version() >= (10, 2): - first_bytes = 'ld.const.v2.u8' - first_bytes_msg = 'load the first two bytes as a vector' - else: - first_bytes = 'ld.const.v4.u8' - first_bytes_msg = 'load the first three bytes as a vector' + jcuconst[2, 1](A, B, C, D, E) + np.testing.assert_allclose(A, CONST_RECORD_ALIGN['a']) + np.testing.assert_allclose(B, CONST_RECORD_ALIGN['b']) + np.testing.assert_allclose(C, CONST_RECORD_ALIGN['x']) + np.testing.assert_allclose(D, CONST_RECORD_ALIGN['y']) + np.testing.assert_allclose(E, CONST_RECORD_ALIGN['z']) + + @skip_on_cudasim('PTX inspection not supported on the simulator') + def test_const_record_align_optimization(self): + rtver = cuda.runtime.get_version() + + A = np.zeros(2, dtype=np.float64) + B = np.zeros(2, dtype=np.float64) + C = np.zeros(2, dtype=np.float64) + D = np.zeros(2, dtype=np.float64) + E = np.zeros(2, dtype=np.float64) + jcuconst = cuda.jit(cuconstRecAlign).specialize(A, B, C, D, E) + + if rtver >= (10, 2): + # Code generation differs slightly in 10.2 onwards - the first + # bytes are loaded as individual bytes, so we'll check that + # ld.const.u8 occurs at least four times (the first three bytes, + # then the last byte by itself) + msg = 'load first three bytes and last byte individually' + u8_load_count = len([s for s in jcuconst.ptx.split() + if 'ld.const.u8' in s]) + self.assertGreaterEqual(u8_load_count, 4, msg) + else: + # On earlier versions, a vector of 4 bytes is used to load the + # first three bytes. + first_bytes = 'ld.const.v4.u8' + first_bytes_msg = 'load the first three bytes as a vector' self.assertIn( first_bytes, jcuconst.ptx, first_bytes_msg) - self.assertIn( - 'ld.const.u32', - jcuconst.ptx, - 'load the uint32 natively') + self.assertIn( + 'ld.const.u32', + jcuconst.ptx, + 'load the uint32 natively') + # On 10.2 and above, we already checked for loading the last byte by + # itself - no need to repeat the check. + if rtver < (10, 2): self.assertIn( 'ld.const.u8', jcuconst.ptx, 'load the last byte by itself') - jcuconst[2, 1](A, B, C, D, E) - np.testing.assert_allclose(A, CONST_RECORD_ALIGN['a']) - np.testing.assert_allclose(B, CONST_RECORD_ALIGN['b']) - np.testing.assert_allclose(C, CONST_RECORD_ALIGN['x']) - np.testing.assert_allclose(D, CONST_RECORD_ALIGN['y']) - np.testing.assert_allclose(E, CONST_RECORD_ALIGN['z']) - if __name__ == '__main__': unittest.main() diff --git a/numba/cuda/tests/cudapy/test_debuginfo.py b/numba/cuda/tests/cudapy/test_debuginfo.py index f17051eb960..4300a29ce7f 100644 --- a/numba/cuda/tests/cudapy/test_debuginfo.py +++ b/numba/cuda/tests/cudapy/test_debuginfo.py @@ -50,6 +50,14 @@ def bar(x): self._check(bar, sig=(types.int32[:],), expect=False) + def test_issue_5835(self): + # Invalid debug metadata would segfault NVVM when any function was + # compiled with debug turned on and optimization off. This eager + # compilation should not crash anything. + @cuda.jit((types.int32[::1],), debug=True, opt=False) + def f(x): + x[0] = 0 + if __name__ == '__main__': unittest.main() diff --git a/numba/cuda/tests/cudapy/test_inspect.py b/numba/cuda/tests/cudapy/test_inspect.py index 0cbd97bfcb5..2f72e0a0994 100644 --- a/numba/cuda/tests/cudapy/test_inspect.py +++ b/numba/cuda/tests/cudapy/test_inspect.py @@ -1,7 +1,10 @@ +import numpy as np + from io import StringIO -from numba import cuda, float64, intp +from numba import cuda, float32, float64, int32, intp from numba.cuda.testing import unittest, CUDATestCase -from numba.cuda.testing import skip_on_cudasim +from numba.cuda.testing import (skip_on_cudasim, skip_with_nvdisasm, + skip_without_nvdisasm) @skip_on_cudasim('Simulator does not generate code to be inspected') @@ -70,6 +73,57 @@ def foo(x, y): self.assertIn("foo", asmdict[self.cc, (intp, intp)]) self.assertIn("foo", asmdict[self.cc, (float64, float64)]) + def _test_inspect_sass(self, kernel, name, sass): + # Ensure function appears in output + seen_function = False + for line in sass.split(): + if '.text' in line and name in line: + seen_function = True + self.assertTrue(seen_function) + + # Some instructions common to all supported architectures that should + # appear in the output + self.assertIn('S2R', sass) # Special register to register + self.assertIn('BRA', sass) # Branch + self.assertIn('EXIT', sass) # Exit program + + @skip_without_nvdisasm('nvdisasm needed for inspect_sass()') + def test_inspect_sass_eager(self): + @cuda.jit((float32[::1], int32[::1])) + def add(x, y): + i = cuda.grid(1) + if i < len(x): + x[i] += y[i] + + self._test_inspect_sass(add, 'add', add.inspect_sass()) + + @skip_without_nvdisasm('nvdisasm needed for inspect_sass()') + def test_inspect_sass_lazy(self): + @cuda.jit + def add(x, y): + i = cuda.grid(1) + if i < len(x): + x[i] += y[i] + + x = np.arange(10).astype(np.int32) + y = np.arange(10).astype(np.float32) + add[1, 10](x, y) + + signature = (int32[::1], float32[::1]) + self._test_inspect_sass(add, 'add', add.inspect_sass(signature)) + + @skip_with_nvdisasm('Missing nvdisasm exception only generated when it is ' + 'not present') + def test_inspect_sass_nvdisasm_missing(self): + @cuda.jit((float32[::1],)) + def f(x): + x[0] = 0 + + with self.assertRaises(RuntimeError) as raises: + f.inspect_sass() + + self.assertIn('nvdisasm is required', str(raises.exception)) + if __name__ == '__main__': unittest.main() diff --git a/numba/cuda/tests/cudapy/test_math.py b/numba/cuda/tests/cudapy/test_math.py index cdb128a90c2..cf64c0fd7c6 100644 --- a/numba/cuda/tests/cudapy/test_math.py +++ b/numba/cuda/tests/cudapy/test_math.py @@ -1,7 +1,7 @@ import sys import numpy as np from numba.cuda.testing import unittest, CUDATestCase -from numba import cuda, float32, float64, int32 +from numba import cuda, float32, float64, int32, int64 import math @@ -198,6 +198,7 @@ def unary_template(self, func, npfunc, npdtype, npmtype, start, stop): cfunc[1, nelem](A, B) self.assertTrue(np.allclose(npfunc(A), B)) + def unary_bool_template_float32(self, func, npfunc, start=0, stop=1): self.unary_template(func, npfunc, np.float32, float32, start, stop) @@ -205,6 +206,15 @@ def unary_bool_template_float32(self, func, npfunc, start=0, stop=1): def unary_bool_template_float64(self, func, npfunc, start=0, stop=1): self.unary_template(func, npfunc, np.float64, float64, start, stop) + + def unary_bool_template_int32(self, func, npfunc, start=0, stop=49): + self.unary_template(func, npfunc, np.int32, int32, start, stop) + + + def unary_bool_template_int64(self, func, npfunc, start=0, stop=49): + self.unary_template(func, npfunc, np.int64, int64, start, stop) + + def unary_bool_template(self, func, npfunc, npdtype, npmtype, start, stop): nelem = 50 A = np.linspace(start, stop, nelem).astype(npdtype) @@ -555,6 +565,8 @@ def test_math_mod_binop(self): def test_math_isnan(self): self.unary_bool_template_float32(math_isnan, np.isnan) self.unary_bool_template_float64(math_isnan, np.isnan) + self.unary_bool_template_int32(math_isnan, np.isnan) + self.unary_bool_template_int64(math_isnan, np.isnan) #------------------------------------------------------------------------------ # test_math_isinf @@ -563,6 +575,8 @@ def test_math_isnan(self): def test_math_isinf(self): self.unary_bool_template_float32(math_isinf, np.isinf) self.unary_bool_template_float64(math_isinf, np.isinf) + self.unary_bool_template_int32(math_isinf, np.isnan) + self.unary_bool_template_int64(math_isinf, np.isnan) #------------------------------------------------------------------------------ # test_math_degrees diff --git a/numba/cuda/tests/cudapy/test_optimization.py b/numba/cuda/tests/cudapy/test_optimization.py new file mode 100644 index 00000000000..a33c80bac5b --- /dev/null +++ b/numba/cuda/tests/cudapy/test_optimization.py @@ -0,0 +1,82 @@ +import numpy as np + +from numba.cuda.testing import skip_on_cudasim, CUDATestCase +from numba import cuda, float64 +import unittest + + +def kernel_func(x): + x[0] = 1 + + +def device_func(x, y, z): + return x * y + z + + +# Fragments of code that are removed from kernel_func's PTX when optimization +# is on +removed_by_opt = ( '__local_depot0', 'call.uni', 'st.param.b64') + + +@skip_on_cudasim('Simulator does not optimize code') +class TestOptimization(CUDATestCase): + def test_eager_opt(self): + # Optimization should occur by default + kernel = cuda.jit((float64[::1],))(kernel_func) + ptx = kernel.inspect_asm() + + for fragment in removed_by_opt: + with self.subTest(fragment=fragment): + self.assertNotIn(fragment, ptx) + + def test_eager_noopt(self): + # Optimization disabled + kernel = cuda.jit((float64[::1],), opt=False)(kernel_func) + ptx = kernel.inspect_asm() + + for fragment in removed_by_opt: + with self.subTest(fragment=fragment): + self.assertIn(fragment, ptx) + + def test_lazy_opt(self): + # Optimization should occur by default + kernel = cuda.jit(kernel_func) + x = np.zeros(1, dtype=np.float64) + kernel[1, 1](x) + + # Grab the PTX for the one definition that has just been jitted + ptx = next(iter(kernel.inspect_asm()))[1] + + for fragment in removed_by_opt: + with self.subTest(fragment=fragment): + self.assertNotIn(fragment, ptx) + + def test_lazy_noopt(self): + # Optimization disabled + kernel = cuda.jit(opt=False)(kernel_func) + x = np.zeros(1, dtype=np.float64) + kernel[1, 1](x) + + # Grab the PTX for the one definition that has just been jitted + ptx = next(iter(kernel.inspect_asm().items()))[1] + + for fragment in removed_by_opt: + with self.subTest(fragment=fragment): + self.assertIn(fragment, ptx) + + def test_device_opt(self): + # Optimization should occur by default + device = cuda.jit(device=True)(device_func) + ptx = device.inspect_ptx((float64, float64, float64)).decode('utf-8') + self.assertIn('fma.rn.f64', ptx) + + def test_device_noopt(self): + # Optimization disabled + device = cuda.jit(device=True, opt=False)(device_func) + ptx = device.inspect_ptx((float64, float64, float64)).decode('utf-8') + # Fused-multiply adds should be disabled when not optimizing + self.assertNotIn('fma.rn.f64', ptx) + + +if __name__ == '__main__': + unittest.main() diff --git a/numba/cuda/tests/nocuda/test_nvvm.py b/numba/cuda/tests/nocuda/test_nvvm.py index 6bcac98c6f6..aca1587c7ea 100644 --- a/numba/cuda/tests/nocuda/test_nvvm.py +++ b/numba/cuda/tests/nocuda/test_nvvm.py @@ -5,6 +5,13 @@ import unittest +original = "call void @llvm.memset.p0i8.i64(" \ + "i8* align 4 %arg.x.41, i8 0, i64 %0, i1 false)" + +missing_align = "call void @llvm.memset.p0i8.i64(" \ + "i8* %arg.x.41, i8 0, i64 %0, i1 false)" + + @skip_on_cudasim('libNVVM not supported in simulator') @unittest.skipIf(utils.MACHINE_BITS == 32, "CUDA not support for 32-bit") @unittest.skipIf(not nvvm.is_available(), "No libNVVM") @@ -29,25 +36,9 @@ def test_nvvm_memset_fixup(self): In LLVM7 the alignment parameter can be implicitly provided as an attribute to pointer in the first argument. """ - def foo(x): - # Triggers a generation of llvm.memset - for i in range(x.size): - x[i] = 0 - - cukern = compile_kernel(foo, args=(types.int32[::1],), link=()) - original = cukern._func.ptx.llvmir - self.assertIn("call void @llvm.memset", original) fixed = nvvm.llvm39_to_34_ir(original) self.assertIn("call void @llvm.memset", fixed) - # Check original IR - for ln in original.splitlines(): - if 'call void @llvm.memset' in ln: - # Missing i32 4 in the 2nd last argument - self.assertRegexpMatches( - ln, - r'i64 %\d+, i1 false\)'.replace(' ', r'\s+'), - ) - # Check fixed IR + for ln in fixed.splitlines(): if 'call void @llvm.memset' in ln: # The i32 4 is the alignment @@ -56,6 +47,17 @@ def foo(x): r'i32 4, i1 false\)'.replace(' ', r'\s+'), ) + def test_nvvm_memset_fixup_missing_align(self): + """ + We require alignment to be specified as a parameter attribute to the + dest argument of a memset. + """ + with self.assertRaises(ValueError) as e: + nvvm.llvm39_to_34_ir(missing_align) + + self.assertIn(str(e.exception), + "No alignment attribute found on memset dest") + if __name__ == '__main__': unittest.main() diff --git a/numba/experimental/jitclass/decorators.py b/numba/experimental/jitclass/decorators.py index aacd14c5366..eec874a47fe 100644 --- a/numba/experimental/jitclass/decorators.py +++ b/numba/experimental/jitclass/decorators.py @@ -36,7 +36,7 @@ def _warning_jitclass(spec): To be used in numba/__init__.py. This endpoint is deprecated. """ - url = ("http://numba.pydata.org/numba-doc/latest/reference/" + url = ("https://numba.pydata.org/numba-doc/latest/reference/" "deprecation.html#change-of-jitclass-location") msg = ("The 'numba.jitclass' decorator has moved to " diff --git a/numba/misc/gdb_hook.py b/numba/misc/gdb_hook.py index 8613e957d50..34b3548c157 100644 --- a/numba/misc/gdb_hook.py +++ b/numba/misc/gdb_hook.py @@ -22,7 +22,7 @@ def _confirm_gdb(): if not (os.path.exists(gdbloc) and os.path.isfile(gdbloc)): msg = ('Is gdb present? Location specified (%s) does not exist. The gdb' ' binary location can be set using Numba configuration, see: ' - 'http://numba.pydata.org/numba-doc/latest/reference/envvars.html' + 'https://numba.pydata.org/numba-doc/latest/reference/envvars.html' # noqa: E501 ) raise RuntimeError(msg % config.GDB_BINARY) # Is Yama being used as a kernel security module and if so is ptrace_scope diff --git a/numba/misc/literal.py b/numba/misc/literal.py index 855a4155e1e..2bc1225b7e4 100644 --- a/numba/misc/literal.py +++ b/numba/misc/literal.py @@ -15,6 +15,9 @@ def _ov_literally(obj): @overload(literal_unroll) def literal_unroll_impl(container): + if isinstance(container, types.Poison): + m = f"Invalid use of non-Literal type in literal_unroll({container})" + raise TypingError(m) def impl(container): return container diff --git a/numba/np/ufunc/dufunc.py b/numba/np/ufunc/dufunc.py index 1c01c2af132..093e02ec6d9 100644 --- a/numba/np/ufunc/dufunc.py +++ b/numba/np/ufunc/dufunc.py @@ -78,7 +78,7 @@ class DUFunc(serialize.ReduceMixin, _internal._DUFunc): def __init__(self, py_func, identity=None, cache=False, targetoptions={}): if isinstance(py_func, Dispatcher): py_func = py_func.py_func - dispatcher = jit(target='npyufunc', + dispatcher = jit(_target='npyufunc', cache=cache, **targetoptions)(py_func) self._initialize(dispatcher, identity) diff --git a/numba/np/ufunc/parallel.py b/numba/np/ufunc/parallel.py index b5387c03722..1b4a3b53ddf 100644 --- a/numba/np/ufunc/parallel.py +++ b/numba/np/ufunc/parallel.py @@ -287,28 +287,36 @@ def __exit__(self, *args): pass -try: - # Force the use of an RLock in the case a fork was used to start the - # process and thereby the init sequence, some of the threading backend - # init sequences are not fork safe. Also, windows global mp locks seem - # to be fine. - if "fork" in multiprocessing.get_start_method() or _windows: - _backend_init_process_lock = multiprocessing.get_context().RLock() - else: - _backend_init_process_lock = _nop() +_backend_init_process_lock = None -except OSError as e: - # probably lack of /dev/shm for semaphore writes, warn the user - msg = ("Could not obtain multiprocessing lock due to OS level error: %s\n" - "A likely cause of this problem is '/dev/shm' is missing or" - "read-only such that necessary semaphores cannot be written.\n" - "*** The responsibility of ensuring multiprocessing safe access to " - "this initialization sequence/module import is deferred to the " - "user! ***\n") - warnings.warn(msg % str(e)) +def _set_init_process_lock(): + global _backend_init_process_lock + try: + # Force the use of an RLock in the case a fork was used to start the + # process and thereby the init sequence, some of the threading backend + # init sequences are not fork safe. Also, windows global mp locks seem + # to be fine. + if "fork" in multiprocessing.get_start_method() or _windows: + _backend_init_process_lock = multiprocessing.get_context().RLock() + else: + _backend_init_process_lock = _nop() + + except OSError as e: + + # probably lack of /dev/shm for semaphore writes, warn the user + msg = ( + "Could not obtain multiprocessing lock due to OS level error: %s\n" + "A likely cause of this problem is '/dev/shm' is missing or" + "read-only such that necessary semaphores cannot be written.\n" + "*** The responsibility of ensuring multiprocessing safe access to " + "this initialization sequence/module import is deferred to the " + "user! ***\n" + ) + warnings.warn(msg % str(e)) + + _backend_init_process_lock = _nop() - _backend_init_process_lock = _nop() _is_initialized = False @@ -361,6 +369,9 @@ def _check_tbb_version_compatible(): def _launch_threads(): + if not _backend_init_process_lock: + _set_init_process_lock() + with _backend_init_process_lock: with _backend_init_thread_lock: global _is_initialized diff --git a/numba/np/ufunc/ufuncbuilder.py b/numba/np/ufunc/ufuncbuilder.py index d878606e26c..8339c48c065 100644 --- a/numba/np/ufunc/ufuncbuilder.py +++ b/numba/np/ufunc/ufuncbuilder.py @@ -228,7 +228,7 @@ class UFuncBuilder(_BaseUFuncBuilder): def __init__(self, py_func, identity=None, cache=False, targetoptions={}): self.py_func = py_func self.identity = parse_identity(identity) - self.nb_func = jit(target='npyufunc', + self.nb_func = jit(_target='npyufunc', cache=cache, **targetoptions)(py_func) self._sigs = [] @@ -293,7 +293,7 @@ def __init__(self, py_func, signature, identity=None, cache=False, targetoptions={}): self.py_func = py_func self.identity = parse_identity(identity) - self.nb_func = jit(target='npyufunc', cache=cache)(py_func) + self.nb_func = jit(_target='npyufunc', cache=cache)(py_func) self.signature = signature self.sin, self.sout = parse_signature(signature) self.targetoptions = targetoptions diff --git a/numba/parfors/array_analysis.py b/numba/parfors/array_analysis.py index 61d329f439c..378faba4c3f 100644 --- a/numba/parfors/array_analysis.py +++ b/numba/parfors/array_analysis.py @@ -143,6 +143,8 @@ def assert_equiv(typingctx, *val): or isinstance(a, types.Integer) for a in val[0][1:] ) + if not isinstance(val[0][0], types.StringLiteral): + raise errors.TypingError('first argument must be a StringLiteral') def codegen(context, builder, sig, args): assert len(args) == 1 # it is a vararg tuple diff --git a/numba/parfors/parfor.py b/numba/parfors/parfor.py index 61810c411f1..988ce254716 100644 --- a/numba/parfors/parfor.py +++ b/numba/parfors/parfor.py @@ -1531,6 +1531,124 @@ def __init__(self, func_ir, typemap, calltypes, return_type, typingctx, self.flags = flags +class ConvertInplaceBinop: + """Parfor subpass to convert setitem on Arrays + """ + def __init__(self, pass_states): + """ + Parameters + ---------- + pass_states : ParforPassStates + """ + self.pass_states = pass_states + self.rewritten = [] + + def run(self, blocks): + pass_states = self.pass_states + # convert expressions like A += ... where A is an array. + topo_order = find_topo_order(blocks) + # variables available in the program so far (used for finding map + # functions in array_expr lowering) + for label in topo_order: + block = blocks[label] + new_body = [] + equiv_set = pass_states.array_analysis.get_equiv_set(label) + for instr in block.body: + if isinstance(instr, ir.Assign): + lhs = instr.target + expr = instr.value + if isinstance(expr, ir.Expr) and expr.op == 'inplace_binop': + loc = expr.loc + target = expr.lhs + value = expr.rhs + target_typ = pass_states.typemap[target.name] + value_typ = pass_states.typemap[value.name] + # Handle A op= ... + if isinstance(target_typ, types.npytypes.Array): + # RHS is an array + if isinstance(value_typ, types.npytypes.Array): + new_instr = self._inplace_binop_to_parfor(equiv_set, + loc, expr.immutable_fn, target, value) + self.rewritten.append( + dict(old=instr, new=new_instr, + reason='inplace_binop'), + ) + instr = [new_instr, ir.Assign(target, lhs, loc)] + if isinstance(instr, list): + new_body.extend(instr) + else: + new_body.append(instr) + block.body = new_body + + def _inplace_binop_to_parfor(self, equiv_set, loc, op, target, value): + """generate parfor from setitem node with a boolean or slice array indices. + The value can be either a scalar or an array variable, and if a boolean index + is used for the latter case, the same index must be used for the value too. + """ + pass_states = self.pass_states + scope = target.scope + arr_typ = pass_states.typemap[target.name] + el_typ = arr_typ.dtype + init_block = ir.Block(scope, loc) + value_typ = pass_states.typemap[value.name] + + size_vars = equiv_set.get_shape(target) + + # generate loopnests and size variables from target correlations + index_vars, loopnests = _mk_parfor_loops(pass_states.typemap, size_vars, scope, loc) + + # generate body + body_label = next_label() + body_block = ir.Block(scope, loc) + index_var, index_var_typ = _make_index_var( + pass_states.typemap, scope, index_vars, body_block) + + # Read value. + value_var = ir.Var(scope, mk_unique_var("$value_var"), loc) + pass_states.typemap[value_var.name] = value_typ.dtype + getitem_call = ir.Expr.getitem(value, index_var, loc) + pass_states.calltypes[getitem_call] = signature( + value_typ.dtype, value_typ, index_var_typ) + body_block.body.append(ir.Assign(getitem_call, value_var, loc)) + + # Read target + target_var = ir.Var(scope, mk_unique_var("$target_var"), loc) + pass_states.typemap[target_var.name] = el_typ + getitem_call = ir.Expr.getitem(target, index_var, loc) + pass_states.calltypes[getitem_call] = signature( + el_typ, arr_typ, index_var_typ) + body_block.body.append(ir.Assign(getitem_call, target_var, loc)) + + # Create temp to hold result. + expr_out_var = ir.Var(scope, mk_unique_var("$expr_out_var"), loc) + pass_states.typemap[expr_out_var.name] = el_typ + + # Create binop and assign result to temporary. + binop_expr = ir.Expr.binop(op, target_var, value_var, loc) + body_block.body.append(ir.Assign(binop_expr, expr_out_var, loc)) + unified_type = self.pass_states.typingctx.unify_pairs(el_typ, value_typ.dtype) + pass_states.calltypes[binop_expr] = signature( + unified_type, unified_type, unified_type) + + # Write to target + setitem_node = ir.SetItem(target, index_var, expr_out_var, loc) + pass_states.calltypes[setitem_node] = signature( + types.none, arr_typ, index_var_typ, el_typ) + body_block.body.append(setitem_node) + + parfor = Parfor(loopnests, init_block, {}, loc, index_var, equiv_set, + ('inplace_binop', ''), pass_states.flags) + parfor.loop_body = {body_label: body_block} + if config.DEBUG_ARRAY_OPT >= 1: + print("parfor from inplace_binop") + parfor.dump() + return parfor + + def _type_getitem(self, args): + fnty = operator.getitem + return self.pass_states.typingctx.resolve_function_type(fnty, tuple(args), {}) + + class ConvertSetItemPass: """Parfor subpass to convert setitem on Arrays """ @@ -1550,7 +1668,6 @@ def run(self, blocks): topo_order = find_topo_order(blocks) # variables available in the program so far (used for finding map # functions in array_expr lowering) - avail_vars = [] for label in topo_order: block = blocks[label] new_body = [] @@ -1916,14 +2033,14 @@ def _numpy_map_to_parfor(self, equiv_set, call_name, lhs, args, kws, expr): value_assign = ir.Assign(value, expr_out_var, loc) body_block.body.append(value_assign) - parfor = Parfor(loopnests, init_block, {}, loc, index_var, equiv_set, - ('{} function'.format(call_name,), 'NumPy mapping'), - pass_states.flags) - setitem_node = ir.SetItem(lhs, index_var, expr_out_var, loc) pass_states.calltypes[setitem_node] = signature( types.none, pass_states.typemap[lhs.name], index_var_typ, el_typ) body_block.body.append(setitem_node) + + parfor = Parfor(loopnests, init_block, {}, loc, index_var, equiv_set, + ('{} function'.format(call_name,), 'NumPy mapping'), + pass_states.flags) parfor.loop_body = {body_label: body_block} if config.DEBUG_ARRAY_OPT >= 1: print("generated parfor for numpy map:") @@ -2601,6 +2718,8 @@ def run(self): ConvertReducePass(self).run(self.func_ir.blocks) if self.options.prange: ConvertLoopPass(self).run(self.func_ir.blocks) + if self.options.inplace_binop: + ConvertInplaceBinop(self).run(self.func_ir.blocks) # setup diagnostics now parfors are found self.diagnostics.setup(self.func_ir, self.options.fusion) @@ -2627,6 +2746,7 @@ def run(self): up_direction=False) dprint_func_ir(self.func_ir, "after maximize fusion down") self.fuse_parfors(self.array_analysis, self.func_ir.blocks) + dprint_func_ir(self.func_ir, "after first fuse") # push non-parfors up maximize_fusion(self.func_ir, self.func_ir.blocks, self.typemap) dprint_func_ir(self.func_ir, "after maximize fusion up") @@ -2763,6 +2883,7 @@ def fuse_parfors(self, array_analysis, blocks): def fuse_recursive_parfor(self, parfor, equiv_set): blocks = wrap_parfor_blocks(parfor) maximize_fusion(self.func_ir, blocks, self.typemap) + dprint_func_ir(self.func_ir, "after recursive maximize fusion down", blocks) arr_analysis = array_analysis.ArrayAnalysis(self.typingctx, self.func_ir, self.typemap, self.calltypes) arr_analysis.run(blocks, equiv_set) @@ -3587,14 +3708,26 @@ def maximize_fusion(func_ir, blocks, typemap, up_direction=True): so they are adjacent. """ call_table, _ = get_call_table(blocks) - alias_map, arg_aliases = find_potential_aliases(blocks, func_ir.arg_names, typemap, func_ir) + alias_map, arg_aliases = find_potential_aliases( + blocks, + func_ir.arg_names, + typemap, + func_ir + ) for block in blocks.values(): order_changed = True while order_changed: - order_changed = maximize_fusion_inner(func_ir, block, call_table, - alias_map, up_direction) + order_changed = maximize_fusion_inner( + func_ir, + block, + call_table, + alias_map, + arg_aliases, + up_direction + ) -def maximize_fusion_inner(func_ir, block, call_table, alias_map, up_direction=True): +def maximize_fusion_inner(func_ir, block, call_table, alias_map, + arg_aliases, up_direction=True): order_changed = False i = 0 # i goes to body[-3] (i+1 to body[-2]) since body[-1] is terminator and @@ -3602,9 +3735,10 @@ def maximize_fusion_inner(func_ir, block, call_table, alias_map, up_direction=Tr while i < len(block.body) - 2: stmt = block.body[i] next_stmt = block.body[i+1] - can_reorder = (_can_reorder_stmts(stmt, next_stmt, func_ir, call_table, alias_map) + can_reorder = (_can_reorder_stmts(stmt, next_stmt, func_ir, + call_table, alias_map, arg_aliases) if up_direction else _can_reorder_stmts(next_stmt, stmt, - func_ir, call_table, alias_map)) + func_ir, call_table, alias_map, arg_aliases)) if can_reorder: block.body[i] = next_stmt block.body[i+1] = stmt @@ -3612,16 +3746,18 @@ def maximize_fusion_inner(func_ir, block, call_table, alias_map, up_direction=Tr i += 1 return order_changed -def expand_aliases(the_set, alias_map): +def expand_aliases(the_set, alias_map, arg_aliases): ret = set() for i in the_set: if i in alias_map: ret = ret.union(alias_map[i]) - else: - ret.add(i) + if i in arg_aliases: + ret = ret.union(arg_aliases) + ret.add(i) return ret -def _can_reorder_stmts(stmt, next_stmt, func_ir, call_table, alias_map): +def _can_reorder_stmts(stmt, next_stmt, func_ir, call_table, + alias_map, arg_aliases): """ Check dependencies to determine if a parfor can be reordered in the IR block with a non-parfor statement. @@ -3630,18 +3766,20 @@ def _can_reorder_stmts(stmt, next_stmt, func_ir, call_table, alias_map): # don't reorder calls with side effects (e.g. file close) # only read-read dependencies are OK # make sure there is no write-write, write-read dependencies - if (isinstance( - stmt, Parfor) and not isinstance( - next_stmt, Parfor) and not isinstance( - next_stmt, ir.Print) - and (not isinstance(next_stmt, ir.Assign) - or has_no_side_effect( - next_stmt.value, set(), call_table) + if (isinstance(stmt, Parfor) + and not isinstance(next_stmt, Parfor) + and not isinstance(next_stmt, ir.Print) + and (not isinstance(next_stmt, ir.Assign) + or has_no_side_effect(next_stmt.value, set(), call_table) or guard(is_assert_equiv, func_ir, next_stmt.value))): - stmt_accesses = expand_aliases({v.name for v in stmt.list_vars()}, alias_map) - stmt_writes = expand_aliases(get_parfor_writes(stmt), alias_map) - next_accesses = expand_aliases({v.name for v in next_stmt.list_vars()}, alias_map) - next_writes = expand_aliases(get_stmt_writes(next_stmt), alias_map) + stmt_accesses = expand_aliases({v.name for v in stmt.list_vars()}, + alias_map, arg_aliases) + stmt_writes = expand_aliases(get_parfor_writes(stmt), + alias_map, arg_aliases) + next_accesses = expand_aliases({v.name for v in next_stmt.list_vars()}, + alias_map, arg_aliases) + next_writes = expand_aliases(get_stmt_writes(next_stmt), + alias_map, arg_aliases) if len((stmt_writes & next_accesses) | (next_writes & stmt_accesses)) == 0: return True diff --git a/numba/pycc/compiler.py b/numba/pycc/compiler.py index bb7d2682143..4fea211261c 100644 --- a/numba/pycc/compiler.py +++ b/numba/pycc/compiler.py @@ -99,8 +99,8 @@ class _ModuleCompiler(object): lt._int8_star)) method_def_ptr = lc.Type.pointer(method_def_ty) - - env_def_ty = lc.Type.struct((lt._void_star, lt._int32)) + # The structure type constructed by PythonAPI.serialize_uncached() + env_def_ty = lc.Type.struct((lt._void_star, lt._int32, lt._void_star)) env_def_ptr = lc.Type.pointer(env_def_ty) def __init__(self, export_entries, module_name, use_nrt=False, diff --git a/numba/pycc/modulemixin.c b/numba/pycc/modulemixin.c index 1aa3dbb6657..e2bfeca63c2 100644 --- a/numba/pycc/modulemixin.c +++ b/numba/pycc/modulemixin.c @@ -59,6 +59,7 @@ extern void *nrt_atomic_add, *nrt_atomic_sub; typedef struct { const char *data; int len; + const char *hashbuf; } env_def_t; /* Environment GlobalVariable address type */ @@ -73,7 +74,7 @@ recreate_environment(PyObject *module, env_def_t env) EnvironmentObject *envobj; PyObject *env_consts; - env_consts = numba_unpickle(env.data, env.len); + env_consts = numba_unpickle(env.data, env.len, env.hashbuf); if (env_consts == NULL) return NULL; if (!PyList_Check(env_consts)) { diff --git a/numba/tests/test_comprehension.py b/numba/tests/test_comprehension.py index b269fd6460d..d0c4b402065 100644 --- a/numba/tests/test_comprehension.py +++ b/numba/tests/test_comprehension.py @@ -282,6 +282,16 @@ def comp_with_array_noinline(n): finally: ic.enable_inline_arraycall = True + def test_comp_with_array_noinline_issue_6053(self): + def comp_with_array_noinline(n): + lst = [0] + for i in range(n): + lst.append(i) + l = np.array(lst) + return l + + self.check(comp_with_array_noinline, 5, assert_allocate_list=True) + def test_comp_nest_with_array(self): def comp_nest_with_array(n): l = np.array([[i * j for j in range(n)] for i in range(n)]) diff --git a/numba/tests/test_deprecations.py b/numba/tests/test_deprecations.py index 3122b091db2..e7abdf51be4 100644 --- a/numba/tests/test_deprecations.py +++ b/numba/tests/test_deprecations.py @@ -11,7 +11,7 @@ def check_warning(self, warnings, expected_str, category): self.assertEqual(len(warnings), 1) self.assertEqual(warnings[0].category, category) self.assertIn(expected_str, str(warnings[0].message)) - self.assertIn("http://numba.pydata.org", str(warnings[0].message)) + self.assertIn("https://numba.pydata.org", str(warnings[0].message)) def test_jitfallback(self): # tests that @jit falling back to object mode raises a @@ -52,7 +52,7 @@ def foo_set(a): self.assertIn(msg, warn_msg) msg = ("\'reflected %s\' found for argument" % container) self.assertIn(msg, warn_msg) - self.assertIn("http://numba.pydata.org", warn_msg) + self.assertIn("https://numba.pydata.org", warn_msg) if __name__ == '__main__': diff --git a/numba/tests/test_dictobject.py b/numba/tests/test_dictobject.py index 2a139491a89..e4a3bdd5aed 100644 --- a/numba/tests/test_dictobject.py +++ b/numba/tests/test_dictobject.py @@ -1727,6 +1727,65 @@ def foo(): foo() + def test_mutation_not_carried_single_function(self): + # this is another pattern for using literally + + @njit + def nop(*args): + pass + + for fn, iv in (nop, None), (literally, {'a': 1, 'b': 2, 'c': 3}): + @njit + def baz(x): + pass + + def bar(z): + pass + + @overload(bar) + def ol_bar(z): + def impl(z): + fn(z) + baz(z) + return impl + + @njit + def foo(): + x = {'a': 1, 'b': 2, 'c': 3} + bar(x) + x['d'] = 4 + return x + + foo() + # baz should be specialised based on literally being invoked and + # the literal/unliteral arriving at the call site + larg = baz.signatures[0][0] + self.assertEqual(larg.initial_value, iv) + + def test_unify_across_function_call(self): + + @njit + def bar(x): + o = {1: 2} + if x: + o = {2: 3} + return o + + @njit + def foo(x): + if x: + d = {3: 4} + else: + d = bar(x) + return d + + e1 = Dict() + e1[3] = 4 + e2 = Dict() + e2[1] = 2 + self.assertEqual(foo(True), e1) + self.assertEqual(foo(False), e2) + class TestLiteralStrKeyDict(MemoryLeakMixin, TestCase): """ Tests for dictionaries with string keys that can map to anything!""" diff --git a/numba/tests/test_errorhandling.py b/numba/tests/test_errorhandling.py index fca546ccc47..83f06211b23 100644 --- a/numba/tests/test_errorhandling.py +++ b/numba/tests/test_errorhandling.py @@ -8,6 +8,7 @@ from numba import jit, njit, typed, int64, types from numba.core import errors import numba.core.typing.cffi_utils as cffi_support +from numba.experimental import structref from numba.extending import (overload, intrinsic, overload_method, overload_attribute) from numba.core.compiler import CompilerBase @@ -423,6 +424,25 @@ def foo(): excstr = str(raises.exception) self.assertIn("Type Restricted Function in function 'unknown'", excstr) + def test_missing_source(self): + + @structref.register + class ParticleType(types.StructRef): + pass + + class Particle(structref.StructRefProxy): + def __new__(cls, pos, mass): + return structref.StructRefProxy.__new__(cls, pos) + # didn't provide the required mass argument ----^ + + structref.define_proxy(Particle, ParticleType, ["pos", "mass"]) + + with self.assertRaises(errors.TypingError) as raises: + Particle(pos=1, mass=2) + + excstr = str(raises.exception) + self.assertIn("required positional argument: 'mass'", excstr) + class TestDeveloperSpecificErrorMessages(SerialMixin, unittest.TestCase): diff --git a/numba/tests/test_extending.py b/numba/tests/test_extending.py index 972fddd285d..6b4e0c4f47d 100644 --- a/numba/tests/test_extending.py +++ b/numba/tests/test_extending.py @@ -1770,5 +1770,100 @@ def foo(x): ) +class TestOverloadPreferLiteral(TestCase): + def test_overload(self): + def prefer_lit(x): + pass + + def non_lit(x): + pass + + def ov(x): + if isinstance(x, types.IntegerLiteral): + # With prefer_literal=False, this branch will not be reached. + if x.literal_value == 1: + def impl(x): + return 0xcafe + return impl + else: + raise errors.TypingError('literal value') + else: + def impl(x): + return x * 100 + return impl + + overload(prefer_lit, prefer_literal=True)(ov) + overload(non_lit)(ov) + + @njit + def check_prefer_lit(x): + return prefer_lit(1), prefer_lit(2), prefer_lit(x) + + a, b, c = check_prefer_lit(3) + self.assertEqual(a, 0xcafe) + self.assertEqual(b, 200) + self.assertEqual(c, 300) + + @njit + def check_non_lit(x): + return non_lit(1), non_lit(2), non_lit(x) + + a, b, c = check_non_lit(3) + self.assertEqual(a, 100) + self.assertEqual(b, 200) + self.assertEqual(c, 300) + + def test_overload_method(self): + def ov(self, x): + if isinstance(x, types.IntegerLiteral): + # With prefer_literal=False, this branch will not be reached. + if x.literal_value == 1: + def impl(self, x): + return 0xcafe + return impl + else: + raise errors.TypingError('literal value') + else: + def impl(self, x): + return x * 100 + return impl + + overload_method( + MyDummyType, "method_prefer_literal", + prefer_literal=True, + )(ov) + + overload_method( + MyDummyType, "method_non_literal", + prefer_literal=False, + )(ov) + + @njit + def check_prefer_lit(dummy, x): + return ( + dummy.method_prefer_literal(1), + dummy.method_prefer_literal(2), + dummy.method_prefer_literal(x), + ) + + a, b, c = check_prefer_lit(MyDummy(), 3) + self.assertEqual(a, 0xcafe) + self.assertEqual(b, 200) + self.assertEqual(c, 300) + + @njit + def check_non_lit(dummy, x): + return ( + dummy.method_non_literal(1), + dummy.method_non_literal(2), + dummy.method_non_literal(x), + ) + + a, b, c = check_non_lit(MyDummy(), 3) + self.assertEqual(a, 100) + self.assertEqual(b, 200) + self.assertEqual(c, 300) + + if __name__ == "__main__": unittest.main() diff --git a/numba/tests/test_lists.py b/numba/tests/test_lists.py index fcefb614bb6..0ae2bb366f6 100644 --- a/numba/tests/test_lists.py +++ b/numba/tests/test_lists.py @@ -1481,6 +1481,57 @@ def foo(): foo() + def test_mutation_not_carried_single_function(self): + # this is another pattern for using literally + + @njit + def nop(*args): + pass + + for fn, iv in (nop, None), (literally, [1, 2, 3]): + @njit + def baz(x): + pass + + def bar(z): + pass + + @overload(bar) + def ol_bar(z): + def impl(z): + fn(z) + baz(z) + return impl + + @njit + def foo(): + x = [1, 2, 3] + bar(x) + x.append(2) + return x + + foo() + # baz should be specialised based on literally being invoked and + # the literal/unliteral arriving at the call site + larg = baz.signatures[0][0] + self.assertEqual(larg.initial_value, iv) + + def test_list_of_list_ctor(self): + # see issue 6082 + @njit + def bar(x): + pass + + @njit + def foo(): + x = [[1, 2, 3, 4, 5], [1, 2, 3, 4, 6]] + bar(x) + + foo() + larg = bar.signatures[0][0] + self.assertEqual(larg.initial_value, None) + self.assertEqual(larg.dtype.initial_value, None) + class TestLiteralLists(MemoryLeakMixin, TestCase): diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py index 34d99c0e933..d3e796e2914 100644 --- a/numba/tests/test_parallel_backend.py +++ b/numba/tests/test_parallel_backend.py @@ -862,33 +862,32 @@ class TestInitSafetyIssues(TestCase): _DEBUG = False + def run_cmd(self, cmdline): + popen = subprocess.Popen(cmdline, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE,) + # finish in _TEST_TIMEOUT seconds or kill it + timeout = threading.Timer(_TEST_TIMEOUT, popen.kill) + try: + timeout.start() + out, err = popen.communicate() + if popen.returncode != 0: + raise AssertionError( + "process failed with code %s: stderr follows\n%s\n" % + (popen.returncode, err.decode())) + finally: + timeout.cancel() + return out.decode(), err.decode() + @linux_only # only linux can leak semaphores def test_orphaned_semaphore(self): # sys path injection and separate usecase module to make sure everything # is importable by children of multiprocessing - def run_cmd(cmdline): - popen = subprocess.Popen(cmdline, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE,) - # finish in _TEST_TIMEOUT seconds or kill it - timeout = threading.Timer(_TEST_TIMEOUT, popen.kill) - try: - timeout.start() - out, err = popen.communicate() - if popen.returncode != 0: - raise AssertionError( - "process failed with code %s: stderr follows\n%s\n" % - (popen.returncode, err.decode())) - finally: - timeout.cancel() - return out.decode(), err.decode() - test_file = os.path.join(os.path.dirname(__file__), "orphaned_semaphore_usecase.py") - cmdline = [sys.executable, test_file] - out, err = run_cmd(cmdline) + out, err = self.run_cmd(cmdline) # assert no semaphore leaks reported on stderr self.assertNotIn("leaked semaphore", err) @@ -897,6 +896,27 @@ def run_cmd(cmdline): print("OUT:", out) print("ERR:", err) + def test_lazy_lock_init(self): + # checks based on https://github.com/numba/numba/pull/5724 + # looking for "lazy" process lock initialisation so as to avoid setting + # a multiprocessing context as part of import. + for meth in ('fork', 'spawn', 'forkserver'): + # if a context is available on the host check it can be set as the + # start method in a separate process + try: + multiprocessing.get_context(meth) + except ValueError: + continue + cmd = ("import numba; import multiprocessing;" + "multiprocessing.set_start_method('{}');" + "print(multiprocessing.get_context().get_start_method())") + cmdline = [sys.executable, "-c", cmd.format(meth)] + out, err = self.run_cmd(cmdline) + if self._DEBUG: + print("OUT:", out) + print("ERR:", err) + self.assertIn(meth, out) + @skip_parfors_unsupported @skip_no_omp diff --git a/numba/tests/test_parfors.py b/numba/tests/test_parfors.py index a4003686d5b..2869589a2e2 100644 --- a/numba/tests/test_parfors.py +++ b/numba/tests/test_parfors.py @@ -569,7 +569,7 @@ def test_blackscholes(self): def test_logistic_regression(self): args = (numba.float64[:], numba.float64[:,:], numba.float64[:], numba.int64) - self.assertTrue(countParfors(lr_impl, args) == 1) + self.assertTrue(countParfors(lr_impl, args) == 2) self.assertTrue(countArrayAllocs(lr_impl, args) == 1) @skip_parfors_unsupported @@ -1628,6 +1628,18 @@ def test_impl(a): x = np.arange(10) self.check(test_impl, x) + @skip_parfors_unsupported + def test_inplace_binop(self): + def test_impl(a, b): + b += a + return b + + X = np.arange(10) + 10 + Y = np.arange(10) + 100 + self.check(test_impl, X, Y) + self.assertTrue(countParfors(test_impl, + (types.Array(types.float64, 1, 'C'), + types.Array(types.float64, 1, 'C'))) == 1) class TestParforsLeaks(MemoryLeakMixin, TestParforsBase): def check(self, pyfunc, *args, **kwargs): @@ -3050,6 +3062,46 @@ def test_impl(a): self.check(test_impl, np.arange(3)) + @skip_parfors_unsupported + def test_issue5942_1(self): + # issue5942: tests statement reordering of + # aliased arguments. + def test_impl(gg, gg_next): + gs = gg.shape + d = gs[0] + for i_gg in prange(d): + gg_next[i_gg, :] = gg[i_gg, :] + gg_next[i_gg, 0] += 1 + + return gg_next + + d = 4 + k = 2 + + gg = np.zeros((d, k), dtype = np.int32) + gg_next = np.zeros((d, k), dtype = np.int32) + self.check(test_impl, gg, gg_next) + + @skip_parfors_unsupported + def test_issue5942_2(self): + # issue5942: tests statement reordering + def test_impl(d, k): + gg = np.zeros((d, k), dtype = np.int32) + gg_next = np.zeros((d, k), dtype = np.int32) + + for i_gg in prange(d): + for n in range(k): + gg[i_gg, n] = i_gg + gg_next[i_gg, :] = gg[i_gg, :] + gg_next[i_gg, 0] += 1 + + return gg_next + + d = 4 + k = 2 + + self.check(test_impl, d, k) + class TestParforsOptions(TestParforsBase): diff --git a/numba/tests/test_serialize.py b/numba/tests/test_serialize.py index f345eda65ca..2bcf843458a 100644 --- a/numba/tests/test_serialize.py +++ b/numba/tests/test_serialize.py @@ -186,5 +186,23 @@ def foo(x): """ subprocess.check_call([sys.executable, "-c", code]) + +class TestSerializationMisc(TestCase): + def test_numba_unpickle(self): + # Test that _numba_unpickle is memorizing its output + from numba.core.serialize import _numba_unpickle + + random_obj = object() + bytebuf = pickle.dumps(random_obj) + hashed = hash(random_obj) + + got1 = _numba_unpickle(id(random_obj), bytebuf, hashed) + # not the original object + self.assertIsNot(got1, random_obj) + got2 = _numba_unpickle(id(random_obj), bytebuf, hashed) + # unpickled results are the same objects + self.assertIs(got1, got2) + + if __name__ == '__main__': unittest.main() diff --git a/numba/tests/test_stencils.py b/numba/tests/test_stencils.py index 261d6581e54..6a10523fc70 100644 --- a/numba/tests/test_stencils.py +++ b/numba/tests/test_stencils.py @@ -1260,7 +1260,7 @@ def computebound(mins, maxs): ast.copy_location(returner, node) add_kwarg = [ast.arg('neighborhood', None)] - defaults = [ast.Name(id='None', ctx=ast.Load())] + defaults = [] newargs = ast.arguments( args=node.args.args + diff --git a/numba/tests/test_svml.py b/numba/tests/test_svml.py index 459fdbbc653..cebac432c19 100644 --- a/numba/tests/test_svml.py +++ b/numba/tests/test_svml.py @@ -19,6 +19,9 @@ # a map of float64 vector lenghs with corresponding CPU architecture vlen2cpu = {2: 'nehalem', 4: 'haswell', 8: 'skylake-avx512'} +# force LLVM to use AVX512 registers for vectorization +# https://reviews.llvm.org/D67259 +vlen2cpu_features = {2: '', 4: '', 8: '-prefer-256-bit'} # K: SVML functions, V: python functions which are expected to be SIMD-vectorized # using SVML, explicit references to Python functions here are mostly for sake of @@ -182,7 +185,7 @@ class TestSVMLGeneration(TestCase): def _inject_test(cls, dtype, mode, vlen, flags): # unsupported combinations if dtype.startswith('complex') and mode != 'numpy': - return + return # TODO: address skipped tests below skipped = dtype.startswith('int') and vlen == 2 args = (dtype, mode, vlen, flags) @@ -192,7 +195,7 @@ def test_template(self): fn, contains, avoids = combo_svml_usecase(*args) # look for specific patters in the asm for a given target with override_env_config('NUMBA_CPU_NAME', vlen2cpu[vlen]), \ - override_env_config('NUMBA_CPU_FEATURES', ''): + override_env_config('NUMBA_CPU_FEATURES', vlen2cpu_features[vlen]): # recompile for overridden CPU try: jit = compile_isolated(fn, (numba.int64, ), flags=flags) @@ -302,6 +305,9 @@ def check(self, pyfunc, *args, **kwargs): std_pattern = kwargs.pop('std_pattern', None) fast_pattern = kwargs.pop('fast_pattern', None) cpu_name = kwargs.pop('cpu_name', 'skylake-avx512') + # force LLVM to use AVX512 registers for vectorization + # https://reviews.llvm.org/D67259 + cpu_features = kwargs.pop('cpu_features', '-prefer-256-bit') # python result py_expected = pyfunc(*self.copy_args(*args)) @@ -318,7 +324,7 @@ def check(self, pyfunc, *args, **kwargs): # look for specific patters in the asm for a given target with override_env_config('NUMBA_CPU_NAME', cpu_name), \ - override_env_config('NUMBA_CPU_FEATURES', ''): + override_env_config('NUMBA_CPU_FEATURES', cpu_features): # recompile for overridden CPU jitstd, jitfast = self.compile(pyfunc, *args) if std_pattern: diff --git a/numba/tests/test_tuples.py b/numba/tests/test_tuples.py index dfd93ad0e2a..f0ae01167c2 100644 --- a/numba/tests/test_tuples.py +++ b/numba/tests/test_tuples.py @@ -4,7 +4,7 @@ import numpy as np from numba.core.compiler import compile_isolated -from numba import njit, jit, typeof +from numba import njit, jit, typeof, literally from numba.core import types, errors, utils from numba.tests.support import TestCase, MemoryLeakMixin, tag import unittest @@ -218,6 +218,17 @@ def test_len(self): [types.UniTuple(types.int64, 3)]) self.assertPreciseEqual(cr.entry_point((4, 5, 6)), 3) + def test_index_literal(self): + # issue #6023, test non-static getitem with IntegerLiteral index + def pyfunc(tup, idx): + idx = literally(idx) + return tup[idx] + cfunc = njit(pyfunc) + + tup = (4, 3.1, 'sss') + for i in range(len(tup)): + self.assertPreciseEqual(cfunc(tup, i), tup[i]) + def test_index(self): pyfunc = tuple_index cr = compile_isolated(pyfunc, diff --git a/setup.py b/setup.py index 485bf2c4916..1d0bacf6640 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,11 @@ -from setuptools import setup, Extension, find_packages -from distutils.command import build -from distutils.spawn import spawn -from distutils import sysconfig -import sys import os import platform +import sys +from distutils import sysconfig +from distutils.command import build +from distutils.spawn import spawn + +from setuptools import Extension, find_packages, setup import versioneer @@ -174,11 +175,6 @@ def check_file_at_path(path2file): found = p # the latest is used return found - # Search for Intel TBB, first check env var TBBROOT then conda locations - tbb_root = os.getenv('TBBROOT') - if not tbb_root: - tbb_root = check_file_at_path(['include', 'tbb', 'tbb.h']) - # Set various flags for use in TBB and openmp. On OSX, also find OpenMP! have_openmp = True if sys.platform.startswith('win'): @@ -205,33 +201,42 @@ def check_file_at_path(path2file): else: omplinkflags = ['-fopenmp'] - if tbb_root: - print("Using Intel TBB from:", tbb_root) - ext_np_ufunc_tbb_backend = Extension( - name='numba.np.ufunc.tbbpool', - sources=[ - 'numba/np/ufunc/tbbpool.cpp', - 'numba/np/ufunc/gufunc_scheduler.cpp', - ], - depends=['numba/np/ufunc/workqueue.h'], - include_dirs=[os.path.join(tbb_root, 'include')], - extra_compile_args=cpp11flags, - libraries=['tbb'], # TODO: if --debug or -g, use 'tbb_debug' - library_dirs=[ - # for Linux - os.path.join(tbb_root, 'lib', 'intel64', 'gcc4.4'), - # for MacOS - os.path.join(tbb_root, 'lib'), - # for Windows - os.path.join(tbb_root, 'lib', 'intel64', 'vc_mt'), - ], - ) - ext_np_ufunc_backends.append(ext_np_ufunc_tbb_backend) + # Disable tbb if forced by user with NUMBA_DISABLE_TBB=1 + if os.getenv("NUMBA_DISABLE_TBB"): + print("TBB disabled") else: - print("TBB not found") + # Search for Intel TBB, first check env var TBBROOT then conda locations + tbb_root = os.getenv('TBBROOT') + if not tbb_root: + tbb_root = check_file_at_path(['include', 'tbb', 'tbb.h']) + + if tbb_root: + print("Using Intel TBB from:", tbb_root) + ext_np_ufunc_tbb_backend = Extension( + name='numba.np.ufunc.tbbpool', + sources=[ + 'numba/np/ufunc/tbbpool.cpp', + 'numba/np/ufunc/gufunc_scheduler.cpp', + ], + depends=['numba/np/ufunc/workqueue.h'], + include_dirs=[os.path.join(tbb_root, 'include')], + extra_compile_args=cpp11flags, + libraries=['tbb'], # TODO: if --debug or -g, use 'tbb_debug' + library_dirs=[ + # for Linux + os.path.join(tbb_root, 'lib', 'intel64', 'gcc4.4'), + # for MacOS + os.path.join(tbb_root, 'lib'), + # for Windows + os.path.join(tbb_root, 'lib', 'intel64', 'vc_mt'), + ], + ) + ext_np_ufunc_backends.append(ext_np_ufunc_tbb_backend) + else: + print("TBB not found") - # Disable OpenMP if forced by user with NUMBA_NO_OPENMP=1 - if os.getenv('NUMBA_NO_OPENMP'): + # Disable OpenMP if forced by user with NUMBA_DISABLE_OPENMP=1 + if os.getenv('NUMBA_DISABLE_OPENMP'): print("OpenMP disabled") elif have_openmp: print("Using OpenMP from:", have_openmp) @@ -293,10 +298,10 @@ def check_file_at_path(path2file): packages = find_packages(include=["numba", "numba.*"]) -build_requires = [f'numpy >={min_numpy_build_version}'] +build_requires = ['numpy >={}'.format(min_numpy_build_version)] install_requires = [ - f'llvmlite >={min_llvmlite_version},<{max_llvmlite_version}', - f'numpy >={min_numpy_run_version}', + 'llvmlite >={},<{}'.format(min_llvmlite_version, max_llvmlite_version), + 'numpy >={}'.format(min_numpy_run_version), 'setuptools', ] @@ -333,11 +338,11 @@ def check_file_at_path(path2file): scripts=["numba/pycc/pycc", "bin/numba"], author="Anaconda, Inc.", author_email="numba-users@continuum.io", - url="http://numba.github.com", + url="https://numba.github.com", packages=packages, setup_requires=build_requires, install_requires=install_requires, - python_requires=f">={min_python_version}", + python_requires=">={}".format(min_python_version), license="BSD", cmdclass=cmdclass, )