diff --git a/CHANGE_LOG b/CHANGE_LOG
index d266d552a55..68b0e38e4ad 100644
--- a/CHANGE_LOG
+++ b/CHANGE_LOG
@@ -1,7 +1,242 @@
-Version 0.51.0
---------------
+Version 0.51.0 (August 12, 2020)
+--------------------------------
+
+This release continues to add new features to Numba and also contains a
+significant number of bug fixes and stability improvements.
+
+Highlights of core feature changes include:
+
+* The compilation chain is now based on LLVM 10 (Valentin Haenel).
+* Numba has internally switched to prefer non-literal types over literal ones so
+  as to reduce function over-specialisation, this with view of speeding up
+  compile times (Siu Kwan Lam).
+* On the CUDA target: Support for CUDA Toolkit 11, Ampere, and Compute
+  Capability 8.0; Printing of ``SASS`` code for kernels; Callbacks to Python
+  functions can be inserted into CUDA streams, and streams are async awaitable;
+  Atomic ``nanmin`` and ``nanmax`` functions are added; Fixes for various
+  miscompilations and segfaults. (mostly Graham Markall; call backs on
+  streams by Peter Würtz).
+
+Intel also kindly sponsored research and development that lead to some exciting
+new features:
+
+* Support for heterogeneous immutable lists and heterogeneous immutable string
+  key dictionaries. Also optional initial/construction value capturing for all
+  lists and dictionaries containing literal values (Stuart Archibald).
+* A new pass-by-reference mutable structure extension type ``StructRef`` (Siu
+  Kwan Lam).
+* Object mode blocks are now cacheable, with the side effect of numerous bug
+  fixes and performance improvements in caching. This also permits caching of
+  functions defined in closures (Siu Kwan Lam).
+
+Deprecations to note:
+
+To align with other targets, the ``argtypes`` and ``restypes`` kwargs to
+``@cuda.jit`` are now deprecated, the ``bind`` kwarg is also deprecated.
+Further the ``target`` kwarg to the ``numba.jit`` decorator family is
+deprecated.
+
+General Enhancements:
+
+* PR #5463: Add str(int) impl
+* PR #5526: Impl. np.asarray(literal)
+* PR #5619: Add support for multi-output ufuncs
+* PR #5711: Division with timedelta input
+* PR #5763: Support minlength argument to np.bincount
+* PR #5779: Return zero array from np.dot when the arguments are empty.
+* PR #5796: Add implementation for np.positive
+* PR #5849: Setitem for records when index is StringLiteral, including literal
+  unroll
+* PR #5856: Add support for conversion of inplace_binop to parfor.
+* PR #5893: Allocate 1D iteration space one at a time for more even
+  distribution.
+* PR #5922: Reduce objmode and unpickling overhead
+* PR #5944: re-enable OpenMP in wheels
+* PR #5946: Implement literal dictionaries and lists.
+* PR #5956: Update numba_sysinfo.py
+* PR #5978: Add structref as a mutable struct that is pass-by-ref
+* PR #5980: Deprecate target kwarg for numba.jit.
+* PR #6058: Add prefer_literal option to overload API
+
+Fixes:
+
+* PR #5674: Fix #3955. Allow `with objmode` to be cached
+* PR #5724: Initialize process lock lazily to prevent multiprocessing issue
+* PR #5783: Make np.divide and np.remainder code more similar
+* PR #5808: Fix 5665 Block jit(nopython=True, forceobj=True) and suppress
+  njit(forceobj=True)
+* PR #5834: Fix the is operator on Ellipsis
+* PR #5838: Ensure ``Dispatcher.__eq__`` always returns a bool
+* PR #5841: cleanup: Use PythonAPI.bool_from_bool in more places
+* PR #5862: Do not leak loop iteration variables into the numba.np.npyimpl
+  namespace
+* PR #5869: Update repomap
+* PR #5879: Fix erroneous input mutation in linalg routines
+* PR #5882: Type check function in jit decorator
+* PR #5925: Use np.inf and -np.inf for max and min float values respectively.
+* PR #5935: Fix default arguments with multiprocessing
+* PR #5952: Fix "Internal error ... local variable 'errstr' referenced before
+  assignment during BoundFunction(...)"
+* PR #5962: Fix SVML tests with LLVM 10 and AVX512
+* PR #5972: fix flake8 for numba/runtests.py
+* PR #5995: Update setup.py with new llvmlite versions
+* PR #5996: Set lower bound for llvmlite to 0.33
+* PR #6004: Fix problem in branch pruning with LiteralStrKeyDict
+* PR #6017: Fixing up numba_do_raise
+* PR #6028: Fix #6023
+* PR #6031: Continue 5821
+* PR #6035: Fix overspecialize of literal
+* PR #6046: Fixes statement reordering bug in maximize fusion step.
+* PR #6056: Fix issue on invalid inlining of non-empty build_list by
+  inline_arraycall
+* PR #6057: fix aarch64/python_3.8 failure on master
+* PR #6070: Fix overspecialized containers
+* PR #6071: Remove f-strings in setup.py
+* PR #6072: Fix for #6005
+* PR #6073: Fixes invalid C prototype in helper function.
+* PR #6078: Duplicate NumPy's PyArray_DescrCheck macro
+* PR #6081: Fix issue with cross drive use and relpath.
+* PR #6083: Fix bug in initial value unify.
+
+CUDA Enhancements/Fixes:
+
+* PR #5359: Remove special-casing of 0d arrays
+* PR #5709: CUDA: Refactoring of cuda.jit and kernel / dispatcher abstractions
+* PR #5732: CUDA Docs: document ``forall`` method of kernels
+* PR #5745: CUDA stream callbacks and async awaitable streams
+* PR #5761: Add implmentation for int types for isnan and isinf for CUDA
+* PR #5819: Add support for CUDA 11 and Ampere / CC 8.0
+* PR #5826: CUDA: Add function to get SASS for kernels
+* PR #5846: CUDA: Allow disabling NVVM optimizations, and fix debug issues
+* PR #5851: CUDA EMM enhancements - add default get_ipc_handle implementation,
+  skip a test conditionally
+* PR #5852: CUDA: Fix ``cuda.test()``
+* PR #5857: CUDA docs: Add notes on resetting the EMM plugin
+* PR #5859: CUDA: Fix reduce docs and style improvements
+* PR #6016: Fixes change of list spelling in a cuda test.
+* PR #6020: CUDA: Fix #5820, adding atomic nanmin / nanmax
+* PR #6030: CUDA: Don't optimize IR before sending it to NVVM
+* PR #6052: Fix dtype for atomic_add_double testsuite
+* PR #6080: CUDA: Prevent auto-upgrade of atomic intrinsics
+
+Documentation Updates:
+
+* PR #5782: Host docs on Read the Docs
+* PR #5830: doc: Mention that caching uses pickle
+* PR #5963: Fix broken link to numpy ufunc signature docs
+* PR #5975: restructure communication section
+* PR #5981: Document bounds-checking behavior in python deviations page
+* PR #5993: Docs for structref
+* PR #6008: Small fix so bullet points are rendered by sphinx
+* PR #6013: emphasize cuda kernel functions are asynchronous
+* PR #6036: Update deprecation doc from numba.errors to numba.core.errors
+* PR #6062: Change references to numba.pydata.org to https
+
+CI updates:
+
+* PR #5850: Updates the "New Issue" behaviour to better redirect users.
+* PR #5940: Add discourse badge
+* PR #5960: Setting mypy on CI
 
-In development
+Enhancements from user contributed PRs (with thanks!):
+
+* Aisha Tammy added the ability to switch off TBB support at compile time in
+  #5821 (continued in #6031 by Stuart Archibald).
+* Alexander Stiebing fixed a reference before assignment bug in #5952.
+* Alexey Kozlov fixed a bug in tuple getitem for literals in #6028.
+* Andrew Eckart updated the repomap in #5869, added support for Read the Docs
+  in #5782, fixed a bug in the ``np.dot`` implementation to correctly handle
+  empty arrays in #5779 and added support for ``minlength`` to ``np.bincount``
+  in #5763.
+* ``@bitsisbits`` updated ``numba_sysinfo.py`` to handle HSA agents correctly in
+  #5956.
+* Daichi Suzuo Fixed a bug in the threading backend initialisation sequence such
+  that it is now correctly a lazy lock in #5724.
+* Eric Wieser contributed a number of patches, particularly in enhancing and
+  improving the ``ufunc`` capabilities:
+
+  * #5359: Remove special-casing of 0d arrays
+  * #5834: Fix the is operator on Ellipsis
+  * #5619: Add support for multi-output ufuncs
+  * #5841: cleanup: Use PythonAPI.bool_from_bool in more places
+  * #5862: Do not leak loop iteration variables into the numba.np.npyimpl
+    namespace
+  * #5838: Ensure ``Dispatcher.__eq__`` always returns a bool
+  * #5830: doc: Mention that caching uses pickle
+  * #5783: Make np.divide and np.remainder code more similar
+
+* Ethan Pronovost added a guard to prevent the common mistake of applying a jit
+  decorator to the same function twice in #5881.
+* Graham Markall contributed many patches to the CUDA target, as follows:
+
+  * #6052: Fix dtype for atomic_add_double tests
+  * #6030: CUDA: Don't optimize IR before sending it to NVVM
+  * #5846: CUDA: Allow disabling NVVM optimizations, and fix debug issues
+  * #5826: CUDA: Add function to get SASS for kernels
+  * #5851: CUDA EMM enhancements - add default get_ipc_handle implementation,
+    skip a test conditionally
+  * #5709: CUDA: Refactoring of cuda.jit and kernel / dispatcher abstractions
+  * #5819: Add support for CUDA 11 and Ampere / CC 8.0
+  * #6020: CUDA: Fix #5820, adding atomic nanmin / nanmax
+  * #5857: CUDA docs: Add notes on resetting the EMM plugin
+  * #5859: CUDA: Fix reduce docs and style improvements
+  * #5852: CUDA: Fix ``cuda.test()``
+  * #5732: CUDA Docs: document ``forall`` method of kernels
+
+* Guilherme Leobas added support for ``str(int)`` in #5463 and
+  ``np.asarray(literal value)``` in #5526.
+* Hameer Abbasi deprecated the ``target`` kwarg for ``numba.jit`` in #5980.
+* Hannes Pahl added a badge to the Numba github page linking to the new
+  discourse forum in #5940 and also fixed a bug that permitted illegal
+  combinations of flags to be passed into ``@jit`` in #5808.
+* Kayran Schmidt emphasized that CUDA kernel functions are asynchronous in the
+  documentation in #6013.
+* Leonardo Uieda fixed a broken link to the NumPy ufunc signature docs in #5963.
+* Lucio Fernandez-Arjona added mypy to CI and started adding type annotations to
+  the code base in #5960, also fixed a (de)serialization problem on the
+  dispatcher in #5935, improved the undefined variable error message in #5876,
+  added support for division with timedelta input in #5711 and implemented
+  ``setitem`` for records when the index is a ``StringLiteral`` in #5849.
+* Ludovic Tiako documented Numba's bounds-checking behavior in the python
+  deviations page in #5981.
+* Matt Roeschke changed all ``http`` references ``https`` in #6062.
+* ``@niteya-shah`` implemented ``isnan`` and ``isinf`` for integer types on the
+  CUDA target in #5761 and implemented ``np.positive`` in #5796.
+* Peter Würtz added CUDA stream callbacks and async awaitable streams in #5745.
+* ``@rht`` fixed an invalid import referred to in the deprecation documentation
+  in #6036.
+* Sergey Pokhodenko updated the SVML tests for LLVM 10 in #5962.
+* Shyam Saladi fixed a Sphinx rendering bug in #6008.
+
+Authors:
+
+* Aisha Tammy
+* Alexander Stiebing
+* Alexey Kozlov
+* Andrew Eckart
+* ``@bitsisbits``
+* Daichi Suzuo
+* Eric Wieser
+* Ethan Pronovost
+* Graham Markall
+* Guilherme Leobas
+* Hameer Abbasi
+* Hannes Pahl
+* Kayran Schmidt
+* Kozlov, Alexey
+* Leonardo Uieda
+* Lucio Fernandez-Arjona
+* Ludovic Tiako
+* Matt Roeschke
+* ``@niteya-shah``
+* Peter Würtz
+* Sergey Pokhodenko
+* Shyam Saladi
+* ``@rht``
+* Siu Kwan Lam (core dev)
+* Stuart Archibald (core dev)
+* Todd A. Anderson (core dev)
+* Valentin Haenel (core dev)
 
 
 Version 0.50.1 (Jun 24, 2020)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 3db68875355..e1402297ae2 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -122,7 +122,7 @@
 # All sphinx_rtd_theme options. Default values commented out; uncomment to
 # change.
 html_theme_options = {
-    'canonical_url': 'http://numba.pydata.org/numba-doc/latest/index.html',
+    'canonical_url': 'https://numba.pydata.org/numba-doc/latest/index.html',
     # 'logo_only': False,
     # 'display_version': True,
     # 'prev_next_buttons_location': 'bottom',
diff --git a/docs/source/cuda-reference/host.rst b/docs/source/cuda-reference/host.rst
index 317d1c0ab48..d69b5de4e48 100644
--- a/docs/source/cuda-reference/host.rst
+++ b/docs/source/cuda-reference/host.rst
@@ -178,7 +178,7 @@ Programming Guide Streams section
 Streams are instances of :class:`numba.cuda.cudadrv.driver.Stream`:
 
 .. autoclass:: numba.cuda.cudadrv.driver.Stream
-   :members: synchronize, auto_synchronize
+   :members: synchronize, auto_synchronize, add_callback, async_done
 
 To create a new stream:
 
diff --git a/docs/source/cuda-reference/kernel.rst b/docs/source/cuda-reference/kernel.rst
index 8ab9c06b2ca..cd6e4fb3370 100644
--- a/docs/source/cuda-reference/kernel.rst
+++ b/docs/source/cuda-reference/kernel.rst
@@ -57,8 +57,8 @@ Dispatcher objects also provide several utility methods for inspection and
 creating a specialized instance:
 
 .. autoclass:: numba.cuda.compiler.Dispatcher
-   :members: inspect_asm, inspect_llvm, inspect_types, specialize, specialized,
-             extensions
+   :members: inspect_asm, inspect_llvm, inspect_sass, inspect_types,
+             specialize, specialized, extensions
 
 
 Intrinsic Attributes and Functions
diff --git a/docs/source/reference/deprecation.rst b/docs/source/reference/deprecation.rst
index ecdd99688f7..1f43b11c305 100644
--- a/docs/source/reference/deprecation.rst
+++ b/docs/source/reference/deprecation.rst
@@ -254,3 +254,22 @@ This feature will be moved with respect to this schedule:
 * Deprecation warnings will be issued in version 0.49.0
 * Support for importing from ``numba.jitclass`` will be removed in version
   0.52.0.
+
+Deprecation of the target kwarg
+===============================
+There have been a number of users attempting to use the ``target`` keyword
+argument that's meant for internal use only. We are deprecating this argument,
+as alternative solutions are available to achieve the same behaviour.
+
+Recommendations
+---------------
+Update the ``jit`` decorator as follows:
+
+* Change ``@numba.jit(..., target='cuda')`` to ``numba.cuda.jit(...)``.
+
+Schedule
+--------
+This feature will be moved with respect to this schedule:
+
+* Deprecation warnings will be issued in 0.51.0.
+* The target kwarg will be removed in version 0.53.0.
diff --git a/docs/source/reference/envvars.rst b/docs/source/reference/envvars.rst
index 3e9f888a814..f9946b3b61d 100644
--- a/docs/source/reference/envvars.rst
+++ b/docs/source/reference/envvars.rst
@@ -3,6 +3,10 @@
 Environment variables
 =====================
 
+.. note:: This section relates to environment variables that impact Numba's
+          runtime, for compile time environment variables see
+          :ref:`numba-source-install-env_vars`.
+
 Numba allows its behaviour to be changed through the use of environment
 variables. Unless otherwise mentioned, those variables have integer values and
 default to zero.
diff --git a/docs/source/user/installing.rst b/docs/source/user/installing.rst
index aa5c71df27c..37a4ffc47ac 100644
--- a/docs/source/user/installing.rst
+++ b/docs/source/user/installing.rst
@@ -170,6 +170,37 @@ Then you can build and install Numba from the top level of the source tree::
 
     $ python setup.py install
 
+.. _numba-source-install-env_vars:
+
+Build time environment variables and configuration of optional components
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Below are environment variables that are applicable to altering how Numba would
+otherwise build by default along with information on configuration options.
+
+.. envvar:: NUMBA_DISABLE_OPENMP (default: not set)
+
+  To disable compilation of the OpenMP threading backend set this environment
+  variable to a non-empty string when building. If not set (default):
+
+  * For Linux and Windows it is necessary to provide OpenMP C headers and
+    runtime  libraries compatible with the compiler tool chain mentioned above,
+    and for these to be accessible to the compiler via standard flags.
+  * For OSX the conda packages ``llvm-openmp`` and ``intel-openmp`` provide
+    suitable C headers and libraries. If the compilation requirements are not
+    met the OpenMP threading backend will not be compiled
+
+.. envvar:: NUMBA_DISABLE_TBB (default: not set)
+
+  To disable the compilation of the TBB threading backend set this environment
+  variable to a non-empty string when building. If not set (default) the TBB C
+  headers and libraries must be available at compile time. If building with
+  ``conda build`` this requirement can be met by installing the ``tbb-devel``
+  package. If not building with ``conda build`` the requirement can be met via a
+  system installation of TBB or through the use of the ``TBBROOT`` environment
+  variable to provide the location of the TBB installation. For more
+  information about setting ``TBBROOT`` see the `Intel documentation <https://software.intel.com/content/www/us/en/develop/documentation/advisor-user-guide/top/appendix/adding-parallelism-to-your-program/adding-the-parallel-framework-to-your-build-environment/defining-the-tbbroot-environment-variable.html>`_.
+
 .. _numba-source-install-check:
 
 Dependency List
@@ -177,7 +208,7 @@ Dependency List
 
 Numba has numerous required and optional dependencies which additionally may
 vary with target operating system and hardware. The following lists them all
-(as of September 2019).
+(as of July 2020).
 
 * Required build time:
 
@@ -185,26 +216,24 @@ vary with target operating system and hardware. The following lists them all
   * ``numpy``
   * ``llvmlite``
   * Compiler toolchain mentioned above
-  * OpenMP C headers and runtime libraries compatible with the compiler
-    toolchain mentioned above and accessible to the compiler via standard flags
-    (Linux, Windows).
+
+* Required run time:
+
+  * ``setuptools``
+  * ``numpy``
+  * ``llvmlite``
 
 * Optional build time:
 
+  See :ref:`numba-source-install-env_vars` for more details about additional
+  options for the configuration and specification of these optional components.
+
   * ``llvm-openmp`` (OSX) - provides headers for compiling OpenMP support into
     Numba's threading backend
   * ``intel-openmp`` (OSX) - provides OpenMP library support for Numba's
     threading backend.
   * ``tbb-devel`` - provides TBB headers/libraries for compiling TBB support
     into Numba's threading backend
-  * ``pickle5`` - provides Python 3.8 pickling features for faster pickling in
-    Python 3.6 and 3.7.
-
-* Required run time:
-
-  * ``setuptools``
-  * ``numpy``
-  * ``llvmlite``
 
 * Optional runtime are:
 
@@ -234,6 +263,8 @@ vary with target operating system and hardware. The following lists them all
     inspection. `See here <https://github.com/radareorg/radare2>`_ for
     information on obtaining and installing.
   * ``graphviz`` - for some CFG inspection functionality.
+  * ``pickle5`` - provides Python 3.8 pickling features for faster pickling in
+    Python 3.6 and 3.7.
 
 * To build the documentation:
 
@@ -299,4 +330,3 @@ further information.
                                   pci bus id: 1
 
 (output truncated due to length)
-
diff --git a/numba/_helperlib.c b/numba/_helperlib.c
index ad698cdf5b2..399c6c10415 100644
--- a/numba/_helperlib.c
+++ b/numba/_helperlib.c
@@ -825,143 +825,17 @@ static void traceback_add(const char *funcname, const char *filename, int lineno
     _PyErr_ChainExceptions(exc, val, tb);
 }
 
-/* Logic for raising an arbitrary object.  Adapted from CPython's ceval.c.
-   This *consumes* a reference count to its argument. */
-NUMBA_EXPORT_FUNC(int)
-numba_do_raise(PyObject *exc_packed)
-{
-    PyObject *exc = NULL, *type = NULL, *value = NULL, *loc = NULL;
+
+/*
+ * Add traceback information to *loc* to the active exception.
+ * loc can be NULL, which causes this function to become a no-op.
+ */
+static
+void traceback_add_loc(PyObject *loc) {
     const char *function_name_str = NULL, *filename_str = NULL;
     PyObject *function_name = NULL, *filename = NULL, *lineno = NULL;
     Py_ssize_t pos;
 
-    /* We support the following forms of raise:
-       raise
-       raise <instance>
-       raise <type> */
-
-    /* could be a tuple from npm (some exc like thing, args, location) */
-    if (PyTuple_CheckExact(exc_packed)) {
-        /* Unpack a (class/inst/tuple, arguments, location) tuple. */
-        if (!PyArg_ParseTuple(exc_packed, "OOO", &exc, &value, &loc)) {
-            Py_DECREF(exc_packed);
-            goto raise_error_w_loc;
-        }
-
-        if (exc == Py_None) {
-            /* Reraise */
-            PyThreadState *tstate = PyThreadState_GET();
-            PyObject *tb;
-#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 7)
-            _PyErr_StackItem *tstate_exc = tstate->exc_info;
-#else
-            PyThreadState *tstate_exc = tstate;
-#endif
-            Py_DECREF(exc_packed);
-            type = tstate_exc->exc_type;
-            value = tstate_exc->exc_value;
-            tb = tstate_exc->exc_traceback;
-            if (type == Py_None) {
-                PyErr_SetString(PyExc_RuntimeError,
-                                "No active exception to reraise");
-                return 0;
-            }
-            Py_XINCREF(type);
-            Py_XINCREF(value);
-            Py_XINCREF(tb);
-            PyErr_Restore(type, value, tb);
-            return 1;
-        }
-
-        /* the unpacked exc should be a class, value and loc are set from above
-         */
-        Py_XINCREF(value);
-        Py_XINCREF(loc);
-        if (PyExceptionClass_Check(exc)) {
-            /* It is a class, type used here just as a tmp var */
-            type = PyObject_CallObject(exc, value);
-            if (type == NULL)
-                goto raise_error_w_loc;
-            if (!PyExceptionInstance_Check(type)) {
-                PyErr_SetString(PyExc_TypeError,
-                                "exceptions must derive from BaseException");
-                goto raise_error_w_loc;
-            }
-            /* all ok, set type to the exc */
-            Py_DECREF(type);
-            type = exc;
-        } else {
-            /* this should be unreachable as typing should catch it */
-            /* Not something you can raise.  You get an exception
-            anyway, just not what you specified :-) */
-            Py_DECREF(exc_packed);
-            PyErr_SetString(PyExc_TypeError,
-                            "exceptions must derive from BaseException");
-            goto raise_error_w_loc;
-        }
-
-        /* as this branch is exited:
-         * - type should be an exception class
-         * - value should be the args for the exception class instantiation
-         * - loc should be the location information (or None)
-         */
-    } else {  /* could be a reraise or an exception from objmode */
-        exc = exc_packed;
-        if (exc == Py_None) {
-            /* Reraise */
-            PyThreadState *tstate = PyThreadState_GET();
-            PyObject *tb;
-#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 7)
-            _PyErr_StackItem *tstate_exc = tstate->exc_info;
-#else
-            PyThreadState *tstate_exc = tstate;
-#endif
-            Py_DECREF(exc);
-            type = tstate_exc->exc_type;
-            value = tstate_exc->exc_value;
-            tb = tstate_exc->exc_traceback;
-            if (type == Py_None) {
-                PyErr_SetString(PyExc_RuntimeError,
-                                "No active exception to reraise");
-                return 0;
-            }
-            Py_XINCREF(type);
-            Py_XINCREF(value);
-            Py_XINCREF(tb);
-            PyErr_Restore(type, value, tb);
-            return 1;
-        }
-
-        /* exc should be an exception class or an instance of an exception */
-        if (PyExceptionClass_Check(exc)) {
-            type = exc;
-            value = PyObject_CallObject(exc, value);
-            if (value == NULL)
-                goto raise_error;
-            if (!PyExceptionInstance_Check(value)) {
-                PyErr_SetString(PyExc_TypeError,
-                                "exceptions must derive from BaseException");
-                goto raise_error;
-            }
-        }
-        else if (PyExceptionInstance_Check(exc)) {
-            value = exc;
-            type = PyExceptionInstance_Class(exc);
-            Py_INCREF(type);
-        }
-        else {
-            /* Not something you can raise.  You get an exception
-            anyway, just not what you specified :-) */
-            Py_DECREF(exc); // exc points to exc_packed
-            PyErr_SetString(PyExc_TypeError,
-                            "exceptions must derive from BaseException");
-            goto raise_error;
-        }
-    }
-
-    PyErr_SetObject(type, value);
-
-raise_error_w_loc:
     /* instance is instantiated/internal exception is raised, if loc is present
      * add a frame for it into the traceback */
     if(loc && loc != Py_None && PyTuple_Check(loc))
@@ -977,31 +851,141 @@ numba_do_raise(PyObject *exc_packed)
         traceback_add(function_name_str, filename_str, \
                       (int)PyLong_AsLong(lineno));
     }
+}
 
-    /* PyErr_SetObject incref's its arguments */
-    Py_XDECREF(value);
-    Py_XDECREF(type);
-    return 0;
+/**
+ * Re-raise the current active exception.
+ * Called internal by process_raise() when *exc* is None.
+ */
+static
+int reraise_exc_is_none(void) {
+    /* Reraise */
+    PyThreadState *tstate = PyThreadState_GET();
+    PyObject *tb, *type, *value;
+#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 7)
+    _PyErr_StackItem *tstate_exc = tstate->exc_info;
+#else
+    PyThreadState *tstate_exc = tstate;
+#endif
+    type = tstate_exc->exc_type;
+    value = tstate_exc->exc_value;
+    tb = tstate_exc->exc_traceback;
+    if (type == Py_None) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "No active exception to reraise");
+        return 0;
+    }
+    /* incref needed because PyErr_Restore DOES NOT */
+    Py_XINCREF(type);
+    Py_XINCREF(value);
+    Py_XINCREF(tb);
+    PyErr_Restore(type, value, tb);
+    return 1;
+}
 
-raise_error:
-    Py_XDECREF(value);
-    Py_XDECREF(type);
-    return 0;
+/*
+ * Set exception given the Exception type and the constructor argument.
+ * Equivalent to ``raise exc(value)``.
+ * PyExceptionClass_Check(exc) must be True.
+ * value can be NULL.
+ */
+static
+int process_exception_class(PyObject *exc, PyObject *value) {
+    PyObject *type;
+    /* It is a class, type used here just as a tmp var */
+    type = PyObject_CallObject(exc, value);
+    if (type == NULL){
+        return 0;
+    }
+    if (!PyExceptionInstance_Check(type)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "exceptions must derive from BaseException");
+        Py_DECREF(type);
+        return 0;
+    }
+    /* all ok, set type to the exc */
+    Py_DECREF(type);
+    type = exc;
+    PyErr_SetObject(type, value);
+    return 1;
 }
 
+/*
+ * Internal routine to process exceptions.
+ * exc cannot be NULL. It can be a None, Exception type, or Exception instance.
+ * value can be NULL for absent, or any PyObject valid for the exception.
+ */
+static
+int process_raise(PyObject *exc, PyObject *value) {
+    /* exc is None */
+    if (exc == Py_None) {
+        return reraise_exc_is_none();
+    }
+    /* exc should be an exception class */
+    else if (PyExceptionClass_Check(exc)) {
+        return process_exception_class(exc, value);
+    }
+    /* exc is an instance of an Exception */
+    else if (PyExceptionInstance_Check(exc)) {
+        PyObject *type = PyExceptionInstance_Class(exc);
+        PyErr_SetObject(type, exc);
+        return 0;
+    }
+    else {
+        /* Not something you can raise.  You get an exception
+        anyway, just not what you specified :-) */
+        PyErr_SetString(PyExc_TypeError,
+                        "exceptions must derive from BaseException");
+        return 0;
+    }
+}
+
+/* Logic for raising an arbitrary object.  Adapted from CPython's ceval.c.
+   This *consumes* a reference count to its argument. */
+NUMBA_EXPORT_FUNC(int)
+numba_do_raise(PyObject *exc_packed)
+{
+    int status;
+    PyObject *exc = NULL, *value = NULL, *loc = NULL;
+
+    /* We support the following forms of raise:
+       raise
+       raise <instance>
+       raise <type> */
+
+    /* could be a tuple from npm (some exc like thing, args, location) */
+    if (PyTuple_CheckExact(exc_packed)) {
+        /* Unpack a (class/inst/tuple, arguments, location) tuple. */
+        if (!PyArg_ParseTuple(exc_packed, "OOO", &exc, &value, &loc)) {
+            traceback_add_loc(loc);
+            return 0;
+        }
+    } else {
+        /* could be a reraise or an exception from objmode */
+        exc = exc_packed;
+        /* branch exit with value = NULL and loc = NULL */
+    }
+    /* value is either NULL or borrowed */
+    status = process_raise(exc, value);
+    traceback_add_loc(loc);
+    Py_DECREF(exc_packed);
+    return status;
+}
+
+
 NUMBA_EXPORT_FUNC(PyObject *)
-numba_unpickle(const char *data, int n)
+numba_unpickle(const char *data, int n, const char *hashed)
 {
-    PyObject *buf, *obj;
-    static PyObject *loads;
+    PyObject *buf=NULL, *obj=NULL, *addr=NULL, *hashedbuf=NULL;
+    static PyObject *loads=NULL;
 
     /* Caching the pickle.loads function shaves a couple µs here. */
     if (loads == NULL) {
         PyObject *picklemod;
-        picklemod = PyImport_ImportModule("pickle");
+        picklemod = PyImport_ImportModule("numba.core.serialize");
         if (picklemod == NULL)
             return NULL;
-        loads = PyObject_GetAttrString(picklemod, "loads");
+        loads = PyObject_GetAttrString(picklemod, "_numba_unpickle");
         Py_DECREF(picklemod);
         if (loads == NULL)
             return NULL;
@@ -1010,7 +994,17 @@ numba_unpickle(const char *data, int n)
     buf = PyBytes_FromStringAndSize(data, n);
     if (buf == NULL)
         return NULL;
-    obj = PyObject_CallFunctionObjArgs(loads, buf, NULL);
+    /* SHA1 produces 160 bit or 20 bytes */
+    hashedbuf = PyBytes_FromStringAndSize(hashed, 20);
+    if (hashedbuf == NULL)
+        goto error;
+    addr = PyLong_FromVoidPtr((void*)data);
+    if (addr == NULL)
+        goto error;
+    obj = PyObject_CallFunctionObjArgs(loads, addr, buf, hashedbuf, NULL);
+error:
+    Py_XDECREF(addr);
+    Py_XDECREF(hashedbuf);
     Py_DECREF(buf);
     return obj;
 }
diff --git a/numba/_numba_common.h b/numba/_numba_common.h
index f406db58317..c5e67d9c6a3 100644
--- a/numba/_numba_common.h
+++ b/numba/_numba_common.h
@@ -17,4 +17,23 @@
 #define VISIBILITY_HIDDEN
 #endif
 
+/*
+ * Numba's version of the PyArray_DescrCheck macro from NumPy, use it as a
+ * direct replacement of NumPy's PyArray_DescrCheck to ensure binary
+ * compatibility.
+ *
+ * Details of why this is needed:
+ * NumPy 1.18 changed the definition of the PyArray_DescrCheck macro here:
+ * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698
+ * the result of this being that building against NumPy <1.18 would prevent
+ * Numba running against NumPy >= 1.20 as noted here:
+ * https://github.com/numba/numba/issues/6041#issuecomment-665132199
+ *
+ * This macro definition is copied from:
+ * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698#diff-ad2213da23136c5fc5883d9eb2d88666R26
+ *
+ * NOTE: This is the NumPy 1.18 and above version of the macro.
+ */
+#define NUMBA_PyArray_DescrCheck(op) PyObject_TypeCheck(op, &PyArrayDescr_Type)
+
 #endif /* NUMBA_COMMON_H_ */
diff --git a/numba/_typeof.c b/numba/_typeof.c
index 242150cccd0..ffe0e3a3c58 100644
--- a/numba/_typeof.c
+++ b/numba/_typeof.c
@@ -4,6 +4,7 @@
 #include <time.h>
 #include <assert.h>
 
+#include "_numba_common.h"
 #include "_typeof.h"
 #include "_hashtable.h"
 
@@ -372,7 +373,7 @@ compute_fingerprint(string_writer_t *w, PyObject *val)
         PyBuffer_Release(&buf);
         return 0;
     }
-    if (PyArray_DescrCheck(val)) {
+    if (NUMBA_PyArray_DescrCheck(val)) {
         TRY(string_writer_put_char, w, OP_NP_DTYPE);
         return compute_dtype_fingerprint(w, (PyArray_Descr *) val);
     }
diff --git a/numba/core/analysis.py b/numba/core/analysis.py
index ebf4c0c87c9..96cb2014cd9 100644
--- a/numba/core/analysis.py
+++ b/numba/core/analysis.py
@@ -608,6 +608,13 @@ def find_literally_calls(func_ir, argtypes):
                     first_loc.setdefault(argindex, assign.loc)
     # Signal the dispatcher to force literal typing
     for pos in marked_args:
-        if not isinstance(argtypes[pos], (types.Literal, types.InitialValue)):
+        query_arg = argtypes[pos]
+        do_raise = (isinstance(query_arg, types.InitialValue) and
+                    query_arg.initial_value is None)
+        if do_raise:
+            loc = first_loc[pos]
+            raise errors.ForceLiteralArg(marked_args, loc=loc)
+
+        if not isinstance(query_arg, (types.Literal, types.InitialValue)):
             loc = first_loc[pos]
             raise errors.ForceLiteralArg(marked_args, loc=loc)
diff --git a/numba/core/callconv.py b/numba/core/callconv.py
index ec154128887..7a7bb85feb3 100644
--- a/numba/core/callconv.py
+++ b/numba/core/callconv.py
@@ -316,8 +316,9 @@ def get_exception(self, exc_id):
             msg = "unknown error %d in native function" % exc_id
             return SystemError, (msg,)
 
-
-excinfo_t = ir.LiteralStructType([GENERIC_POINTER, int32_t])
+# The structure type constructed by PythonAPI.serialize_uncached()
+# i.e a {i8* pickle_buf, i32 pickle_bufsz, i8* hash_buf}
+excinfo_t = ir.LiteralStructType([GENERIC_POINTER, int32_t, GENERIC_POINTER])
 excinfo_ptr_t = ir.PointerType(excinfo_t)
 
 
diff --git a/numba/core/codegen.py b/numba/core/codegen.py
index 05a68e10d99..1e05c12bacd 100644
--- a/numba/core/codegen.py
+++ b/numba/core/codegen.py
@@ -742,7 +742,7 @@ def _check_llvm_bugs(self):
             raise RuntimeError(
                 "LLVM will produce incorrect floating-point code "
                 "in the current locale %s.\nPlease read "
-                "http://numba.pydata.org/numba-doc/latest/user/faq.html#llvm-locale-bug "
+                "https://numba.pydata.org/numba-doc/latest/user/faq.html#llvm-locale-bug "
                 "for more information."
                 % (loc,))
         raise AssertionError("Unexpected IR:\n%s\n" % (ir_out,))
diff --git a/numba/core/cpu_options.py b/numba/core/cpu_options.py
index 758f95b8479..c8019949a64 100644
--- a/numba/core/cpu_options.py
+++ b/numba/core/cpu_options.py
@@ -50,6 +50,7 @@ def __init__(self, value):
             self.enabled = value
             self.comprehension = value
             self.reduction = value
+            self.inplace_binop = value
             self.setitem = value
             self.numpy = value
             self.stencil = value
@@ -59,6 +60,7 @@ def __init__(self, value):
             self.enabled = True
             self.comprehension = value.pop('comprehension', True)
             self.reduction = value.pop('reduction', True)
+            self.inplace_binop = value.pop('inplace_binop', True)
             self.setitem = value.pop('setitem', True)
             self.numpy = value.pop('numpy', True)
             self.stencil = value.pop('stencil', True)
diff --git a/numba/core/debuginfo.py b/numba/core/debuginfo.py
index 23f49d2b655..b26c7f30219 100644
--- a/numba/core/debuginfo.py
+++ b/numba/core/debuginfo.py
@@ -31,6 +31,12 @@ def mark_subprogram(self, function, name, loc):
         """
         pass
 
+    @abc.abstractmethod
+    def initialize(self):
+        """Initialize the debug info. An opportunity for the debuginfo to
+        prepare any necessary data structures.
+        """
+
     @abc.abstractmethod
     def finalize(self):
         """Finalize the debuginfo by emitting all necessary metadata.
@@ -52,6 +58,9 @@ def mark_location(self, builder, loc):
     def mark_subprogram(self, function, name, loc):
         pass
 
+    def initialize(self):
+        pass
+
     def finalize(self):
         pass
 
@@ -66,6 +75,11 @@ def __init__(self, module, filepath):
         self.filepath = os.path.abspath(filepath)
         self.difile = self._di_file()
         self.subprograms = []
+        self.initialize()
+
+    def initialize(self):
+        # Create the compile unit now because it is referenced when
+        # constructing subprograms
         self.dicompileunit = self._di_compile_unit()
 
     def _var_type(self, lltype, size):
@@ -302,6 +316,7 @@ def _di_file(self):
     def _di_compile_unit(self):
         filepair = self._filepair()
         empty = self.module.add_metadata([self._const_int(0)])
+        sp_metadata = self.module.add_metadata(self.subprograms)
         return self.module.add_metadata([
             self._const_int(self.DI_Compile_unit),         # tag
             filepair,                   # source directory and file pair
@@ -374,3 +389,11 @@ def _di_location(self, line):
             None,                    # original scope
         ])
 
+    def initialize(self):
+        pass
+
+    def finalize(self):
+        # We create the compile unit at this point because subprograms is
+        # populated and can be referred to by the compile unit.
+        self.dicompileunit = self._di_compile_unit()
+        super().finalize()
diff --git a/numba/core/decorators.py b/numba/core/decorators.py
index e1870091ba6..cfe91168969 100644
--- a/numba/core/decorators.py
+++ b/numba/core/decorators.py
@@ -23,7 +23,7 @@
                                  "Signatures should be passed as the first "
                                  "positional argument.")
 
-def jit(signature_or_function=None, locals={}, target='cpu', cache=False,
+def jit(signature_or_function=None, locals={}, cache=False,
         pipeline_class=None, boundscheck=False, **options):
     """
     This decorator is used to compile a Python function into native code.
@@ -41,7 +41,7 @@ def jit(signature_or_function=None, locals={}, target='cpu', cache=False,
         Mapping of local variable names to Numba types. Used to override the
         types deduced by Numba's type inference engine.
 
-    target: str
+    target (deprecated): str
         Specifies the target platform to compile for. Valid targets are cpu,
         gpu, npyufunc, and cuda. Defaults to cpu.
 
@@ -145,6 +145,11 @@ def bar(x, y):
         raise DeprecationError(_msg_deprecated_signature_arg.format('restype'))
     if options.get('nopython', False) and options.get('forceobj', False):
         raise ValueError("Only one of 'nopython' or 'forceobj' can be True.")
+    if 'target' in options:
+        target = options.pop('target')
+        warnings.warn("The 'target' keyword argument is deprecated.", NumbaDeprecationWarning)
+    else:
+        target = options.pop('_target', 'cpu')
 
     options['boundscheck'] = boundscheck
 
diff --git a/numba/core/errors.py b/numba/core/errors.py
index 46426a9e002..3417a86777b 100644
--- a/numba/core/errors.py
+++ b/numba/core/errors.py
@@ -336,9 +336,9 @@ def termcolor():
 please file a feature request at: https://github.com/numba/numba/issues/new
 
 To see Python/NumPy features supported by the latest release of Numba visit:
-http://numba.pydata.org/numba-doc/latest/reference/pysupported.html
+https://numba.pydata.org/numba-doc/latest/reference/pysupported.html
 and
-http://numba.pydata.org/numba-doc/latest/reference/numpysupported.html
+https://numba.pydata.org/numba-doc/latest/reference/numpysupported.html
 """
 
 constant_inference_info = """
@@ -347,7 +347,7 @@ def termcolor():
 however please first check that your code is valid for compilation,
 particularly with respect to string interpolation (not supported!) and
 the requirement of compile time constants as arguments to exceptions:
-http://numba.pydata.org/numba-doc/latest/reference/pysupported.html?highlight=exceptions#constructs
+https://numba.pydata.org/numba-doc/latest/reference/pysupported.html?highlight=exceptions#constructs
 
 If the code is valid and the unsupported functionality is important to you
 please file a feature request at: https://github.com/numba/numba/issues/new
@@ -360,12 +360,12 @@ def termcolor():
 the use of unsupported features or an issue in resolving types.
 
 To see Python/NumPy features supported by the latest release of Numba visit:
-http://numba.pydata.org/numba-doc/latest/reference/pysupported.html
+https://numba.pydata.org/numba-doc/latest/reference/pysupported.html
 and
-http://numba.pydata.org/numba-doc/latest/reference/numpysupported.html
+https://numba.pydata.org/numba-doc/latest/reference/numpysupported.html
 
 For more information about typing errors and how to debug them visit:
-http://numba.pydata.org/numba-doc/latest/user/troubleshoot.html#my-code-doesn-t-compile
+https://numba.pydata.org/numba-doc/latest/user/troubleshoot.html#my-code-doesn-t-compile
 
 If you think your code should work with Numba, please report the error message
 and traceback, along with a minimal reproducer at:
diff --git a/numba/core/extending.py b/numba/core/extending.py
index 96f90a6df33..8d8d8525e21 100644
--- a/numba/core/extending.py
+++ b/numba/core/extending.py
@@ -55,7 +55,8 @@ def generic(self):
 _overload_default_jit_options = {'no_cpython_wrapper': True}
 
 
-def overload(func, jit_options={}, strict=True, inline='never'):
+def overload(func, jit_options={}, strict=True, inline='never',
+             prefer_literal=False):
     """
     A decorator marking the decorated function as typing and implementing
     *func* in nopython mode.
@@ -101,6 +102,12 @@ def len_impl(seq):
       holds the information from the callee. The function should return Truthy
       to determine whether to inline, this essentially permitting custom
       inlining rules (typical use might be cost models).
+
+    The *prefer_literal* option allows users to control if literal types should
+    be tried first or last. The default (`False`) is to use non-literal types.
+    Implementations that can specialize based on literal values should set the
+    option to `True`. Note, this option maybe expanded in the near future to
+    allow for more control (e.g. disabling non-literal types).
     """
     from numba.core.typing.templates import make_overload_template, infer_global
 
@@ -110,7 +117,7 @@ def len_impl(seq):
 
     def decorate(overload_func):
         template = make_overload_template(func, overload_func, opts, strict,
-                                          inline)
+                                          inline, prefer_literal)
         infer(template)
         if callable(func):
             infer_global(func, types.Function(template))
@@ -207,6 +214,7 @@ def decorate(overload_func):
         template = make_overload_method_template(
             typ, attr, overload_func,
             inline=kwargs.get('inline', 'never'),
+            prefer_literal=kwargs.get('prefer_literal', False)
         )
         infer_getattr(template)
         overload(overload_func, **kwargs)(overload_func)
diff --git a/numba/core/inline_closurecall.py b/numba/core/inline_closurecall.py
index 30cb64c4d51..be56fe05162 100644
--- a/numba/core/inline_closurecall.py
+++ b/numba/core/inline_closurecall.py
@@ -939,6 +939,8 @@ def _inline_arraycall(func_ir, cfg, visited, loop, swapped, enable_prange=False,
         list_var_def = get_definition(func_ir, list_var_def.value)
     # Check if the definition is a build_list
     require(isinstance(list_var_def, ir.Expr) and list_var_def.op ==  'build_list')
+    # The build_list must be empty
+    require(len(list_var_def.items) == 0)
 
     # Look for list_append in "last" block in loop body, which should be a block that is
     # a post-dominator of the loop header.
diff --git a/numba/core/ir_utils.py b/numba/core/ir_utils.py
index 96a74095398..1d58c5c8b5b 100644
--- a/numba/core/ir_utils.py
+++ b/numba/core/ir_utils.py
@@ -2003,7 +2003,7 @@ def raise_on_unsupported_feature(func_ir, typemap):
                "in a function is unsupported (strange things happen!), use "
                "numba.gdb_breakpoint() to create additional breakpoints "
                "instead.\n\nRelevant documentation is available here:\n"
-               "http://numba.pydata.org/numba-doc/latest/user/troubleshoot.html"
+               "https://numba.pydata.org/numba-doc/latest/user/troubleshoot.html"
                "/troubleshoot.html#using-numba-s-direct-gdb-bindings-in-"
                "nopython-mode\n\nConflicting calls found at:\n %s")
         buf = '\n'.join([x.strformat() for x in gdb_calls])
@@ -2021,7 +2021,7 @@ def warn_deprecated(func_ir, typemap):
                 arg = name.split('.')[1]
                 fname = func_ir.func_id.func_qualname
                 tyname = 'list' if isinstance(ty, types.List) else 'set'
-                url = ("http://numba.pydata.org/numba-doc/latest/reference/"
+                url = ("https://numba.pydata.org/numba-doc/latest/reference/"
                        "deprecation.html#deprecation-of-reflection-for-list-and"
                        "-set-types")
                 msg = ("\nEncountered the use of a type that is scheduled for "
diff --git a/numba/core/lowering.py b/numba/core/lowering.py
index 1cd0bb5d831..1c9c19cd3b1 100644
--- a/numba/core/lowering.py
+++ b/numba/core/lowering.py
@@ -465,7 +465,8 @@ def lower_setitem(self, target_var, index_var, value_var, signature):
             target = self.context.cast(self.builder, target, targetty,
                                        targetty.type)
         else:
-            assert targetty == signature.args[0]
+            ul = types.unliteral
+            assert ul(targetty) == ul(signature.args[0])
 
         index = self.context.cast(self.builder, index, indexty,
                                   signature.args[1])
diff --git a/numba/core/object_mode_passes.py b/numba/core/object_mode_passes.py
index 99748e8b5a6..ce302a4b5a7 100644
--- a/numba/core/object_mode_passes.py
+++ b/numba/core/object_mode_passes.py
@@ -177,7 +177,7 @@ def backend_object_mode():
             warnings.warn(errors.NumbaWarning(warn_msg,
                                               state.func_ir.loc))
 
-            url = ("http://numba.pydata.org/numba-doc/latest/reference/"
+            url = ("https://numba.pydata.org/numba-doc/latest/reference/"
                    "deprecation.html#deprecation-of-object-mode-fall-"
                    "back-behaviour-when-using-jit")
             msg = ("\nFall-back from the nopython compilation path to the "
diff --git a/numba/core/pylowering.py b/numba/core/pylowering.py
index b5783ceb748..2b8aa8c6002 100644
--- a/numba/core/pylowering.py
+++ b/numba/core/pylowering.py
@@ -5,6 +5,7 @@
 
 import builtins
 import operator
+import inspect
 
 from llvmlite.llvmpy.core import Type, Constant
 import llvmlite.llvmpy.core as lc
@@ -75,10 +76,6 @@ def init(self):
     def pre_lower(self):
         super(PyLower, self).pre_lower()
         self.init_pyapi()
-        # Pre-computed for later use
-        from numba.core.dispatcher import OmittedArg
-        self.omitted_typobj = self.pyapi.unserialize(
-            self.pyapi.serialize_object(OmittedArg))
 
     def post_lower(self):
         pass
@@ -170,6 +167,15 @@ def lower_inst(self, inst):
         else:
             raise NotImplementedError(type(inst), inst)
 
+    @utils.cached_property
+    def _omitted_typobj(self):
+        """Return a `OmittedArg` type instance as a LLVM value suitable for
+        testing at runtime.
+        """
+        from numba.core.dispatcher import OmittedArg
+        return self.pyapi.unserialize(
+            self.pyapi.serialize_object(OmittedArg))
+
     def lower_assign(self, inst):
         """
         The returned object must have a new reference
@@ -188,21 +194,28 @@ def lower_assign(self, inst):
         elif isinstance(value, ir.Yield):
             return self.lower_yield(value)
         elif isinstance(value, ir.Arg):
+            param = self.func_ir.func_id.pysig.parameters.get(value.name)
+
             obj = self.fnargs[value.index]
-            # When an argument is omitted, the dispatcher hands it as
-            # _OmittedArg(<default value>)
-            typobj = self.pyapi.get_type(obj)
             slot = cgutils.alloca_once_value(self.builder, obj)
-            is_omitted = self.builder.icmp_unsigned('==', typobj,
-                                                    self.omitted_typobj)
-            with self.builder.if_else(is_omitted, likely=False) as (omitted, present):
-                with present:
-                    self.incref(obj)
-                    self.builder.store(obj, slot)
-                with omitted:
-                    # The argument is omitted => get the default value
-                    obj = self.pyapi.object_getattr_string(obj, 'value')
-                    self.builder.store(obj, slot)
+            # Don't check for OmittedArg unless the argument has a default
+            if param is not None and param.default is inspect.Parameter.empty:
+                self.incref(obj)
+                self.builder.store(obj, slot)
+            else:
+                # When an argument is omitted, the dispatcher hands it as
+                # _OmittedArg(<default value>)
+                typobj = self.pyapi.get_type(obj)
+                is_omitted = self.builder.icmp_unsigned('==', typobj,
+                                                        self._omitted_typobj)
+                with self.builder.if_else(is_omitted, likely=False) as (omitted, present):
+                    with present:
+                        self.incref(obj)
+                        self.builder.store(obj, slot)
+                    with omitted:
+                        # The argument is omitted => get the default value
+                        obj = self.pyapi.object_getattr_string(obj, 'value')
+                        self.builder.store(obj, slot)
 
             return self.builder.load(slot)
         else:
diff --git a/numba/core/pythonapi.py b/numba/core/pythonapi.py
index 0d85e0e17e6..93a960849b5 100644
--- a/numba/core/pythonapi.py
+++ b/numba/core/pythonapi.py
@@ -1,6 +1,7 @@
 from collections import namedtuple
 import contextlib
 import pickle
+import hashlib
 
 from llvmlite import ir
 from llvmlite.llvmpy.core import Type, Constant
@@ -1298,27 +1299,37 @@ def unserialize(self, structptr):
         Unserialize some data.  *structptr* should be a pointer to
         a {i8* data, i32 length} structure.
         """
-        fnty = Type.function(self.pyobj, (self.voidptr, ir.IntType(32)))
+        fnty = Type.function(self.pyobj,
+                             (self.voidptr, ir.IntType(32), self.voidptr))
         fn = self._get_function(fnty, name="numba_unpickle")
         ptr = self.builder.extract_value(self.builder.load(structptr), 0)
         n = self.builder.extract_value(self.builder.load(structptr), 1)
-        return self.builder.call(fn, (ptr, n))
+        hashed = self.builder.extract_value(self.builder.load(structptr), 2)
+        return self.builder.call(fn, (ptr, n, hashed))
 
     def serialize_uncached(self, obj):
         """
         Same as serialize_object(), but don't create a global variable,
-        simply return a literal {i8* data, i32 length} structure.
+        simply return a literal {i8* data, i32 length, i8* hashbuf} structure.
         """
         # First make the array constant
         data = serialize.dumps(obj)
         assert len(data) < 2**31
         name = ".const.pickledata.%s" % (id(obj) if config.DIFF_IR == 0 else "DIFF_IR")
         bdata = cgutils.make_bytearray(data)
+        # Make SHA1 hash on the pickled content
+        # NOTE: update buffer size in numba_unpickle() when changing the
+        #       hash algorithm.
+        hashed = cgutils.make_bytearray(hashlib.sha1(data).digest())
         arr = self.context.insert_unique_const(self.module, name, bdata)
+        hasharr = self.context.insert_unique_const(
+            self.module, f"{name}.sha1", hashed,
+        )
         # Then populate the structure constant
         struct = ir.Constant.literal_struct([
             arr.bitcast(self.voidptr),
             ir.Constant(ir.IntType(32), arr.type.pointee.count),
+            hasharr.bitcast(self.voidptr),
             ])
         return struct
 
diff --git a/numba/core/runtime/_nrt_python.c b/numba/core/runtime/_nrt_python.c
index 97f7f19efb3..c3053bda134 100644
--- a/numba/core/runtime/_nrt_python.c
+++ b/numba/core/runtime/_nrt_python.c
@@ -10,6 +10,7 @@
 #include <numpy/arrayscalars.h>
 
 #include "../../_arraystruct.h"
+#include "../../_numba_common.h"
 #include "nrt.h"
 
 
@@ -294,7 +295,7 @@ NRT_adapt_ndarray_to_python(arystruct_t* arystruct, int ndim,
     npy_intp *shape, *strides;
     int flags = 0;
 
-    if (!PyArray_DescrCheck(descr)) {
+    if (!NUMBA_PyArray_DescrCheck(descr)) {
         PyErr_Format(PyExc_TypeError,
                      "expected dtype object, got '%.200s'",
                      Py_TYPE(descr)->tp_name);
diff --git a/numba/core/serialize.py b/numba/core/serialize.py
index bac37ef7dca..ae5e7ac32be 100644
--- a/numba/core/serialize.py
+++ b/numba/core/serialize.py
@@ -133,6 +133,32 @@ def _rebuild_code(marshal_version, bytecode_magic, marshalled):
     return marshal.loads(marshalled)
 
 
+# Keep unpickled object via `numba_unpickle` alive.
+_unpickled_memo = {}
+
+
+def _numba_unpickle(address, bytedata, hashed):
+    """Used by `numba_unpickle` from _helperlib.c
+
+    Parameters
+    ----------
+    address : int
+    bytedata : bytes
+    hashed : bytes
+
+    Returns
+    -------
+    obj : object
+        unpickled object
+    """
+    key = (address, hashed)
+    try:
+        obj = _unpickled_memo[key]
+    except KeyError:
+        _unpickled_memo[key] = obj = pickle.loads(bytedata)
+    return obj
+
+
 def dumps(obj):
     """Similar to `pickle.dumps()`. Returns the serialized object in bytes.
     """
diff --git a/numba/core/typeconv/typeconv.py b/numba/core/typeconv/typeconv.py
index 6f05e276a7f..08c05649caf 100644
--- a/numba/core/typeconv/typeconv.py
+++ b/numba/core/typeconv/typeconv.py
@@ -5,7 +5,7 @@
 except ImportError as e:
     from numba.core.errors import feedback_details as reportme
     import sys
-    url = "http://numba.pydata.org/numba-doc/latest/developer/contributing.html"
+    url = "https://numba.pydata.org/numba-doc/latest/developer/contributing.html"
     dashes = '-' * 80
     msg = ("Numba could not be imported.\nIf you are seeing this message and "
            "are undertaking Numba development work, you may need to re-run:\n\n"
diff --git a/numba/core/typed_passes.py b/numba/core/typed_passes.py
index 25d8f7a91a0..3756a3e7c22 100644
--- a/numba/core/typed_passes.py
+++ b/numba/core/typed_passes.py
@@ -304,7 +304,7 @@ def run_pass(self, state):
             # parfor calls the compiler chain again with a string
             if not (config.DISABLE_PERFORMANCE_WARNINGS or
                     state.func_ir.loc.filename == '<string>'):
-                url = ("http://numba.pydata.org/numba-doc/latest/user/"
+                url = ("https://numba.pydata.org/numba-doc/latest/user/"
                        "parallel.html#diagnostics")
                 msg = ("\nThe keyword argument 'parallel=True' was specified "
                        "but no transformation for parallel execution was "
diff --git a/numba/core/typeinfer.py b/numba/core/typeinfer.py
index 89825d34244..e1a4eed79cd 100644
--- a/numba/core/typeinfer.py
+++ b/numba/core/typeinfer.py
@@ -1138,7 +1138,7 @@ def diagnose_imprecision(offender):
 the type of empty lists can be inferred, this is not always the case, see this
 documentation for help:
 
-http://numba.pydata.org/numba-doc/latest/user/troubleshoot.html#my-code-has-an-untyped-list-problem
+https://numba.pydata.org/numba-doc/latest/user/troubleshoot.html#my-code-has-an-untyped-list-problem
 """
             if offender is not None:
                 # This block deals with imprecise lists
diff --git a/numba/core/types/containers.py b/numba/core/types/containers.py
index 37a2573ac5b..87f4258c9dc 100644
--- a/numba/core/types/containers.py
+++ b/numba/core/types/containers.py
@@ -200,7 +200,7 @@ def can_convert_to(self, typingctx, other):
             return max(kinds)
 
     def __unliteral__(self):
-        return BaseTuple.from_types([unliteral(t) for t in self])
+        return type(self).from_types([unliteral(t) for t in self])
 
 
 class _HomogeneousTuple(Sequence, BaseTuple):
@@ -253,6 +253,9 @@ def unify(self, typingctx, other):
             if dtype is not None:
                 return UniTuple(dtype=dtype, count=self.count)
 
+    def __unliteral__(self):
+        return type(self)(dtype=unliteral(self.dtype), count=self.count)
+
 
 class UniTupleIter(BaseContainerIterator):
     """
@@ -426,14 +429,22 @@ def copy(self, dtype=None, reflected=None):
             dtype = self.dtype
         if reflected is None:
             reflected = self.reflected
-        return List(dtype, reflected)
+        return List(dtype, reflected, self.initial_value)
 
     def unify(self, typingctx, other):
         if isinstance(other, List):
             dtype = typingctx.unify_pairs(self.dtype, other.dtype)
             reflected = self.reflected or other.reflected
             if dtype is not None:
-                return List(dtype, reflected)
+                siv = self.initial_value
+                oiv = other.initial_value
+                if siv is not None and oiv is not None:
+                    use = siv
+                    if siv is None:
+                        use = oiv
+                    return List(dtype, reflected, use)
+                else:
+                    return List(dtype, reflected)
 
     @property
     def key(self):
@@ -452,6 +463,10 @@ def __getitem__(self, args):
         """
         return self.dtype
 
+    def __unliteral__(self):
+        return List(self.dtype, reflected=self.reflected,
+                    initial_value=None)
+
 
 class LiteralList(Literal, _HeterogeneousTuple):
     """A heterogeneous immutable list (basically a tuple with list semantics).
@@ -695,11 +710,27 @@ def unify(self, typingctx, other):
         if isinstance(other, DictType):
             if not other.is_precise():
                 return self
+            else:
+                ukey_type = self.key_type == other.key_type
+                uvalue_type = self.value_type == other.value_type
+                if ukey_type and uvalue_type:
+                    siv = self.initial_value
+                    oiv = other.initial_value
+                    siv_none = siv is None
+                    oiv_none = oiv is None
+                    if not siv_none and not oiv_none:
+                        if siv == oiv:
+                            return DictType(self.key_type, other.value_type,
+                                            siv)
+                    return DictType(self.key_type, other.value_type)
 
     @property
     def key(self):
         return self.key_type, self.value_type, str(self.initial_value)
 
+    def __unliteral__(self):
+        return DictType(self.key_type, self.value_type)
+
 
 class LiteralStrKeyDict(Literal, NamedTuple):
     """A Dictionary of string keys to heterogeneous values (basically a
diff --git a/numba/core/types/functions.py b/numba/core/types/functions.py
index 54d4c159e2c..7acff943c92 100644
--- a/numba/core/types/functions.py
+++ b/numba/core/types/functions.py
@@ -224,6 +224,16 @@ def raise_error(self):
         raise errors.TypingError(self.format())
 
 
+def _unlit_non_poison(ty):
+    """Apply unliteral(ty) and raise a TypingError if type is Poison.
+    """
+    out = unliteral(ty)
+    if isinstance(out, types.Poison):
+        m = f"Poison type used in arguments; got {out}"
+        raise TypingError(m)
+    return out
+
+
 class BaseFunction(Callable):
     """
     Base type class for some function types.
@@ -266,18 +276,23 @@ def get_impl_key(self, sig):
         return self._impl_keys[sig.args]
 
     def get_call_type(self, context, args, kws):
+        prefer_lit = [True, False]    # old behavior preferring literal
+        prefer_not = [False, True]    # new behavior preferring non-literal
         failures = _ResolutionFailures(context, self, args, kws,
                                        depth=self._depth)
         self._depth += 1
         for temp_cls in self.templates:
             temp = temp_cls(context)
-            for uselit in [True, False]:
+            # The template can override the default and prefer literal args
+            choice = prefer_lit if temp.prefer_literal else prefer_not
+            for uselit in choice:
                 try:
                     if uselit:
                         sig = temp.apply(args, kws)
                     else:
-                        nolitargs = tuple([unliteral(a) for a in args])
-                        nolitkws = {k: unliteral(v) for k, v in kws.items()}
+                        nolitargs = tuple([_unlit_non_poison(a) for a in args])
+                        nolitkws = {k: _unlit_non_poison(v)
+                                    for k, v in kws.items()}
                         sig = temp.apply(nolitargs, nolitkws)
                 except Exception as e:
                     sig = None
@@ -360,33 +375,45 @@ def get_call_type(self, context, args, kws):
         template = self.template(context)
         literal_e = None
         nonliteral_e = None
+        out = None
 
-
-        # Try with Literal
-        try:
-            out = template.apply(args, kws)
-        except Exception as exc:
-            if isinstance(exc, errors.ForceLiteralArg):
-                raise exc
-            literal_e = exc
-            out = None
-
-        # if the unliteral_args and unliteral_kws are the same as the literal
-        # ones, set up to not bother retrying
-        unliteral_args = tuple([unliteral(a) for a in args])
-        unliteral_kws = {k: unliteral(v) for k, v in kws.items()}
-        skip = unliteral_args == args and kws == unliteral_kws
-
-        # If the above template application failed and the non-literal args are
-        # different to the literal ones, try again with literals rewritten as
-        # non-literals
-        if not skip and out is None:
-            try:
-                out = template.apply(unliteral_args, unliteral_kws)
-            except Exception as exc:
-                if isinstance(exc, errors.ForceLiteralArg):
-                    raise exc
-                nonliteral_e = exc
+        choice = [True, False] if template.prefer_literal else [False, True]
+        for uselit in choice:
+            if uselit:
+                # Try with Literal
+                try:
+                    out = template.apply(args, kws)
+                except Exception as exc:
+                    if isinstance(exc, errors.ForceLiteralArg):
+                        raise exc
+                    literal_e = exc
+                    out = None
+                else:
+                    break
+            else:
+                # if the unliteral_args and unliteral_kws are the same as the literal
+                # ones, set up to not bother retrying
+                unliteral_args = tuple([_unlit_non_poison(a) for a in args])
+                unliteral_kws = {k: _unlit_non_poison(v)
+                                 for k, v in kws.items()}
+                skip = unliteral_args == args and kws == unliteral_kws
+
+                # If the above template application failed and the non-literal args are
+                # different to the literal ones, try again with literals rewritten as
+                # non-literals
+                if not skip and out is None:
+                    try:
+                        out = template.apply(unliteral_args, unliteral_kws)
+                    except Exception as exc:
+                        if isinstance(exc, errors.ForceLiteralArg):
+                            if template.prefer_literal:
+                                # For template that prefers literal types,
+                                # reaching here means that the literal types
+                                # have failed typing as well.
+                                raise exc
+                        nonliteral_e = exc
+                    else:
+                        break
 
         if out is None and (nonliteral_e is not None or literal_e is not None):
             header = "- Resolution failure for {} arguments:\n{}\n"
diff --git a/numba/core/typing/arraydecl.py b/numba/core/typing/arraydecl.py
index 58001e47ed4..c75ed8ee5ec 100644
--- a/numba/core/typing/arraydecl.py
+++ b/numba/core/typing/arraydecl.py
@@ -416,6 +416,8 @@ def resolve_argsort(self, ary, args, kws):
         assert not args
         kwargs = dict(kws)
         kind = kwargs.pop('kind', types.StringLiteral('quicksort'))
+        if not isinstance(kind, types.StringLiteral):
+            raise errors.TypingError('"kind" must be a string literal')
         if kwargs:
             msg = "Unsupported keywords: {!r}"
             raise TypingError(msg.format([k for k in kwargs.keys()]))
@@ -742,8 +744,9 @@ def generic_index(self, args, kws):
     assert not kws
     return signature(types.intp, recvr=self.this)
 
-def install_array_method(name, generic):
-    my_attr = {"key": "array." + name, "generic": generic}
+def install_array_method(name, generic, prefer_literal=True):
+    my_attr = {"key": "array." + name, "generic": generic,
+               "prefer_literal": prefer_literal}
     temp_class = type("Array_" + name, (AbstractTemplate,), my_attr)
     def array_attribute_attachment(self, ary):
         return types.BoundFunction(temp_class, ary)
@@ -756,7 +759,7 @@ def array_attribute_attachment(self, ary):
 
 # Functions that return a machine-width type, to avoid overflows
 install_array_method("prod", generic_expand)
-install_array_method("sum", sum_expand)
+install_array_method("sum", sum_expand, prefer_literal=True)
 
 # Functions that return a machine-width type, to avoid overflows
 for fname in ["cumsum", "cumprod"]:
diff --git a/numba/core/typing/npydecl.py b/numba/core/typing/npydecl.py
index 878829ef9e8..2dbbed39be9 100644
--- a/numba/core/typing/npydecl.py
+++ b/numba/core/typing/npydecl.py
@@ -341,6 +341,10 @@ class Numpy_method_redirection(AbstractTemplate):
     array method of the same name (e.g. ndarray.sum).
     """
 
+    # Arguments like *axis* can specialize on literals but also support
+    # non-literals
+    prefer_literal = True
+
     def generic(self, args, kws):
         pysig = None
         if kws:
diff --git a/numba/core/typing/templates.py b/numba/core/typing/templates.py
index 71e4ed7dbf8..f77e3557ffd 100644
--- a/numba/core/typing/templates.py
+++ b/numba/core/typing/templates.py
@@ -251,7 +251,14 @@ class FunctionTemplate(ABC):
     # Set to true to disable unsafe cast.
     # subclass overide-able
     unsafe_casting = True
+    # Set to true to require exact match without casting.
+    # subclass overide-able
     exact_match_required = False
+    # Set to true to prefer literal arguments.
+    # Useful for definitions that specialize on literal but also support
+    # non-literals.
+    # subclass overide-able
+    prefer_literal = False
 
     def __init__(self, context):
         self.context = context
@@ -278,6 +285,28 @@ def get_impl_key(self, sig):
             key = key.im_func
         return key
 
+    @classmethod
+    def get_source_code_info(cls, impl):
+        """
+        Gets the source information about function impl.
+        Returns:
+
+        code - str: source code as a string
+        firstlineno - int: the first line number of the function impl
+        path - str: the path to file containing impl
+
+        if any of the above are not available something generic is returned
+        """
+        try:
+            code, firstlineno = inspect.getsourcelines(impl)
+        except OSError: # missing source, probably a string
+            code = "None available (built from string?)"
+            firstlineno = 0
+        path = inspect.getsourcefile(impl)
+        if path is None:
+            path = "<unknown> (built from string?)"
+        return code, firstlineno, path
+
     @abstractmethod
     def get_template_info(self):
         """
@@ -296,6 +325,13 @@ def get_template_info(self):
         """
         pass
 
+    def __str__(self):
+        info = self.get_template_info()
+        srcinfo = f"{info['filename']}:{info['lines'][0]}"
+        return f"<{self.__class__.__name__} {srcinfo}>"
+
+    __repr__ = __str__
+
 
 class AbstractTemplate(FunctionTemplate):
     """
@@ -332,14 +368,14 @@ def unpack_opt(x):
     def get_template_info(self):
         impl = getattr(self, "generic")
         basepath = os.path.dirname(os.path.dirname(numba.__file__))
-        code, firstlineno = inspect.getsourcelines(impl)
-        path = inspect.getsourcefile(impl)
+
+        code, firstlineno, path = self.get_source_code_info(impl)
         sig = str(utils.pysignature(impl))
         info = {
             'kind': "overload",
             'name': getattr(impl, '__qualname__', impl.__name__),
             'sig': sig,
-            'filename': os.path.relpath(path, start=basepath),
+            'filename': utils.safe_relpath(path, start=basepath),
             'lines': (firstlineno, firstlineno + len(code) - 1),
             'docstring': impl.__doc__
         }
@@ -407,15 +443,14 @@ def unpack_opt(x):
     def get_template_info(self):
         impl = getattr(self, "generic")
         basepath = os.path.dirname(os.path.dirname(numba.__file__))
-        code, firstlineno = inspect.getsourcelines(impl)
-        path = inspect.getsourcefile(impl)
+        code, firstlineno, path = self.get_source_code_info(impl)
         sig = str(utils.pysignature(impl))
         info = {
             'kind': "overload",
             'name': getattr(self.key, '__name__',
                             getattr(impl, '__qualname__', impl.__name__),),
             'sig': sig,
-            'filename': os.path.relpath(path, start=basepath),
+            'filename': utils.safe_relpath(path, start=basepath),
             'lines': (firstlineno, firstlineno + len(code) - 1),
             'docstring': impl.__doc__
         }
@@ -682,6 +717,7 @@ def _build_impl(self, cache_key, args, kws):
             # should be using.
             sig, pyfunc = ovf_result
             args = sig.args
+            kws = {}
             cache_key = None            # don't cache
         else:
             # Regular case
@@ -699,6 +735,9 @@ def _build_impl(self, cache_key, args, kws):
         # Make dispatcher
         jitdecor = jit(nopython=True, **self._jit_options)
         disp = jitdecor(pyfunc)
+        # Make sure that the implementation can be fully compiled
+        disp_type = types.Dispatcher(disp)
+        disp_type.get_call_type(self.context, args, kws)
         if cache_key is not None:
             self._impl_cache[cache_key] = disp, args
         return disp, args
@@ -712,7 +751,7 @@ def get_impl_key(self, sig):
 
     @classmethod
     def get_source_info(cls):
-        """Return a dictionary with information about the source code  of the
+        """Return a dictionary with information about the source code of the
         implementation.
 
         Returns
@@ -733,14 +772,13 @@ def get_source_info(cls):
         """
         basepath = os.path.dirname(os.path.dirname(numba.__file__))
         impl = cls._overload_func
-        code, firstlineno = inspect.getsourcelines(impl)
-        path = inspect.getsourcefile(impl)
+        code, firstlineno, path = cls.get_source_code_info(impl)
         sig = str(utils.pysignature(impl))
         info = {
             'kind': "overload",
             'name': getattr(impl, '__qualname__', impl.__name__),
             'sig': sig,
-            'filename': os.path.relpath(path, start=basepath),
+            'filename': utils.safe_relpath(path, start=basepath),
             'lines': (firstlineno, firstlineno + len(code) - 1),
             'docstring': impl.__doc__
         }
@@ -749,14 +787,13 @@ def get_source_info(cls):
     def get_template_info(self):
         basepath = os.path.dirname(os.path.dirname(numba.__file__))
         impl = self._overload_func
-        code, firstlineno = inspect.getsourcelines(impl)
-        path = inspect.getsourcefile(impl)
+        code, firstlineno, path = self.get_source_code_info(impl)
         sig = str(utils.pysignature(impl))
         info = {
             'kind': "overload",
             'name': getattr(impl, '__qualname__', impl.__name__),
             'sig': sig,
-            'filename': os.path.relpath(path, start=basepath),
+            'filename': utils.safe_relpath(path, start=basepath),
             'lines': (firstlineno, firstlineno + len(code) - 1),
             'docstring': impl.__doc__
         }
@@ -764,7 +801,7 @@ def get_template_info(self):
 
 
 def make_overload_template(func, overload_func, jit_options, strict,
-                           inline):
+                           inline, prefer_literal=False):
     """
     Make a template class for function *func* overloaded by *overload_func*.
     Compiler options are passed as a dictionary to *jit_options*.
@@ -775,7 +812,7 @@ def make_overload_template(func, overload_func, jit_options, strict,
     dct = dict(key=func, _overload_func=staticmethod(overload_func),
                _impl_cache={}, _compiled_overloads={}, _jit_options=jit_options,
                _strict=strict, _inline=staticmethod(InlineOptions(inline)),
-               _inline_overloads={})
+               _inline_overloads={}, prefer_literal=prefer_literal)
     return type(base)(name, (base,), dct)
 
 
@@ -818,14 +855,13 @@ def get_impl_key(self, sig):
     def get_template_info(self):
         basepath = os.path.dirname(os.path.dirname(numba.__file__))
         impl = self._definition_func
-        code, firstlineno = inspect.getsourcelines(impl)
-        path = inspect.getsourcefile(impl)
+        code, firstlineno, path = self.get_source_code_info(impl)
         sig = str(utils.pysignature(impl))
         info = {
             'kind': "intrinsic",
             'name': getattr(impl, '__qualname__', impl.__name__),
             'sig': sig,
-            'filename': os.path.relpath(path, start=basepath),
+            'filename': utils.safe_relpath(path, start=basepath),
             'lines': (firstlineno, firstlineno + len(code) - 1),
             'docstring': impl.__doc__
         }
@@ -967,6 +1003,7 @@ class MethodTemplate(AbstractTemplate):
             _inline = self._inline
             _overload_func = staticmethod(self._overload_func)
             _inline_overloads = self._inline_overloads
+            prefer_literal = self.prefer_literal
 
             def generic(_, args, kws):
                 args = (typ,) + tuple(args)
@@ -982,6 +1019,7 @@ def generic(_, args, kws):
 
 
 def make_overload_attribute_template(typ, attr, overload_func, inline,
+                                     prefer_literal=False,
                                      base=_OverloadAttributeTemplate):
     """
     Make a template class for attribute *attr* of *typ* overloaded by
@@ -994,18 +1032,21 @@ def make_overload_attribute_template(typ, attr, overload_func, inline,
                _inline=staticmethod(InlineOptions(inline)),
                _inline_overloads={},
                _overload_func=staticmethod(overload_func),
+               prefer_literal=prefer_literal,
                )
-    return type(base)(name, (base,), dct)
+    obj = type(base)(name, (base,), dct)
+    return obj
 
 
-def make_overload_method_template(typ, attr, overload_func, inline):
+def make_overload_method_template(typ, attr, overload_func, inline,
+                                  prefer_literal=False):
     """
     Make a template class for method *attr* of *typ* overloaded by
     *overload_func*.
     """
     return make_overload_attribute_template(
         typ, attr, overload_func, inline=inline,
-        base=_OverloadMethodTemplate,
+        base=_OverloadMethodTemplate, prefer_literal=prefer_literal,
     )
 
 
diff --git a/numba/core/utils.py b/numba/core/utils.py
index 2dd3fdb21c0..a74e3ef9283 100644
--- a/numba/core/utils.py
+++ b/numba/core/utils.py
@@ -104,6 +104,23 @@ def erase_traceback(exc_value):
     return exc_value.with_traceback(None)
 
 
+def safe_relpath(path, start=os.curdir):
+    """
+    Produces a "safe" relative path, on windows relpath doesn't work across
+    drives as technically they don't share the same root.
+    See: https://bugs.python.org/issue7195 for details.
+    """
+    # find the drive letters for path and start and if they are not the same
+    # then don't use relpath!
+    drive_letter = lambda x: os.path.splitdrive(os.path.abspath(x))[0]
+    drive_path = drive_letter(path)
+    drive_start = drive_letter(start)
+    if drive_path != drive_start:
+        return os.path.abspath(path)
+    else:
+        return os.path.relpath(path, start=start)
+
+
 # Mapping between operator module functions and the corresponding built-in
 # operators.
 
diff --git a/numba/cpython/tupleobj.py b/numba/cpython/tupleobj.py
index 7a547577f8a..34739623fff 100644
--- a/numba/cpython/tupleobj.py
+++ b/numba/cpython/tupleobj.py
@@ -187,6 +187,24 @@ def iternext_unituple(context, builder, sig, args, result):
         builder.store(nidx, iterval.index)
 
 
+@overload(operator.getitem)
+def getitem_literal_idx(tup, idx):
+    """
+    Overloads BaseTuple getitem to cover cases where constant
+    inference and RewriteConstGetitems cannot replace it
+    with a static_getitem.
+    """
+    if not (isinstance(tup, types.BaseTuple)
+            and isinstance(idx, types.IntegerLiteral)):
+        return None
+
+    idx_val = idx.literal_value
+    def getitem_literal_idx_impl(tup, idx):
+        return tup[idx_val]
+
+    return getitem_literal_idx_impl
+
+
 @lower_builtin('typed_getitem', types.BaseTuple, types.Any)
 def getitem_typed(context, builder, sig, args):
     tupty, _ = sig.args
diff --git a/numba/cuda/codegen.py b/numba/cuda/codegen.py
index e201a2101e4..bfd58c27ca7 100644
--- a/numba/cuda/codegen.py
+++ b/numba/cuda/codegen.py
@@ -12,22 +12,14 @@
 
 
 class CUDACodeLibrary(CodeLibrary):
+    # We don't optimize the IR at the function or module level because it is
+    # optimized by NVVM after we've passed it on.
+
     def _optimize_functions(self, ll_module):
         pass
 
     def _optimize_final_module(self):
-        # Run some lightweight optimization to simplify the module.
-        # This seems to workaround a libnvvm compilation bug (see #1341)
-        pmb = ll.PassManagerBuilder()
-        pmb.opt_level = 1
-        pmb.disable_unit_at_a_time = False
-        pmb.disable_unroll_loops = True
-        pmb.loop_vectorize = False
-        pmb.slp_vectorize = False
-
-        pm = ll.ModulePassManager()
-        pmb.populate(pm)
-        pm.run(self._final_module)
+        pass
 
     def _finalize_specific(self):
         # Fix global naming
diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py
index b3324cf5b56..dd57817e438 100644
--- a/numba/cuda/compiler.py
+++ b/numba/cuda/compiler.py
@@ -1,7 +1,9 @@
 import ctypes
 import inspect
 import os
+import subprocess
 import sys
+import tempfile
 
 import numpy as np
 
@@ -51,7 +53,7 @@ def compile_cuda(pyfunc, return_type, args, debug=False, inline=False):
 
 @global_compiler_lock
 def compile_kernel(pyfunc, args, link, debug=False, inline=False,
-                   fastmath=False, extensions=[], max_registers=None):
+                   fastmath=False, extensions=[], max_registers=None, opt=True):
     cres = compile_cuda(pyfunc, types.void, args, debug=debug, inline=inline)
     fname = cres.fndesc.llvm_func_name
     lib, kernel = cres.target_context.prepare_cuda_kernel(cres.library, fname,
@@ -65,6 +67,7 @@ def compile_kernel(pyfunc, args, link, debug=False, inline=False,
                      type_annotation=cres.type_annotation,
                      link=link,
                      debug=debug,
+                     opt=opt,
                      call_helper=cres.call_helper,
                      fastmath=fastmath,
                      extensions=extensions,
@@ -130,13 +133,43 @@ def compile_ptx_for_current_device(pyfunc, args, debug=False, device=False,
                        fastmath=fastmath, cc=cc, opt=True)
 
 
+def disassemble_cubin(cubin):
+    # nvdisasm only accepts input from a file, so we need to write out to a
+    # temp file and clean up afterwards.
+    fd = None
+    fname = None
+    try:
+        fd, fname = tempfile.mkstemp()
+        with open(fname, 'wb') as f:
+            f.write(cubin)
+
+        try:
+            cp = subprocess.run(['nvdisasm', fname], check=True,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE)
+        except FileNotFoundError as e:
+            if e.filename == 'nvdisasm':
+                msg = ("nvdisasm is required for SASS inspection, and has not "
+                       "been found.\n\nYou may need to install the CUDA "
+                       "toolkit and ensure that it is available on your "
+                       "PATH.\n")
+                raise RuntimeError(msg)
+        return cp.stdout.decode('utf-8')
+    finally:
+        if fd is not None:
+            os.close(fd)
+        if fname is not None:
+            os.unlink(fname)
+
+
 class DeviceFunctionTemplate(serialize.ReduceMixin):
     """Unmaterialized device function
     """
-    def __init__(self, pyfunc, debug, inline):
+    def __init__(self, pyfunc, debug, inline, opt):
         self.py_func = pyfunc
         self.debug = debug
         self.inline = inline
+        self.opt = opt
         self._compileinfos = {}
         name = getattr(pyfunc, '__name__', 'unknown')
         self.__name__ = f"{name} <CUDA device function>".format(name)
@@ -187,6 +220,10 @@ def inspect_llvm(self, args):
         -------
         llvmir : str
         """
+        # Force a compilation to occur if none has yet - this can be needed if
+        # the user attempts to inspect LLVM IR or PTX before the function has
+        # been called for the given arguments from a jitted kernel.
+        self.compile(args)
         cres = self._compileinfos[args]
         mod = cres.library._final_module
         return str(mod)
@@ -211,17 +248,18 @@ def inspect_ptx(self, args, nvvm_options={}):
         device = cuctx.device
         cc = device.compute_capability
         arch = nvvm.get_arch_option(*cc)
-        ptx = nvvm.llvm_to_ptx(llvmir, opt=3, arch=arch, **nvvm_options)
+        opt = 3 if self.opt else 0
+        ptx = nvvm.llvm_to_ptx(llvmir, opt=opt, arch=arch, **nvvm_options)
         return ptx
 
 
-def compile_device_template(pyfunc, debug=False, inline=False):
+def compile_device_template(pyfunc, debug=False, inline=False, opt=True):
     """Create a DeviceFunctionTemplate object and register the object to
     the CUDA typing context.
     """
     from .descriptor import CUDATargetDesc
 
-    dft = DeviceFunctionTemplate(pyfunc, debug=debug, inline=inline)
+    dft = DeviceFunctionTemplate(pyfunc, debug=debug, inline=inline, opt=opt)
 
     class device_function_template(AbstractTemplate):
         key = dft
@@ -239,7 +277,7 @@ def get_template_info(cls):
                 'kind': "overload",
                 'name': getattr(cls.key, '__name__', "unknown"),
                 'sig': sig,
-                'filename': os.path.relpath(path, start=basepath),
+                'filename': utils.safe_relpath(path, start=basepath),
                 'lines': (firstlineno, firstlineno + len(code) - 1),
                 'docstring': pyfunc.__doc__
             }
@@ -371,7 +409,7 @@ def get(self):
         ptx = self.cache.get(cc)
         if ptx is None:
             arch = nvvm.get_arch_option(*cc)
-            ptx = nvvm.llvm_to_ptx(self.llvmir, opt=3, arch=arch,
+            ptx = nvvm.llvm_to_ptx(self.llvmir, arch=arch,
                                    **self._extra_options)
             self.cache[cc] = ptx
             if config.DUMP_ASSEMBLY:
@@ -394,6 +432,7 @@ def __init__(self, entry_name, ptx, linking, max_registers):
         self.linking = linking
         self.cache = {}
         self.ccinfos = {}
+        self.cubins = {}
         self.max_registers = max_registers
 
     def get(self):
@@ -408,16 +447,27 @@ def get(self):
             linker.add_ptx(ptx)
             for path in self.linking:
                 linker.add_file_guess_ext(path)
-            cubin, _size = linker.complete()
+            cubin, size = linker.complete()
             compile_info = linker.info_log
             module = cuctx.create_module_image(cubin)
 
             # Load
             cufunc = module.get_function(self.entry_name)
+
+            # Populate caches
             self.cache[device.id] = cufunc
             self.ccinfos[device.id] = compile_info
+            # We take a copy of the cubin because it's owned by the linker
+            cubin_ptr = ctypes.cast(cubin, ctypes.POINTER(ctypes.c_char))
+            cubin_data = np.ctypeslib.as_array(cubin_ptr, shape=(size,)).copy()
+            self.cubins[device.id] = cubin_data
         return cufunc
 
+    def get_sass(self):
+        self.get()  # trigger compilation
+        device = get_context().device
+        return disassemble_cubin(self.cubins[device.id])
+
     def get_info(self):
         self.get()   # trigger compilation
         cuctx = get_context()
@@ -453,12 +503,13 @@ class _Kernel(serialize.ReduceMixin):
     '''
     def __init__(self, llvm_module, name, pretty_name, argtypes, call_helper,
                  link=(), debug=False, fastmath=False, type_annotation=None,
-                 extensions=[], max_registers=None):
+                 extensions=[], max_registers=None, opt=True):
         super().__init__()
         # initialize CUfunction
         options = {
             'debug': debug,
-            'fastmath': fastmath
+            'fastmath': fastmath,
+            'opt': 3 if opt else 0
         }
 
         ptx = CachedPTX(pretty_name, str(llvm_module), options=options)
@@ -547,6 +598,14 @@ def inspect_asm(self):
         '''
         return self._func.ptx.get().decode('ascii')
 
+    def inspect_sass(self):
+        '''
+        Returns the SASS code for this kernel.
+
+        Requires nvdisasm to be available on the PATH.
+        '''
+        return self._func.get_sass()
+
     def inspect_types(self, file=None):
         '''
         Produce a dump of the Python source of this function annotated with the
@@ -881,10 +940,10 @@ def inspect_llvm(self, signature=None, compute_capability=None):
 
     def inspect_asm(self, signature=None, compute_capability=None):
         '''
-        Return the generated assembly code for all signatures encountered thus
-        far, or the LLVM IR for a specific signature and compute_capability
-        if given. If the dispatcher is specialized, the assembly code for the
-        single specialization is returned.
+        Return the generated PTX assembly code for all signatures encountered
+        thus far, or the PTX assembly code for a specific signature and
+        compute_capability if given. If the dispatcher is specialized, the
+        assembly code for the single specialization is returned.
         '''
         cc = compute_capability or get_current_device().compute_capability
         if signature is not None:
@@ -895,6 +954,23 @@ def inspect_asm(self, signature=None, compute_capability=None):
             return dict((sig, defn.inspect_asm())
                         for sig, defn in self.definitions.items())
 
+    def inspect_sass(self, signature=None, compute_capability=None):
+        '''
+        Return the generated SASS code for all signatures encountered thus
+        far, or the SASS code for a specific signature and compute_capability
+        if given.
+
+        Requires nvdisasm to be available on the PATH.
+        '''
+        cc = compute_capability or get_current_device().compute_capability
+        if signature is not None:
+            return self.definitions[(cc, signature)].inspect_sass()
+        elif self.specialized:
+            return self.definition.inspect_sass()
+        else:
+            return dict((sig, defn.inspect_sass())
+                        for sig, defn in self.definitions.items())
+
     def inspect_types(self, file=None):
         '''
         Produce a dump of the Python source of this function annotated with the
diff --git a/numba/cuda/cudadrv/driver.py b/numba/cuda/cudadrv/driver.py
index 0b0ee044e2b..414377229ff 100644
--- a/numba/cuda/cudadrv/driver.py
+++ b/numba/cuda/cudadrv/driver.py
@@ -20,6 +20,7 @@
 import warnings
 import logging
 import threading
+import asyncio
 from itertools import product
 from abc import ABCMeta, abstractmethod
 from ctypes import (c_int, byref, c_size_t, c_char, c_char_p, addressof,
@@ -33,7 +34,7 @@
 from numba.core import utils, errors, serialize, config
 from .error import CudaSupportError, CudaDriverError
 from .drvapi import API_PROTOTYPES
-from .drvapi import cu_occupancy_b2d_size
+from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj
 from numba.cuda.cudadrv import enums, drvapi, _extras
 from numba.core.utils import longint as long
 from numba.cuda.envvars import get_numba_envvar
@@ -44,6 +45,12 @@
 SUPPORTS_IPC = sys.platform.startswith('linux')
 
 
+_py_decref = ctypes.pythonapi.Py_DecRef
+_py_incref = ctypes.pythonapi.Py_IncRef
+_py_decref.argtypes = [ctypes.py_object]
+_py_incref.argtypes = [ctypes.py_object]
+
+
 def make_logger():
     logger = logging.getLogger(__name__)
     # is logging configured?
@@ -1791,6 +1798,64 @@ def auto_synchronize(self):
         yield self
         self.synchronize()
 
+    def add_callback(self, callback, arg):
+        """
+        Add a callback to a compute stream.
+        The user provided function is called from a driver thread once all
+        preceding stream operations are complete.
+
+        Callback functions are called from a CUDA driver thread, not from
+        the thread that invoked `add_callback`. No CUDA API functions may
+        be called from within the callback function.
+
+        The duration of a callback function should be kept short, as the
+        callback will block later work in the stream and may block other
+        callbacks from being executed.
+
+        Note: This function is marked as deprecated and may be replaced in a
+        future CUDA release.
+
+        :param callback: Callback function with arguments (stream, status, arg).
+        :param arg: User data to be passed to the callback function.
+        """
+        data = (self, callback, arg)
+        _py_incref(data)
+        driver.cuStreamAddCallback(self.handle, self._stream_callback, data, 0)
+
+    @staticmethod
+    @cu_stream_callback_pyobj
+    def _stream_callback(handle, status, data):
+        try:
+            stream, callback, arg = data
+            callback(stream, status, arg)
+        except Exception as e:
+            warnings.warn(f"Exception in stream callback: {e}")
+        finally:
+            _py_decref(data)
+
+    def async_done(self) -> asyncio.futures.Future:
+        """
+        Return an awaitable that resolves once all preceding stream operations
+        are complete.
+        """
+        loop = asyncio.get_running_loop() if utils.PYVERSION >= (3, 7) \
+            else asyncio.get_event_loop()
+        future = loop.create_future()
+
+        def resolver(future, status):
+            if future.done():
+                return
+            elif status == 0:
+                future.set_result(None)
+            else:
+                future.set_exception(Exception(f"Stream error {status}"))
+
+        def callback(stream, status, future):
+            loop.call_soon_threadsafe(resolver, future, status)
+
+        self.add_callback(callback, future)
+        return future
+
 
 class Event(object):
     def __init__(self, context, handle, finalizer=None):
diff --git a/numba/cuda/cudadrv/drvapi.py b/numba/cuda/cudadrv/drvapi.py
index 9fb6c95db65..8a78edb1e05 100644
--- a/numba/cuda/cudadrv/drvapi.py
+++ b/numba/cuda/cudadrv/drvapi.py
@@ -17,6 +17,8 @@
 cu_function_attribute = c_int
 cu_ipc_mem_handle = (c_byte * _extras.CUDA_IPC_HANDLE_SIZE)   # 64 bytes wide
 
+cu_stream_callback_pyobj = CFUNCTYPE(None, cu_stream, c_int, py_object)
+
 cu_occupancy_b2d_size = CFUNCTYPE(c_size_t, c_int)
 
 # See https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
@@ -159,6 +161,9 @@
 # CUresult cuStreamSynchronize(CUstream hStream);
 'cuStreamSynchronize':  (c_int, cu_stream),
 
+# CUresult cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void* userData, unsigned int flags)
+'cuStreamAddCallback':  (c_int, cu_stream, cu_stream_callback_pyobj, py_object, c_uint),
+
 # CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
 #                        unsigned int gridDimY,
 #                        unsigned int gridDimZ,
diff --git a/numba/cuda/cudadrv/nvvm.py b/numba/cuda/cudadrv/nvvm.py
index bf5acba9893..24569f99b43 100644
--- a/numba/cuda/cudadrv/nvvm.py
+++ b/numba/cuda/cudadrv/nvvm.py
@@ -202,7 +202,7 @@ def compile(self, **options):
             if options.pop('debug'):
                 opts.append('-g')
 
-        if options.get('opt'):
+        if 'opt' in options:
             opts.append('-opt=%d' % options.pop('opt'))
 
         if options.get('arch'):
@@ -500,6 +500,12 @@ def llvm_to_ptx(llvmir, **opts):
     for decl, fn in replacements:
         llvmir = llvmir.replace(decl, fn)
 
+    # llvm.numba_nvvm.atomic is used to prevent LLVM 9 onwards auto-upgrading
+    # these intrinsics into atomicrmw instructions, which are not recognized by
+    # NVVM. We can now replace them with the real intrinsic names, ready to
+    # pass to NVVM.
+    llvmir = llvmir.replace('llvm.numba_nvvm.atomic', 'llvm.nvvm.atomic')
+
     llvmir = llvm39_to_34_ir(llvmir)
     cu.add_module(llvmir.encode('utf8'))
     cu.add_module(libdevice.get())
@@ -666,7 +672,11 @@ def _replace_llvm_memset_usage(m):
     Used as functor for `re.sub.
     """
     params = list(m.group(1).split(','))
-    align = re.search(r'align (\d+)', params[0]).group(1)
+    align_attr = re.search(r'align (\d+)', params[0])
+    if not align_attr:
+        raise ValueError("No alignment attribute found on memset dest")
+    else:
+        align = align_attr.group(1)
     params.insert(-1, 'i32 {}'.format(align))
     out = ', '.join(params)
     return '({})'.format(out)
diff --git a/numba/cuda/cudaimpl.py b/numba/cuda/cudaimpl.py
index 1ef9401fe7e..621f662b7c4 100644
--- a/numba/cuda/cudaimpl.py
+++ b/numba/cuda/cudaimpl.py
@@ -468,6 +468,12 @@ def ptx_round(context, builder, sig, args):
     ])
 
 
+@lower(math.isinf, types.Integer)
+@lower(math.isnan, types.Integer)
+def math_isinf_isnan_int(context, builder, sig, args):
+    return lc.Constant.int(lc.Type.int(1), 0)
+
+
 def gen_deg_rad(const):
     def impl(context, builder, sig, args):
         argty, = sig.args
diff --git a/numba/cuda/decorators.py b/numba/cuda/decorators.py
index 3abcafd9eaa..2e87a5b3f80 100644
--- a/numba/cuda/decorators.py
+++ b/numba/cuda/decorators.py
@@ -11,17 +11,17 @@
                                  "positional argument.")
 
 
-def jitdevice(func, link=[], debug=None, inline=False):
+def jitdevice(func, link=[], debug=None, inline=False, opt=True):
     """Wrapper for device-jit.
     """
     debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
     if link:
         raise ValueError("link keyword invalid for device function")
-    return compile_device_template(func, debug=debug, inline=inline)
+    return compile_device_template(func, debug=debug, inline=inline, opt=opt)
 
 
 def jit(func_or_sig=None, argtypes=None, device=False, inline=False,
-        link=[], debug=None, **kws):
+        link=[], debug=None, opt=True, **kws):
     """
     JIT compile a python function conforming to the CUDA Python specification.
     If a signature is supplied, then a function is returned that takes a
@@ -51,6 +51,10 @@ def jit(func_or_sig=None, argtypes=None, device=False, inline=False,
        from which they are called.
     :param max_registers: Limit the kernel to using at most this number of
        registers per thread. Useful for increasing occupancy.
+    :param opt: Whether to compile from LLVM IR to PTX with optimization
+                enabled. When ``True``, ``-opt=3`` is passed to NVVM. When
+                ``False``, ``-opt=0`` is passed to NVVM. Defaults to ``True``.
+    :type opt: bool
     """
     debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
 
@@ -79,7 +83,7 @@ def autojitwrapper(func):
                                           debug=debug)
             else:
                 def autojitwrapper(func):
-                    return jit(func, device=device, debug=debug, **kws)
+                    return jit(func, device=device, debug=debug, opt=opt, **kws)
 
             return autojitwrapper
         # func_or_sig is a function
@@ -88,10 +92,11 @@ def autojitwrapper(func):
                 return FakeCUDAKernel(func_or_sig, device=device, fastmath=fastmath,
                                       debug=debug)
             elif device:
-                return jitdevice(func_or_sig, debug=debug, **kws)
+                return jitdevice(func_or_sig, debug=debug, opt=opt, **kws)
             else:
                 targetoptions = kws.copy()
                 targetoptions['debug'] = debug
+                targetoptions['opt'] = opt
                 targetoptions['link'] = link
                 sigs = None
                 return Dispatcher(func_or_sig, sigs, bind=bind,
@@ -127,6 +132,7 @@ def kernel_jit(func):
             targetoptions = kws.copy()
             targetoptions['debug'] = debug
             targetoptions['link'] = link
+            targetoptions['opt'] = opt
             return Dispatcher(func, sigs, bind=bind, targetoptions=targetoptions)
 
         def device_jit(func):
diff --git a/numba/cuda/envvars.py b/numba/cuda/envvars.py
index c79edae38da..dfe29759ea9 100644
--- a/numba/cuda/envvars.py
+++ b/numba/cuda/envvars.py
@@ -8,7 +8,7 @@ def get_numbapro_envvar(envvar, default=None):
     # use vanilla get here so as to use `None` as a signal for not-set
     value = os.environ.get(envvar)
     if value is not None:
-        url = ("http://numba.pydata.org/numba-doc/latest/cuda/overview.html",
+        url = ("https://numba.pydata.org/numba-doc/latest/cuda/overview.html",
                "#cudatoolkit-lookup")
         msg = ("\nEnvironment variables with the 'NUMBAPRO' prefix are "
                "deprecated and consequently ignored, found use of %s=%s.\n\n"
diff --git a/numba/cuda/nvvmutils.py b/numba/cuda/nvvmutils.py
index 543a48539d2..c273dcd2bf5 100644
--- a/numba/cuda/nvvmutils.py
+++ b/numba/cuda/nvvmutils.py
@@ -12,8 +12,13 @@ def declare_atomic_cas_int32(lmod):
     return lmod.get_or_insert_function(fnty, fname)
 
 
+# For atomic intrinsics, "numba_nvvm" prevents LLVM 9 onwards auto-upgrading
+# them into atomicrmw instructions that are not recognized by NVVM. It is
+# replaced with "nvvm" in llvm_to_ptx later, after the module has been parsed
+# and dumped by LLVM.
+
 def declare_atomic_add_float32(lmod):
-    fname = 'llvm.nvvm.atomic.load.add.f32.p0f32'
+    fname = 'llvm.numba_nvvm.atomic.load.add.f32.p0f32'
     fnty = lc.Type.function(lc.Type.float(),
         (lc.Type.pointer(lc.Type.float(), 0), lc.Type.float()))
     return lmod.get_or_insert_function(fnty, name=fname)
@@ -21,7 +26,7 @@ def declare_atomic_add_float32(lmod):
 
 def declare_atomic_add_float64(lmod):
     if current_context().device.compute_capability >= (6, 0):
-        fname = 'llvm.nvvm.atomic.load.add.f64.p0f64'
+        fname = 'llvm.numba_nvvm.atomic.load.add.f64.p0f64'
     else:
         fname = '___numba_atomic_double_add'
     fnty = lc.Type.function(lc.Type.double(),
diff --git a/numba/cuda/target.py b/numba/cuda/target.py
index 6ac492efa39..d8465bf1912 100644
--- a/numba/cuda/target.py
+++ b/numba/cuda/target.py
@@ -38,7 +38,9 @@ def resolve_value_type(self, val):
                 if not val._can_compile:
                     raise ValueError('using cpu function on device '
                                      'but its compilation is disabled')
-                jd = jitdevice(val, debug=val.targetoptions.get('debug'))
+                opt = val.targetoptions.get('opt', True)
+                jd = jitdevice(val, debug=val.targetoptions.get('debug'),
+                               opt=opt)
                 # cache the device function for future use and to avoid
                 # duplicated copy of the same function.
                 val.__cudajitdevice = jd
diff --git a/numba/cuda/testing.py b/numba/cuda/testing.py
index 5a3c1562644..43146685153 100644
--- a/numba/cuda/testing.py
+++ b/numba/cuda/testing.py
@@ -1,5 +1,6 @@
 import contextlib
 import os
+import shutil
 import sys
 
 from numba.tests.support import (
@@ -59,6 +60,16 @@ def skip_under_cuda_memcheck(reason):
     return unittest.skipIf(os.environ.get('CUDA_MEMCHECK') is not None, reason)
 
 
+def skip_without_nvdisasm(reason):
+    nvdisasm_path = shutil.which('nvdisasm')
+    return unittest.skipIf(nvdisasm_path is None, reason)
+
+
+def skip_with_nvdisasm(reason):
+    nvdisasm_path = shutil.which('nvdisasm')
+    return unittest.skipIf(nvdisasm_path is not None, reason)
+
+
 class CUDATextCapture(object):
 
     def __init__(self, stream):
diff --git a/numba/cuda/tests/cudadrv/test_streams.py b/numba/cuda/tests/cudadrv/test_streams.py
new file mode 100644
index 00000000000..485498b6077
--- /dev/null
+++ b/numba/cuda/tests/cudadrv/test_streams.py
@@ -0,0 +1,104 @@
+import asyncio
+import functools
+import threading
+import numpy as np
+from numba import cuda
+from numba.core import utils
+from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
+
+
+def with_asyncio_loop(f):
+    @functools.wraps(f)
+    def runner(*args, **kwds):
+        loop = asyncio.new_event_loop()
+        loop.set_debug(True)
+        try:
+            return loop.run_until_complete(f(*args, **kwds))
+        finally:
+            loop.close()
+    return runner
+
+
+asyncio_create_task = asyncio.create_task if utils.PYVERSION >= (3, 7) \
+    else asyncio.ensure_future
+
+
+@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
+class TestCudaStream(CUDATestCase):
+    def test_add_callback(self):
+        def callback(stream, status, event):
+            event.set()
+
+        stream = cuda.stream()
+        callback_event = threading.Event()
+        stream.add_callback(callback, callback_event)
+        self.assertTrue(callback_event.wait(1.0))
+
+    @with_asyncio_loop
+    async def test_async_done(self):
+        stream = cuda.stream()
+        await stream.async_done()
+
+    @with_asyncio_loop
+    async def test_parallel_tasks(self):
+        async def async_cuda_fn(value_in: float) -> float:
+            stream = cuda.stream()
+            h_src, h_dst = cuda.pinned_array(8), cuda.pinned_array(8)
+            h_src[:] = value_in
+            d_ary = cuda.to_device(h_src, stream=stream)
+            d_ary.copy_to_host(h_dst, stream=stream)
+            await stream.async_done()
+            return h_dst.mean()
+
+        values_in = [1, 2, 3, 4]
+        tasks = [asyncio_create_task(async_cuda_fn(v)) for v in values_in]
+        values_out = await asyncio.gather(*tasks)
+        self.assertTrue(np.allclose(values_in, values_out))
+
+    @with_asyncio_loop
+    async def test_multiple_async_done(self):
+        stream = cuda.stream()
+        done_aws = [stream.async_done() for _ in range(4)]
+        await asyncio.gather(*done_aws)
+
+    @with_asyncio_loop
+    async def test_cancelled_future(self):
+        stream = cuda.stream()
+        done1, done2 = stream.async_done(), stream.async_done()
+        done1.cancel()
+        await done2
+        self.assertTrue(done1.cancelled())
+        self.assertTrue(done2.done())
+
+
+@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
+class TestFailingStream(CUDATestCase):
+    # This test can only be run in isolation because it corrupts the CUDA
+    # context, which cannot be recovered from within the same process. It is
+    # left here so that it can be run manually for debugging / testing purposes
+    # - or may be re-enabled if in future there is infrastructure added for
+    # running tests in a separate process (a subprocess cannot be used because
+    # CUDA will have been initialized before the fork, so it cannot be used in
+    # the child process).
+    @unittest.skip
+    @with_asyncio_loop
+    async def test_failed_stream(self):
+        ctx = cuda.current_context()
+        module = ctx.create_module_ptx("""
+            .version 6.5
+            .target sm_30
+            .address_size 64
+            .visible .entry failing_kernel() { trap; }
+        """)
+        failing_kernel = module.get_function("failing_kernel")
+
+        stream = cuda.stream()
+        failing_kernel.configure((1,), (1,), stream=stream).__call__()
+        done = stream.async_done()
+        with self.assertRaises(Exception):
+            await done
+        self.assertIsNotNone(done.exception())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/numba/cuda/tests/cudapy/test_atomics.py b/numba/cuda/tests/cudapy/test_atomics.py
index 2c5b7491b46..95cdc702627 100644
--- a/numba/cuda/tests/cudapy/test_atomics.py
+++ b/numba/cuda/tests/cudapy/test_atomics.py
@@ -264,7 +264,7 @@ def assertCorrectFloat64Atomics(self, kernel, shared=True):
 
     @skip_unless_cc_50
     def test_atomic_add_double(self):
-        idx = np.random.randint(0, 32, size=32)
+        idx = np.random.randint(0, 32, size=32, dtype=np.int64)
         ary = np.zeros(32, np.float64)
         cuda_func = cuda.jit('void(int64[:], float64[:])')(atomic_add_double)
         cuda_func[1, 32](idx, ary)
@@ -295,7 +295,7 @@ def test_atomic_add_double_3(self):
 
     @skip_unless_cc_50
     def test_atomic_add_double_global(self):
-        idx = np.random.randint(0, 32, size=32)
+        idx = np.random.randint(0, 32, size=32, dtype=np.int64)
         ary = np.zeros(32, np.float64)
         cuda_func = cuda.jit('void(int64[:], float64[:])')(atomic_add_double_global)
         cuda_func[1, 32](idx, ary)
diff --git a/numba/cuda/tests/cudapy/test_constmem.py b/numba/cuda/tests/cudapy/test_constmem.py
index 43d52809e42..11ad2a5431a 100644
--- a/numba/cuda/tests/cudapy/test_constmem.py
+++ b/numba/cuda/tests/cudapy/test_constmem.py
@@ -1,7 +1,8 @@
 import numpy as np
+import sys
 
 from numba import cuda
-from numba.cuda.testing import unittest, CUDATestCase
+from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
 from numba.core.config import ENABLE_CUDASIM
 
 CONST_EMPTY = np.array([])
@@ -133,10 +134,14 @@ def test_const_array_3d(self):
         self.assertTrue(np.all(A == CONST3D))
 
         if not ENABLE_CUDASIM:
-            self.assertIn(
-                'ld.const.v2.u32',
-                jcuconst3d.ptx,
-                "load the two halves of the complex as u32s")
+            if cuda.runtime.get_version() in ((8, 0), (9, 0), (9, 1)):
+                complex_load = 'ld.const.v2.f32'
+                description = 'Load the complex as a vector of 2x f32'
+            else:
+                complex_load = 'ld.const.f32'
+                description =  'load each half of the complex as f32'
+
+            self.assertIn(complex_load, jcuconst3d.ptx, description)
 
     def test_const_record_empty(self):
         jcuconstRecEmpty = cuda.jit('void(float64[:])')(cuconstRecEmpty)
@@ -149,27 +154,50 @@ def test_const_record(self):
         B = np.zeros(2, dtype=int)
         jcuconst = cuda.jit(cuconstRec).specialize(A, B)
 
-        if not ENABLE_CUDASIM:
-            if not any(c in jcuconst.ptx for c in [
-                # a vector load: the compiler fuses the load
-                # of the x and y fields into a single instruction!
-                'ld.const.v2.u64',
-
-                # for some reason Win64 / Py3 / CUDA 9.1 decides
-                # to do two u32 loads, and shifts and ors the
-                # values to get the float `x` field, then uses
-                # another ld.const.u32 to load the int `y` as
-                # a 32-bit value!
-                'ld.const.u32',
-            ]):
-                raise AssertionError(
-                    "the compiler should realise it doesn't " \
-                    "need to interpret the bytes as float!")
-
         jcuconst[2, 1](A, B)
         np.testing.assert_allclose(A, CONST_RECORD['x'])
         np.testing.assert_allclose(B, CONST_RECORD['y'])
 
+    @skip_on_cudasim('PTX inspection not supported on the simulator')
+    def test_const_record_optimization(self):
+        A = np.zeros(2, dtype=float)
+        B = np.zeros(2, dtype=int)
+        jcuconst = cuda.jit(cuconstRec).specialize(A, B)
+
+        old_runtime = cuda.runtime.get_version() in ((8, 0), (9, 0), (9, 1))
+        windows = sys.platform.startswith('win')
+
+        if old_runtime:
+            if windows:
+                # for some reason Win64 / Py3 / CUDA 9.1 decides to do two u32
+                # loads, and shifts and ors the values to get the float `x`
+                # field, then uses another ld.const.u32 to load the int `y` as
+                # a 32-bit value!
+                self.assertIn('ld.const.u32', jcuconst.ptx,
+                              'load record fields as u32')
+            else:
+                # Load of the x and y fields fused into a single instruction
+                self.assertIn('ld.const.v2.f64', jcuconst.ptx,
+                              'load record fields as vector of 2x f64')
+        else:
+            # In newer toolkits, constant values are all loaded 8 bits at a
+            # time. Check that there are enough 8-bit loads for everything to
+            # have been loaded. This is possibly less than optimal, but is the
+            # observed behaviour with current toolkit versions when IR is not
+            # optimized before sending to NVVM.
+            u8_load_count = len([s for s in jcuconst.ptx.split()
+                                 if 'ld.const.u8' in s])
+
+            if windows:
+                # NumPy ints are 32-bit on Windows by default, so only 4 bytes
+                # for loading the int (and 8 for the float)
+                expected_load_count = 12
+            else:
+                # int is 64-bit elsewhere
+                expected_load_count = 16
+            self.assertGreaterEqual(u8_load_count, expected_load_count,
+                                    'load record values as individual bytes')
+
     def test_const_record_align(self):
         A = np.zeros(2, dtype=np.float64)
         B = np.zeros(2, dtype=np.float64)
@@ -178,37 +206,57 @@ def test_const_record_align(self):
         E = np.zeros(2, dtype=np.float64)
         jcuconst = cuda.jit(cuconstRecAlign).specialize(A, B, C, D, E)
 
-        if not ENABLE_CUDASIM:
-            # Code generation differs slightly in 10.2 onwards
-            if cuda.runtime.get_version() >= (10, 2):
-                first_bytes = 'ld.const.v2.u8'
-                first_bytes_msg = 'load the first two bytes as a vector'
-            else:
-                first_bytes = 'ld.const.v4.u8'
-                first_bytes_msg = 'load the first three bytes as a vector'
+        jcuconst[2, 1](A, B, C, D, E)
+        np.testing.assert_allclose(A, CONST_RECORD_ALIGN['a'])
+        np.testing.assert_allclose(B, CONST_RECORD_ALIGN['b'])
+        np.testing.assert_allclose(C, CONST_RECORD_ALIGN['x'])
+        np.testing.assert_allclose(D, CONST_RECORD_ALIGN['y'])
+        np.testing.assert_allclose(E, CONST_RECORD_ALIGN['z'])
+
+    @skip_on_cudasim('PTX inspection not supported on the simulator')
+    def test_const_record_align_optimization(self):
+        rtver = cuda.runtime.get_version()
+
+        A = np.zeros(2, dtype=np.float64)
+        B = np.zeros(2, dtype=np.float64)
+        C = np.zeros(2, dtype=np.float64)
+        D = np.zeros(2, dtype=np.float64)
+        E = np.zeros(2, dtype=np.float64)
+        jcuconst = cuda.jit(cuconstRecAlign).specialize(A, B, C, D, E)
+
+        if rtver >= (10, 2):
+            # Code generation differs slightly in 10.2 onwards - the first
+            # bytes are loaded as individual bytes, so we'll check that
+            # ld.const.u8 occurs at least four times (the first three bytes,
+            # then the last byte by itself)
+            msg = 'load first three bytes and last byte individually'
+            u8_load_count = len([s for s in jcuconst.ptx.split()
+                                 if 'ld.const.u8' in s])
+            self.assertGreaterEqual(u8_load_count, 4, msg)
+        else:
+            # On earlier versions, a vector of 4 bytes is used to load the
+            # first three bytes.
+            first_bytes = 'ld.const.v4.u8'
+            first_bytes_msg = 'load the first three bytes as a vector'
 
             self.assertIn(
                 first_bytes,
                 jcuconst.ptx,
                 first_bytes_msg)
 
-            self.assertIn(
-                'ld.const.u32',
-                jcuconst.ptx,
-                'load the uint32 natively')
+        self.assertIn(
+            'ld.const.u32',
+            jcuconst.ptx,
+            'load the uint32 natively')
 
+        # On 10.2 and above, we already checked for loading the last byte by
+        # itself - no need to repeat the check.
+        if rtver < (10, 2):
             self.assertIn(
                 'ld.const.u8',
                 jcuconst.ptx,
                 'load the last byte by itself')
 
-        jcuconst[2, 1](A, B, C, D, E)
-        np.testing.assert_allclose(A, CONST_RECORD_ALIGN['a'])
-        np.testing.assert_allclose(B, CONST_RECORD_ALIGN['b'])
-        np.testing.assert_allclose(C, CONST_RECORD_ALIGN['x'])
-        np.testing.assert_allclose(D, CONST_RECORD_ALIGN['y'])
-        np.testing.assert_allclose(E, CONST_RECORD_ALIGN['z'])
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/numba/cuda/tests/cudapy/test_debuginfo.py b/numba/cuda/tests/cudapy/test_debuginfo.py
index f17051eb960..4300a29ce7f 100644
--- a/numba/cuda/tests/cudapy/test_debuginfo.py
+++ b/numba/cuda/tests/cudapy/test_debuginfo.py
@@ -50,6 +50,14 @@ def bar(x):
 
             self._check(bar, sig=(types.int32[:],), expect=False)
 
+    def test_issue_5835(self):
+        # Invalid debug metadata would segfault NVVM when any function was
+        # compiled with debug turned on and optimization off. This eager
+        # compilation should not crash anything.
+        @cuda.jit((types.int32[::1],), debug=True, opt=False)
+        def f(x):
+            x[0] = 0
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/numba/cuda/tests/cudapy/test_inspect.py b/numba/cuda/tests/cudapy/test_inspect.py
index 0cbd97bfcb5..2f72e0a0994 100644
--- a/numba/cuda/tests/cudapy/test_inspect.py
+++ b/numba/cuda/tests/cudapy/test_inspect.py
@@ -1,7 +1,10 @@
+import numpy as np
+
 from io import StringIO
-from numba import cuda, float64, intp
+from numba import cuda, float32, float64, int32, intp
 from numba.cuda.testing import unittest, CUDATestCase
-from numba.cuda.testing import skip_on_cudasim
+from numba.cuda.testing import (skip_on_cudasim, skip_with_nvdisasm,
+                                skip_without_nvdisasm)
 
 
 @skip_on_cudasim('Simulator does not generate code to be inspected')
@@ -70,6 +73,57 @@ def foo(x, y):
         self.assertIn("foo", asmdict[self.cc, (intp, intp)])
         self.assertIn("foo", asmdict[self.cc, (float64, float64)])
 
+    def _test_inspect_sass(self, kernel, name, sass):
+        # Ensure function appears in output
+        seen_function = False
+        for line in sass.split():
+            if '.text' in line and name in line:
+                seen_function = True
+        self.assertTrue(seen_function)
+
+        # Some instructions common to all supported architectures that should
+        # appear in the output
+        self.assertIn('S2R', sass)   # Special register to register
+        self.assertIn('BRA', sass)   # Branch
+        self.assertIn('EXIT', sass)  # Exit program
+
+    @skip_without_nvdisasm('nvdisasm needed for inspect_sass()')
+    def test_inspect_sass_eager(self):
+        @cuda.jit((float32[::1], int32[::1]))
+        def add(x, y):
+            i = cuda.grid(1)
+            if i < len(x):
+                x[i] += y[i]
+
+        self._test_inspect_sass(add, 'add', add.inspect_sass())
+
+    @skip_without_nvdisasm('nvdisasm needed for inspect_sass()')
+    def test_inspect_sass_lazy(self):
+        @cuda.jit
+        def add(x, y):
+            i = cuda.grid(1)
+            if i < len(x):
+                x[i] += y[i]
+
+        x = np.arange(10).astype(np.int32)
+        y = np.arange(10).astype(np.float32)
+        add[1, 10](x, y)
+
+        signature = (int32[::1], float32[::1])
+        self._test_inspect_sass(add, 'add', add.inspect_sass(signature))
+
+    @skip_with_nvdisasm('Missing nvdisasm exception only generated when it is '
+                        'not present')
+    def test_inspect_sass_nvdisasm_missing(self):
+        @cuda.jit((float32[::1],))
+        def f(x):
+            x[0] = 0
+
+        with self.assertRaises(RuntimeError) as raises:
+            f.inspect_sass()
+
+        self.assertIn('nvdisasm is required', str(raises.exception))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/numba/cuda/tests/cudapy/test_math.py b/numba/cuda/tests/cudapy/test_math.py
index cdb128a90c2..cf64c0fd7c6 100644
--- a/numba/cuda/tests/cudapy/test_math.py
+++ b/numba/cuda/tests/cudapy/test_math.py
@@ -1,7 +1,7 @@
 import sys
 import numpy as np
 from numba.cuda.testing import unittest, CUDATestCase
-from numba import cuda, float32, float64, int32
+from numba import cuda, float32, float64, int32, int64
 import math
 
 
@@ -198,6 +198,7 @@ def unary_template(self, func, npfunc, npdtype, npmtype, start, stop):
         cfunc[1, nelem](A, B)
         self.assertTrue(np.allclose(npfunc(A), B))
 
+
     def unary_bool_template_float32(self, func, npfunc, start=0, stop=1):
         self.unary_template(func, npfunc, np.float32, float32, start, stop)
 
@@ -205,6 +206,15 @@ def unary_bool_template_float32(self, func, npfunc, start=0, stop=1):
     def unary_bool_template_float64(self, func, npfunc, start=0, stop=1):
         self.unary_template(func, npfunc, np.float64, float64, start, stop)
 
+
+    def unary_bool_template_int32(self, func, npfunc, start=0, stop=49):
+        self.unary_template(func, npfunc, np.int32, int32, start, stop)
+
+
+    def unary_bool_template_int64(self, func, npfunc, start=0, stop=49):
+        self.unary_template(func, npfunc, np.int64, int64, start, stop)
+
+
     def unary_bool_template(self, func, npfunc, npdtype, npmtype, start, stop):
         nelem = 50
         A = np.linspace(start, stop, nelem).astype(npdtype)
@@ -555,6 +565,8 @@ def test_math_mod_binop(self):
     def test_math_isnan(self):
         self.unary_bool_template_float32(math_isnan, np.isnan)
         self.unary_bool_template_float64(math_isnan, np.isnan)
+        self.unary_bool_template_int32(math_isnan, np.isnan)
+        self.unary_bool_template_int64(math_isnan, np.isnan)
 
     #------------------------------------------------------------------------------
     # test_math_isinf
@@ -563,6 +575,8 @@ def test_math_isnan(self):
     def test_math_isinf(self):
         self.unary_bool_template_float32(math_isinf, np.isinf)
         self.unary_bool_template_float64(math_isinf, np.isinf)
+        self.unary_bool_template_int32(math_isinf, np.isnan)
+        self.unary_bool_template_int64(math_isinf, np.isnan)
 
     #------------------------------------------------------------------------------
     # test_math_degrees
diff --git a/numba/cuda/tests/cudapy/test_optimization.py b/numba/cuda/tests/cudapy/test_optimization.py
new file mode 100644
index 00000000000..a33c80bac5b
--- /dev/null
+++ b/numba/cuda/tests/cudapy/test_optimization.py
@@ -0,0 +1,82 @@
+import numpy as np
+
+from numba.cuda.testing import skip_on_cudasim, CUDATestCase
+from numba import cuda, float64
+import unittest
+
+
+def kernel_func(x):
+    x[0] = 1
+
+
+def device_func(x, y, z):
+    return x * y + z
+
+
+# Fragments of code that are removed from kernel_func's PTX when optimization
+# is on
+removed_by_opt = ( '__local_depot0', 'call.uni', 'st.param.b64')
+
+
+@skip_on_cudasim('Simulator does not optimize code')
+class TestOptimization(CUDATestCase):
+    def test_eager_opt(self):
+        # Optimization should occur by default
+        kernel = cuda.jit((float64[::1],))(kernel_func)
+        ptx = kernel.inspect_asm()
+
+        for fragment in removed_by_opt:
+            with self.subTest(fragment=fragment):
+                self.assertNotIn(fragment, ptx)
+
+    def test_eager_noopt(self):
+        # Optimization disabled
+        kernel = cuda.jit((float64[::1],), opt=False)(kernel_func)
+        ptx = kernel.inspect_asm()
+
+        for fragment in removed_by_opt:
+            with self.subTest(fragment=fragment):
+                self.assertIn(fragment, ptx)
+
+    def test_lazy_opt(self):
+        # Optimization should occur by default
+        kernel = cuda.jit(kernel_func)
+        x = np.zeros(1, dtype=np.float64)
+        kernel[1, 1](x)
+
+        # Grab the PTX for the one definition that has just been jitted
+        ptx = next(iter(kernel.inspect_asm()))[1]
+
+        for fragment in removed_by_opt:
+            with self.subTest(fragment=fragment):
+                self.assertNotIn(fragment, ptx)
+
+    def test_lazy_noopt(self):
+        # Optimization disabled
+        kernel = cuda.jit(opt=False)(kernel_func)
+        x = np.zeros(1, dtype=np.float64)
+        kernel[1, 1](x)
+
+        # Grab the PTX for the one definition that has just been jitted
+        ptx = next(iter(kernel.inspect_asm().items()))[1]
+
+        for fragment in removed_by_opt:
+            with self.subTest(fragment=fragment):
+                self.assertIn(fragment, ptx)
+
+    def test_device_opt(self):
+        # Optimization should occur by default
+        device = cuda.jit(device=True)(device_func)
+        ptx = device.inspect_ptx((float64, float64, float64)).decode('utf-8')
+        self.assertIn('fma.rn.f64', ptx)
+
+    def test_device_noopt(self):
+        # Optimization disabled
+        device = cuda.jit(device=True, opt=False)(device_func)
+        ptx = device.inspect_ptx((float64, float64, float64)).decode('utf-8')
+        # Fused-multiply adds should be disabled when not optimizing
+        self.assertNotIn('fma.rn.f64', ptx)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/numba/cuda/tests/nocuda/test_nvvm.py b/numba/cuda/tests/nocuda/test_nvvm.py
index 6bcac98c6f6..aca1587c7ea 100644
--- a/numba/cuda/tests/nocuda/test_nvvm.py
+++ b/numba/cuda/tests/nocuda/test_nvvm.py
@@ -5,6 +5,13 @@
 import unittest
 
 
+original = "call void @llvm.memset.p0i8.i64(" \
+           "i8* align 4 %arg.x.41, i8 0, i64 %0, i1 false)"
+
+missing_align = "call void @llvm.memset.p0i8.i64(" \
+                "i8* %arg.x.41, i8 0, i64 %0, i1 false)"
+
+
 @skip_on_cudasim('libNVVM not supported in simulator')
 @unittest.skipIf(utils.MACHINE_BITS == 32, "CUDA not support for 32-bit")
 @unittest.skipIf(not nvvm.is_available(), "No libNVVM")
@@ -29,25 +36,9 @@ def test_nvvm_memset_fixup(self):
         In LLVM7 the alignment parameter can be implicitly provided as
         an attribute to pointer in the first argument.
         """
-        def foo(x):
-            # Triggers a generation of llvm.memset
-            for i in range(x.size):
-                x[i] = 0
-
-        cukern = compile_kernel(foo, args=(types.int32[::1],), link=())
-        original = cukern._func.ptx.llvmir
-        self.assertIn("call void @llvm.memset", original)
         fixed = nvvm.llvm39_to_34_ir(original)
         self.assertIn("call void @llvm.memset", fixed)
-        # Check original IR
-        for ln in original.splitlines():
-            if 'call void @llvm.memset' in ln:
-                # Missing i32 4 in the 2nd last argument
-                self.assertRegexpMatches(
-                    ln,
-                    r'i64 %\d+, i1 false\)'.replace(' ', r'\s+'),
-                )
-        # Check fixed IR
+
         for ln in fixed.splitlines():
             if 'call void @llvm.memset' in ln:
                 # The i32 4 is the alignment
@@ -56,6 +47,17 @@ def foo(x):
                     r'i32 4, i1 false\)'.replace(' ', r'\s+'),
                 )
 
+    def test_nvvm_memset_fixup_missing_align(self):
+        """
+        We require alignment to be specified as a parameter attribute to the
+        dest argument of a memset.
+        """
+        with self.assertRaises(ValueError) as e:
+            nvvm.llvm39_to_34_ir(missing_align)
+
+        self.assertIn(str(e.exception),
+                      "No alignment attribute found on memset dest")
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/numba/experimental/jitclass/decorators.py b/numba/experimental/jitclass/decorators.py
index aacd14c5366..eec874a47fe 100644
--- a/numba/experimental/jitclass/decorators.py
+++ b/numba/experimental/jitclass/decorators.py
@@ -36,7 +36,7 @@ def _warning_jitclass(spec):
     To be used in numba/__init__.py.
     This endpoint is deprecated.
     """
-    url = ("http://numba.pydata.org/numba-doc/latest/reference/"
+    url = ("https://numba.pydata.org/numba-doc/latest/reference/"
            "deprecation.html#change-of-jitclass-location")
 
     msg = ("The 'numba.jitclass' decorator has moved to "
diff --git a/numba/misc/gdb_hook.py b/numba/misc/gdb_hook.py
index 8613e957d50..34b3548c157 100644
--- a/numba/misc/gdb_hook.py
+++ b/numba/misc/gdb_hook.py
@@ -22,7 +22,7 @@ def _confirm_gdb():
     if not (os.path.exists(gdbloc) and os.path.isfile(gdbloc)):
         msg = ('Is gdb present? Location specified (%s) does not exist. The gdb'
                ' binary location can be set using Numba configuration, see: '
-               'http://numba.pydata.org/numba-doc/latest/reference/envvars.html'
+               'https://numba.pydata.org/numba-doc/latest/reference/envvars.html'  # noqa: E501
                )
         raise RuntimeError(msg % config.GDB_BINARY)
     # Is Yama being used as a kernel security module and if so is ptrace_scope
diff --git a/numba/misc/literal.py b/numba/misc/literal.py
index 855a4155e1e..2bc1225b7e4 100644
--- a/numba/misc/literal.py
+++ b/numba/misc/literal.py
@@ -15,6 +15,9 @@ def _ov_literally(obj):
 
 @overload(literal_unroll)
 def literal_unroll_impl(container):
+    if isinstance(container, types.Poison):
+        m = f"Invalid use of non-Literal type in literal_unroll({container})"
+        raise TypingError(m)
 
     def impl(container):
         return container
diff --git a/numba/np/ufunc/dufunc.py b/numba/np/ufunc/dufunc.py
index 1c01c2af132..093e02ec6d9 100644
--- a/numba/np/ufunc/dufunc.py
+++ b/numba/np/ufunc/dufunc.py
@@ -78,7 +78,7 @@ class DUFunc(serialize.ReduceMixin, _internal._DUFunc):
     def __init__(self, py_func, identity=None, cache=False, targetoptions={}):
         if isinstance(py_func, Dispatcher):
             py_func = py_func.py_func
-        dispatcher = jit(target='npyufunc',
+        dispatcher = jit(_target='npyufunc',
                          cache=cache,
                          **targetoptions)(py_func)
         self._initialize(dispatcher, identity)
diff --git a/numba/np/ufunc/parallel.py b/numba/np/ufunc/parallel.py
index b5387c03722..1b4a3b53ddf 100644
--- a/numba/np/ufunc/parallel.py
+++ b/numba/np/ufunc/parallel.py
@@ -287,28 +287,36 @@ def __exit__(self, *args):
         pass
 
 
-try:
-    # Force the use of an RLock in the case a fork was used to start the
-    # process and thereby the init sequence, some of the threading backend
-    # init sequences are not fork safe. Also, windows global mp locks seem
-    # to be fine.
-    if "fork" in multiprocessing.get_start_method() or _windows:
-        _backend_init_process_lock = multiprocessing.get_context().RLock()
-    else:
-        _backend_init_process_lock = _nop()
+_backend_init_process_lock = None
 
-except OSError as e:
 
-    # probably lack of /dev/shm for semaphore writes, warn the user
-    msg = ("Could not obtain multiprocessing lock due to OS level error: %s\n"
-           "A likely cause of this problem is '/dev/shm' is missing or"
-           "read-only such that necessary semaphores cannot be written.\n"
-           "*** The responsibility of ensuring multiprocessing safe access to "
-           "this initialization sequence/module import is deferred to the "
-           "user! ***\n")
-    warnings.warn(msg % str(e))
+def _set_init_process_lock():
+    global _backend_init_process_lock
+    try:
+        # Force the use of an RLock in the case a fork was used to start the
+        # process and thereby the init sequence, some of the threading backend
+        # init sequences are not fork safe. Also, windows global mp locks seem
+        # to be fine.
+        if "fork" in multiprocessing.get_start_method() or _windows:
+            _backend_init_process_lock = multiprocessing.get_context().RLock()
+        else:
+            _backend_init_process_lock = _nop()
+
+    except OSError as e:
+
+        # probably lack of /dev/shm for semaphore writes, warn the user
+        msg = (
+            "Could not obtain multiprocessing lock due to OS level error: %s\n"
+            "A likely cause of this problem is '/dev/shm' is missing or"
+            "read-only such that necessary semaphores cannot be written.\n"
+            "*** The responsibility of ensuring multiprocessing safe access to "
+            "this initialization sequence/module import is deferred to the "
+            "user! ***\n"
+        )
+        warnings.warn(msg % str(e))
+
+        _backend_init_process_lock = _nop()
 
-    _backend_init_process_lock = _nop()
 
 _is_initialized = False
 
@@ -361,6 +369,9 @@ def _check_tbb_version_compatible():
 
 
 def _launch_threads():
+    if not _backend_init_process_lock:
+        _set_init_process_lock()
+
     with _backend_init_process_lock:
         with _backend_init_thread_lock:
             global _is_initialized
diff --git a/numba/np/ufunc/ufuncbuilder.py b/numba/np/ufunc/ufuncbuilder.py
index d878606e26c..8339c48c065 100644
--- a/numba/np/ufunc/ufuncbuilder.py
+++ b/numba/np/ufunc/ufuncbuilder.py
@@ -228,7 +228,7 @@ class UFuncBuilder(_BaseUFuncBuilder):
     def __init__(self, py_func, identity=None, cache=False, targetoptions={}):
         self.py_func = py_func
         self.identity = parse_identity(identity)
-        self.nb_func = jit(target='npyufunc',
+        self.nb_func = jit(_target='npyufunc',
                            cache=cache,
                            **targetoptions)(py_func)
         self._sigs = []
@@ -293,7 +293,7 @@ def __init__(self, py_func, signature, identity=None, cache=False,
                  targetoptions={}):
         self.py_func = py_func
         self.identity = parse_identity(identity)
-        self.nb_func = jit(target='npyufunc', cache=cache)(py_func)
+        self.nb_func = jit(_target='npyufunc', cache=cache)(py_func)
         self.signature = signature
         self.sin, self.sout = parse_signature(signature)
         self.targetoptions = targetoptions
diff --git a/numba/parfors/array_analysis.py b/numba/parfors/array_analysis.py
index 61d329f439c..378faba4c3f 100644
--- a/numba/parfors/array_analysis.py
+++ b/numba/parfors/array_analysis.py
@@ -143,6 +143,8 @@ def assert_equiv(typingctx, *val):
         or isinstance(a, types.Integer)
         for a in val[0][1:]
     )
+    if not isinstance(val[0][0], types.StringLiteral):
+        raise errors.TypingError('first argument must be a StringLiteral')
 
     def codegen(context, builder, sig, args):
         assert len(args) == 1  # it is a vararg tuple
diff --git a/numba/parfors/parfor.py b/numba/parfors/parfor.py
index 61810c411f1..988ce254716 100644
--- a/numba/parfors/parfor.py
+++ b/numba/parfors/parfor.py
@@ -1531,6 +1531,124 @@ def __init__(self, func_ir, typemap, calltypes, return_type, typingctx,
         self.flags = flags
 
 
+class ConvertInplaceBinop:
+    """Parfor subpass to convert setitem on Arrays
+    """
+    def __init__(self, pass_states):
+        """
+        Parameters
+        ----------
+        pass_states : ParforPassStates
+        """
+        self.pass_states = pass_states
+        self.rewritten = []
+
+    def run(self, blocks):
+        pass_states = self.pass_states
+        # convert expressions like A += ... where A is an array.
+        topo_order = find_topo_order(blocks)
+        # variables available in the program so far (used for finding map
+        # functions in array_expr lowering)
+        for label in topo_order:
+            block = blocks[label]
+            new_body = []
+            equiv_set = pass_states.array_analysis.get_equiv_set(label)
+            for instr in block.body:
+                if isinstance(instr, ir.Assign):
+                    lhs = instr.target
+                    expr = instr.value
+                    if isinstance(expr, ir.Expr) and expr.op == 'inplace_binop':
+                        loc = expr.loc
+                        target = expr.lhs
+                        value = expr.rhs
+                        target_typ = pass_states.typemap[target.name]
+                        value_typ = pass_states.typemap[value.name]
+                        # Handle A op= ...
+                        if isinstance(target_typ, types.npytypes.Array):
+                            # RHS is an array
+                            if isinstance(value_typ, types.npytypes.Array):
+                                new_instr = self._inplace_binop_to_parfor(equiv_set,
+                                        loc, expr.immutable_fn, target, value)
+                                self.rewritten.append(
+                                    dict(old=instr, new=new_instr,
+                                         reason='inplace_binop'),
+                                )
+                                instr = [new_instr, ir.Assign(target, lhs, loc)]
+                if isinstance(instr, list):
+                    new_body.extend(instr)
+                else:
+                    new_body.append(instr)
+            block.body = new_body
+
+    def _inplace_binop_to_parfor(self, equiv_set, loc, op, target, value):
+        """generate parfor from setitem node with a boolean or slice array indices.
+        The value can be either a scalar or an array variable, and if a boolean index
+        is used for the latter case, the same index must be used for the value too.
+        """
+        pass_states = self.pass_states
+        scope = target.scope
+        arr_typ = pass_states.typemap[target.name]
+        el_typ = arr_typ.dtype
+        init_block = ir.Block(scope, loc)
+        value_typ = pass_states.typemap[value.name]
+
+        size_vars = equiv_set.get_shape(target)
+
+        # generate loopnests and size variables from target correlations
+        index_vars, loopnests = _mk_parfor_loops(pass_states.typemap, size_vars, scope, loc)
+
+        # generate body
+        body_label = next_label()
+        body_block = ir.Block(scope, loc)
+        index_var, index_var_typ = _make_index_var(
+                pass_states.typemap, scope, index_vars, body_block)
+
+        # Read value.
+        value_var = ir.Var(scope, mk_unique_var("$value_var"), loc)
+        pass_states.typemap[value_var.name] = value_typ.dtype
+        getitem_call = ir.Expr.getitem(value, index_var, loc)
+        pass_states.calltypes[getitem_call] = signature(
+            value_typ.dtype, value_typ, index_var_typ)
+        body_block.body.append(ir.Assign(getitem_call, value_var, loc))
+
+        # Read target
+        target_var = ir.Var(scope, mk_unique_var("$target_var"), loc)
+        pass_states.typemap[target_var.name] = el_typ
+        getitem_call = ir.Expr.getitem(target, index_var, loc)
+        pass_states.calltypes[getitem_call] = signature(
+            el_typ, arr_typ, index_var_typ)
+        body_block.body.append(ir.Assign(getitem_call, target_var, loc))
+
+        # Create temp to hold result.
+        expr_out_var = ir.Var(scope, mk_unique_var("$expr_out_var"), loc)
+        pass_states.typemap[expr_out_var.name] = el_typ
+
+        # Create binop and assign result to temporary.
+        binop_expr = ir.Expr.binop(op, target_var, value_var, loc)
+        body_block.body.append(ir.Assign(binop_expr, expr_out_var, loc))
+        unified_type = self.pass_states.typingctx.unify_pairs(el_typ, value_typ.dtype)
+        pass_states.calltypes[binop_expr] = signature(
+            unified_type, unified_type, unified_type)
+
+        # Write to target
+        setitem_node = ir.SetItem(target, index_var, expr_out_var, loc)
+        pass_states.calltypes[setitem_node] = signature(
+            types.none, arr_typ, index_var_typ, el_typ)
+        body_block.body.append(setitem_node)
+
+        parfor = Parfor(loopnests, init_block, {}, loc, index_var, equiv_set,
+                        ('inplace_binop', ''), pass_states.flags)
+        parfor.loop_body = {body_label: body_block}
+        if config.DEBUG_ARRAY_OPT >= 1:
+            print("parfor from inplace_binop")
+            parfor.dump()
+        return parfor
+
+    def _type_getitem(self, args):
+        fnty = operator.getitem
+        return self.pass_states.typingctx.resolve_function_type(fnty, tuple(args), {})
+
+
 class ConvertSetItemPass:
     """Parfor subpass to convert setitem on Arrays
     """
@@ -1550,7 +1668,6 @@ def run(self, blocks):
         topo_order = find_topo_order(blocks)
         # variables available in the program so far (used for finding map
         # functions in array_expr lowering)
-        avail_vars = []
         for label in topo_order:
             block = blocks[label]
             new_body = []
@@ -1916,14 +2033,14 @@ def _numpy_map_to_parfor(self, equiv_set, call_name, lhs, args, kws, expr):
         value_assign = ir.Assign(value, expr_out_var, loc)
         body_block.body.append(value_assign)
 
-        parfor = Parfor(loopnests, init_block, {}, loc, index_var, equiv_set,
-                        ('{} function'.format(call_name,), 'NumPy mapping'),
-                        pass_states.flags)
-
         setitem_node = ir.SetItem(lhs, index_var, expr_out_var, loc)
         pass_states.calltypes[setitem_node] = signature(
             types.none, pass_states.typemap[lhs.name], index_var_typ, el_typ)
         body_block.body.append(setitem_node)
+
+        parfor = Parfor(loopnests, init_block, {}, loc, index_var, equiv_set,
+                        ('{} function'.format(call_name,), 'NumPy mapping'),
+                        pass_states.flags)
         parfor.loop_body = {body_label: body_block}
         if config.DEBUG_ARRAY_OPT >= 1:
             print("generated parfor for numpy map:")
@@ -2601,6 +2718,8 @@ def run(self):
             ConvertReducePass(self).run(self.func_ir.blocks)
         if self.options.prange:
             ConvertLoopPass(self).run(self.func_ir.blocks)
+        if self.options.inplace_binop:
+            ConvertInplaceBinop(self).run(self.func_ir.blocks)
 
         # setup diagnostics now parfors are found
         self.diagnostics.setup(self.func_ir, self.options.fusion)
@@ -2627,6 +2746,7 @@ def run(self):
                                                             up_direction=False)
             dprint_func_ir(self.func_ir, "after maximize fusion down")
             self.fuse_parfors(self.array_analysis, self.func_ir.blocks)
+            dprint_func_ir(self.func_ir, "after first fuse")
             # push non-parfors up
             maximize_fusion(self.func_ir, self.func_ir.blocks, self.typemap)
             dprint_func_ir(self.func_ir, "after maximize fusion up")
@@ -2763,6 +2883,7 @@ def fuse_parfors(self, array_analysis, blocks):
     def fuse_recursive_parfor(self, parfor, equiv_set):
         blocks = wrap_parfor_blocks(parfor)
         maximize_fusion(self.func_ir, blocks, self.typemap)
+        dprint_func_ir(self.func_ir, "after recursive maximize fusion down", blocks)
         arr_analysis = array_analysis.ArrayAnalysis(self.typingctx, self.func_ir,
                                                 self.typemap, self.calltypes)
         arr_analysis.run(blocks, equiv_set)
@@ -3587,14 +3708,26 @@ def maximize_fusion(func_ir, blocks, typemap, up_direction=True):
     so they are adjacent.
     """
     call_table, _ = get_call_table(blocks)
-    alias_map, arg_aliases = find_potential_aliases(blocks, func_ir.arg_names, typemap, func_ir)
+    alias_map, arg_aliases = find_potential_aliases(
+                                 blocks,
+                                 func_ir.arg_names,
+                                 typemap,
+                                 func_ir
+                             )
     for block in blocks.values():
         order_changed = True
         while order_changed:
-            order_changed = maximize_fusion_inner(func_ir, block, call_table,
-                                                  alias_map, up_direction)
+            order_changed = maximize_fusion_inner(
+                                func_ir,
+                                block,
+                                call_table,
+                                alias_map,
+                                arg_aliases,
+                                up_direction
+                            )
 
-def maximize_fusion_inner(func_ir, block, call_table, alias_map, up_direction=True):
+def maximize_fusion_inner(func_ir, block, call_table, alias_map,
+                          arg_aliases, up_direction=True):
     order_changed = False
     i = 0
     # i goes to body[-3] (i+1 to body[-2]) since body[-1] is terminator and
@@ -3602,9 +3735,10 @@ def maximize_fusion_inner(func_ir, block, call_table, alias_map, up_direction=Tr
     while i < len(block.body) - 2:
         stmt = block.body[i]
         next_stmt = block.body[i+1]
-        can_reorder = (_can_reorder_stmts(stmt, next_stmt, func_ir, call_table, alias_map)
+        can_reorder = (_can_reorder_stmts(stmt, next_stmt, func_ir,
+                                          call_table, alias_map, arg_aliases)
                         if up_direction else _can_reorder_stmts(next_stmt, stmt,
-                        func_ir, call_table, alias_map))
+                        func_ir, call_table, alias_map, arg_aliases))
         if can_reorder:
             block.body[i] = next_stmt
             block.body[i+1] = stmt
@@ -3612,16 +3746,18 @@ def maximize_fusion_inner(func_ir, block, call_table, alias_map, up_direction=Tr
         i += 1
     return order_changed
 
-def expand_aliases(the_set, alias_map):
+def expand_aliases(the_set, alias_map, arg_aliases):
     ret = set()
     for i in the_set:
         if i in alias_map:
             ret = ret.union(alias_map[i])
-        else:
-            ret.add(i)
+        if i in arg_aliases:
+            ret = ret.union(arg_aliases)
+        ret.add(i)
     return ret
 
-def _can_reorder_stmts(stmt, next_stmt, func_ir, call_table, alias_map):
+def _can_reorder_stmts(stmt, next_stmt, func_ir, call_table,
+                       alias_map, arg_aliases):
     """
     Check dependencies to determine if a parfor can be reordered in the IR block
     with a non-parfor statement.
@@ -3630,18 +3766,20 @@ def _can_reorder_stmts(stmt, next_stmt, func_ir, call_table, alias_map):
     # don't reorder calls with side effects (e.g. file close)
     # only read-read dependencies are OK
     # make sure there is no write-write, write-read dependencies
-    if (isinstance(
-            stmt, Parfor) and not isinstance(
-            next_stmt, Parfor) and not isinstance(
-            next_stmt, ir.Print)
-            and (not isinstance(next_stmt, ir.Assign)
-                 or has_no_side_effect(
-                next_stmt.value, set(), call_table)
+    if (isinstance(stmt, Parfor)
+        and not isinstance(next_stmt, Parfor)
+        and not isinstance(next_stmt, ir.Print)
+        and (not isinstance(next_stmt, ir.Assign)
+            or has_no_side_effect(next_stmt.value, set(), call_table)
             or guard(is_assert_equiv, func_ir, next_stmt.value))):
-        stmt_accesses = expand_aliases({v.name for v in stmt.list_vars()}, alias_map)
-        stmt_writes = expand_aliases(get_parfor_writes(stmt), alias_map)
-        next_accesses = expand_aliases({v.name for v in next_stmt.list_vars()}, alias_map)
-        next_writes = expand_aliases(get_stmt_writes(next_stmt), alias_map)
+        stmt_accesses = expand_aliases({v.name for v in stmt.list_vars()},
+                                       alias_map, arg_aliases)
+        stmt_writes = expand_aliases(get_parfor_writes(stmt),
+                                     alias_map, arg_aliases)
+        next_accesses = expand_aliases({v.name for v in next_stmt.list_vars()},
+                                       alias_map, arg_aliases)
+        next_writes = expand_aliases(get_stmt_writes(next_stmt),
+                                     alias_map, arg_aliases)
         if len((stmt_writes & next_accesses)
                 | (next_writes & stmt_accesses)) == 0:
             return True
diff --git a/numba/pycc/compiler.py b/numba/pycc/compiler.py
index bb7d2682143..4fea211261c 100644
--- a/numba/pycc/compiler.py
+++ b/numba/pycc/compiler.py
@@ -99,8 +99,8 @@ class _ModuleCompiler(object):
                                     lt._int8_star))
 
     method_def_ptr = lc.Type.pointer(method_def_ty)
-
-    env_def_ty = lc.Type.struct((lt._void_star, lt._int32))
+    # The structure type constructed by PythonAPI.serialize_uncached()
+    env_def_ty = lc.Type.struct((lt._void_star, lt._int32, lt._void_star))
     env_def_ptr = lc.Type.pointer(env_def_ty)
 
     def __init__(self, export_entries, module_name, use_nrt=False,
diff --git a/numba/pycc/modulemixin.c b/numba/pycc/modulemixin.c
index 1aa3dbb6657..e2bfeca63c2 100644
--- a/numba/pycc/modulemixin.c
+++ b/numba/pycc/modulemixin.c
@@ -59,6 +59,7 @@ extern void *nrt_atomic_add, *nrt_atomic_sub;
 typedef struct {
     const char *data;
     int len;
+    const char *hashbuf;
 } env_def_t;
 
 /* Environment GlobalVariable address type */
@@ -73,7 +74,7 @@ recreate_environment(PyObject *module, env_def_t env)
     EnvironmentObject *envobj;
     PyObject *env_consts;
 
-    env_consts = numba_unpickle(env.data, env.len);
+    env_consts = numba_unpickle(env.data, env.len, env.hashbuf);
     if (env_consts == NULL)
         return NULL;
     if (!PyList_Check(env_consts)) {
diff --git a/numba/tests/test_comprehension.py b/numba/tests/test_comprehension.py
index b269fd6460d..d0c4b402065 100644
--- a/numba/tests/test_comprehension.py
+++ b/numba/tests/test_comprehension.py
@@ -282,6 +282,16 @@ def comp_with_array_noinline(n):
         finally:
             ic.enable_inline_arraycall = True
 
+    def test_comp_with_array_noinline_issue_6053(self):
+        def comp_with_array_noinline(n):
+            lst = [0]
+            for i in range(n):
+                lst.append(i)
+            l = np.array(lst)
+            return l
+
+        self.check(comp_with_array_noinline, 5, assert_allocate_list=True)
+
     def test_comp_nest_with_array(self):
         def comp_nest_with_array(n):
             l = np.array([[i * j for j in range(n)] for i in range(n)])
diff --git a/numba/tests/test_deprecations.py b/numba/tests/test_deprecations.py
index 3122b091db2..e7abdf51be4 100644
--- a/numba/tests/test_deprecations.py
+++ b/numba/tests/test_deprecations.py
@@ -11,7 +11,7 @@ def check_warning(self, warnings, expected_str, category):
         self.assertEqual(len(warnings), 1)
         self.assertEqual(warnings[0].category, category)
         self.assertIn(expected_str, str(warnings[0].message))
-        self.assertIn("http://numba.pydata.org", str(warnings[0].message))
+        self.assertIn("https://numba.pydata.org", str(warnings[0].message))
 
     def test_jitfallback(self):
         # tests that @jit falling back to object mode raises a
@@ -52,7 +52,7 @@ def foo_set(a):
                 self.assertIn(msg, warn_msg)
                 msg = ("\'reflected %s\' found for argument" % container)
                 self.assertIn(msg, warn_msg)
-                self.assertIn("http://numba.pydata.org", warn_msg)
+                self.assertIn("https://numba.pydata.org", warn_msg)
 
 
 if __name__ == '__main__':
diff --git a/numba/tests/test_dictobject.py b/numba/tests/test_dictobject.py
index 2a139491a89..e4a3bdd5aed 100644
--- a/numba/tests/test_dictobject.py
+++ b/numba/tests/test_dictobject.py
@@ -1727,6 +1727,65 @@ def foo():
 
         foo()
 
+    def test_mutation_not_carried_single_function(self):
+        # this is another pattern for using literally
+
+        @njit
+        def nop(*args):
+            pass
+
+        for fn, iv in (nop, None), (literally, {'a': 1, 'b': 2, 'c': 3}):
+            @njit
+            def baz(x):
+                pass
+
+            def bar(z):
+                pass
+
+            @overload(bar)
+            def ol_bar(z):
+                def impl(z):
+                    fn(z)
+                    baz(z)
+                return impl
+
+            @njit
+            def foo():
+                x = {'a': 1, 'b': 2, 'c': 3}
+                bar(x)
+                x['d'] = 4
+                return x
+
+            foo()
+            # baz should be specialised based on literally being invoked and
+            # the literal/unliteral arriving at the call site
+            larg = baz.signatures[0][0]
+            self.assertEqual(larg.initial_value, iv)
+
+    def test_unify_across_function_call(self):
+
+        @njit
+        def bar(x):
+            o = {1: 2}
+            if x:
+                o = {2: 3}
+            return o
+
+        @njit
+        def foo(x):
+            if x:
+                d = {3: 4}
+            else:
+                d = bar(x)
+            return d
+
+        e1 = Dict()
+        e1[3] = 4
+        e2 = Dict()
+        e2[1] = 2
+        self.assertEqual(foo(True), e1)
+        self.assertEqual(foo(False), e2)
+
 
 class TestLiteralStrKeyDict(MemoryLeakMixin, TestCase):
     """ Tests for dictionaries with string keys that can map to anything!"""
diff --git a/numba/tests/test_errorhandling.py b/numba/tests/test_errorhandling.py
index fca546ccc47..83f06211b23 100644
--- a/numba/tests/test_errorhandling.py
+++ b/numba/tests/test_errorhandling.py
@@ -8,6 +8,7 @@
 from numba import jit, njit, typed, int64, types
 from numba.core import errors
 import numba.core.typing.cffi_utils as cffi_support
+from numba.experimental import structref
 from numba.extending import (overload, intrinsic, overload_method,
                              overload_attribute)
 from numba.core.compiler import CompilerBase
@@ -423,6 +424,25 @@ def foo():
         excstr = str(raises.exception)
         self.assertIn("Type Restricted Function in function 'unknown'", excstr)
 
+    def test_missing_source(self):
+
+        @structref.register
+        class ParticleType(types.StructRef):
+            pass
+
+        class Particle(structref.StructRefProxy):
+            def __new__(cls, pos, mass):
+                return structref.StructRefProxy.__new__(cls, pos)
+                # didn't provide the required mass argument ----^
+
+        structref.define_proxy(Particle, ParticleType, ["pos", "mass"])
+
+        with self.assertRaises(errors.TypingError) as raises:
+            Particle(pos=1, mass=2)
+
+        excstr = str(raises.exception)
+        self.assertIn("required positional argument: 'mass'", excstr)
+
 
 class TestDeveloperSpecificErrorMessages(SerialMixin, unittest.TestCase):
 
diff --git a/numba/tests/test_extending.py b/numba/tests/test_extending.py
index 972fddd285d..6b4e0c4f47d 100644
--- a/numba/tests/test_extending.py
+++ b/numba/tests/test_extending.py
@@ -1770,5 +1770,100 @@ def foo(x):
         )
 
 
+class TestOverloadPreferLiteral(TestCase):
+    def test_overload(self):
+        def prefer_lit(x):
+            pass
+
+        def non_lit(x):
+            pass
+
+        def ov(x):
+            if isinstance(x, types.IntegerLiteral):
+                # With prefer_literal=False, this branch will not be reached.
+                if x.literal_value == 1:
+                    def impl(x):
+                        return 0xcafe
+                    return impl
+                else:
+                    raise errors.TypingError('literal value')
+            else:
+                def impl(x):
+                    return x * 100
+                return impl
+
+        overload(prefer_lit, prefer_literal=True)(ov)
+        overload(non_lit)(ov)
+
+        @njit
+        def check_prefer_lit(x):
+            return prefer_lit(1), prefer_lit(2), prefer_lit(x)
+
+        a, b, c = check_prefer_lit(3)
+        self.assertEqual(a, 0xcafe)
+        self.assertEqual(b, 200)
+        self.assertEqual(c, 300)
+
+        @njit
+        def check_non_lit(x):
+            return non_lit(1), non_lit(2), non_lit(x)
+
+        a, b, c = check_non_lit(3)
+        self.assertEqual(a, 100)
+        self.assertEqual(b, 200)
+        self.assertEqual(c, 300)
+
+    def test_overload_method(self):
+        def ov(self, x):
+            if isinstance(x, types.IntegerLiteral):
+                # With prefer_literal=False, this branch will not be reached.
+                if x.literal_value == 1:
+                    def impl(self, x):
+                        return 0xcafe
+                    return impl
+                else:
+                    raise errors.TypingError('literal value')
+            else:
+                def impl(self, x):
+                    return x * 100
+                return impl
+
+        overload_method(
+            MyDummyType, "method_prefer_literal",
+            prefer_literal=True,
+        )(ov)
+
+        overload_method(
+            MyDummyType, "method_non_literal",
+            prefer_literal=False,
+        )(ov)
+
+        @njit
+        def check_prefer_lit(dummy, x):
+            return (
+                dummy.method_prefer_literal(1),
+                dummy.method_prefer_literal(2),
+                dummy.method_prefer_literal(x),
+            )
+
+        a, b, c = check_prefer_lit(MyDummy(), 3)
+        self.assertEqual(a, 0xcafe)
+        self.assertEqual(b, 200)
+        self.assertEqual(c, 300)
+
+        @njit
+        def check_non_lit(dummy, x):
+            return (
+                dummy.method_non_literal(1),
+                dummy.method_non_literal(2),
+                dummy.method_non_literal(x),
+            )
+
+        a, b, c = check_non_lit(MyDummy(), 3)
+        self.assertEqual(a, 100)
+        self.assertEqual(b, 200)
+        self.assertEqual(c, 300)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/numba/tests/test_lists.py b/numba/tests/test_lists.py
index fcefb614bb6..0ae2bb366f6 100644
--- a/numba/tests/test_lists.py
+++ b/numba/tests/test_lists.py
@@ -1481,6 +1481,57 @@ def foo():
 
         foo()
 
+    def test_mutation_not_carried_single_function(self):
+        # this is another pattern for using literally
+
+        @njit
+        def nop(*args):
+            pass
+
+        for fn, iv in (nop, None), (literally, [1, 2, 3]):
+            @njit
+            def baz(x):
+                pass
+
+            def bar(z):
+                pass
+
+            @overload(bar)
+            def ol_bar(z):
+                def impl(z):
+                    fn(z)
+                    baz(z)
+                return impl
+
+            @njit
+            def foo():
+                x = [1, 2, 3]
+                bar(x)
+                x.append(2)
+                return x
+
+            foo()
+            # baz should be specialised based on literally being invoked and
+            # the literal/unliteral arriving at the call site
+            larg = baz.signatures[0][0]
+            self.assertEqual(larg.initial_value, iv)
+
+    def test_list_of_list_ctor(self):
+        # see issue 6082
+        @njit
+        def bar(x):
+            pass
+
+        @njit
+        def foo():
+            x = [[1, 2, 3, 4, 5], [1, 2, 3, 4, 6]]
+            bar(x)
+
+        foo()
+        larg = bar.signatures[0][0]
+        self.assertEqual(larg.initial_value, None)
+        self.assertEqual(larg.dtype.initial_value, None)
+
 
 class TestLiteralLists(MemoryLeakMixin, TestCase):
 
diff --git a/numba/tests/test_parallel_backend.py b/numba/tests/test_parallel_backend.py
index 34d99c0e933..d3e796e2914 100644
--- a/numba/tests/test_parallel_backend.py
+++ b/numba/tests/test_parallel_backend.py
@@ -862,33 +862,32 @@ class TestInitSafetyIssues(TestCase):
 
     _DEBUG = False
 
+    def run_cmd(self, cmdline):
+        popen = subprocess.Popen(cmdline,
+                                 stdout=subprocess.PIPE,
+                                 stderr=subprocess.PIPE,)
+        # finish in _TEST_TIMEOUT seconds or kill it
+        timeout = threading.Timer(_TEST_TIMEOUT, popen.kill)
+        try:
+            timeout.start()
+            out, err = popen.communicate()
+            if popen.returncode != 0:
+                raise AssertionError(
+                    "process failed with code %s: stderr follows\n%s\n" %
+                    (popen.returncode, err.decode()))
+        finally:
+            timeout.cancel()
+        return out.decode(), err.decode()
+
     @linux_only # only linux can leak semaphores
     def test_orphaned_semaphore(self):
         # sys path injection and separate usecase module to make sure everything
         # is importable by children of multiprocessing
 
-        def run_cmd(cmdline):
-            popen = subprocess.Popen(cmdline,
-                                     stdout=subprocess.PIPE,
-                                     stderr=subprocess.PIPE,)
-            # finish in _TEST_TIMEOUT seconds or kill it
-            timeout = threading.Timer(_TEST_TIMEOUT, popen.kill)
-            try:
-                timeout.start()
-                out, err = popen.communicate()
-                if popen.returncode != 0:
-                    raise AssertionError(
-                        "process failed with code %s: stderr follows\n%s\n" %
-                        (popen.returncode, err.decode()))
-            finally:
-                timeout.cancel()
-            return out.decode(), err.decode()
-
         test_file = os.path.join(os.path.dirname(__file__),
                                  "orphaned_semaphore_usecase.py")
-
         cmdline = [sys.executable, test_file]
-        out, err = run_cmd(cmdline)
+        out, err = self.run_cmd(cmdline)
 
         # assert no semaphore leaks reported on stderr
         self.assertNotIn("leaked semaphore", err)
@@ -897,6 +896,27 @@ def run_cmd(cmdline):
             print("OUT:", out)
             print("ERR:", err)
 
+    def test_lazy_lock_init(self):
+        # checks based on https://github.com/numba/numba/pull/5724
+        # looking for "lazy" process lock initialisation so as to avoid setting
+        # a multiprocessing context as part of import.
+        for meth in ('fork', 'spawn', 'forkserver'):
+            # if a context is available on the host check it can be set as the
+            # start method in a separate process
+            try:
+                multiprocessing.get_context(meth)
+            except ValueError:
+                continue
+            cmd = ("import numba; import multiprocessing;"
+                   "multiprocessing.set_start_method('{}');"
+                   "print(multiprocessing.get_context().get_start_method())")
+            cmdline = [sys.executable, "-c", cmd.format(meth)]
+            out, err = self.run_cmd(cmdline)
+            if self._DEBUG:
+                print("OUT:", out)
+                print("ERR:", err)
+            self.assertIn(meth, out)
+
 
 @skip_parfors_unsupported
 @skip_no_omp
diff --git a/numba/tests/test_parfors.py b/numba/tests/test_parfors.py
index a4003686d5b..2869589a2e2 100644
--- a/numba/tests/test_parfors.py
+++ b/numba/tests/test_parfors.py
@@ -569,7 +569,7 @@ def test_blackscholes(self):
     def test_logistic_regression(self):
         args = (numba.float64[:], numba.float64[:,:], numba.float64[:],
                 numba.int64)
-        self.assertTrue(countParfors(lr_impl, args) == 1)
+        self.assertTrue(countParfors(lr_impl, args) == 2)
         self.assertTrue(countArrayAllocs(lr_impl, args) == 1)
 
     @skip_parfors_unsupported
@@ -1628,6 +1628,18 @@ def test_impl(a):
         x = np.arange(10)
         self.check(test_impl, x)
 
+    @skip_parfors_unsupported
+    def test_inplace_binop(self):
+        def test_impl(a, b):
+            b += a
+            return b
+
+        X = np.arange(10) + 10
+        Y = np.arange(10) + 100
+        self.check(test_impl, X, Y)
+        self.assertTrue(countParfors(test_impl,
+                                    (types.Array(types.float64, 1, 'C'),
+                                     types.Array(types.float64, 1, 'C'))) == 1)
 
 class TestParforsLeaks(MemoryLeakMixin, TestParforsBase):
     def check(self, pyfunc, *args, **kwargs):
@@ -3050,6 +3062,46 @@ def test_impl(a):
 
         self.check(test_impl, np.arange(3))
 
+    @skip_parfors_unsupported
+    def test_issue5942_1(self):
+        # issue5942: tests statement reordering of
+        # aliased arguments.
+        def test_impl(gg, gg_next):
+            gs = gg.shape
+            d = gs[0]
+            for i_gg in prange(d):
+                gg_next[i_gg, :]  = gg[i_gg, :]
+                gg_next[i_gg, 0] += 1
+
+            return gg_next
+
+        d = 4
+        k = 2
+
+        gg      = np.zeros((d, k), dtype = np.int32)
+        gg_next = np.zeros((d, k), dtype = np.int32)
+        self.check(test_impl, gg, gg_next)
+
+    @skip_parfors_unsupported
+    def test_issue5942_2(self):
+        # issue5942: tests statement reordering
+        def test_impl(d, k):
+            gg      = np.zeros((d, k), dtype = np.int32)
+            gg_next = np.zeros((d, k), dtype = np.int32)
+
+            for i_gg in prange(d):
+                for n in range(k):
+                    gg[i_gg, n] = i_gg
+                gg_next[i_gg, :]  = gg[i_gg, :]
+                gg_next[i_gg, 0] += 1
+
+            return gg_next
+
+        d = 4
+        k = 2
+
+        self.check(test_impl, d, k)
+
 
 class TestParforsOptions(TestParforsBase):
 
diff --git a/numba/tests/test_serialize.py b/numba/tests/test_serialize.py
index f345eda65ca..2bcf843458a 100644
--- a/numba/tests/test_serialize.py
+++ b/numba/tests/test_serialize.py
@@ -186,5 +186,23 @@ def foo(x):
         """
         subprocess.check_call([sys.executable, "-c", code])
 
+
+class TestSerializationMisc(TestCase):
+    def test_numba_unpickle(self):
+        # Test that _numba_unpickle is memorizing its output
+        from numba.core.serialize import _numba_unpickle
+
+        random_obj = object()
+        bytebuf = pickle.dumps(random_obj)
+        hashed = hash(random_obj)
+
+        got1 = _numba_unpickle(id(random_obj), bytebuf, hashed)
+        # not the original object
+        self.assertIsNot(got1, random_obj)
+        got2 = _numba_unpickle(id(random_obj), bytebuf, hashed)
+        # unpickled results are the same objects
+        self.assertIs(got1, got2)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/numba/tests/test_stencils.py b/numba/tests/test_stencils.py
index 261d6581e54..6a10523fc70 100644
--- a/numba/tests/test_stencils.py
+++ b/numba/tests/test_stencils.py
@@ -1260,7 +1260,7 @@ def computebound(mins, maxs):
             ast.copy_location(returner, node)
 
             add_kwarg = [ast.arg('neighborhood', None)]
-            defaults = [ast.Name(id='None', ctx=ast.Load())]
+            defaults = []
 
             newargs = ast.arguments(
                 args=node.args.args +
diff --git a/numba/tests/test_svml.py b/numba/tests/test_svml.py
index 459fdbbc653..cebac432c19 100644
--- a/numba/tests/test_svml.py
+++ b/numba/tests/test_svml.py
@@ -19,6 +19,9 @@
 
 # a map of float64 vector lenghs with corresponding CPU architecture
 vlen2cpu = {2: 'nehalem', 4: 'haswell', 8: 'skylake-avx512'}
+# force LLVM to use AVX512 registers for vectorization
+# https://reviews.llvm.org/D67259
+vlen2cpu_features = {2: '', 4: '', 8: '-prefer-256-bit'}
 
 # K: SVML functions, V: python functions which are expected to be SIMD-vectorized
 # using SVML, explicit references to Python functions here are mostly for sake of
@@ -182,7 +185,7 @@ class TestSVMLGeneration(TestCase):
     def _inject_test(cls, dtype, mode, vlen, flags):
         # unsupported combinations
         if dtype.startswith('complex') and mode != 'numpy':
-            return 
+            return
         # TODO: address skipped tests below
         skipped = dtype.startswith('int') and vlen == 2
         args = (dtype, mode, vlen, flags)
@@ -192,7 +195,7 @@ def test_template(self):
             fn, contains, avoids = combo_svml_usecase(*args)
             # look for specific patters in the asm for a given target
             with override_env_config('NUMBA_CPU_NAME', vlen2cpu[vlen]), \
-                 override_env_config('NUMBA_CPU_FEATURES', ''):
+                 override_env_config('NUMBA_CPU_FEATURES', vlen2cpu_features[vlen]):
                 # recompile for overridden CPU
                 try:
                     jit = compile_isolated(fn, (numba.int64, ), flags=flags)
@@ -302,6 +305,9 @@ def check(self, pyfunc, *args, **kwargs):
         std_pattern = kwargs.pop('std_pattern', None)
         fast_pattern = kwargs.pop('fast_pattern', None)
         cpu_name = kwargs.pop('cpu_name', 'skylake-avx512')
+        # force LLVM to use AVX512 registers for vectorization
+        # https://reviews.llvm.org/D67259
+        cpu_features = kwargs.pop('cpu_features', '-prefer-256-bit')
 
         # python result
         py_expected = pyfunc(*self.copy_args(*args))
@@ -318,7 +324,7 @@ def check(self, pyfunc, *args, **kwargs):
 
         # look for specific patters in the asm for a given target
         with override_env_config('NUMBA_CPU_NAME', cpu_name), \
-             override_env_config('NUMBA_CPU_FEATURES', ''):
+             override_env_config('NUMBA_CPU_FEATURES', cpu_features):
             # recompile for overridden CPU
             jitstd, jitfast = self.compile(pyfunc, *args)
             if std_pattern:
diff --git a/numba/tests/test_tuples.py b/numba/tests/test_tuples.py
index dfd93ad0e2a..f0ae01167c2 100644
--- a/numba/tests/test_tuples.py
+++ b/numba/tests/test_tuples.py
@@ -4,7 +4,7 @@
 import numpy as np
 
 from numba.core.compiler import compile_isolated
-from numba import njit, jit, typeof
+from numba import njit, jit, typeof, literally
 from numba.core import types, errors, utils
 from numba.tests.support import TestCase, MemoryLeakMixin, tag
 import unittest
@@ -218,6 +218,17 @@ def test_len(self):
                               [types.UniTuple(types.int64, 3)])
         self.assertPreciseEqual(cr.entry_point((4, 5, 6)), 3)
 
+    def test_index_literal(self):
+        # issue #6023, test non-static getitem with IntegerLiteral index
+        def pyfunc(tup, idx):
+            idx = literally(idx)
+            return tup[idx]
+        cfunc = njit(pyfunc)
+
+        tup = (4, 3.1, 'sss')
+        for i in range(len(tup)):
+            self.assertPreciseEqual(cfunc(tup, i), tup[i])
+
     def test_index(self):
         pyfunc = tuple_index
         cr = compile_isolated(pyfunc,
diff --git a/setup.py b/setup.py
index 485bf2c4916..1d0bacf6640 100644
--- a/setup.py
+++ b/setup.py
@@ -1,10 +1,11 @@
-from setuptools import setup, Extension, find_packages
-from distutils.command import build
-from distutils.spawn import spawn
-from distutils import sysconfig
-import sys
 import os
 import platform
+import sys
+from distutils import sysconfig
+from distutils.command import build
+from distutils.spawn import spawn
+
+from setuptools import Extension, find_packages, setup
 
 import versioneer
 
@@ -174,11 +175,6 @@ def check_file_at_path(path2file):
                     found = p  # the latest is used
         return found
 
-    # Search for Intel TBB, first check env var TBBROOT then conda locations
-    tbb_root = os.getenv('TBBROOT')
-    if not tbb_root:
-        tbb_root = check_file_at_path(['include', 'tbb', 'tbb.h'])
-
     # Set various flags for use in TBB and openmp. On OSX, also find OpenMP!
     have_openmp = True
     if sys.platform.startswith('win'):
@@ -205,33 +201,42 @@ def check_file_at_path(path2file):
         else:
             omplinkflags = ['-fopenmp']
 
-    if tbb_root:
-        print("Using Intel TBB from:", tbb_root)
-        ext_np_ufunc_tbb_backend = Extension(
-            name='numba.np.ufunc.tbbpool',
-            sources=[
-                'numba/np/ufunc/tbbpool.cpp',
-                'numba/np/ufunc/gufunc_scheduler.cpp',
-            ],
-            depends=['numba/np/ufunc/workqueue.h'],
-            include_dirs=[os.path.join(tbb_root, 'include')],
-            extra_compile_args=cpp11flags,
-            libraries=['tbb'],  # TODO: if --debug or -g, use 'tbb_debug'
-            library_dirs=[
-                # for Linux
-                os.path.join(tbb_root, 'lib', 'intel64', 'gcc4.4'),
-                # for MacOS
-                os.path.join(tbb_root, 'lib'),
-                # for Windows
-                os.path.join(tbb_root, 'lib', 'intel64', 'vc_mt'),
-            ],
-        )
-        ext_np_ufunc_backends.append(ext_np_ufunc_tbb_backend)
+    # Disable tbb if forced by user with NUMBA_DISABLE_TBB=1
+    if os.getenv("NUMBA_DISABLE_TBB"):
+        print("TBB disabled")
     else:
-        print("TBB not found")
+        # Search for Intel TBB, first check env var TBBROOT then conda locations
+        tbb_root = os.getenv('TBBROOT')
+        if not tbb_root:
+            tbb_root = check_file_at_path(['include', 'tbb', 'tbb.h'])
+
+        if tbb_root:
+            print("Using Intel TBB from:", tbb_root)
+            ext_np_ufunc_tbb_backend = Extension(
+                name='numba.np.ufunc.tbbpool',
+                sources=[
+                    'numba/np/ufunc/tbbpool.cpp',
+                    'numba/np/ufunc/gufunc_scheduler.cpp',
+                ],
+                depends=['numba/np/ufunc/workqueue.h'],
+                include_dirs=[os.path.join(tbb_root, 'include')],
+                extra_compile_args=cpp11flags,
+                libraries=['tbb'],  # TODO: if --debug or -g, use 'tbb_debug'
+                library_dirs=[
+                    # for Linux
+                    os.path.join(tbb_root, 'lib', 'intel64', 'gcc4.4'),
+                    # for MacOS
+                    os.path.join(tbb_root, 'lib'),
+                    # for Windows
+                    os.path.join(tbb_root, 'lib', 'intel64', 'vc_mt'),
+                ],
+            )
+            ext_np_ufunc_backends.append(ext_np_ufunc_tbb_backend)
+        else:
+            print("TBB not found")
 
-    # Disable OpenMP if forced by user with NUMBA_NO_OPENMP=1
-    if os.getenv('NUMBA_NO_OPENMP'):
+    # Disable OpenMP if forced by user with NUMBA_DISABLE_OPENMP=1
+    if os.getenv('NUMBA_DISABLE_OPENMP'):
         print("OpenMP disabled")
     elif have_openmp:
         print("Using OpenMP from:", have_openmp)
@@ -293,10 +298,10 @@ def check_file_at_path(path2file):
 
 packages = find_packages(include=["numba", "numba.*"])
 
-build_requires = [f'numpy >={min_numpy_build_version}']
+build_requires = ['numpy >={}'.format(min_numpy_build_version)]
 install_requires = [
-    f'llvmlite >={min_llvmlite_version},<{max_llvmlite_version}',
-    f'numpy >={min_numpy_run_version}',
+    'llvmlite >={},<{}'.format(min_llvmlite_version, max_llvmlite_version),
+    'numpy >={}'.format(min_numpy_run_version),
     'setuptools',
 ]
 
@@ -333,11 +338,11 @@ def check_file_at_path(path2file):
     scripts=["numba/pycc/pycc", "bin/numba"],
     author="Anaconda, Inc.",
     author_email="numba-users@continuum.io",
-    url="http://numba.github.com",
+    url="https://numba.github.com",
     packages=packages,
     setup_requires=build_requires,
     install_requires=install_requires,
-    python_requires=f">={min_python_version}",
+    python_requires=">={}".format(min_python_version),
     license="BSD",
     cmdclass=cmdclass,
 )