Skip to content

Enable offloading for numba.njit in dpctl.device_context #630

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Nov 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,22 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

## [0.17.3] - 2021-11-xx

### Fixed
* Enable offloading for `numba.njit` in `dpctl.deveice_context` (#630)
* Fix upload conditions for main and release branches (#610)

## [0.17.2] - 2021-11-15

### Changes
### Changed
* Use llvm-spirv from bin-llvm during build for Linux and Windows (#626, #627)

## [0.17.1] - 2021-11-10

### Changes
### Changed
* Update clang to icx (#622)

## [0.17.0] - 2021-11-03
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def div_kernel(dst, src, m):
dst[i] = src[i] // m

import dpctl
with numba_dppy.offload_to_sycl_device(dpctl.SyclQueue()):
with dpctl.device_context(dpctl.SyclQueue()):
X = np.arange(10)
Y = np.arange(10)
div_kernel[10, numba_dppy.DEFAULT_LOCAL_SIZE](Y, X, 5)
Expand Down
4 changes: 2 additions & 2 deletions conda-recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ requirements:
- setuptools
- cython
- numba 0.54*|0.55*
- dpctl 0.10*|0.11*
- dpctl 0.11*
- dpnp 0.8*|0.9* # [linux]
- wheel
run:
- python
- numba 0.54*|0.55*
- dpctl 0.10*|0.11*
- dpctl 0.11*
- spirv-tools
- llvm-spirv 11.*
- dpnp 0.8*|0.9* # [linux]
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/atomic_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def atomic_add(a):
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
atomic_add[global_size, dppy.DEFAULT_LOCAL_SIZE](a)

# Expected 100, because global_size = 100
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/auto_offload_examples/sum-1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
c = f1(a, b)

print("RESULT c:", c, hex(c.ctypes.data))
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/auto_offload_examples/sum-2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
c = f1(a, b)

print("c:", c, hex(c.ctypes.data))
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/auto_offload_examples/sum-3d.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
c = f1(a, b)

print("c:", c, hex(c.ctypes.data))
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/auto_offload_examples/sum-4d.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
c = f1(a, b)

print("c:", c, hex(c.ctypes.data))
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/auto_offload_examples/sum-5d.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
c = f1(a, b)

print("c:", c, hex(c.ctypes.data))
Expand Down
4 changes: 2 additions & 2 deletions numba_dppy/examples/barrier.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def twice(A):
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
twice[N, dppy.DEFAULT_LOCAL_SIZE](arr)

# the output should be `arr * 2, i.e. [0, 2, 4, 6, ...]`
Expand Down Expand Up @@ -80,7 +80,7 @@ def reverse_array(A):
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
reverse_array[blocksize, dppy.DEFAULT_LOCAL_SIZE](arr)

# the output should be `orig[::-1] + orig, i.e. [9, 9, 9, ...]``
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/blacksholes_kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
for i in range(iterations):
black_scholes_dppy[blockdim, griddim](
callResult,
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/blacksholes_njit.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
run(iter)

print("Done...")
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/debug/dppy_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
driver(a, b, c, global_size)

print("Done...")
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/debug/dppy_numba_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def main():

if args.api == "numba-dppy":
device = dpctl.select_default_device()
with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
dppy_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
else:
numba_func_driver(a, b, c)
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/debug/simple_dppy_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def kernel_sum(a_in_kernel, b_in_kernel, c_in_kernel):
c = np.empty_like(a)

device = dpctl.SyclDevice("opencl:gpu")
with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
kernel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)

print("Done...")
2 changes: 1 addition & 1 deletion numba_dppy/examples/debug/simple_sum.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def data_parallel_sum(a, b, c):
c = np.ones_like(a)

device = dpctl.SyclDevice("opencl:gpu")
with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)

print("Done...")
2 changes: 1 addition & 1 deletion numba_dppy/examples/debug/sum.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
driver(a, b, c, global_size)

print("Done...")
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/debug/sum_local_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def data_parallel_sum(a, b, c):
c = np.ones_like(a)

device = dpctl.SyclDevice("opencl:gpu")
with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)

print("Done...")
2 changes: 1 addition & 1 deletion numba_dppy/examples/debug/sum_local_vars_revive.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def data_parallel_sum(a, b, c):
c = np.ones_like(a)

device = dpctl.SyclDevice("opencl:gpu")
with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)

print("Done...")
2 changes: 1 addition & 1 deletion numba_dppy/examples/dppy_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
driver(a, b, N)

print("Done...")
Expand Down
6 changes: 2 additions & 4 deletions numba_dppy/examples/dppy_with_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
numba. The optimizer automatically detects data-parallel code
regions in a numba.jit function and then offloads the data-parallel
regions to a SYCL device. The optimizer is triggered when a numba.jit
function is invoked inside ``numba_dppy.offload_to_sycl_device`` scope.
function is invoked inside ``dpctl.device_context`` scope.

This example demonstrates the usage of numba_dppy's automatic offload
functionality. Note that numba_dppy should be installed in your
Expand All @@ -28,8 +28,6 @@
import numpy as np
from numba import njit, prange

import numba_dppy as dppy


@njit
def add_two_arrays(b, c):
Expand All @@ -51,7 +49,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
result = add_two_arrays(b, c)

print("Result :", result)
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
driver(a, b, c)

# Host compute using standard NumPy
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/pairwise_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
times = driver()

times = np.asarray(times, dtype=np.float32)
Expand Down
6 changes: 2 additions & 4 deletions numba_dppy/examples/rand.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,13 @@
a JIT function on a SYCL device using dpnp
(https://github.com/IntelPython/dpnp). As with the rest of numba_dppy examples,
this feature is also available by simply invoking a ``numba.jit`` function with
the numpy.random calls from within a ``numba_dppy.offload_to_sycl_device``scope.
the numpy.random calls from within a ``dpctl.device_context``scope.
"""

import dpctl
import numba
import numpy as np

import numba_dppy as dppy


@numba.njit
def rand():
Expand Down Expand Up @@ -57,7 +55,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
result = rand()
# Random values in a given shape (3, 2)
print(result)
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/sum.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
driver(a, b, c, global_size)

print("Done...")
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/sum2D.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
driver(a, b, c, global_size)

print(c)
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/sum_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
print("before A: ", a)
print("before B: ", b)
data_parallel_sum[global_size, local_size](a, b, c)
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/sum_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def sum_reduce(A):
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
while total > 1:
global_size = total // 2
sum_reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/sum_reduction_ocl.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def sum_reduce(A):
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
sum_reduction_kernel[global_size, work_group_size](A, partial_sums)

final_sum = 0
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/sum_reduction_recursive_ocl.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def sum_reduce(A):
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
inp_buf = dpctl_mem.MemoryUSMShared(A.size * A.dtype.itemsize)
inp_ndarray = np.ndarray(A.shape, buffer=inp_buf, dtype=A.dtype)
np.copyto(inp_ndarray, A)
Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/usm_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def main():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
da = dpt.usm_ndarray(a.shape, dtype=a.dtype, buffer="shared")
da.usm_data.copy_from_host(a.reshape((-1)).view("|u1"))

Expand Down
2 changes: 1 addition & 1 deletion numba_dppy/examples/vectorize.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_njit():
print("Using device ...")
device.print_device_info()

with dppy.offload_to_sycl_device(device):
with dpctl.device_context(device):
C = ufunc_kernel(A, B)

print(C)
Expand Down
35 changes: 24 additions & 11 deletions numba_dppy/retarget.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,30 @@ def compile_retarget(self, cpu_disp):
return kernel


first_level_cache = dict()
_first_level_cache = dict()


@contextmanager
def offload_to_sycl_device(dpctl_device):
with dpctl.device_context(dpctl_device) as sycl_queue:
filter_string = sycl_queue.sycl_device.filter_string
retarget = first_level_cache.get(filter_string, None)
def _retarget(sycl_queue):
filter_string = sycl_queue.sycl_device.filter_string

if retarget is None:
retarget = DPPYRetarget(filter_string)
first_level_cache[filter_string] = retarget
with TargetConfig.switch_target(retarget):
yield sycl_queue
result = _first_level_cache.get(filter_string)

if not result:
result = DPPYRetarget(filter_string)
_first_level_cache[filter_string] = result

return result


def _retarget_context_manager(sycl_queue):
"""Return context manager for retargeting njit offloading."""
retarget = _retarget(sycl_queue)
return TargetConfig.switch_target(retarget)


def _register_context_factory():
dpctl.nested_context_factories.append(_retarget_context_manager)


_register_context_factory()
offload_to_sycl_device = dpctl.device_context
Loading