diff --git a/.flake8 b/.flake8 index 44679ac08e..0362a55380 100644 --- a/.flake8 +++ b/.flake8 @@ -29,7 +29,6 @@ per-file-ignores = dpctl/tensor/numpy_usm_shared.py: F821 dpctl/tests/_cython_api.pyx: E999, E225, E227, E402 dpctl/utils/_compute_follows_data.pyx: E999, E225, E227 - examples/cython/sycl_buffer/_buffer_example.pyx: E999, E225, E402 - examples/cython/sycl_direct_linkage/_buffer_example.pyx: E999, E225, E402 - examples/cython/usm_memory/blackscholes.pyx: E999, E225, E226, E402 + examples/cython/sycl_buffer/syclbuffer/_buffer_example.pyx: E999, E225, E402 + examples/cython/usm_memory/blackscholes/blackscholes.pyx: E999, E225, E226, E402 examples/cython/use_dpctl_sycl/use_dpctl_sycl/_cython_api.pyx: E999, E225, E226, E402 diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index 7bbc5fffd1..b41795131c 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -423,7 +423,7 @@ jobs: CHANNELS="-c $GITHUB_WORKSPACE/channel -c dppy/label/dev -c intel -c main --override-channels" export PACKAGE_VERSION=$(python -c "${VER_SCRIPT1} ${VER_SCRIPT2}") conda install -n examples -y ${CHANNELS} dpctl=${PACKAGE_VERSION} dpnp">=0.10.1" || exit 1 - - name: Build and run examples with native extensions + - name: Build and run examples of pybind11 extensions shell: bash -l {0} run: | source $CONDA/etc/profile.d/conda.sh @@ -431,23 +431,17 @@ jobs: conda activate examples conda list cd examples/pybind11 - for d in $(ls) + for d in $(find . -maxdepth 1 -type d -not -path ".") do pushd $d export MKLROOT=${CONDA_PREFIX} export TBBROOT=${CONDA_PREFIX} conda activate --stack build_env - if [ -e CMakeLists.txt ] - then - CC=icx CXX=icpx python setup.py build_ext --inplace -G Ninja -- \ - -DTBB_LIBRARY_DIR=${TBBROOT}/lib \ - -DMKL_LIBRARY_DIR=${MKLROOT}/lib \ - -DMKL_INCLUDE_DIR=${MKLROOT}/include \ - -DTBB_INCLUDE_DIR=${TBBROOT}/include || exit 1 - else - CC=icx CXX=icpx CFLAGS="-fsycl" LDSHARED="dpcpp -shared" \ - python setup.py build_ext --inplace || exit 1 - fi + CC=icx CXX=icpx python setup.py build_ext --inplace -G Ninja -- \ + -DTBB_LIBRARY_DIR=${TBBROOT}/lib \ + -DMKL_LIBRARY_DIR=${MKLROOT}/lib \ + -DMKL_INCLUDE_DIR=${MKLROOT}/include \ + -DTBB_INCLUDE_DIR=${TBBROOT}/include || exit 1 conda deactivate if [ -e tests ] then @@ -457,24 +451,32 @@ jobs: fi popd done - cd ../cython - for d in $(ls) + - name: Build and run examples of Cython extensions + shell: bash -l {0} + run: | + source $CONDA/etc/profile.d/conda.sh + export OCL_ICD_FILENAMES=libintelocl.so + conda activate examples + conda list + cd examples/cython + for d in $(find . -maxdepth 1 -type d -not -path ".") do pushd $d conda activate --stack build_env - CC=dpcpp CXX=dpcpp LDSHARED="dpcpp -shared" \ - python setup.py build_ext --inplace || exit 1 + python setup.py build_ext --inplace || exit 1 conda deactivate - if [ -e tests ] - then - LD_LIBRARY_PATH=${CONDA_PREFIX}/lib python -m pytest tests || exit 1 - else - LD_LIBRARY_PATH=${CONDA_PREFIX}/lib python run.py || exit 1 - fi + python -m pytest tests || exit 1 popd done - cd ../c - for d in $(ls) + - name: Build and run examples of C-extensions + shell: bash -l {0} + run: | + source $CONDA/etc/profile.d/conda.sh + export OCL_ICD_FILENAMES=libintelocl.so + conda activate examples + conda list + cd examples/c + for d in $(find . -maxdepth 1 -type d -not -path ".") do pushd $d conda activate --stack build_env diff --git a/README.md b/README.md index cb84c3d362..48140b912a 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ The compute kernel can be a code: * that is part of a library, such as oneMKL The `dpctl` library is built upon the [SYCL -standard](https://www.khronos.org/sycl/). It also implements Python +standard](https://www.khronos.org/sycl/). It implements Python bindings for a subset of the standard [runtime classes](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_sycl_runtime_classes) that allow users to: * query platforms @@ -57,7 +57,7 @@ To get the library from the latest oneAPI release, follow the instructions from Intel(R) [oneAPI installation guide](https://www.intel.com/content/www/us/en/developer/articles/guide/installation-guide-for-oneapi-toolkits.html). -> **NOTE:** You need to install the Intel(R) oneAPI Basekit to get +> **NOTE:** You need to install the Intel(R) oneAPI AI Analytics Tookit to get >IDP and `dpctl`. @@ -85,7 +85,7 @@ To try out the current master, install it from our development channel on Anaconda cloud: ```bash -conda install dpctl -c dppy\label\dev +conda install dpctl -c dppy/label/dev ``` # Building @@ -93,34 +93,30 @@ conda install dpctl -c dppy\label\dev Refer to our [Documentation](https://intelpython.github.io/dpctl) for more information on setting up a development environment and building `dpctl` from the source. -# Running Examples -Find our examples [here](examples). +# Examples -To run these examples, use: +Our examples are located in the [examples/](examples) folder and are organized in sub-folders. Examples +in the [Python/](examples/python) folder demonstrate how to inspect the heterogeneous platform, +select a device, create an execution queue, and how to control device memory allocation and +execution placement. -```bash -for script in `ls examples/python/`; - do echo "executing ${script}"; - python examples/python/${script}; -done -``` - -## Cython extensions -See examples of building Cython extensions with DPC++ compiler that interoperates -with `dpctl` in the [cython folder](examples\cython). +Examples in [Cython/](examples/cython), [C/](examples/c), and [Pybind11](examples/pybind11) folders +demonstrate creation of SYCL-powered native Python extensions. Please refer to each folder's README +document for directions on how to build and use each example. -To build these examples, run: -```bash -CC=icx CXX=dpcpp python setup.py build_ext --inplace -``` -To execute extensions, refer to the `run.py` script in each folder. # Running Tests -Tests are located [here](dpctl/tests). +Tests are located in folder [dpctl/tests](dpctl/tests). To run the tests, use: ```bash pytest --pyargs dpctl ``` + +Running full test suite requires working C++ compiler. To run the test suite without one, use: + +```bash +pytest --pyargs dpctl -k "not test_cython_api" +``` diff --git a/examples/cython/sycl_direct_linkage/run.py b/dpctl/tensor/__init__.pxd similarity index 68% rename from examples/cython/sycl_direct_linkage/run.py rename to dpctl/tensor/__init__.pxd index 831f01680e..363e58620d 100644 --- a/examples/cython/sycl_direct_linkage/run.py +++ b/dpctl/tensor/__init__.pxd @@ -1,6 +1,6 @@ # Data Parallel Control (dpctl) # -# Copyright 2020-2022 Intel Corporation +# Copyright 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,13 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import syclbuffer_naive as sb +""" This file declares the extension types and functions for the Cython API + implemented in _usmarray.pyx file. +""" -X = np.random.randn(20, 10) +# distutils: language = c++ +# cython: language_level=3 -# compute column-wise total with NumPy's own host code -print(X.sum(axis=0)) - -# compute column-wise total with SYCL extension -print(sb.columnwise_total(X)) +from dpctl.tensor._usmarray cimport * diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000000..b5cd2033ab --- /dev/null +++ b/examples/README.md @@ -0,0 +1,48 @@ +# Examples of using `dpctl` + +The `dpctl` is a foundational package facilitating use of [SYCL](sycl) to extend Python's reach to heterogeneous systems. + +## Python + +The `dpctl` provides Python API to SYCL runtime permitting user to +inspect the heterogeneous [platform], +[select](device_seelection) amongst available devices, +query [properties](device_descriptors) of created devices, +and construct [queues] to specify execution placement of offloaded computation. + +Additionally, `dpctl.tensor` submodule allows to create ND-arrays on devices and manipulate them using `dpctl.tensor` library of array computation operations specified in [Python Array API standard](array_api). + +Examples of this functionality are located in the [python](python) folder. + +## Cython + +The `dpctl` integrates with [Cython], a native extension generator, to facilitate building +SYCL-powered Python extensions. + +Examples of Python extensions written using Cython are located in the [cython](cython) folder. + +## Pybind11 + +Since [SYCL](sycl) is based on C++, [pybind11] is a natural tool of choice to author SYCL-powered +Python extensions. The `dpctl` provides `dpctl4pybind11.hpp` integration header to provide natural +mapping between SYCL C++ classes and `dpctl` Python types. + +Examples of Python extensions created with `pybind11` are located in the [pybind11](pybind11) folder. + +## C + +The `dpctl` implements `DPCTLSyclInterface` C library and C-API to work with Python objects and types +implemented in `dpctl`. Use integration headers `dpctl_sycl_interface.h` and `dpctl_capi.h` to access +this functionality. + +Examples of Python extensions created using C are located in [c](c) folder. + + +[platform]: https://intelpython.github.io/dpctl/latest/docfiles/user_guides/manual/dpctl/platforms.html +[device_selection]: https://intelpython.github.io/dpctl/latest/docfiles/user_guides/manual/dpctl/device_selection.html +[device_descriptors]: https://intelpython.github.io/dpctl/latest/docfiles/user_guides/manual/dpctl/devices.html#device-aspects-and-information-descriptors +[queues]: https://intelpython.github.io/dpctl/latest/docfiles/user_guides/manual/dpctl/queues.html +[array_api]: https://data-apis.org/array-api/ +[sycl]: https://registry.khronos.org/SYCL/ +[Cython]: https://cython.org/ +[pybind11]: https://pybind11.readthedocs.io diff --git a/examples/c/README.md b/examples/c/README.md new file mode 100644 index 0000000000..0deff25116 --- /dev/null +++ b/examples/c/README.md @@ -0,0 +1,8 @@ +# Examples C-based Python extensions using `dpctl` + +The `dpctl` implements `DPCTLSyclInterface` C library as well as provides C-API to work with Python objects +and types implemented in `dpctl`. Use integration headers `dpctl_sycl_interface.h` and `dpctl_capi.h` to access +this functionality. + +Use `python -m dpctl --includes` to get include compiler options and `python -m dpctl --library` to get linking options to link +to `SyclInterface` library. diff --git a/examples/c/py_sycl_ls/README.md b/examples/c/py_sycl_ls/README.md new file mode 100644 index 0000000000..f3429a534d --- /dev/null +++ b/examples/c/py_sycl_ls/README.md @@ -0,0 +1,19 @@ +# Python module to enumerate SYCL devices + +## Building + +```bash +python setup.py build_ext --inplace +``` + +## Testing + +``` +pytest -m tests +``` + +## Running + +``` +python -m py_sycl_ls +``` diff --git a/examples/c/py_sycl_ls/setup.py b/examples/c/py_sycl_ls/setup.py index fc9183aec8..21f7024c99 100644 --- a/examples/c/py_sycl_ls/setup.py +++ b/examples/c/py_sycl_ls/setup.py @@ -15,7 +15,6 @@ # limitations under the License. import os.path -import sysconfig from setuptools import Extension, setup @@ -41,7 +40,6 @@ ], include_dirs=[ dpctl.get_include(), - os.path.join(sysconfig.get_paths()["include"], ".."), ], library_dirs=[ os.path.join(dpctl.get_include(), ".."), diff --git a/examples/c/py_sycl_ls/src/py_sycl-ls.c b/examples/c/py_sycl_ls/src/py_sycl-ls.c index 64a331c92f..8e6bb855ea 100644 --- a/examples/c/py_sycl_ls/src/py_sycl-ls.c +++ b/examples/c/py_sycl_ls/src/py_sycl-ls.c @@ -27,9 +27,7 @@ // clang-format off #include "Python.h" #include "dpctl_capi.h" -#include "syclinterface/dpctl_sycl_platform_interface.h" -#include "syclinterface/dpctl_sycl_platform_manager.h" -#include "syclinterface/dpctl_utils.h" +#include "dpctl_sycl_interface.h" // clang-format on PyObject *sycl_ls(PyObject *self_unused, PyObject *args) diff --git a/examples/cython/README.md b/examples/cython/README.md new file mode 100644 index 0000000000..8d7490c116 --- /dev/null +++ b/examples/cython/README.md @@ -0,0 +1,9 @@ +# Examples of data-parallel Python extensions written in Cython + +The `dpctl` package provides Cython definition files for types it defines. + +Use `cimport dpctl as c_dpctl`, `cimport dpctl.memory as c_dpm`, or `cimport dpctl.tensor as c_dpt` +to use these definitions. + +Cython definition fille `dpctl.sycl` provides incomplete definitions of core SYCL runtime classes as +well as conversion routine between `SyclInterface` reference types and SYCL runtime classes. diff --git a/examples/cython/sycl_buffer/README.md b/examples/cython/sycl_buffer/README.md index 86ea0bc362..c66053b618 100644 --- a/examples/cython/sycl_buffer/README.md +++ b/examples/cython/sycl_buffer/README.md @@ -6,7 +6,8 @@ Cython function expecting a 2D array in a C-contiguous layout that computes column-wise total by using SYCL oneMKL (as GEMV call with an all-units vector). -The example illustrates compiling SYCL extension linking to oneMKL. +The example illustrates compiling SYCL extension that is linking to +oneMKL. ## Compiling @@ -15,66 +16,40 @@ The example illustrates compiling SYCL extension linking to oneMKL. To compile the example, run: ``` -CC=icx CXX=dpcpp python setup.py build_ext --inplace +python setup.py develop ``` ## Running ``` -# SYCL_DEVICE_FILTER=opencl sets SYCL backend to OpenCL to avoid a -# transient issue with MKL's using the default Level-0 backend -(idp) [08:16:12 ansatnuc04 simple]$ SYCL_DEVICE_FILTER=opencl ipython -Python 3.7.7 (default, Jul 14 2020, 22:02:37) +(dev_dpctl) opavlyk@opavlyk-mobl:~/repos/dpctl/examples/cython/sycl_buffer$ ipython +Python 3.9.12 (main, Jun 1 2022, 11:38:51) Type 'copyright', 'credits' or 'license' for more information -IPython 7.17.0 -- An enhanced Interactive Python. Type '?' for help. +IPython 8.4.0 -- An enhanced Interactive Python. Type '?' for help. In [1]: import syclbuffer as sb, numpy as np, dpctl -In [2]: x = np.random.randn(10**4, 2500) +In [2]: x = np.random.randn(10**6, 7).astype(np.float32) -In [3]: %time m1 = np.sum(x, axis=0) -CPU times: user 22.3 ms, sys: 160 µs, total: 22.5 ms -Wall time: 21.2 ms +In [3]: sb.columnwise_total(x) +Out[3]: +array([ -810.02496 , 42.692146, -786.71075 , -1417.643 , + -1096.2424 , 212.33067 , 18.40631 ], dtype=float32) -In [4]: %time m = sb.columnwise_total(x) # first time is slower, due to JIT overhead -CPU times: user 207 ms, sys: 36.1 ms, total: 243 ms -Wall time: 248 ms - -In [5]: %time m = sb.columnwise_total(x) -CPU times: user 8.89 ms, sys: 4.12 ms, total: 13 ms -Wall time: 12.4 ms - -In [6]: %time m = sb.columnwise_total(x) -CPU times: user 4.82 ms, sys: 8.06 ms, total: 12.9 ms -Wall time: 12.3 ms +In [4]: np.sum(x, axis=0) +Out[4]: +array([ -810.03296 , 42.68893 , -786.7023 , -1417.648 , + -1096.2699 , 212.32564 , 18.412518], dtype=float32) ``` ### Running bench.py: ``` -========== Executing warm-up ========== -NumPy result: [1. 1. 1. ... 1. 1. 1.] -SYCL(Intel(R) Core(TM) i7-10710U CPU @ 1.10GHz) result: [1. 1. 1. ... 1. 1. 1.] -SYCL(Intel(R) Gen9 HD Graphics NEO) result: [1. 1. 1. ... 1. 1. 1.] -Times for 'opencl:cpu:0' -[2.864787499012891, 2.690436460019555, 2.5902308400254697, 2.5802528870408423, 2.538990616973024] -Times for 'opencl:gpu:0' -[1.9769684099592268, 2.3491444009705447, 2.293720397981815, 2.391633405990433, 1.9465659779962152] -Times for NumPy -[3.4011058019823395, 3.07286038500024, 3.0390414349967614, 3.0305576199898496, 3.002687797998078] +$ python scripts/bench.py ``` -### Running run.py: +### Running tests: ``` -(idp) [09:14:53 ansatnuc04 sycl_buffer]$ SYCL_DEVICE_FILTER=opencl python run.py -Result computed by NumPy -[ 0.27170187 -23.36798583 7.31326489 -1.95121928] -Result computed by SYCL extension -[ 0.27170187 -23.36798583 7.31326489 -1.95121928] - -Running on: Intel(R) Gen9 HD Graphics NEO -[ 0.27170187 -23.36798583 7.31326489 -1.95121928] -Running on: Intel(R) Core(TM) i7-10710U CPU @ 1.10GHz -[ 0.27170187 -23.36798583 7.31326489 -1.95121928] +$ python -m pytest tests ``` diff --git a/examples/cython/sycl_buffer/_buffer_example.pyx b/examples/cython/sycl_buffer/_buffer_example.pyx deleted file mode 100644 index 5dfdcb3ff0..0000000000 --- a/examples/cython/sycl_buffer/_buffer_example.pyx +++ /dev/null @@ -1,64 +0,0 @@ -# Data Parallel Control (dpctl) -# -# Copyright 2020-2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# distutils: language = c++ -# cython: language_level=3 - -cimport numpy as cnp - -import numpy as np - -cimport dpctl as c_dpctl - -import dpctl - - -cdef extern from "use_sycl_buffer.h": - int c_columnwise_total( - c_dpctl.DPCTLSyclQueueRef q, size_t n, size_t m, double *m, double *ct - ) nogil - int c_columnwise_total_no_mkl( - c_dpctl.DPCTLSyclQueueRef q, size_t n, size_t m, double *m, double *ct - ) nogil - - -def columnwise_total(double[:, ::1] v, method='mkl', queue=None): - cdef cnp.ndarray res_array = np.empty((v.shape[1],), dtype='d') - cdef double[::1] res_memslice = res_array - cdef int ret_status - cdef c_dpctl.SyclQueue q - cdef c_dpctl.DPCTLSyclQueueRef q_ref - - if (queue is None): - q = c_dpctl.SyclQueue() - elif isinstance(queue, dpctl.SyclQueue): - q = queue - else: - q = c_dpctl.SyclQueue(queue) - q_ref = q.get_queue_ref() - - if method == 'mkl': - with nogil: - ret_status = c_columnwise_total( - q_ref, v.shape[0], v.shape[1], &v[0, 0], &res_memslice[0] - ) - else: - with nogil: - ret_status = c_columnwise_total_no_mkl( - q_ref, v.shape[0], v.shape[1], &v[0, 0], &res_memslice[0] - ) - - return res_array diff --git a/examples/cython/sycl_buffer/bench.py b/examples/cython/sycl_buffer/bench.py deleted file mode 100644 index 491816b9bb..0000000000 --- a/examples/cython/sycl_buffer/bench.py +++ /dev/null @@ -1,70 +0,0 @@ -# Data Parallel Control (dpctl) -# -# Copyright 2020-2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import timeit - -import numpy as np -import syclbuffer as sb - -import dpctl - -X = np.full((10**4, 4098), 1e-4, dtype="d") - -# warm-up -print("=" * 10 + " Executing warm-up " + "=" * 10) -print("NumPy result: ", X.sum(axis=0)) - -q = dpctl.SyclQueue("opencl:cpu") -print( - "SYCL({}) result: {}".format( - q.sycl_device.name, - sb.columnwise_total(X, queue=q), - ) -) - -q = dpctl.SyclQueue("opencl:gpu") -print( - "SYCL({}) result: {}".format( - q.sycl_device.name, - sb.columnwise_total(X, queue=q), - ) -) - - -print("Times for 'opencl:cpu'") -print( - timeit.repeat( - stmt="sb.columnwise_total(X, queue=q)", - setup='q = dpctl.SyclQueue("opencl:cpu"); ' - "sb.columnwise_total(X, queue=q)", # do not count JIT compilation - number=100, - globals=globals(), - ) -) - -print("Times for 'opencl:gpu'") -print( - timeit.repeat( - stmt="sb.columnwise_total(X, queue=q)", - setup='q = dpctl.SyclQueue("opencl:gpu"); ' - "sb.columnwise_total(X, queue=q)", - number=100, - globals=globals(), - ) -) - -print("Times for NumPy") -print(timeit.repeat(stmt="X.sum(axis=0)", number=100, globals=globals())) diff --git a/examples/cython/sycl_buffer/run.py b/examples/cython/sycl_buffer/run.py deleted file mode 100644 index 401edc2d75..0000000000 --- a/examples/cython/sycl_buffer/run.py +++ /dev/null @@ -1,53 +0,0 @@ -# Data Parallel Control (dpctl) -# -# Copyright 2020-2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import syclbuffer as sb - -import dpctl - -X = np.random.randn(100, 4) - -print("Result computed by NumPy") -print(X.sum(axis=0)) - -try: - res = sb.columnwise_total(X) - print("Result computed by SYCL extension using default offloading target") - print(res) -except dpctl.SyclQueueCreationError: - print( - "Could not create SyclQueue for default selected device. Nothing to do." - ) - exit(0) - -print("") - -# controlling where to offload - -try: - q = dpctl.SyclQueue("opencl:gpu") - print("Running on: ", q.sycl_device.name) - print(sb.columnwise_total(X, queue=q)) -except dpctl.SyclQueueCreationError: - print("Not running onf opencl:gpu, queue could not be created") - -try: - q = dpctl.SyclQueue("opencl:cpu") - print("Running on: ", q.sycl_device.name) - print(sb.columnwise_total(X, queue=q)) -except dpctl.SyclQueueCreationError: - print("Not running onf opencl:cpu, queue could not be created") diff --git a/examples/cython/sycl_buffer/scripts/bench.py b/examples/cython/sycl_buffer/scripts/bench.py new file mode 100644 index 0000000000..03168b8b81 --- /dev/null +++ b/examples/cython/sycl_buffer/scripts/bench.py @@ -0,0 +1,92 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import timeit + +import numpy as np +import syclbuffer as sb + +import dpctl + + +class Skipped: + def __init__(self, msg): + self.msg = msg + + def __str__(self): + return f"Skipped(reason='{self.msg}')" + + def __repr__(self): + return self.__str__() + + +def bench_offload(selector_string, X): + try: + q = dpctl.SyclQueue(selector_string) + except dpctl.SyclQueueCreationError: + return Skipped( + f"Skipping run for {selector_string}, queue could nor be created" + ) + return timeit.repeat( + stmt="sb.columnwise_total(X, queue=q)", + setup="q = dpctl.SyclQueue(selector_string); " + "sb.columnwise_total(X, queue=q)", # do not count JIT compilation + number=100, + globals={ + "q": q, + "X": X, + "dpctl": dpctl, + "sb": sb, + "selector_string": selector_string, + }, + ) + + +def run_offload(selector_string, X): + try: + q = dpctl.SyclQueue(selector_string) + except dpctl.SyclQueueCreationError: + return Skipped( + f"Skipping run for {selector_string}, queue could nor be created" + ) + return "SYCL({}) result: {}".format( + q.sycl_device.name, + sb.columnwise_total(X, queue=q), + ) + + +X = np.full((10**6, 15), 1e-4, dtype="f4") + +print(f"Matrix size: {X.shape}, dtype = {X.dtype}") + +# warm-up +print("=" * 10 + " Executing warm-up " + "=" * 10) +print("NumPy result: ", X.sum(axis=0)) + +for ss in ["opencl:cpu", "opencl:gpu", "level_zero:gpu"]: + print("Result for '" + ss + "': {}".format(run_offload(ss, X))) + +print("=" * 10 + " Running bechmarks " + "=" * 10) + +for ss in ["opencl:cpu", "opencl:gpu", "level_zero:gpu"]: + print("Timing offload to '" + ss + "': {}".format(bench_offload(ss, X))) + + +print( + "Times for NumPy: {}".format( + timeit.repeat(stmt="X.sum(axis=0)", number=100, globals=globals()) + ) +) diff --git a/examples/cython/sycl_buffer/setup.py b/examples/cython/sycl_buffer/setup.py index 2bee545617..54eb47df5d 100644 --- a/examples/cython/sycl_buffer/setup.py +++ b/examples/cython/sycl_buffer/setup.py @@ -1,6 +1,6 @@ # Data Parallel Control (dpctl) # -# Copyright 2020-2022 Intel Corporation +# Copyright 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,60 +14,60 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os.path -import sysconfig - -import numpy as np from setuptools import Extension, setup +from setuptools.command.build_ext import build_ext import dpctl + +class custom_build_ext(build_ext): + def build_extensions(self): + self.compiler.set_executable("compiler_so", "icpx -fsycl -fPIC") + self.compiler.set_executable("compiler_cxx", "icpx -fsycl -fPIC") + self.compiler.set_executable( + "linker_so", + "icpx -fsycl -shared -fpic -fsycl-device-code-split=per_kernel", + ) + build_ext.build_extensions(self) + + +ext_modules = [ + Extension( + name="syclbuffer._syclbuffer", + sources=[ + "syclbuffer/_buffer_example.pyx", + ], + depends=[ + "src/use_sycl_buffer.hpp", + ], + include_dirs=[ + ".", + "./src", + dpctl.get_include(), + ], + extra_compile_args=[ + "-Wall", + "-Wextra", + "-fsycl", + ], + extra_link_args=["-fPIC"], + language="c++", + ) +] + setup( name="syclbuffer", version="0.0.0", description="An example of Cython extension calling SYCL routines", long_description=""" Example of using SYCL to work on host allocated NumPy array using - SYCL buffers by calling oneMKL functions. + SYCL buffers and SYCL functions. See README.md for more details. """, license="Apache 2.0", author="Intel Corporation", url="https://github.com/IntelPython/dpctl", - ext_modules=[ - Extension( - name="syclbuffer", - sources=[ - "_buffer_example.pyx", - "use_sycl_buffer.cpp", - ], - include_dirs=[ - ".", - np.get_include(), - dpctl.get_include(), - os.path.join(sysconfig.get_paths()["include"], ".."), - ], - library_dirs=[ - os.path.join(sysconfig.get_paths()["stdlib"], ".."), - ], - libraries=["sycl"] - + [ - "mkl_sycl", - "mkl_intel_ilp64", - "mkl_tbb_thread", - "mkl_core", - "tbb", - ], - runtime_library_dirs=[], - extra_compile_args=[ - "-Wall", - "-Wextra", - "-fsycl", - "-fsycl-unnamed-lambda", - ], - extra_link_args=["-fPIC"], - language="c++", - ) - ], + ext_modules=ext_modules, + cmdclass={"build_ext": custom_build_ext}, ) diff --git a/examples/cython/sycl_buffer/src/use_sycl_buffer.hpp b/examples/cython/sycl_buffer/src/use_sycl_buffer.hpp new file mode 100644 index 0000000000..7f7ad1bd95 --- /dev/null +++ b/examples/cython/sycl_buffer/src/use_sycl_buffer.hpp @@ -0,0 +1,87 @@ +//=- use_sycl_buffer.cpp - Example of SYCL code to be called from Cython =// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements SYCL code to compute columnwise total of a matrix, +/// provided as host C-contiguous allocation. SYCL kernels access this memory +/// using `sycl::buffer`. Two routines are provided. One solves the task by +/// calling BLAS function GEMV from Intel(R) Math Kernel Library, the other +/// performs the computation using DPC++ reduction group function and atomics. +/// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +inline size_t upper_multiple(size_t n, size_t wg) +{ + return wg * ((n + wg - 1) / wg); +} + +template +void columnwise_total(sycl::queue q, + size_t n, + size_t m, + const dataT *mat, + dataT *ct) +{ + sycl::buffer mat_buffer = sycl::buffer(mat, sycl::range<2>(n, m)); + sycl::buffer ct_buffer = sycl::buffer(ct, sycl::range<1>(m)); + + q.submit([&](sycl::handler &h) { + sycl::accessor ct_acc{ + ct_buffer, h, sycl::write_only, {sycl::property::no_init{}}}; + h.parallel_for(sycl::range<1>(m), + [=](sycl::id<1> i) { ct_acc[i] = dataT(0); }); + }); + + const sycl::device &d = q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = + 2 * (*std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); + + q.submit([&](sycl::handler &h) { + sycl::accessor mat_acc{mat_buffer, h, sycl::read_only}; + sycl::accessor ct_acc{ct_buffer, h}; + + sycl::range<2> global{upper_multiple(n, wg), m}; + sycl::range<2> local{wg, 1}; + + h.parallel_for( + sycl::nd_range<2>(global, local), [=](sycl::nd_item<2> it) { + size_t i = it.get_global_id(0); + dataT group_sum = sycl::reduce_over_group( + it.get_group(), + (i < n) ? mat_acc[it.get_global_id()] : dataT(0), + std::plus()); + if (it.get_group().leader()) { + size_t j = it.get_global_id(1); + sycl::atomic_ref( + ct_acc[j]) += group_sum; + } + }); + }); + + return; +} diff --git a/examples/cython/sycl_buffer/syclbuffer/__init__.py b/examples/cython/sycl_buffer/syclbuffer/__init__.py new file mode 100644 index 0000000000..6ae708ed28 --- /dev/null +++ b/examples/cython/sycl_buffer/syclbuffer/__init__.py @@ -0,0 +1,28 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ._syclbuffer import columnwise_total + +__doc__ = """ +This is a toy example module illustrating use of SYCL-based code +to operate on NumPy arrays addressing memory allocated by standard +Python memory allocator. +""" +__license__ = "Apache 2.0" + +__all__ = [ + "columnwise_total", +] diff --git a/examples/cython/sycl_buffer/syclbuffer/_buffer_example.pyx b/examples/cython/sycl_buffer/syclbuffer/_buffer_example.pyx new file mode 100644 index 0000000000..e3f9267acd --- /dev/null +++ b/examples/cython/sycl_buffer/syclbuffer/_buffer_example.pyx @@ -0,0 +1,91 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# distutils: language = c++ +# cython: language_level=3 + +cimport cython + +cimport dpctl as c_dpctl +from dpctl.sycl cimport queue as dpcpp_queue +from dpctl.sycl cimport unwrap_queue + +import numpy as np + +import dpctl + + +cdef extern from "use_sycl_buffer.hpp": + void native_columnwise_total "columnwise_total"[T]( + dpcpp_queue, # execution queue + size_t, # number of rows of the input matrix + size_t, # number of columns of the input matrix + const T *, # data pointer of the input matrix + T * # pointer for the resulting vector + ) nogil except+ + + +def columnwise_total(cython.floating[:, ::1] mat, queue=None): + """ columntiwse_total(mat, queue=None) + + Returns column-wise total of the input matrix. + + Args: + mat: ndarray + C-contiguous non-empty matrix of single- or double-precision + floating point type. + queue: dpctl.SyclQueue or None + Execution queue targeting a SYCL device for offload. Default + value of `None` means use default-constructed `dpctl.SyclQueue` + that targets default-selected device. + + Note: + It is advantageous to create `dpctl.SyclQueue` and reuse it as queue + construction may be expensive. + """ + cdef cython.floating[:] res_memslice + cdef c_dpctl.SyclQueue q + cdef dpcpp_queue* exec_queue_ptr = NULL + cdef size_t n_cols + cdef size_t n_rows + + n_rows = mat.shape[0] + n_cols = mat.shape[1] + + if cython.floating is float: + res_memslice = np.empty(n_cols, dtype=np.single) + elif cython.floating is double: + res_memslice = np.empty(n_cols, dtype=np.double) + else: + raise TypeError( + "Use single or double precision floating point types are supported" + ) + + if (queue is None): + # use default-constructed queue + q = c_dpctl.SyclQueue() + elif isinstance(queue, dpctl.SyclQueue): + q = queue + else: + q = c_dpctl.SyclQueue(queue) + exec_queue_ptr = unwrap_queue(q.get_queue_ref()) + + with nogil: + native_columnwise_total( + exec_queue_ptr[0], n_rows, n_cols, &mat[0,0], &res_memslice[0] + ) + + return np.asarray(res_memslice) diff --git a/examples/cython/sycl_buffer/tests/test_example.py b/examples/cython/sycl_buffer/tests/test_example.py new file mode 100644 index 0000000000..fcb0126702 --- /dev/null +++ b/examples/cython/sycl_buffer/tests/test_example.py @@ -0,0 +1,24 @@ +import numpy as np +import pytest +from syclbuffer import columnwise_total + +import dpctl + + +def test_columnwise_total(): + x = np.array([[2, 3], [3, 4], [5, 6], [7, 8], [9, 10]], dtype=np.float32) + + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Could not create default-constructed queue") + + ref = x.sum(axis=0) + res1 = columnwise_total(x) + assert res1.shape == (2,) + + res2 = columnwise_total(x, queue=q) + assert res2.shape == (2,) + + assert np.allclose(res1, ref) + assert np.allclose(res2, ref) diff --git a/examples/cython/sycl_buffer/use_sycl_buffer.cpp b/examples/cython/sycl_buffer/use_sycl_buffer.cpp deleted file mode 100644 index 1d48a1a17b..0000000000 --- a/examples/cython/sycl_buffer/use_sycl_buffer.cpp +++ /dev/null @@ -1,138 +0,0 @@ -//=- use_sycl_buffer.cpp - Example of SYCL code to be called from Cython =// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2022 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file implements SYCL code to compute columnwise total of a matrix, -/// provided as host C-contiguous allocation. SYCL kernels access this memory -/// using `sycl::buffer`. Two routines are provided. One solves the task by -/// calling BLAS function GEMV from Intel(R) Math Kernel Library, the other -/// performs the computation using DPC++ reduction group function and atomics. -/// -//===----------------------------------------------------------------------===// - -#include "use_sycl_buffer.h" -#include "dpctl_sycl_interface.h" -#include -#include - -int c_columnwise_total(DPCTLSyclQueueRef q_ref, - size_t n, - size_t m, - double *mat, - double *ct) -{ - - sycl::queue q = *(reinterpret_cast(q_ref)); - - sycl::buffer mat_buffer = - sycl::buffer(mat, sycl::range<1>(n * m)); - sycl::buffer ct_buffer = sycl::buffer(ct, sycl::range<1>(m)); - - double *ones = reinterpret_cast(malloc(n * sizeof(double))); - { - sycl::buffer ones_buffer = - sycl::buffer(ones, sycl::range<1>(n)); - - try { - auto ev = q.submit([&](sycl::handler &cgh) { - auto ones_acc = - ones_buffer.get_access(cgh); - cgh.fill(ones_acc, double(1.0)); - }); - - ev.wait_and_throw(); - } catch (sycl::exception const &e) { - std::cout << "\t\tCaught synchronous SYCL exception during fill:\n" - << e.what() << std::endl; - goto cleanup; - } - - try { - oneapi::mkl::blas::row_major::gemv( - q, oneapi::mkl::transpose::trans, n, m, double(1.0), mat_buffer, - m, ones_buffer, 1, double(0.0), ct_buffer, 1); - q.wait(); - } catch (sycl::exception const &e) { - std::cout << "\t\tCaught synchronous SYCL exception during GEMV:\n" - << e.what() << std::endl; - goto cleanup; - } - } - - free(ones); - return 0; - -cleanup: - free(ones); - return -1; -} - -inline size_t upper_multiple(size_t n, size_t wg) -{ - return wg * ((n + wg - 1) / wg); -} - -int c_columnwise_total_no_mkl(DPCTLSyclQueueRef q_ref, - size_t n, - size_t m, - double *mat, - double *ct) -{ - - sycl::queue q = *(reinterpret_cast(q_ref)); - - sycl::buffer mat_buffer = - sycl::buffer(mat, sycl::range<2>(n, m)); - sycl::buffer ct_buffer = sycl::buffer(ct, sycl::range<1>(m)); - - auto e = q.submit([&](sycl::handler &h) { - sycl::accessor ct_acc{ct_buffer, h, sycl::write_only}; - h.parallel_for(sycl::range<1>(m), - [=](sycl::id<1> i) { ct_acc[i] = 0.0; }); - }); - - constexpr size_t wg = 256; - auto e2 = q.submit([&](sycl::handler &h) { - sycl::accessor mat_acc{mat_buffer, h, sycl::read_only}; - sycl::accessor ct_acc{ct_buffer, h}; - h.depends_on(e); - - sycl::range<2> global{upper_multiple(n, wg), m}; - sycl::range<2> local{wg, 1}; - - h.parallel_for( - sycl::nd_range<2>(global, local), [=](sycl::nd_item<2> it) { - size_t i = it.get_global_id(0); - size_t j = it.get_global_id(1); - double group_sum = sycl::reduce_over_group( - it.get_group(), (i < n) ? mat_acc[it.get_global_id()] : 0.0, - std::plus()); - if (it.get_local_id(0) == 0) { - sycl::atomic_ref( - ct_acc[j]) += group_sum; - } - }); - }); - - e2.wait_and_throw(); - return 0; -} diff --git a/examples/cython/sycl_buffer/use_sycl_buffer.h b/examples/cython/sycl_buffer/use_sycl_buffer.h deleted file mode 100644 index 5053321046..0000000000 --- a/examples/cython/sycl_buffer/use_sycl_buffer.h +++ /dev/null @@ -1,31 +0,0 @@ -// Data Parallel Control (dpctl) -// -// Copyright 2020-2022 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// - -#include "dpctl_sycl_interface.h" -#include - -extern int c_columnwise_total(DPCTLSyclQueueRef q, - size_t n, - size_t m, - double *mat, - double *ct); -extern int c_columnwise_total_no_mkl(DPCTLSyclQueueRef q, - size_t n, - size_t m, - double *mat, - double *ct); diff --git a/examples/cython/sycl_direct_linkage/.gitignore b/examples/cython/sycl_direct_linkage/.gitignore deleted file mode 100644 index 201efb635e..0000000000 --- a/examples/cython/sycl_direct_linkage/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -_buffer_example.cpp -*.cpython*.so -*~ diff --git a/examples/cython/sycl_direct_linkage/README.md b/examples/cython/sycl_direct_linkage/README.md deleted file mode 100644 index 0d87ac8bf4..0000000000 --- a/examples/cython/sycl_direct_linkage/README.md +++ /dev/null @@ -1,56 +0,0 @@ -# Example of sycl_direct_linkage Usage - -This Cython extension does not directly use dpctl and links to SYCL. -It exposes the `columnwise_total` function that uses oneMKL to compute -totals for each column of its argument matrix in double precision -expected as an ordinary NumPy array in a C-contiguous layout. - -This function performs the following steps: - - 1. Creates a SYCL queue using the default device selector - 2. Creates SYCL buffer around the matrix data - 3. Creates a vector `v_ones` with all elements being ones - and allocates memory for the result. - 4. Calls oneMKL to compute xGEMV as dot(v_ones, M) - 5. Returns the result as NumPy array - -This extension does not allow to control the device or queue, to -which execution of kernel is being scheduled. - -A related example "sycl_buffer" modifies this example in that it uses -`dpctl` to retrieve the current queue allowing a user to control the queue -and avoid the overhead of queue creation. - -To illustrate the queue creation overhead in each call, compare the execution of the default queue, -which is Intel(R) Gen9 GPU on an OpenCL backend: - -``` -(idp) [11:24:38 ansatnuc04 sycl_direct_linkage]$ SYCL_DEVICE_FILTER=opencl:gpu python bench.py -========== Executing warm-up ========== -NumPy result: [1. 1. 1. ... 1. 1. 1.] -SYCL(default_device) result: [1. 1. 1. ... 1. 1. 1.] -Running time of 100 calls to columnwise_total on matrix with shape (10000, 4098) -Times for default_selector, inclusive of queue creation: -[19.384219504892826, 19.49932464491576, 19.613155928440392, 19.64031868893653, 19.752969074994326] -Times for NumPy -[3.5394036192446947, 3.498957809060812, 3.4925728561356664, 3.5036555202677846, 3.493739523924887] -``` - -to the timing when the `dpctl` queue is being reused: - -``` -(idp) [11:29:14 ansatnuc04 sycl_buffer]$ python bench.py -========== Executing warm-up ========== -NumPy result: [1. 1. 1. ... 1. 1. 1.] -SYCL(Intel(R) Core(TM) i7-10710U CPU @ 1.10GHz) result: [1. 1. 1. ... 1. 1. 1.] -SYCL(Intel(R) Graphics Gen9 [0x9bca]) result: [1. 1. 1. ... 1. 1. 1.] -Times for 'opencl:cpu:0' -[2.9164800881408155, 2.8714500251226127, 2.9770236839540303, 2.913622073829174, 2.7949972581118345] -Times for 'opencl:gpu:0' -[9.529508924111724, 10.288004886358976, 10.189113245811313, 10.197128206957132, 10.26169267296791] -Times for NumPy -[3.4809365631081164, 3.42917942116037, 3.42471009073779, 3.3689011191017926, 3.4336009239777923] -``` - -The overhead of the ``sycl::queue`` creation per call is approximately comparable with the time of -the actual computation execution. diff --git a/examples/cython/sycl_direct_linkage/_buffer_example.pyx b/examples/cython/sycl_direct_linkage/_buffer_example.pyx deleted file mode 100644 index 627f388f6d..0000000000 --- a/examples/cython/sycl_direct_linkage/_buffer_example.pyx +++ /dev/null @@ -1,53 +0,0 @@ -# Data Parallel Control (dpctl) -# -# Copyright 2020-2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# distutils: language = c++ -# cython: language_level=3 - -cimport numpy as cnp - -import numpy as np - -from cython.operator cimport dereference as deref - - -cdef extern from "CL/sycl.hpp" namespace "sycl": - cdef cppclass queue nogil: - pass - - -cdef extern from "sycl_function.hpp": - int c_columnwise_total( - queue q, size_t n, size_t m, double *m, double *ct - ) nogil - - -def columnwise_total(double[:, ::1] v): - cdef cnp.ndarray res_array = np.empty((v.shape[1],), dtype='d') - cdef double[::1] res_memslice = res_array - cdef int ret_status - cdef queue* q - - q = new queue() - - with nogil: - ret_status = c_columnwise_total( - deref(q), v.shape[0], v.shape[1], &v[0, 0], &res_memslice[0] - ) - - del q - - return res_array diff --git a/examples/cython/sycl_direct_linkage/bench.py b/examples/cython/sycl_direct_linkage/bench.py deleted file mode 100644 index 4955bb2f0c..0000000000 --- a/examples/cython/sycl_direct_linkage/bench.py +++ /dev/null @@ -1,46 +0,0 @@ -# Data Parallel Control (dpctl) -# -# Copyright 2020-2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import timeit - -import numpy as np -import syclbuffer_naive as sb - -X = np.full((10**4, 4098), 1e-4, dtype="d") - -# warm-up -print("=" * 10 + " Executing warm-up " + "=" * 10) -print("NumPy result: ", X.sum(axis=0)) -print( - "SYCL(default_device) result: {}".format( - sb.columnwise_total(X), - ) -) -print( - "Running time of 100 calls to columnwise_total on matrix with " - "shape {}".format(X.shape) -) -print("Times for default_selector, inclusive of queue creation:") -print( - timeit.repeat( - stmt="sb.columnwise_total(X)", - setup="sb.columnwise_total(X)", # ensure JIT compilation is not counted - number=100, - globals=globals(), - ) -) -print("Times for NumPy") -print(timeit.repeat(stmt="X.sum(axis=0)", number=100, globals=globals())) diff --git a/examples/cython/sycl_direct_linkage/setup.py b/examples/cython/sycl_direct_linkage/setup.py deleted file mode 100644 index 44a277eee8..0000000000 --- a/examples/cython/sycl_direct_linkage/setup.py +++ /dev/null @@ -1,77 +0,0 @@ -# Data Parallel Control (dpctl) -# -# Copyright 2020-2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os.path -import sysconfig - -import numpy as np -from setuptools import Extension, setup - -import dpctl - -setup( - name="syclbuffer", - version="0.0.0", - description="An example of Cython extension calling SYCL routines", - long_description=""" - Example of using SYCL to work on host allocated NumPy array using - SYCL buffers by calling oneMKL functions. - - This extension create SYCL queue in the scope of the function call - incurring large performance overhead. See `sycl_buffer/` example, - where user-constructed `dpctl.SyclQueue` can be given by the user. - - See README.md for more details. - """, - license="Apache 2.0", - author="Intel Corporation", - url="https://github.com/IntelPython/dpctl", - ext_modules=[ - Extension( - name="syclbuffer_naive", - sources=[ - "_buffer_example.pyx", - "sycl_function.cpp", - ], - include_dirs=[ - ".", - np.get_include(), - dpctl.get_include(), - os.path.join(sysconfig.get_paths()["include"], ".."), - ], - library_dirs=[ - os.path.join(sysconfig.get_paths()["stdlib"], ".."), - ], - libraries=["sycl"] - + [ - "mkl_sycl", - "mkl_intel_ilp64", - "mkl_tbb_thread", - "mkl_core", - "tbb", - ], - runtime_library_dirs=[], - extra_compile_args=[ - "-Wall", - "-Wextra", - "-fsycl", - "-fsycl-unnamed-lambda", - ], - extra_link_args=["-fPIC"], - language="c++", - ) - ], -) diff --git a/examples/cython/sycl_direct_linkage/sycl_function.cpp b/examples/cython/sycl_direct_linkage/sycl_function.cpp deleted file mode 100644 index b8db50bdff..0000000000 --- a/examples/cython/sycl_direct_linkage/sycl_function.cpp +++ /dev/null @@ -1,81 +0,0 @@ -//=- use_sycl_buffer.cpp - Example of SYCL code to be called from Cython =// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2022 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file implements SYCL code to compute columnwise total of a matrix, -/// provided as host C-contiguous allocation. SYCL kernels access this memory -/// using `sycl::buffer`. The routine solves the task by calling BLAS function -// GEMV from Intel(R) Math Kernel Library. -/// -//===----------------------------------------------------------------------===// - -#include "sycl_function.hpp" -#include "mkl.h" -#include -#include - -int c_columnwise_total(sycl::queue q, - size_t n, - size_t m, - double *mat, - double *ct) -{ - sycl::buffer mat_buffer = - sycl::buffer(mat, sycl::range<1>(n * m)); - sycl::buffer ct_buffer = sycl::buffer(ct, sycl::range<1>(m)); - - double *ones = reinterpret_cast(malloc(n * sizeof(double))); - { - sycl::buffer ones_buffer = - sycl::buffer(ones, sycl::range<1>(n)); - - try { - auto ev = q.submit([&](sycl::handler &cgh) { - auto ones_acc = - ones_buffer.get_access(cgh); - cgh.fill(ones_acc, double(1.0)); - }); - - ev.wait_and_throw(); - } catch (sycl::exception const &e) { - std::cout << "\t\tCaught synchronous SYCL exception during fill:\n" - << e.what() << std::endl; - goto cleanup; - } - - try { - oneapi::mkl::blas::row_major::gemv( - q, oneapi::mkl::transpose::trans, n, m, double(1.0), mat_buffer, - m, ones_buffer, 1, double(0.0), ct_buffer, 1); - q.wait(); - } catch (sycl::exception const &e) { - std::cout << "\t\tCaught synchronous SYCL exception during GEMV:\n" - << e.what() << std::endl; - goto cleanup; - } - } - - free(ones); - return 0; - -cleanup: - free(ones); - return -1; -} diff --git a/examples/cython/sycl_direct_linkage/sycl_function.hpp b/examples/cython/sycl_direct_linkage/sycl_function.hpp deleted file mode 100644 index 9e7dc56f6d..0000000000 --- a/examples/cython/sycl_direct_linkage/sycl_function.hpp +++ /dev/null @@ -1,24 +0,0 @@ -// Data Parallel Control (dpctl) -// -// Copyright 2020-2022 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -#include - -int c_columnwise_total(sycl::queue, - size_t n, - size_t m, - double *mat, - double *ct); diff --git a/examples/cython/usm_memory/README.md b/examples/cython/usm_memory/README.md index df952f3530..bf359ffc2b 100644 --- a/examples/cython/usm_memory/README.md +++ b/examples/cython/usm_memory/README.md @@ -1,27 +1,44 @@ # Working with USM Memory +This example demonstrates building of an extension that works with +`dpctl.tensor.usm_ndarray` container. + +It implements two Python functions: `blackscholes.populate_params` and +`blackscholes.black_scholes_price`. The first one uses MKL's device RNG +implementation to populate option parameters from uniform distribution +in user-specified ranges, and the other one takes the array with option +parameters and produces array with call and put European vanilla option +prices. + ## Building > **NOTE:** Make sure oneAPI is activated, $ONEAPI_ROOT must be set. To build the example, run: ``` -$ CC=icx CXX=dpcpp LD_SHARED="dpcpp -shared" \ - CXXFLAGS=-fno-sycl-early-optimizations python setup.py build_ext --inplace +$ python setup.py build_ext --inplace ``` -## Running +## Testing ``` -$ python run.py +$ pytest tests/ +``` + +## Running benchmark + +``` +$ python scripts/bench.py ``` It gives the example output: ``` -True -Using : Intel(R) Core(TM) i7-10710U CPU @ 1.10GHz -Elapsed: 0.9255791641771793 -Using : Intel(R) Gen9 -Elapsed: 0.32811625860631466 +(dev_dpctl) opavlyk@opavlyk-mobl:~/repos/dpctl/examples/cython/usm_memory$ python scripts/bench.py +Pricing 30,000,000 vanilla European options using Black-Scholes-Merton formula + +Using : 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz +Wall times : [0.07042762002674863, 0.047108696977375075, 0.04325491201598197, 0.045397296984447166, 0.0433025429956615] for dtype=float32 +Using : Intel(R) Graphics [0x9a49] +Wall times : [0.1194021370029077, 0.0720841379952617, 0.0647223969863262, 0.06645121600013226, 0.06911522900918499] for dtype=float32 ``` diff --git a/examples/cython/usm_memory/blackscholes.pyx b/examples/cython/usm_memory/blackscholes.pyx deleted file mode 100644 index 1d76dd78bf..0000000000 --- a/examples/cython/usm_memory/blackscholes.pyx +++ /dev/null @@ -1,150 +0,0 @@ -# Data Parallel Control (dpctl) -# -# Copyright 2020-2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# cython: language_level=3 -# distutils: language=c++ - -cimport numpy as cnp -from cython cimport floating - -cimport dpctl as c_dpctl -cimport dpctl.memory as c_dpctl_mem - -import numpy as np - -import dpctl - - -cdef extern from "sycl_blackscholes.hpp": - cdef void cpp_blackscholes[T]( - c_dpctl.DPCTLSyclQueueRef, size_t n_opts, T* option_params, T* callput - ) except + - cdef void cpp_populate_params[T]( - c_dpctl.DPCTLSyclQueueRef, - size_t n_opts, - T* option_params, - T pl, - T ph, - T sl, - T sh, - T tl, - T th, - T rl, - T rh, - T vl, - T vh, - int seed - ) except + - -cdef c_dpctl.SyclQueue from_queue_keyword(queue): - if (queue is None): - return c_dpctl.SyclQueue() - elif isinstance(queue, dpctl.SyclQueue): - return queue - else: - return c_dpctl.SyclQueue(queue) - # use default - return c_dpctl.SyclQueue() - - -def black_scholes_price(floating[:, ::1] option_params, queue=None): - cdef size_t n_opts = option_params.shape[0] - cdef size_t n_params = option_params.shape[1] - cdef size_t n_bytes = 0 - cdef c_dpctl.SyclQueue q - cdef c_dpctl.DPCTLSyclQueueRef q_ptr - cdef c_dpctl_mem.MemoryUSMShared mobj - cdef floating[:, :] call_put_prices - cdef cnp.ndarray callput_arr - cdef double* dp1 - cdef double* dp2 - cdef float* fp1 - cdef float* fp2 - - if (n_params != 5): - raise ValueError(( - "Array of option parameters has unexpected number of " - "columns {} != 5. Each row must specify (current_price, " - "strike_price, maturity, interest_rate, volatility)." - ).format(n_params) - ) - - q = from_queue_keyword(queue) - q_ptr = q.get_queue_ref() - if (floating is double): - n_bytes = 2*n_opts * sizeof(double) - mobj = c_dpctl_mem.MemoryUSMShared(n_bytes, queue=q) - callput_arr = np.ndarray((n_opts, 2), buffer=mobj, dtype='d') - call_put_prices = callput_arr - dp1 = &option_params[0, 0] - dp2 = &call_put_prices[0, 0] - cpp_blackscholes[double](q_ptr, n_opts, dp1, dp2) - elif (floating is float): - n_bytes = 2*n_opts * sizeof(float) - mobj = c_dpctl_mem.MemoryUSMShared(n_bytes, queue=q) - callput_arr = np.ndarray((n_opts, 2), buffer=mobj, dtype='f') - call_put_prices = callput_arr - fp1 = &option_params[0, 0] - fp2 = &call_put_prices[0, 0] - cpp_blackscholes[float](q_ptr, n_opts, fp1, fp2) - - return callput_arr - - -def populate_params( - floating[:, ::1] option_params, - pl, - ph, - sl, - sh, - tl, - th, - rl, - rh, - vl, - vh, - int seed, - queue=None -): - cdef size_t n_opts = option_params.shape[0] - cdef size_t n_params = option_params.shape[1] - - cdef c_dpctl.SyclQueue q - cdef c_dpctl.DPCTLSyclQueueRef q_ptr - cdef double* dp = NULL - cdef float* fp = NULL - - if (n_params != 5): - raise ValueError( - "Array of option parameters has unexpected number of " - "columns {} != 5. Each row must specify (current_price, " - "strike_price, maturity, interest_rate, volatility).".format( - n_params - ) - ) - - q = from_queue_keyword(queue) - q_ptr = q.get_queue_ref() - if (floating is double): - dp = &option_params[0, 0] - cpp_populate_params[double]( - q_ptr, n_opts, dp, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed - ) - elif (floating is float): - fp = &option_params[0, 0] - cpp_populate_params[float]( - q_ptr, n_opts, fp, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed - ) diff --git a/examples/cython/usm_memory/blackscholes/__init__.py b/examples/cython/usm_memory/blackscholes/__init__.py new file mode 100644 index 0000000000..b8c85dd09f --- /dev/null +++ b/examples/cython/usm_memory/blackscholes/__init__.py @@ -0,0 +1,29 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ._blackscholes_usm import black_scholes_price, populate_params + +__doc__ = """ +This is a toy example module illustrating use of SYCL-based code +to operate on NumPy arrays addressing memory allocated by standard +Python memory allocator. +""" +__license__ = "Apache 2.0" + +__all__ = [ + "black_scholes_price", + "populate_params", +] diff --git a/examples/cython/usm_memory/blackscholes/blackscholes.pyx b/examples/cython/usm_memory/blackscholes/blackscholes.pyx new file mode 100644 index 0000000000..73bffb0be9 --- /dev/null +++ b/examples/cython/usm_memory/blackscholes/blackscholes.pyx @@ -0,0 +1,208 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# cython: language_level=3 +# distutils: language=c++ + +cimport numpy as cnp +from cython cimport floating + +cimport dpctl as c_dpctl +cimport dpctl.tensor as c_dpt +from dpctl.sycl cimport queue as dpcpp_queue +from dpctl.sycl cimport unwrap_queue + +import numpy as np + +import dpctl +import dpctl.tensor as dpt + + +cdef extern from "sycl_blackscholes.hpp": + cdef void cpp_blackscholes[T]( + dpcpp_queue, size_t n_opts, T* option_params, T* callput + ) except + + cdef void cpp_populate_params[T]( + dpcpp_queue, + size_t n_opts, + T* option_params, + T pl, + T ph, + T sl, + T sh, + T tl, + T th, + T rl, + T rh, + T vl, + T vh, + int seed + ) except + + + +def black_scholes_price(c_dpt.usm_ndarray option_params_arr): + """black_scholes_price(params) + + Applies Black-Scholes-Merton formula to compute call and put European option prices. + + Args: + option_params_arr: usm_ndarray + Floating point array with shape (n_opts, 5) containing + (price, strike, maturity, rate, volatility) per each option. + Returns: + usm_ndarray + Floating point array with shape (n_opts, 2) containing (call_price, put_price) + per each option. + """ + cdef size_t n_opts = 0 + cdef size_t n_params = 0 + cdef size_t n_bytes = 0 + cdef c_dpctl.SyclQueue q + cdef dpcpp_queue* exec_q_ptr = NULL + cdef c_dpt.usm_ndarray call_put_prices + cdef double* dp1 = NULL + cdef double* dp2 = NULL + cdef float* fp1 = NULL + cdef float* fp2 = NULL + cdef int flags_ = 0 + cdef int typenum_ = 0 + + if option_params_arr.get_ndim() != 2: + raise ValueError("Option parameter array must be 2-dimensional") + + n_opts = option_params_arr.get_shape()[0] + n_params = option_params_arr.get_shape()[1] + + if (n_params != 5): + raise ValueError(( + "Array of option parameters has unexpected number of " + "columns {} != 5. Each row must specify (current_price, " + "strike_price, maturity, interest_rate, volatility)." + ).format(n_params) + ) + + flags_ = option_params_arr.get_flags() + if (not (flags_ & c_dpt.USM_ARRAY_C_CONTIGUOUS)): + raise ValueError("Only C-contiguous arrays are supported") + + q = option_params_arr.get_sycl_queue() + exec_q_ptr = unwrap_queue(q.get_queue_ref()) + typenum_ = option_params_arr.get_typenum() + + if (typenum_ == c_dpt.UAR_DOUBLE): + call_put_prices = dpt.empty((n_opts, 2), dtype='d', sycl_queue=q) + dp1 = option_params_arr.get_data() + dp2 = call_put_prices.get_data() + cpp_blackscholes[double](exec_q_ptr[0], n_opts, dp1, dp2) + elif (typenum_ == c_dpt.UAR_FLOAT): + call_put_prices = dpt.empty((n_opts, 2), dtype='f', sycl_queue=q) + fp1 = option_params_arr.get_data() + fp2 = call_put_prices.get_data() + cpp_blackscholes[float](exec_q_ptr[0], n_opts, fp1, fp2) + else: + raise ValueError("Unsupported data-type") + + return call_put_prices + + +def populate_params( + c_dpt.usm_ndarray option_params_arr, + pl, + ph, + sl, + sh, + tl, + th, + rl, + rh, + vl, + vh, + int seed +): + """ populate_params(params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed) + + Args: + params: usm_narray + Array of shape (n_opts, 5) to populate with price, strike, time to + maturity, interest rate, volatility rate per option using uniform + distribution with provided distribution parameters. + pl: float + Lower bound for distribution of option price parameter + ph: float + Upper bound for distribution of option price parameter + sl: float + Lower bound for distribution of option strike parameter + sh: float + Upper bound for distribution of option strike parameter + tl: float + Lower bound for distribution of option time to maturity parameter + th: float + Upper bound for distribution of option time to maturity parameter + rl: float + Lower bound for distribution of option interest rate parameter + rh: float + Upper bound for distribution of option interest rate parameter + vl: float + Lower bound for distribution of option volatility parameter + vh: float + Upper bound for distribution of option volatility parameter + seed: int + Pseudo-random number generator parameter + """ + cdef size_t n_opts = 0 + cdef size_t n_params = 0 + cdef c_dpctl.SyclQueue sycl_queue + cdef dpcpp_queue* exec_q_ptr = NULL + cdef double* dp = NULL + cdef float* fp = NULL + cdef int typenum_ = 0 + cdef int flags_ = 0 + + if option_params_arr.get_ndim() != 2: + raise ValueError("Option parameter array must be 2-dimensional") + + n_opts = option_params_arr.get_shape()[0] + n_params = option_params_arr.get_shape()[1] + + if (n_params != 5): + raise ValueError( + "Array of option parameters has unexpected number of " + "columns {} != 5. Each row must specify (current_price, " + "strike_price, maturity, interest_rate, volatility).".format( + n_params + ) + ) + + flags_ = option_params_arr.get_flags() + if (not (flags_ & c_dpt.USM_ARRAY_C_CONTIGUOUS)): + raise ValueError("Only C-contiguous arrays are supported") + + exec_q_ptr = unwrap_queue(option_params_arr.get_queue_ref()) + + typenum_ = option_params_arr.get_typenum() + + if (typenum_ == c_dpt.UAR_DOUBLE): + dp = option_params_arr.get_data() + cpp_populate_params[double]( + exec_q_ptr[0], n_opts, dp, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed + ) + elif (typenum_ == c_dpt.UAR_FLOAT): + fp = option_params_arr.get_data() + cpp_populate_params[float]( + exec_q_ptr[0], n_opts, fp, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed + ) + else: + raise ValueError("Unsupported data-type") diff --git a/examples/cython/usm_memory/run.py b/examples/cython/usm_memory/scripts/bench.py similarity index 60% rename from examples/cython/usm_memory/run.py rename to examples/cython/usm_memory/scripts/bench.py index dcf2f05a72..1b9c917984 100644 --- a/examples/cython/usm_memory/run.py +++ b/examples/cython/usm_memory/scripts/bench.py @@ -18,50 +18,22 @@ import timeit -import blackscholes_usm as bs -import numpy as np -from reference_black_scholes import ref_python_black_scholes +import blackscholes as bs import dpctl -import dpctl.memory as dpctl_mem +import dpctl.tensor as dpt def gen_option_params( n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype, queue=None ): - nbytes = n_opts * 5 * np.dtype(dtype).itemsize - usm_mem = dpctl_mem.MemoryUSMShared(nbytes, queue=queue) - params = np.ndarray(shape=(n_opts, 5), buffer=usm_mem, dtype=dtype) + params = dpt.empty((n_opts, 5), dtype=dtype, sycl_queue=queue) seed = 1234 - bs.populate_params( - params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed, queue=queue - ) + bs.populate_params(params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed) return params -# ==== dry run === -usm_mem = dpctl_mem.MemoryUSMShared(3 * 5 * np.dtype("d").itemsize) -opts = np.ndarray((3, 5), buffer=usm_mem, dtype="d") -# copy from Host NumPy to USM buffer -opts[:, :] = np.array( - [ - [81.2, 81.8, 29, 0.01, 0.02], - [24.24, 22.1, 10, 0.02, 0.08], - [100, 100, 30, 0.01, 0.12], - ] -) -# GPU computation -Xgpu = bs.black_scholes_price(opts) - -# compute prices in CPython -X_ref = np.array([ref_python_black_scholes(*opt) for opt in opts], dtype="d") - -print( - "Correctness check: allclose(Xgpu, Xref) == ", - np.allclose(Xgpu, X_ref, atol=1e-5), -) - -n_opts = 3 * 10**6 +n_opts = 3 * 10**7 # compute on CPU sycl device @@ -91,21 +63,31 @@ def gen_option_params( 0.05, 0.01, 0.05, - "d", + "f", queue=q, ) opt_params_list.append(opt_params) times_dict = dict() +dtype_dict = dict() + for q, params in zip(queues, opt_params_list): times_list = [] for _ in range(5): t0 = timeit.default_timer() - X1 = bs.black_scholes_price(params, queue=q) + X1 = bs.black_scholes_price(params) t1 = timeit.default_timer() times_list.append(t1 - t0) times_dict[q.name] = times_list + dtype_dict[q.name] = params.dtype +print( + f"Pricing {n_opts:,} vanilla European options using " + "Black-Scholes-Merton formula" +) +print("") for dev_name, wall_times in times_dict.items(): print("Using : {}".format(dev_name)) - print("Wall times : {}".format(wall_times)) + print( + "Wall times : {} for dtype={}".format(wall_times, dtype_dict[dev_name]) + ) diff --git a/examples/cython/usm_memory/setup.py b/examples/cython/usm_memory/setup.py index 5d87a271a4..2f594d4656 100644 --- a/examples/cython/usm_memory/setup.py +++ b/examples/cython/usm_memory/setup.py @@ -1,6 +1,6 @@ # Data Parallel Control (dpctl) # -# Copyright 2020-2022 Intel Corporation +# Copyright 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,9 +19,61 @@ import numpy as np from setuptools import Extension, setup +from setuptools.command.build_ext import build_ext import dpctl + +class custom_build_ext(build_ext): + def build_extensions(self): + self.compiler.set_executable("compiler_so", "icpx -fsycl -fPIC") + self.compiler.set_executable("compiler_cxx", "icpx -fsycl -fPIC") + self.compiler.set_executable( + "linker_so", + "icpx -fsycl -shared -fpic -fsycl-device-code-split=per_kernel", + ) + build_ext.build_extensions(self) + + +ext_modules = [ + Extension( + name="blackscholes._blackscholes_usm", + sources=[ + "blackscholes/blackscholes.pyx", + ], + depends=[ + "src/sycl_black_scholes.hpp", + ], + include_dirs=[ + "./src", + np.get_include(), + dpctl.get_include(), + os.path.join(sysconfig.get_paths()["include"], ".."), + ], + library_dirs=[ + os.path.join(sysconfig.get_paths()["stdlib"], ".."), + ], + libraries=["sycl"] + + [ + "mkl_sycl", + "mkl_intel_ilp64", + "mkl_tbb_thread", + "mkl_core", + "tbb", + ], + runtime_library_dirs=[], + extra_compile_args=[ + "-Wall", + "-Wextra", + "-fsycl", + "-fno-fast-math", + ], + extra_link_args=["-fPIC"], + language="c++", + ) +] + + setup( name="blackscholes_usm", version="0.0.0", @@ -34,39 +86,6 @@ license="Apache 2.0", author="Intel Corporation", url="https://github.com/IntelPython/dpctl", - ext_modules=[ - Extension( - name="blackscholes_usm", - sources=[ - "blackscholes.pyx", - "sycl_blackscholes.cpp", - ], - include_dirs=[ - ".", - np.get_include(), - dpctl.get_include(), - os.path.join(sysconfig.get_paths()["include"], ".."), - ], - library_dirs=[ - os.path.join(sysconfig.get_paths()["stdlib"], ".."), - ], - libraries=["sycl"] - + [ - "mkl_sycl", - "mkl_intel_ilp64", - "mkl_tbb_thread", - "mkl_core", - "tbb", - ], - runtime_library_dirs=[], - extra_compile_args=[ - "-Wall", - "-Wextra", - "-fsycl", - "-fsycl-unnamed-lambda", - ], - extra_link_args=["-fPIC"], - language="c++", - ) - ], + ext_modules=ext_modules, + cmdclass={"build_ext": custom_build_ext}, ) diff --git a/examples/cython/usm_memory/src/sycl_blackscholes.hpp b/examples/cython/usm_memory/src/sycl_blackscholes.hpp new file mode 100644 index 0000000000..81ed9e0c04 --- /dev/null +++ b/examples/cython/usm_memory/src/sycl_blackscholes.hpp @@ -0,0 +1,178 @@ +//=- sycl_blackscholes.cpp - Example of SYCL code to be called from Cython =// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements SYCL code to price European vanilla options using +/// Black-Scholes formula, as well as code to generate option parameters using +/// SYCL device random number generation library from Intel(R) Math Kernel +/// Library. +/// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +template class black_scholes_kernel; + +constexpr int n_params = 5; +constexpr int n_params_next_pow2 = 8; + +constexpr int n_prices = 2; +constexpr int PRICE = 0; +constexpr int STRIKE = 1; +constexpr int MATURITY = 2; +constexpr int RATE = 3; +constexpr int VOLATILITY = 4; +constexpr int CALL = 0; +constexpr int PUT = 1; + +template +void cpp_blackscholes(sycl::queue q, size_t n_opts, T *params, T *callput) +{ + using data_t = T; + + auto e = q.submit([&](sycl::handler &cgh) { + data_t zero = data_t(0), one = data_t(1), two = data_t(2); + data_t quarter = one / data_t(4); + data_t half = one / two; + + cgh.parallel_for>( + sycl::range<1>(n_opts), + [=](sycl::id<1> idx) + { + const size_t i = n_params * idx[0]; + const data_t opt_price = params[i + PRICE]; + const data_t opt_strike = params[i + STRIKE]; + const data_t opt_maturity = params[i + MATURITY]; + const data_t opt_rate = params[i + RATE]; + const data_t opt_volatility = params[i + VOLATILITY]; + data_t a, b, c, y, z, e, d1, d1c, d2, d2c, w1, w2; + data_t mr = -opt_rate, + sig_sig_two = two * opt_volatility * opt_volatility; + + a = sycl::log(opt_price / opt_strike); + b = opt_maturity * mr; + z = opt_maturity * sig_sig_two; + + c = quarter * z; + e = sycl::exp(b); + y = sycl::rsqrt(z); + + a = b - a; + w1 = (a - c) * y; + w2 = (a + c) * y; + + if (w1 < zero) { + d1 = sycl::erfc(w1) * half; + d1c = one - d1; + } + else { + d1c = sycl::erfc(-w1) * half; + d1 = one - d1c; + } + if (w2 < zero) { + d2 = sycl::erfc(w2) * half; + d2c = one - d2; + } + else { + d2c = sycl::erfc(-w2) * half; + d2 = one - d2c; + } + + e *= opt_strike; + data_t call_price = opt_price * d1 - e * d2; + data_t put_price = e * d2c - opt_price * d1c; + + const size_t callput_i = n_prices * idx[0]; + callput[callput_i + CALL] = call_price; + callput[callput_i + PUT] = put_price; + }); + }); + + e.wait_and_throw(); + + return; +} + +template +void cpp_populate_params(sycl::queue q, + size_t n_opts, + T *params, + T pl, + T ph, + T sl, + T sh, + T tl, + T th, + T rl, + T rh, + T vl, + T vh, + int seed) +{ + + sycl::event e = q.submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::range<1>(n_opts), [=](sycl::item<1> idx) { + size_t i = n_params * idx.get_id(0); + size_t j = n_params_next_pow2 * idx.get_id(0); + + // create engine to sample 5 parameters per workers + oneapi::mkl::rng::device::philox4x32x10 engine( + seed, j); + oneapi::mkl::rng::device::uniform distr; + + sycl::vec res = + oneapi::mkl::rng::device::generate(distr, engine); + + { + const int pos = PRICE; + auto u = res[pos]; + params[i + pos] = pl * u + ph * (T(1) - u); + } + { + const int pos = STRIKE; + auto u = res[pos]; + params[i + pos] = sl * u + sh * (T(1) - u); + } + { + const int pos = MATURITY; + auto u = res[pos]; + params[i + pos] = tl * u + th * (T(1) - u); + } + { + const int pos = RATE; + auto u = res[pos]; + params[i + pos] = rl * u + rh * (T(1) - u); + } + { + const int pos = VOLATILITY; + auto u = res[pos]; + params[i + pos] = vl * u + vh * (T(1) - u); + } + }); + }); + + e.wait_and_throw(); + + return; +} diff --git a/examples/cython/usm_memory/sycl_blackscholes.cpp b/examples/cython/usm_memory/sycl_blackscholes.cpp deleted file mode 100644 index e8bac4703e..0000000000 --- a/examples/cython/usm_memory/sycl_blackscholes.cpp +++ /dev/null @@ -1,245 +0,0 @@ -//=- sycl_blackscholes.cpp - Example of SYCL code to be called from Cython =// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2022 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file implements SYCL code to price European vanilla options using -/// Black-Scholes formula, as well as code to generate option parameters using -/// SYCL device random number generation library from Intel(R) Math Kernel -/// Library. -/// -//===----------------------------------------------------------------------===// - -#include "sycl_blackscholes.hpp" -#include "dpctl_sycl_interface.h" -#include -#include -#include - -template class black_scholes_kernel; - -constexpr int n_params = 5; -constexpr int n_params_next_pow2 = 8; - -constexpr int n_prices = 2; -constexpr int PRICE = 0; -constexpr int STRIKE = 1; -constexpr int MATURITY = 2; -constexpr int RATE = 3; -constexpr int VOLATILITY = 4; -constexpr int CALL = 0; -constexpr int PUT = 1; - -template -extern void -cpp_blackscholes(DPCTLSyclQueueRef q_ptr, size_t n_opts, T *params, T *callput) -{ - using data_t = T; - - sycl::queue q = *(reinterpret_cast(q_ptr)); - - auto ctx = q.get_context(); - { - sycl::usm::alloc params_type = sycl::get_pointer_type(params, ctx); - if (params_type != sycl::usm::alloc::shared) { - throw std::runtime_error("Input option_params to cpp_blackscholes " - "is not a USM-shared pointer."); - } - } - { - sycl::usm::alloc callput_type = sycl::get_pointer_type(callput, ctx); - if (callput_type != sycl::usm::alloc::shared) { - throw std::runtime_error("Input callput to cpp_blackscholes is not " - "a USM-shared pointer."); - } - } - - auto e = q.submit([&](sycl::handler &cgh) { - data_t zero = data_t(0), one = data_t(1), two = data_t(2); - data_t quarter = one / data_t(4); - data_t half = one / two; - - cgh.parallel_for>( - sycl::range<1>(n_opts), - [=](sycl::id<1> idx) - { - const size_t i = n_params * idx[0]; - const data_t opt_price = params[i + PRICE]; - const data_t opt_strike = params[i + STRIKE]; - const data_t opt_maturity = params[i + MATURITY]; - const data_t opt_rate = params[i + RATE]; - const data_t opt_volatility = params[i + VOLATILITY]; - data_t a, b, c, y, z, e, d1, d1c, d2, d2c, w1, w2; - data_t mr = -opt_rate, - sig_sig_two = two * opt_volatility * opt_volatility; - - a = sycl::log(opt_price / opt_strike); - b = opt_maturity * mr; - z = opt_maturity * sig_sig_two; - - c = quarter * z; - e = sycl::exp(b); - y = sycl::rsqrt(z); - - a = b - a; - w1 = (a - c) * y; - w2 = (a + c) * y; - - if (w1 < zero) { - d1 = sycl::erfc(w1) * half; - d1c = one - d1; - } - else { - d1c = sycl::erfc(-w1) * half; - d1 = one - d1c; - } - if (w2 < zero) { - d2 = sycl::erfc(w2) * half; - d2c = one - d2; - } - else { - d2c = sycl::erfc(-w2) * half; - d2 = one - d2c; - } - - e *= opt_strike; - data_t call_price = opt_price * d1 - e * d2; - data_t put_price = e * d2c - opt_price * d1c; - - const size_t callput_i = n_prices * idx[0]; - callput[callput_i + CALL] = call_price; - callput[callput_i + PUT] = put_price; - }); - }); - - e.wait_and_throw(); - - return; -} - -template -void cpp_populate_params(DPCTLSyclQueueRef q_ptr, - size_t n_opts, - T *params, - T pl, - T ph, - T sl, - T sh, - T tl, - T th, - T rl, - T rh, - T vl, - T vh, - int seed) -{ - sycl::queue q = *(reinterpret_cast(q_ptr)); - - auto ctx = q.get_context(); - { - sycl::usm::alloc params_type = sycl::get_pointer_type(params, ctx); - if (params_type != sycl::usm::alloc::shared) { - throw std::runtime_error("Input option_params to cpp_blackscholes " - "is not a USM-shared pointer."); - } - } - - sycl::event e = q.submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>(n_opts), [=](sycl::item<1> idx) { - size_t i = n_params * idx.get_id(0); - size_t j = n_params_next_pow2 * idx.get_id(0); - - // create engine to sample 5 parameters per workers - oneapi::mkl::rng::device::philox4x32x10 engine( - seed, j); - oneapi::mkl::rng::device::uniform distr; - - sycl::vec res = - oneapi::mkl::rng::device::generate(distr, engine); - - { - const int pos = PRICE; - auto u = res[pos]; - params[i + pos] = pl * u + ph * (T(1) - u); - } - { - const int pos = STRIKE; - auto u = res[pos]; - params[i + pos] = sl * u + sh * (T(1) - u); - } - { - const int pos = MATURITY; - auto u = res[pos]; - params[i + pos] = tl * u + th * (T(1) - u); - } - { - const int pos = RATE; - auto u = res[pos]; - params[i + pos] = rl * u + rh * (T(1) - u); - } - { - const int pos = VOLATILITY; - auto u = res[pos]; - params[i + pos] = vl * u + vh * (T(1) - u); - } - }); - }); - - e.wait_and_throw(); -} - -// instantation for object files to not be empty - -template void cpp_blackscholes(DPCTLSyclQueueRef q_ptr, - size_t n_opts, - double *params, - double *callput); -template void cpp_blackscholes(DPCTLSyclQueueRef q_ptr, - size_t n_opts, - float *params, - float *callput); - -template void cpp_populate_params(DPCTLSyclQueueRef q_ptr, - size_t n_opts, - double *params, - double pl, - double ph, - double sl, - double sh, - double tl, - double th, - double rl, - double rh, - double vl, - double vh, - int seed); -template void cpp_populate_params(DPCTLSyclQueueRef q_ptr, - size_t n_opts, - float *params, - float pl, - float ph, - float sl, - float sh, - float tl, - float th, - float rl, - float rh, - float vl, - float vh, - int seed); diff --git a/examples/cython/usm_memory/sycl_blackscholes.hpp b/examples/cython/usm_memory/sycl_blackscholes.hpp deleted file mode 100644 index 4d5d449fb3..0000000000 --- a/examples/cython/usm_memory/sycl_blackscholes.hpp +++ /dev/null @@ -1,48 +0,0 @@ -//=- sycl_blackscholes.hpp - Example of SYCL code to be called from Cython =// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2022 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file exports C++ functions to be called from Cython-generated -/// extensions. -/// -//===----------------------------------------------------------------------===// - -#include "dpctl_sycl_interface.h" -#include - -template -extern void -cpp_blackscholes(DPCTLSyclQueueRef q, size_t n_opts, T *params, T *callput); - -template -extern void cpp_populate_params(DPCTLSyclQueueRef q, - size_t n_opts, - T *params, - T pl, - T ph, - T sl, - T sh, - T tl, - T th, - T rl, - T rh, - T vl, - T vh, - int seed); diff --git a/examples/cython/usm_memory/reference_black_scholes.py b/examples/cython/usm_memory/tests/test_black_scholes.py similarity index 54% rename from examples/cython/usm_memory/reference_black_scholes.py rename to examples/cython/usm_memory/tests/test_black_scholes.py index e4cf554b64..2cd86cde82 100644 --- a/examples/cython/usm_memory/reference_black_scholes.py +++ b/examples/cython/usm_memory/tests/test_black_scholes.py @@ -16,6 +16,13 @@ import math +import blackscholes +import numpy as np +import pytest + +import dpctl +import dpctl.tensor as dpt + def ref_python_black_scholes(price, strike, t, rate, vol): mr = -rate @@ -53,3 +60,35 @@ def ref_python_black_scholes(price, strike, t, rate, vol): call = P * d1 - Se * d2 put = Se * d2c - P * d1c return (call, put) + + +@pytest.mark.parametrize("dtype", [dpt.float32, dpt.float64]) +def test_black_scholes_merton(dtype): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Unable to create queue") + if dtype == dpt.float64 and not q.sycl_device.has_aspect_fp64: + pytest.skip(f"Hardware {q.sycl_device.name} does not support {dtype}") + opts = dpt.empty((3, 5), dtype=dtype) + # copy from Host NumPy to USM buffer + opts[:, :] = dpt.asarray( + [ + [81.2, 81.8, 29, 0.01, 0.02], + [24.24, 22.1, 10, 0.02, 0.08], + [100, 100, 30, 0.01, 0.12], + ], + dtype=dtype, + ) + X = blackscholes.black_scholes_price(opts) + + # compute prices in Python + X_ref = np.array( + [ref_python_black_scholes(*opt) for opt in dpt.asnumpy(opts)], + dtype=dtype, + ) + + tol = 64 * dpt.finfo(dtype).eps + assert np.allclose(dpt.asnumpy(X), X_ref, atol=tol, rtol=tol), np.abs( + dpt.asnumpy(X) - X_ref + ).max() diff --git a/examples/pybind11/README.md b/examples/pybind11/README.md new file mode 100644 index 0000000000..e9cf832885 --- /dev/null +++ b/examples/pybind11/README.md @@ -0,0 +1,9 @@ +# Examples of data-parallel Python extensions written with pybind11 + +The `dpctl` provides integration header `dpctl4pybind11.hpp` which implements type casters +establishing mapping between `dpctl.SyclQueue` and `sycl::queue`, `dpctl.SyclDevice` and `sycl::device`, +`dpctl.SyclEvent` and `sycl::event`, etc. + +The header also defines C++ classes `dpctl::tensor::usm_ndarray` and `dpctl::memory::usm_memory` which +derive from `pybind11::object` and encapsulate Python objects of types `dpctl.tensor.usm_ndarray` and +`dpctl.memory._Memory` respectively. diff --git a/examples/python/README.md b/examples/python/README.md new file mode 100644 index 0000000000..194952c8b3 --- /dev/null +++ b/examples/python/README.md @@ -0,0 +1 @@ +# Python examples of using `dpctl`