diff --git a/README.md b/README.md index 4a09b2b7bc..e8ef93908b 100644 --- a/README.md +++ b/README.md @@ -76,11 +76,17 @@ Examples ======== See examples in folder `examples`. -Run examples: +Run python examples: ```bash -python examples/create_sycl_queues.py +for script in `ls examples/python/`; do echo "executing ${script}"; python examples/python/${script}; done ``` +Examples of building Cython extensions with DPC++ compiler, that interoperate with dpCtl can be found in +folder `cython`. + +Each example in `cython` folder can be built using `CC=clang CXX=dpcpp python setup.py build_ext --inplace`. +Please refer to `run.py` script in respective folders to execute extensions. + Tests ===== See tests in folder `dpctl/tests`. diff --git a/examples/cython/sycl_buffer/README.md b/examples/cython/sycl_buffer/README.md new file mode 100644 index 0000000000..47e682d8cb --- /dev/null +++ b/examples/cython/sycl_buffer/README.md @@ -0,0 +1,80 @@ +#1 Example of SYCL extension working NumPy array input via SYCL buffers + + +#2 Decription + +Cython function expecting a 2D array in C-contiguous layout that +computes column-wise total by using SYCL oneMKL (as GEMV call with +an all units vector). + +Example illustrates compiling SYCL extension, linking to oneMKL. + + +#2 Compiling + +``` +# make sure oneAPI is activated, $ONEAPI_ROOT must be set +CC=clang CXX=dpcpp python setup.py build_ext --inplace +``` + + +#2 Running + +``` +# SYCL_BE=PI_OPENCL sets SYCL backend to OpenCL to avoid a +# transient issue with MKL's using the default Level-0 backend +(idp) [08:16:12 ansatnuc04 simple]$ SYCL_BE=PI_OPENCL ipython +Python 3.7.7 (default, Jul 14 2020, 22:02:37) +Type 'copyright', 'credits' or 'license' for more information +IPython 7.17.0 -- An enhanced Interactive Python. Type '?' for help. + +In [1]: import syclbuffer as sb, numpy as np, dpctl + +In [2]: x = np.random.randn(10**4, 2500) + +In [3]: %time m1 = np.sum(x, axis=0) +CPU times: user 22.3 ms, sys: 160 µs, total: 22.5 ms +Wall time: 21.2 ms + +In [4]: %time m = sb.columnwise_total(x) # first time is slower, due to JIT overhead +CPU times: user 207 ms, sys: 36.1 ms, total: 243 ms +Wall time: 248 ms + +In [5]: %time m = sb.columnwise_total(x) +CPU times: user 8.89 ms, sys: 4.12 ms, total: 13 ms +Wall time: 12.4 ms + +In [6]: %time m = sb.columnwise_total(x) +CPU times: user 4.82 ms, sys: 8.06 ms, total: 12.9 ms +Wall time: 12.3 ms +``` + +Running bench.py: + +``` +========== Executing warm-up ========== +NumPy result: [1. 1. 1. ... 1. 1. 1.] +SYCL(Intel(R) Core(TM) i7-10710U CPU @ 1.10GHz) result: [1. 1. 1. ... 1. 1. 1.] +SYCL(Intel(R) Gen9 HD Graphics NEO) result: [1. 1. 1. ... 1. 1. 1.] +Times for 'opencl:cpu:0' +[2.864787499012891, 2.690436460019555, 2.5902308400254697, 2.5802528870408423, 2.538990616973024] +Times for 'opencl:gpu:0' +[1.9769684099592268, 2.3491444009705447, 2.293720397981815, 2.391633405990433, 1.9465659779962152] +Times for NumPy +[3.4011058019823395, 3.07286038500024, 3.0390414349967614, 3.0305576199898496, 3.002687797998078] +``` + +Running run.py: + +``` +(idp) [09:14:53 ansatnuc04 sycl_buffer]$ SYCL_BE=PI_OPENCL python run.py +Result computed by NumPy +[ 0.27170187 -23.36798583 7.31326489 -1.95121928] +Result computed by SYCL extension +[ 0.27170187 -23.36798583 7.31326489 -1.95121928] + +Running on: Intel(R) Gen9 HD Graphics NEO +[ 0.27170187 -23.36798583 7.31326489 -1.95121928] +Running on: Intel(R) Core(TM) i7-10710U CPU @ 1.10GHz +[ 0.27170187 -23.36798583 7.31326489 -1.95121928] +``` \ No newline at end of file diff --git a/examples/cython/sycl_buffer/_buffer_example.pyx b/examples/cython/sycl_buffer/_buffer_example.pyx new file mode 100644 index 0000000000..d1ade59c92 --- /dev/null +++ b/examples/cython/sycl_buffer/_buffer_example.pyx @@ -0,0 +1,28 @@ +cimport numpy as cnp +import numpy as np + +cimport dpctl as c_dpctl +import dpctl + +cdef extern from "use_sycl_buffer.h": + int c_columnwise_total(c_dpctl.DPPLSyclQueueRef q, size_t n, size_t m, double *m, double *ct) nogil + int c_columnwise_total_no_mkl(c_dpctl.DPPLSyclQueueRef q, size_t n, size_t m, double *m, double *ct) nogil + +def columnwise_total(double[:, ::1] v, method='mkl'): + cdef cnp.ndarray res_array = np.empty((v.shape[1],), dtype='d') + cdef double[::1] res_memslice = res_array + cdef int ret_status + cdef c_dpctl.SyclQueue q + cdef c_dpctl.DPPLSyclQueueRef q_ref + + q = c_dpctl.get_current_queue() + q_ref = q.get_queue_ref() + + if method == 'mkl': + with nogil: + ret_status = c_columnwise_total(q_ref, v.shape[0], v.shape[1], &v[0,0], &res_memslice[0]) + else: + with nogil: + ret_status = c_columnwise_total_no_mkl(q_ref, v.shape[0], v.shape[1], &v[0,0], &res_memslice[0]) + + return res_array diff --git a/examples/cython/sycl_buffer/bench.py b/examples/cython/sycl_buffer/bench.py new file mode 100644 index 0000000000..0c6d94d189 --- /dev/null +++ b/examples/cython/sycl_buffer/bench.py @@ -0,0 +1,51 @@ +import dpctl +import syclbuffer as sb +import numpy as np + +X = np.full((10 ** 4, 4098), 1e-4, dtype="d") + +# warm-up +print("=" * 10 + " Executing warm-up " + "=" * 10) +print("NumPy result: ", X.sum(axis=0)) + +dpctl.set_default_queue("opencl", "cpu", 0) +print( + "SYCL({}) result: {}".format( + dpctl.get_current_queue().get_sycl_device().get_device_name(), + sb.columnwise_total(X), + ) +) + +dpctl.set_default_queue("opencl", "gpu", 0) +print( + "SYCL({}) result: {}".format( + dpctl.get_current_queue().get_sycl_device().get_device_name(), + sb.columnwise_total(X), + ) +) + +import timeit + +print("Times for 'opencl:cpu:0'") +print( + timeit.repeat( + stmt="sb.columnwise_total(X)", + setup='dpctl.set_default_queue("opencl", "cpu", 0); ' + "sb.columnwise_total(X)", # ensure JIT compilation is not counted + number=100, + globals=globals(), + ) +) + +print("Times for 'opencl:gpu:0'") +print( + timeit.repeat( + stmt="sb.columnwise_total(X)", + setup='dpctl.set_default_queue("opencl", "gpu", 0); sb.columnwise_total(X)', + number=100, + globals=globals(), + ) +) + +print("Times for NumPy") +print(timeit.repeat(stmt="X.sum(axis=0)", number=100, globals=globals())) diff --git a/examples/cython/sycl_buffer/run.py b/examples/cython/sycl_buffer/run.py new file mode 100644 index 0000000000..4e279a84e2 --- /dev/null +++ b/examples/cython/sycl_buffer/run.py @@ -0,0 +1,22 @@ +import syclbuffer as sb +import numpy as np + +X = np.random.randn(100, 4) + +print("Result computed by NumPy") +print(X.sum(axis=0)) +print("Result computed by SYCL extension") +print(sb.columnwise_total(X)) + + +print("") +# controlling where to offload +import dpctl + +with dpctl.device_context("opencl:gpu"): + print("Running on: ", dpctl.get_current_queue().get_sycl_device().get_device_name()) + print(sb.columnwise_total(X)) + +with dpctl.device_context("opencl:cpu"): + print("Running on: ", dpctl.get_current_queue().get_sycl_device().get_device_name()) + print(sb.columnwise_total(X)) diff --git a/examples/cython/sycl_buffer/setup.py b/examples/cython/sycl_buffer/setup.py new file mode 100644 index 0000000000..ef9b6f3b78 --- /dev/null +++ b/examples/cython/sycl_buffer/setup.py @@ -0,0 +1,67 @@ +import sys +from os.path import join, exists, abspath, dirname +from os import getcwd +from os import environ +from Cython.Build import cythonize + + +def configuration(parent_package="", top_path=None): + from numpy.distutils.misc_util import Configuration + from numpy.distutils.system_info import get_info + import numpy as np + import dpctl + + config = Configuration("", parent_package, top_path) + + oneapi_root = environ.get("ONEAPI_ROOT", None) + if not oneapi_root: + raise ValueError("ONEAPI_ROOT must be set, typical value is /opt/intel/oneapi") + + mkl_info = { + "include_dirs": [join(oneapi_root, "mkl", "include")], + "library_dirs": [ + join(oneapi_root, "mkl", "lib"), + join(oneapi_root, "mkl", "lib", "intel64"), + ], + "libraries": [ + "mkl_sycl", + "mkl_intel_ilp64", + "mkl_tbb_thread", + "mkl_core", + "tbb", + "iomp5", + ], + } + + mkl_include_dirs = mkl_info.get("include_dirs") + mkl_library_dirs = mkl_info.get("library_dirs") + mkl_libraries = mkl_info.get("libraries") + + pdir = dirname(__file__) + wdir = join(pdir) + + eca = ["-Wall", "-Wextra", "-fsycl", "-fsycl-unnamed-lambda"] + + config.add_extension( + name="syclbuffer", + sources=[ + join(pdir, "_buffer_example.pyx"), + join(wdir, "use_sycl_buffer.cpp"), + join(wdir, "use_sycl_buffer.h"), + ], + include_dirs=[wdir, np.get_include(), dpctl.get_include()] + mkl_include_dirs, + libraries=["sycl"] + mkl_libraries, + runtime_library_dirs=mkl_library_dirs, + extra_compile_args=eca, # + ['-O0', '-g', '-ggdb'], + extra_link_args=["-fPIC"], + language="c++", + ) + + config.ext_modules = cythonize(config.ext_modules, include_path=[pdir, wdir]) + return config + + +if __name__ == "__main__": + from numpy.distutils.core import setup + + setup(configuration=configuration) diff --git a/examples/cython/sycl_buffer/use_sycl_buffer.cpp b/examples/cython/sycl_buffer/use_sycl_buffer.cpp new file mode 100644 index 0000000000..0c42332ea7 --- /dev/null +++ b/examples/cython/sycl_buffer/use_sycl_buffer.cpp @@ -0,0 +1,109 @@ +#include +#include "use_sycl_buffer.h" +#include +#include "dppl_sycl_types.h" + +int +c_columnwise_total(DPPLSyclQueueRef q_ref, size_t n, size_t m, double *mat, double *ct) { + + sycl::queue q = *(reinterpret_cast(q_ref)); + + sycl::buffer mat_buffer = sycl::buffer(mat, sycl::range<1>(n * m)); + sycl::buffer ct_buffer = sycl::buffer(ct, sycl::range<1>(m)); + + double *ones = reinterpret_cast(malloc(n * sizeof(double))); + { + sycl::buffer ones_buffer = sycl::buffer(ones, sycl::range<1>(n)); + + try { + auto ev = q.submit([&](sycl::handler &cgh) { + auto ones_acc = ones_buffer.get_access(cgh); + cgh.fill(ones_acc, double(1.0)); + }); + + ev.wait_and_throw(); + } + catch (sycl::exception const& e) { + std::cout << "\t\tCaught synchronous SYCL exception during fill:\n" + << e.what() << std::endl << "OpenCL status: " << e.get_cl_code() << std::endl; + goto cleanup; + } + + try { + oneapi::mkl::blas::row_major::gemv( + q, + oneapi::mkl::transpose::trans, + n, m, double(1.0), mat_buffer, m, + ones_buffer, 1, + double(0.0), ct_buffer, 1); + q.wait(); + } + catch (sycl::exception const &e) { + std::cout << "\t\tCaught synchronous SYCL exception during GEMV:\n" + << e.what() << std::endl << "OpenCL status: " << e.get_cl_code() << std::endl; + goto cleanup; + } + } + + free(ones); + return 0; + + cleanup: + free(ones); + return -1; +} + +inline size_t upper_multiple(size_t n, size_t wg) { return wg * ((n + wg - 1)/wg); } + +int +c_columnwise_total_no_mkl(DPPLSyclQueueRef q_ref, size_t n, size_t m, double *mat, double *ct) { + + sycl::queue q = *(reinterpret_cast(q_ref)); + + sycl::buffer mat_buffer = sycl::buffer(mat, sycl::range<2>(n, m)); + sycl::buffer ct_buffer = sycl::buffer(ct, sycl::range<1>(m)); + + auto e = q.submit( + [&](sycl::handler &h) { + sycl::accessor ct_acc {ct_buffer, h, sycl::write_only}; + h.parallel_for( + sycl::range<1>(m), + [=](sycl::id<1> i){ + ct_acc[i] = 0.0; + }); + }); + + constexpr size_t wg = 256; + auto e2 = q.submit( + [&](sycl::handler &h) { + + sycl::accessor mat_acc {mat_buffer, h, sycl::read_only}; + sycl::accessor ct_acc {ct_buffer, h}; + h.depends_on(e); + + sycl::range<2> global {upper_multiple(n, wg), m}; + sycl::range<2> local {wg, 1}; + + h.parallel_for( + sycl::nd_range<2>(global, local), + [=](sycl::nd_item<2> it) { + size_t i = it.get_global_id(0); + size_t j = it.get_global_id(1); + double group_sum = sycl::ONEAPI::reduce( + it.get_group(), + (i < n) ? mat_acc[it.get_global_id()] : 0.0, + std::plus() + ); + if (it.get_local_id(0) == 0) { + sycl::ONEAPI::atomic_ref< + double, + sycl::ONEAPI::memory_order::relaxed, + sycl::ONEAPI::memory_scope::system, + sycl::access::address_space::global_space>(ct_acc[j]) += group_sum; + } + }); + }); + + e2.wait_and_throw(); + return 0; +} diff --git a/examples/cython/sycl_buffer/use_sycl_buffer.h b/examples/cython/sycl_buffer/use_sycl_buffer.h new file mode 100644 index 0000000000..f3ee924861 --- /dev/null +++ b/examples/cython/sycl_buffer/use_sycl_buffer.h @@ -0,0 +1,7 @@ +#include +#include "dppl_sycl_types.h" + +extern int c_columnwise_total( + DPPLSyclQueueRef q, size_t n, size_t m, double *mat, double *ct); +extern int c_columnwise_total_no_mkl( + DPPLSyclQueueRef q, size_t n, size_t m, double *mat, double *ct); diff --git a/examples/cython/sycl_direct_linkage/_buffer_example.pyx b/examples/cython/sycl_direct_linkage/_buffer_example.pyx new file mode 100644 index 0000000000..e9aca0fcfe --- /dev/null +++ b/examples/cython/sycl_direct_linkage/_buffer_example.pyx @@ -0,0 +1,25 @@ +cimport numpy as cnp +import numpy as np +from cython.operator cimport dereference as deref + +cdef extern from "CL/sycl.hpp" namespace "cl::sycl": + cdef cppclass queue nogil: + pass + +cdef extern from "sycl_function.hpp": + int c_columnwise_total(queue& q, size_t n, size_t m, double *m, double *ct) nogil + +def columnwise_total(double[:, ::1] v): + cdef cnp.ndarray res_array = np.empty((v.shape[1],), dtype='d') + cdef double[::1] res_memslice = res_array + cdef int ret_status + cdef queue* q + + q = new queue() + + with nogil: + ret_status = c_columnwise_total(deref(q), v.shape[0], v.shape[1], &v[0,0], &res_memslice[0]) + + del q + + return res_array diff --git a/examples/cython/sycl_direct_linkage/run.py b/examples/cython/sycl_direct_linkage/run.py new file mode 100644 index 0000000000..ed9597add1 --- /dev/null +++ b/examples/cython/sycl_direct_linkage/run.py @@ -0,0 +1,10 @@ +import syclbuffer as sb +import numpy as np + +X = np.random.randn(20, 10) + +# compute column-wise total with NumPy's own host code +print(X.sum(axis=0)) + +# compute column-wise total with SYCL extension +print(sb.columnwise_total(X)) diff --git a/examples/cython/sycl_direct_linkage/setup.py b/examples/cython/sycl_direct_linkage/setup.py new file mode 100644 index 0000000000..495838b1fd --- /dev/null +++ b/examples/cython/sycl_direct_linkage/setup.py @@ -0,0 +1,67 @@ +import sys +from os.path import join, exists, abspath, dirname +from os import getcwd +from os import environ +from Cython.Build import cythonize + + +def configuration(parent_package="", top_path=None): + from numpy.distutils.misc_util import Configuration + from numpy.distutils.system_info import get_info + import numpy as np + import dpctl + + config = Configuration("", parent_package, top_path) + + oneapi_root = environ.get("ONEAPI_ROOT", None) + if not oneapi_root: + raise ValueError("ONEAPI_ROOT must be set, typical value is /opt/intel/oneapi") + + mkl_info = { + "include_dirs": [join(oneapi_root, "mkl", "include")], + "library_dirs": [ + join(oneapi_root, "mkl", "lib"), + join(oneapi_root, "mkl", "lib", "intel64"), + ], + "libraries": [ + "mkl_sycl", + "mkl_intel_ilp64", + "mkl_tbb_thread", + "mkl_core", + "tbb", + "iomp5", + ], + } + + mkl_include_dirs = mkl_info.get("include_dirs") + mkl_library_dirs = mkl_info.get("library_dirs") + mkl_libraries = mkl_info.get("libraries") + + pdir = dirname(__file__) + wdir = join(pdir) + + eca = ["-Wall", "-Wextra", "-fsycl", "-fsycl-unnamed-lambda"] + + config.add_extension( + name="syclbuffer_naive", + sources=[ + join(pdir, "_buffer_example.pyx"), + join(pdir, "sycl_function.cpp"), + join(pdir, "sycl_function.hpp"), + ], + include_dirs=[wdir, np.get_include(), dpctl.get_include()] + mkl_include_dirs, + libraries=["sycl"] + mkl_libraries, + runtime_library_dirs=mkl_library_dirs, + extra_compile_args=eca, # + ['-O0', '-g', '-ggdb'], + extra_link_args=["-fPIC"], + language="c++", + ) + + config.ext_modules = cythonize(config.ext_modules, include_path=[pdir, wdir]) + return config + + +if __name__ == "__main__": + from numpy.distutils.core import setup + + setup(configuration=configuration) diff --git a/examples/cython/sycl_direct_linkage/sycl_function.cpp b/examples/cython/sycl_direct_linkage/sycl_function.cpp new file mode 100644 index 0000000000..d9d8065f3e --- /dev/null +++ b/examples/cython/sycl_direct_linkage/sycl_function.cpp @@ -0,0 +1,51 @@ +#include +#include "sycl_function.hpp" +#include "mkl_blas_sycl.hpp" +#include "mkl.h" + +int c_columnwise_total(cl::sycl::queue &q, size_t n, size_t m, double *mat, double *ct) { + sycl::buffer mat_buffer = sycl::buffer(mat, sycl::range<1>(n * m)); + sycl::buffer ct_buffer = sycl::buffer(ct, sycl::range<1>(m)); + + double *ones = reinterpret_cast(malloc(n * sizeof(double))); + { + sycl::buffer ones_buffer = sycl::buffer(ones, sycl::range<1>(n)); + + try { + auto ev = q.submit([&](sycl::handler &cgh) { + auto ones_acc = ones_buffer.get_access(cgh); + cgh.fill(ones_acc, double(1.0)); + }); + + ev.wait_and_throw(); + } + catch (sycl::exception const& e) { + std::cout << "\t\tCaught synchronous SYCL exception during fill:\n" + << e.what() << std::endl << "OpenCL status: " << e.get_cl_code() << std::endl; + goto cleanup; + } + + try { + oneapi::mkl::blas::row_major::gemv( + q, + oneapi::mkl::transpose::trans, + n, m, double(1.0), mat_buffer, m, + ones_buffer, 1, + double(0.0), ct_buffer, 1); + q.wait(); + } + catch (sycl::exception const &e) { + std::cout << "\t\tCaught synchronous SYCL exception during GEMV:\n" + << e.what() << std::endl << "OpenCL status: " << e.get_cl_code() << std::endl; + goto cleanup; + } + } + + free(ones); + return 0; + + cleanup: + free(ones); + return -1; +} + diff --git a/examples/cython/sycl_direct_linkage/sycl_function.hpp b/examples/cython/sycl_direct_linkage/sycl_function.hpp new file mode 100644 index 0000000000..51e5e8474b --- /dev/null +++ b/examples/cython/sycl_direct_linkage/sycl_function.hpp @@ -0,0 +1,3 @@ +#include + +int c_columnwise_total(cl::sycl::queue&, size_t n, size_t m, double *mat, double *ct); diff --git a/examples/cython/usm_memory/README.md b/examples/cython/usm_memory/README.md new file mode 100644 index 0000000000..be3a7c6ce4 --- /dev/null +++ b/examples/cython/usm_memory/README.md @@ -0,0 +1,28 @@ +#1 Example of working with USM memory + +#2 Description + +#2 Building + +Make sure oneAPI is activated. Environment variable `$ONEAPI_ROOT` must be set. + + +``` +$ CC=clang CXX=dpcpp LD_SHARED="dpcpp -shared" python setup.py build_ext --inplace +``` + +#2 Running + +``` +$ python run.py +``` + +which gives sample output: + +``` +True +Using : Intel(R) Core(TM) i7-10710U CPU @ 1.10GHz +Elapsed: 0.9255791641771793 +Using : Intel(R) Gen9 +Elapsed: 0.32811625860631466 +``` \ No newline at end of file diff --git a/examples/cython/usm_memory/blackscholes.pyx b/examples/cython/usm_memory/blackscholes.pyx new file mode 100644 index 0000000000..6495265461 --- /dev/null +++ b/examples/cython/usm_memory/blackscholes.pyx @@ -0,0 +1,76 @@ +# cython: language_level=3 +# distutils: language=c++ + +cimport dpctl as c_dpctl +cimport dpctl._memory as c_dpctl_mem +cimport numpy as cnp +from cython cimport floating + +import dpctl +import numpy as np + +cdef extern from "sycl_blackscholes.hpp": + cdef void cpp_blackscholes[T](c_dpctl.DPPLSyclQueueRef, size_t n_opts, T* option_params, T* callput) + cdef void cpp_populate_params[T](c_dpctl.DPPLSyclQueueRef, size_t n_opts, T* option_params, T pl, T ph, T sl, T sh, T tl, T th, T rl, T rh, T vl, T vh, int seed) + +def black_scholes_price(floating[:, ::1] option_params): + cdef size_t n_opts = option_params.shape[0] + cdef size_t n_params = option_params.shape[1] + cdef c_dpctl.SyclQueue q + cdef c_dpctl.DPPLSyclQueueRef q_ptr + cdef c_dpctl_mem.MemoryUSMShared mobj + cdef floating[:, :] call_put_prices + cdef cnp.ndarray callput_arr + cdef double* dp1 + cdef double* dp2 + cdef float* fp1 + cdef float* fp2 + + if (n_params != 5): + raise ValueError(( + "Array of option parameters has unexpected number of columns {} != 5. " + "Each row must specify (current_price, strike_price, maturity, interest_rate, volatility)." + ).format(n_params)) + + q = c_dpctl.get_current_queue() + q_ptr = q.get_queue_ref() + if (floating is double): + mobj = c_dpctl_mem.MemoryUSMShared(nbytes=2*n_opts * sizeof(double)) + callput_arr = np.ndarray((n_opts, 2), buffer=mobj, dtype='d') + call_put_prices = callput_arr + dp1 = &option_params[0,0] + dp2 = &call_put_prices[0,0]; + cpp_blackscholes[double](q_ptr, n_opts, dp1, dp2) + elif (floating is float): + mobj = c_dpctl_mem.MemoryUSMShared(nbytes=2*n_opts * sizeof(float)) + callput_arr = np.ndarray((n_opts, 2), buffer=mobj, dtype='f') + call_put_prices = callput_arr + fp1 = &option_params[0,0] + fp2 = &call_put_prices[0,0] + cpp_blackscholes[float](q_ptr, n_opts, fp1, fp2) + + return callput_arr + +def populate_params(floating[:, ::1] option_params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, int seed): + cdef size_t n_opts = option_params.shape[0] + cdef size_t n_params = option_params.shape[1] + + cdef c_dpctl.SyclQueue q + cdef c_dpctl.DPPLSyclQueueRef q_ptr + cdef double* dp + cdef float* fp + + if (n_params != 5): + raise ValueError(( + "Array of option parameters has unexpected number of columns {} != 5. " + "Each row must specify (current_price, strike_price, maturity, interest_rate, volatility)." + ).format(n_params)) + + q = c_dpctl.get_current_queue() + q_ptr = q.get_queue_ref() + if (floating is double): + dp = &option_params[0,0] + cpp_populate_params[double](q_ptr, n_opts, dp, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed) + elif (floating is float): + fp = &option_params[0,0] + cpp_populate_params[float](q_ptr, n_opts, fp, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed) diff --git a/examples/cython/usm_memory/reference_black_scholes.py b/examples/cython/usm_memory/reference_black_scholes.py new file mode 100644 index 0000000000..ae01312932 --- /dev/null +++ b/examples/cython/usm_memory/reference_black_scholes.py @@ -0,0 +1,39 @@ +import math + + +def ref_python_black_scholes(price, strike, t, rate, vol): + mr = -rate + sig_sig_two = vol * vol * 2 + + P = price + S = strike + T = t + + a = math.log(P / S) + b = T * mr + + z = T * sig_sig_two + c = 0.25 * z + y = 1 / math.sqrt(z) + + Se = math.exp(b) * S + + w1 = (a - b + c) * y + w2 = (a - b - c) * y + + if w1 > 0: + d1 = 0.5 * math.erfc(-w1) + d1c = 1.0 - d1 + else: + d1c = 0.5 * math.erfc(w1) + d1 = 1.0 - d1c + if w2 > 0: + d2 = 0.5 * math.erfc(-w2) + d2c = 1.0 - d2 + else: + d2c = 0.5 * math.erfc(w2) + d2 = 1.0 - d2c + + call = P * d1 - Se * d2 + put = Se * d2c - P * d1c + return (call, put) diff --git a/examples/cython/usm_memory/run.py b/examples/cython/usm_memory/run.py new file mode 100644 index 0000000000..422c4baaf1 --- /dev/null +++ b/examples/cython/usm_memory/run.py @@ -0,0 +1,76 @@ +# coding: utf-8 +import dpctl._memory as dpctl_mem +import blackscholes_usm as bs +import numpy as np, dpctl +from reference_black_scholes import ref_python_black_scholes + + +def gen_option_params(n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype): + usm_mem = dpctl_mem.MemoryUSMShared(n_opts * 5 * np.dtype(dtype).itemsize) + # usm_mem2 = dpctl_mem.MemoryUSMDevice(n_opts * 5 * np.dtype(dtype).itemsize) + params = np.ndarray(shape=(n_opts, 5), buffer=usm_mem, dtype=dtype) + seed = 1234 + bs.populate_params(params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed) + return params + + +# ==== dry run === +usm_mem = dpctl_mem.MemoryUSMShared(3 * 5 * np.dtype("d").itemsize) +opts = np.ndarray((3, 5), buffer=usm_mem, dtype="d") +# copy from Host NumPy to USM buffer +opts[:, :] = np.array( + [ + [81.2, 81.8, 29, 0.01, 0.02], + [24.24, 22.1, 10, 0.02, 0.08], + [100, 100, 30, 0.01, 0.12], + ] +) +# GPU computation +Xgpu = bs.black_scholes_price(opts) + +# compute prices in CPython +X_ref = np.array([ref_python_black_scholes(*opt) for opt in opts], dtype="d") + +print(np.allclose(Xgpu, X_ref, atol=1e-5)) + +n_opts = 3 * 10 ** 6 + +# compute on CPU sycl device +import timeit + +for _ in range(3): + + dpctl.set_default_queue("opencl", "cpu", 0) + print( + "Using : {}".format( + dpctl.get_current_queue().get_sycl_device().get_device_name() + ) + ) + + t0 = timeit.default_timer() + opts1 = gen_option_params( + n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d" + ) + X1 = bs.black_scholes_price(opts1) + t1 = timeit.default_timer() + + print("Elapsed: {}".format(t1 - t0)) + + # compute on GPU sycl device + dpctl.set_default_queue("level0", "gpu", 0) + print( + "Using : {}".format( + dpctl.get_current_queue().get_sycl_device().get_device_name() + ) + ) + + t0 = timeit.default_timer() + opts2 = gen_option_params( + n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d" + ) + X2 = bs.black_scholes_price(opts2) + t1 = timeit.default_timer() + print("Elapsed: {}".format(t1 - t0)) + +print(np.abs(opts1 - opts2).max()) +print(np.abs(X2 - X1).max()) diff --git a/examples/cython/usm_memory/setup.py b/examples/cython/usm_memory/setup.py new file mode 100644 index 0000000000..4f3fced830 --- /dev/null +++ b/examples/cython/usm_memory/setup.py @@ -0,0 +1,67 @@ +import sys +from os.path import join, exists, abspath, dirname +from os import getcwd +from os import environ +from Cython.Build import cythonize + + +def configuration(parent_package="", top_path=None): + from numpy.distutils.misc_util import Configuration + from numpy.distutils.system_info import get_info + import numpy as np + import dpctl + + config = Configuration("", parent_package, top_path) + + oneapi_root = environ.get("ONEAPI_ROOT", None) + if not oneapi_root: + raise ValueError("ONEAPI_ROOT must be set, typical value is /opt/intel/oneapi") + + mkl_info = { + "include_dirs": [join(oneapi_root, "mkl", "include")], + "library_dirs": [ + join(oneapi_root, "mkl", "lib"), + join(oneapi_root, "mkl", "lib", "intel64"), + ], + "libraries": [ + "mkl_sycl", + "mkl_intel_ilp64", + "mkl_tbb_thread", + "mkl_core", + "tbb", + "iomp5", + ], + } + + mkl_include_dirs = mkl_info.get("include_dirs") + mkl_library_dirs = mkl_info.get("library_dirs") + mkl_libraries = mkl_info.get("libraries") + + pdir = dirname(__file__) + wdir = join(pdir) + + eca = ["-Wall", "-Wextra", "-fsycl", "-fsycl-unnamed-lambda"] + + config.add_extension( + name="blackscholes_usm", + sources=[ + join(pdir, "blackscholes.pyx"), + join(wdir, "sycl_blackscholes.cpp"), + join(wdir, "sycl_blackscholes.hpp"), + ], + include_dirs=[wdir, np.get_include(), dpctl.get_include()] + mkl_include_dirs, + libraries=["sycl"] + mkl_libraries, + runtime_library_dirs=mkl_library_dirs, + extra_compile_args=eca, # + ['-O0', '-g', '-ggdb'], + extra_link_args=["-fPIC"], + language="c++", + ) + + config.ext_modules = cythonize(config.ext_modules, include_path=[pdir, wdir]) + return config + + +if __name__ == "__main__": + from numpy.distutils.core import setup + + setup(configuration=configuration) diff --git a/examples/cython/usm_memory/sycl_blackscholes.cpp b/examples/cython/usm_memory/sycl_blackscholes.cpp new file mode 100644 index 0000000000..550a01c622 --- /dev/null +++ b/examples/cython/usm_memory/sycl_blackscholes.cpp @@ -0,0 +1,170 @@ +#include +#include "dppl_sycl_types.h" +#include "sycl_blackscholes.hpp" +#include "mkl_rng_sycl_device.hpp" + +template +class black_scholes_kernel; + +constexpr int n_params = 5; +constexpr int n_params_next_pow2 = 8; + +constexpr int n_prices = 2; +constexpr int PRICE = 0; +constexpr int STRIKE = 1; +constexpr int MATURITY = 2; +constexpr int RATE = 3; +constexpr int VOLATILITY = 4; +constexpr int CALL = 0; +constexpr int PUT = 1; + +template +extern void cpp_blackscholes(DPPLSyclQueueRef q_ptr, size_t n_opts, T* params, T* callput) { + using data_t = T; + + sycl::queue q = *(reinterpret_cast(q_ptr)); + + auto ctx = q.get_context(); + { + sycl::usm::alloc params_type = sycl::get_pointer_type(params, ctx); + if (params_type != sycl::usm::alloc::shared) { + throw std::runtime_error("Input option_params to cpp_blackscholes is not a USM-shared pointer."); + } + } + { + sycl::usm::alloc callput_type = sycl::get_pointer_type(callput, ctx); + if (callput_type != sycl::usm::alloc::shared) { + throw std::runtime_error("Input callput to cpp_blackscholes is not a USM-shared pointer."); + } + } + + auto e = q.submit( + [&](sycl::handler &cgh){ + + data_t zero = data_t(0), one = data_t(1), two = data_t(2); + data_t quarter = one / data_t(4); + data_t half = one / two; + + cgh.parallel_for>( + sycl::range<1>(n_opts), + [=](sycl::id<1> idx) { + const size_t i = n_params * idx[0]; + const data_t opt_price = params[i + PRICE]; + const data_t opt_strike = params[i + STRIKE]; + const data_t opt_maturity = params[i + MATURITY]; + const data_t opt_rate = params[i + RATE]; + const data_t opt_volatility = params[i + VOLATILITY]; + data_t a, b, c, y, z, e, d1, d1c, d2, d2c, w1, w2; + data_t mr = -opt_rate, sig_sig_two = two * opt_volatility * opt_volatility; + + a = cl::sycl::log( opt_price / opt_strike ); + b = opt_maturity * mr; + z = opt_maturity * sig_sig_two; + + c = quarter * z; + e = cl::sycl::exp( b ); + y = cl::sycl::rsqrt( z ); + + a = b - a; + w1 = ( a - c ) * y; + w2 = ( a + c ) * y; + + if (w1 < zero) { + d1 = cl::sycl::erfc(w1) * half; + d1c = one - d1; + } else { + d1c = cl::sycl::erfc(-w1) * half; + d1 = one - d1c; + } + if (w2 < zero) { + d2 = cl::sycl::erfc(w2) * half; + d2c = one - d2; + } else { + d2c = cl::sycl::erfc(-w2) * half; + d2 = one - d2c; + } + + e *= opt_strike; + data_t call_price = opt_price * d1 - e * d2; + data_t put_price = e * d2c - opt_price * d1c; + + const size_t callput_i = n_prices * idx[0]; + callput[callput_i + CALL] = call_price; + callput[callput_i + PUT ] = put_price; + }); + }); + + e.wait_and_throw(); + + return; +} + +template +void cpp_populate_params(DPPLSyclQueueRef q_ptr, size_t n_opts, T* params, T pl, T ph, T sl, T sh, T tl, T th, T rl, T rh, T vl, T vh, int seed) { + sycl::queue q = *(reinterpret_cast(q_ptr)); + + auto ctx = q.get_context(); + { + sycl::usm::alloc params_type = sycl::get_pointer_type(params, ctx); + if (params_type != sycl::usm::alloc::shared) { + throw std::runtime_error("Input option_params to cpp_blackscholes is not a USM-shared pointer."); + } + } + + sycl::event e = q.submit( + [&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<1>(n_opts), + [=](sycl::item<1> idx) { + size_t i = n_params * idx.get_id(0); + size_t j = n_params_next_pow2 * idx.get_id(0); + + // create engine to sample 5 parameters per workers + oneapi::mkl::rng::device::philox4x32x10 engine(seed, j); + oneapi::mkl::rng::device::uniform distr; + + sycl::vec res = oneapi::mkl::rng::device::generate(distr, engine); + + { + const int pos = PRICE; + auto u = res[pos]; + params[i + pos] = pl * u + ph * (T(1)-u); + } + { + const int pos = STRIKE; + auto u = res[pos]; + params[i + pos] = sl * u + sh * (T(1)-u); + } + { + const int pos = MATURITY; + auto u = res[pos]; + params[i + pos] = tl * u + th * (T(1)-u); + } + { + const int pos = RATE; + auto u = res[pos]; + params[i + pos] = rl * u + rh * (T(1)-u); + } + { + const int pos = VOLATILITY; + auto u = res[pos]; + params[i + pos] = vl * u + vh * (T(1)-u); + } + }); + }); + + e.wait_and_throw(); +} + +// instantation for object files to not be empty + +template void cpp_blackscholes(DPPLSyclQueueRef q_ptr, size_t n_opts, double* params, double* callput); +template void cpp_blackscholes(DPPLSyclQueueRef q_ptr, size_t n_opts, float* params, float* callput); + + +template void cpp_populate_params(DPPLSyclQueueRef q_ptr, size_t n_opts, double* params, + double pl, double ph, double sl, double sh, double tl, double th, + double rl, double rh, double vl, double vh, int seed); +template void cpp_populate_params(DPPLSyclQueueRef q_ptr, size_t n_opts, float* params, + float pl, float ph, float sl, float sh, float tl, float th, + float rl, float rh, float vl, float vh, int seed); diff --git a/examples/cython/usm_memory/sycl_blackscholes.hpp b/examples/cython/usm_memory/sycl_blackscholes.hpp new file mode 100644 index 0000000000..36594810dc --- /dev/null +++ b/examples/cython/usm_memory/sycl_blackscholes.hpp @@ -0,0 +1,10 @@ +#include +#include "dppl_sycl_types.h" + +template +extern void cpp_blackscholes(DPPLSyclQueueRef q, size_t n_opts, T* params, T* callput); + +template +extern void cpp_populate_params(DPPLSyclQueueRef q, size_t n_opts, T* params, + T pl, T ph, T sl, T sh, T tl, T th, T rl, T rh, T vl, T vh, + int seed); diff --git a/examples/create_sycl_queues.py b/examples/python/create_sycl_queues.py similarity index 80% rename from examples/create_sycl_queues.py rename to examples/python/create_sycl_queues.py index 6fc6cdc9fa..bdf8368044 100644 --- a/examples/create_sycl_queues.py +++ b/examples/python/create_sycl_queues.py @@ -1,10 +1,11 @@ from __future__ import print_function -from dpctl import runtime, device_context, device_type +import dpctl +from dpctl import device_context, device_type # Global runtime object inside dpctl -rt = runtime +rt = dpctl # Print metadata about the runtime rt.dump() @@ -19,11 +20,11 @@ # the with device_context scope gets reset to what ever context was set # at entry of the scope. For this case, the context would go back to the # default context -with device_context(device_type.cpu, 0) as cpu_queue: +with device_context("opencl:cpu:0") as cpu_queue: print("========================================") print("Current context inside with scope") print("========================================") - rt.dump_queue(cpu_queue) + cpu_queue.get_sycl_device().dump_device_info() # Note the current context can be either directly accessed by using # the "cpu_queue" object, or it can be accessed via the runtime's @@ -31,10 +32,10 @@ print("========================================") print("Looking up current context using runtime") print("========================================") - rt.dump_queue(rt.get_current_queue()) + rt.get_current_queue().get_sycl_device().dump_device_info() print("========================================") print("Current context after exiting with scope") print("========================================") -rt.dump_queue(rt.get_current_queue()) +rt.get_current_queue().get_sycl_device().dump_device_info() diff --git a/examples/python/usm_memory_allocation.py b/examples/python/usm_memory_allocation.py new file mode 100644 index 0000000000..82b989f50b --- /dev/null +++ b/examples/python/usm_memory_allocation.py @@ -0,0 +1,22 @@ +import dpctl +import dpctl.memory as dpmem + +# allocate USM-shared byte-buffer +ms = dpmem.MemoryUSMShared(16) + +# allocate USM-device byte-buffer +md = dpmem.MemoryUSMDevice(16) + +# allocate USM-host byte-buffer +mh = dpmem.MemoryUSMHost(16) + +# specify alignment +mda = dpmem.MemoryUSMDevice(128, alignment=16) + +# allocate using given queue, +# i.e. on the device and bound to the context stored in the queue +mdq = dpmem.MemoryUSMDevice(256, queue=mda._queue) + +# information about device associate with USM buffer +print("Allocation performed on device:") +mda._queue.get_sycl_device().dump_device_info() diff --git a/examples/python/usm_memory_host_access.py b/examples/python/usm_memory_host_access.py new file mode 100644 index 0000000000..c38807be20 --- /dev/null +++ b/examples/python/usm_memory_host_access.py @@ -0,0 +1,38 @@ +import dpctl +import dpctl.memory as dpmem + +# USM-shared and USM-host pointers are host-accessible, +# meaning they are accessible from Python, therefore +# they implement Pyton buffer protocol + +# allocate 1K of USM-shared buffer +ms = dpmem.MemoryUSMShared(1024) + +# create memoryview into USM-shared buffer +msv = memoryview(ms) + +# populate buffer from host one byte at a type +for i in range(len(ms)): + ir = i % 256 + msv[i] = ir ** 2 % 256 + +mh = dpmem.MemoryUSMHost(64) +mhv = memoryview(mh) + +# copy content of block of USM-shared buffer to +# USM-host buffer +mhv[:] = msv[78 : 78 + len(mh)] + +print("Byte-values of the USM-host buffer") +print(list(mhv)) + +# USM-device buffer is not host accessible +md = dpmem.MemoryUSMDevice(16) +try: + mdv = memoryview(md) +except Exception as e: + print("") + print( + "An expected exception was raised during attempted construction of memoryview from USM-device memory object." + ) + print("\t", e) diff --git a/examples/python/usm_memory_operation.py b/examples/python/usm_memory_operation.py new file mode 100644 index 0000000000..efdb7861fd --- /dev/null +++ b/examples/python/usm_memory_operation.py @@ -0,0 +1,30 @@ +import dpctl +import dpctl.memory as dpmem +import numpy as np + +ms = dpmem.MemoryUSMShared(32) +md = dpmem.MemoryUSMDevice(32) + +host_buf = np.random.randint(0, 42, dtype=np.uint8, size=32) + +# copy host byte-like object to USM-device buffer +md.copy_from_host(host_buf) + +# copy USM-device buffer to USM-shared buffer in parallel (using sycl::queue::memcpy) +ms.copy_from_device(md) + +# build numpy array reusing host-accessible USM-shared memory +X = np.ndarray((len(ms),), buffer=ms, dtype=np.uint8) + +# Display Python object NumPy ndarray is viewing into +print("numpy.ndarray.base: ", X.base) +print("") + +# Print content of the view +print("View..........: ", X) + +# Print content of the original host buffer +print("host_buf......: ", host_buf) + +# use copy_to_host to retrieve memory of USM-device memory +print("copy_to_host(): ", md.copy_to_host())