diff --git a/examples/pybind11/external_usm_allocation/CMakeLists.txt b/examples/pybind11/external_usm_allocation/CMakeLists.txt new file mode 100644 index 0000000000..6f42a3e6d3 --- /dev/null +++ b/examples/pybind11/external_usm_allocation/CMakeLists.txt @@ -0,0 +1,36 @@ +cmake_minimum_required(VERSION 3.21) + +project(external_usm_allocation LANGUAGES CXX) + +set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) +find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED True) + +# Fetch pybind11 +include(FetchContent) +FetchContent_Declare( + pybind11 + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.tar.gz + URL_HASH SHA256=6bd528c4dbe2276635dc787b6b1f2e5316cf6b49ee3e150264e455a0d68d19c1 +) +FetchContent_MakeAvailable(pybind11) + +find_package(PythonExtensions REQUIRED) +find_package(Dpctl REQUIRED) +find_package(NumPy REQUIRED) + +set(py_module_name _external_usm_alloc) +pybind11_add_module(${py_module_name} + MODULE + external_usm_allocation/_usm_alloc_example.cpp +) +target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) +target_compile_options(${py_module_name} PRIVATE -Wno-deprecated-declarations) +install(TARGETS ${py_module_name} + DESTINATION external_usm_allocation +) + +set(ignoreMe "${SKBUILD}") diff --git a/examples/pybind11/external_usm_allocation/README.md b/examples/pybind11/external_usm_allocation/README.md index 7431ba4ab6..0b8dd62dc2 100644 --- a/examples/pybind11/external_usm_allocation/README.md +++ b/examples/pybind11/external_usm_allocation/README.md @@ -10,6 +10,7 @@ to dpctl.memory entities using `__sycl_usm_array_interface__`. ``` source /opt/intel/oneapi/compiler/latest/env/vars.sh CXX=dpcpp CC=dpcpp python setup.py build_ext --inplace +python -m pytest tests python example.py ``` @@ -17,7 +18,7 @@ python example.py ``` (idp) [12:43:20 ansatnuc04 external_usm_allocation]$ python example.py - + {'data': [94846745444352, True], 'shape': (5, 5), 'strides': None, 'version': 1, 'typestr': '|f8', 'syclobj': } shared diff --git a/examples/pybind11/external_usm_allocation/example.py b/examples/pybind11/external_usm_allocation/example.py index 4455616146..6001d87eca 100644 --- a/examples/pybind11/external_usm_allocation/example.py +++ b/examples/pybind11/external_usm_allocation/example.py @@ -16,7 +16,7 @@ # coding: utf-8 -import external_usm_alloc as eua +import external_usm_allocation as eua import numpy as np import dpctl diff --git a/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py b/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py new file mode 100644 index 0000000000..1af16147fa --- /dev/null +++ b/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py @@ -0,0 +1,27 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# coding: utf-8 + +from ._external_usm_alloc import DMatrix + +__all__ = ["DMatrix"] + +__doc__ = """ + Example of implementing C++ class with its own USM memory allocation logic +and interfacing that allocation with `dpctl` by implementing +`__sycl_usm_array_interface__`. +""" diff --git a/examples/pybind11/external_usm_allocation/_usm_alloc_example.cpp b/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp similarity index 99% rename from examples/pybind11/external_usm_allocation/_usm_alloc_example.cpp rename to examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp index 30216dac55..2b8212076b 100644 --- a/examples/pybind11/external_usm_allocation/_usm_alloc_example.cpp +++ b/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp @@ -132,7 +132,7 @@ py::list tolist(DMatrix &m) return rows; } -PYBIND11_MODULE(external_usm_alloc, m) +PYBIND11_MODULE(_external_usm_alloc, m) { // Import the dpctl extensions import_dpctl(); diff --git a/examples/pybind11/external_usm_allocation/setup.py b/examples/pybind11/external_usm_allocation/setup.py index 2802075d41..86abd2854a 100644 --- a/examples/pybind11/external_usm_allocation/setup.py +++ b/examples/pybind11/external_usm_allocation/setup.py @@ -14,21 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pybind11.setup_helpers import Pybind11Extension -from setuptools import setup +from skbuild import setup -import dpctl - -ext_modules = [ - Pybind11Extension( - "external_usm_alloc", - ["./_usm_alloc_example.cpp"], - include_dirs=[dpctl.get_include()], - extra_compile_args=["-fPIC"], - extra_link_args=["-fPIC"], - libraries=["sycl"], - language="c++", - ) -] - -setup(name="external_usm_alloc", ext_modules=ext_modules) +setup( + name="external_usm_allocation", + version="0.0.1", + description="an example of SYCL-powered Python package (with pybind11)", + author="Intel Scripting", + license="Apache 2.0", + packages=["external_usm_allocation"], +) diff --git a/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py b/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py new file mode 100644 index 0000000000..96bfb951e8 --- /dev/null +++ b/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py @@ -0,0 +1,48 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# coding: utf-8 + +import external_usm_allocation as eua +import numpy as np + +import dpctl +import dpctl.memory as dpm + + +def test_dmatrix(): + q = dpctl.SyclQueue() + matr = eua.DMatrix(q, 5, 5) + assert hasattr(matr, "__sycl_usm_array_interface__") + + blob = dpm.as_usm_memory(matr) + assert blob.get_usm_type() == "shared" + + Xh = np.array( + [ + [1, 1, 1, 2, 2], + [1, 0, 1, 2, 2], + [1, 1, 0, 2, 2], + [0, 0, 0, 3, -1], + [0, 0, 0, -1, 5], + ], + dtype="d", + ) + host_bytes_view = Xh.reshape((-1)).view(np.ubyte) + blob.copy_from_host(host_bytes_view) + + list_of_lists = matr.tolist() + assert list_of_lists == Xh.tolist() diff --git a/examples/pybind11/onemkl_gemv/CMakeLists.txt b/examples/pybind11/onemkl_gemv/CMakeLists.txt index 3ba71c6252..848e8c7727 100644 --- a/examples/pybind11/onemkl_gemv/CMakeLists.txt +++ b/examples/pybind11/onemkl_gemv/CMakeLists.txt @@ -38,7 +38,7 @@ pybind11_add_module(${py_module_name} sycl_gemm/_onemkl.cpp ) target_include_directories(${py_module_name} - PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} + PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm ) target_link_libraries(${py_module_name} PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb} @@ -53,4 +53,18 @@ set_source_files_properties(${_sycl_gemm_sources} COMPILE_OPTIONS "-O3;-Wno-deprecated-declarations" ) +add_executable(standalone_cpp + EXCLUDE_FROM_ALL + cpp/main.cpp +) +target_compile_options(standalone_cpp + PRIVATE -O3 -Wno-deprecated-declarations +) +target_include_directories(standalone_cpp + PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm + ) +target_link_libraries(standalone_cpp + PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb} +) + set(ignoreMe "${SKBUILD}") diff --git a/examples/pybind11/onemkl_gemv/README.md b/examples/pybind11/onemkl_gemv/README.md index 9645a4dd92..e984a5e013 100644 --- a/examples/pybind11/onemkl_gemv/README.md +++ b/examples/pybind11/onemkl_gemv/README.md @@ -1,9 +1,15 @@ Example of SYCL built pybind11 extension -To build, use (assumes scikit-build and dpcpp) is installed +To build, use (assumes scikit-build and dpcpp is installed): ```sh -python setup.py develop -- -G "Ninja" -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_CXX_COMPILER:PATH=icpx -DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib -DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib -DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include -DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include +python setup.py develop -- -G "Ninja" \ + -DCMAKE_C_COMPILER:PATH=icx \ + -DCMAKE_CXX_COMPILER:PATH=icpx \ + -DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib \ + -DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib \ + -DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include \ + -DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include ``` To run test suite @@ -11,3 +17,16 @@ To run test suite ```sh python -m pytest tests ``` + +To compare Python overhead, + +``` +# build standad-alone executable +cmake --build $(find . -name cmake-build) --target standalone_cpp +# execute it +$(find . -name cmake-build)/standalone_cpp 1000 11 +# launch Python computatin +python sycl_timing_solver.py 1000 11 +``` + +Compare host times vs. C++ wall-clock times while making sure that the number of iterations is the same diff --git a/examples/pybind11/onemkl_gemv/cpp/main.cpp b/examples/pybind11/onemkl_gemv/cpp/main.cpp new file mode 100644 index 0000000000..c0f9a6115b --- /dev/null +++ b/examples/pybind11/onemkl_gemv/cpp/main.cpp @@ -0,0 +1,128 @@ +#include "cg_solver.hpp" +#include +#include +#include +#include + +using T = double; + +int main(int argc, char *argv[]) +{ + size_t n = 1000; + size_t rank = 16; + + if (argc > 1) { + n = std::stoi(argv[1]); + } + + if (argc > 2) { + rank = std::stoi(argv[2]); + } + + std::cout << "Solving " << n << " by " << n << " diagonal system with rank-" + << rank << " perturbation." << std::endl; + + sycl::queue q; + + // USM allocation for data needed by program + size_t buf_size = n * n + rank * n + 2 * n; + T *buf = sycl::malloc_device(buf_size, q); + sycl::event memset_ev = q.fill(buf, T(0), buf_size); + + T *Amat = buf; + T *umat = buf + n * n; + T *bvec = umat + rank * n; + T *sol_vec = bvec + n; + + sycl::event set_diag_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on({memset_ev}); + cgh.parallel_for({n}, [=](sycl::id<1> id) { + auto i = id[0]; + Amat[i * (n + 1)] = T(1); + }); + }); + + oneapi::mkl::rng::philox4x32x10 engine(q, 7777); + oneapi::mkl::rng::gaussian + distr(0.0, 1.0); + + // populate umat and bvec in one call + sycl::event umat_rand_ev = + oneapi::mkl::rng::generate(distr, engine, n * rank + n, umat); + + sycl::event syrk_ev = oneapi::mkl::blas::row_major::syrk( + q, oneapi::mkl::uplo::U, oneapi::mkl::transpose::N, n, rank, T(1), umat, + rank, T(1), Amat, n, {umat_rand_ev, set_diag_ev}); + + // need to transpose + sycl::event transpose_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(syrk_ev); + cgh.parallel_for({n * n}, [=](sycl::id<1> id) { + size_t i = id[0]; + size_t i0 = i / n; + size_t i1 = i - i0 * n; + if (i0 > i1) { + Amat[i] = Amat[i1 * n + i0]; + } + }); + }); + + q.wait(); + + constexpr int reps = 6; + + std::vector time; + std::vector conv_iters; + + time.reserve(reps); + conv_iters.reserve(reps); + for (int i = 0; i < reps; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + int conv_iter_count = cg_solver::cg_solve(q, n, Amat, bvec, sol_vec); + auto end = std::chrono::high_resolution_clock::now(); + + time.push_back( + std::chrono::duration_cast(end - start) + .count() * + 1e-06); + + conv_iters.push_back(conv_iter_count); + } + + std::cout << "Converged in : "; + for (auto &el : conv_iters) { + std::cout << el << " , "; + } + std::cout << std::endl; + + std::cout << "Wall-clock cg_solve execution times: "; + for (auto &el : time) { + std::cout << el << " , "; + } + std::cout << std::endl; + + T *Ax = sycl::malloc_device(2 * n + 1, q); + T *delta = Ax + n; + + sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv( + q, oneapi::mkl::transpose::N, n, n, T(1), Amat, n, sol_vec, 1, T(0), Ax, + 1); + + sycl::event sub_ev = oneapi::mkl::vm::sub(q, n, Ax, bvec, delta, {gemv_ev}, + oneapi::mkl::vm::mode::ha); + + T *n2 = delta + n; + sycl::event dot_ev = oneapi::mkl::blas::row_major::dot( + q, n, delta, 1, delta, 1, n2, {sub_ev}); + + T n2_host{}; + q.copy(n2, &n2_host, 1, {dot_ev}).wait_and_throw(); + + std::cout << "Redisual norm squared: " << n2_host << std::endl; + + q.wait_and_throw(); + sycl::free(Ax, q); + sycl::free(buf, q); + + return 0; +} diff --git a/examples/pybind11/onemkl_gemv/setup.py b/examples/pybind11/onemkl_gemv/setup.py index dd00d8e1a9..1ed65e3af8 100644 --- a/examples/pybind11/onemkl_gemv/setup.py +++ b/examples/pybind11/onemkl_gemv/setup.py @@ -1,3 +1,19 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from skbuild import setup setup( diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py new file mode 100644 index 0000000000..6304808078 --- /dev/null +++ b/examples/pybind11/onemkl_gemv/solve.py @@ -0,0 +1,258 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import sycl_gemm + +import dpctl +import dpctl.tensor as dpt + + +def empty_like(A): + return dpt.empty(A.shape, A.dtype, device=A.device) + + +def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]): + """Chebyshev iterative solver using SYCL routines""" + d = (lMax + lMin) / 2 + c = (lMax - lMin) / 2 + + x = dpt.copy(x0) + exec_queue = A.sycl_queue + assert exec_queue == x.sycl_queue + Ax = empty_like(A[:, 0]) + r = empty_like(Ax) + p = empty_like(Ax) + + e_x = dpctl.SyclEvent() + # Ax = A @ x + _, e_dot = sycl_gemm.gemv(exec_queue, A, x, Ax, depends=depends) + # r = b - Ax + _, e_sub = sycl_gemm.sub(exec_queue, b, Ax, r, depends=[e_dot]) + r_ev = e_sub + for i in range(nIters): + z = r + z_ev = r_ev + if i == 0: + p[:] = z + alpha = 1 / d + _, e_axpby = dpctl.SyclEvent(), dpctl.SyclEvent() + elif i == 1: + beta = 0.5 * (c * alpha) ** 2 + alpha = 1 / (d - beta / alpha) + # p = z + beta * p + _, e_axpby = sycl_gemm.axpby_inplace( + exec_queue, 1, z, beta, p, depends=[z_ev] + ) + else: + beta = (c / 2 * alpha) ** 2 + alpha = 1 / (d - beta / alpha) + # p = z + beta * p + _, e_axpby = sycl_gemm.axpby_inplace( + exec_queue, 1, z, beta, p, depends=[z_ev] + ) + # x = x + alpha * p + _, e_x = sycl_gemm.axpby_inplace( + exec_queue, alpha, p, 1, x, depends=[e_axpby, e_x] + ) + # Ax = A @ x + _, e_dot = sycl_gemm.gemv(exec_queue, A, x, Ax, depends=[e_x]) + # r = b - Ax + _, e_sub = sycl_gemm.sub(exec_queue, b, Ax, r, depends=[e_dot]) + # residual = dot(r, r) + residual = sycl_gemm.norm_squared_blocking( + exec_queue, r, depends=[e_sub] + ) + if residual <= 1e-29: + print(f"chebyshev: converged in {i} iters") + break + exec_queue.wait() # wait for all host tasks to complete + return x + + +def check_with_numpy(A, b): + """Direct solver using numpy""" + import numpy as np + + return np.linalg.solve(Anp, bnp) + + +def chebyshev_numpy(A, b, x0, nIters, lMax, lMin): + """Chebyshev iterative solver using numpy""" + d = (lMax + lMin) / 2 + c = (lMax - lMin) / 2 + + x = x0 + + Ax = np.dot(A, x) + r = b - Ax + for i in range(nIters): + z = r + if i == 0: + p = z + alpha = 1 / d + elif i == 1: + beta = 0.5 * (c * alpha) ** 2 + alpha = 1 / (d - beta / alpha) + p = z + beta * p + else: + beta = (c / 2 * alpha) ** 2 + alpha = 1 / (d - beta / alpha) + p = z + beta * p + x = x + alpha * p + Ax = np.dot(A, x) + r = b - Ax + residual = np.dot(r, r) + if residual <= 1e-29: + print(f"chebyshev_numpy: converged in {i} iters") + break + return x + + +def cg_solve(A, b): + """ + Conjugate gradient solver for A @ x == b. + + Returns tuple: (x, converged) + + converged is False if solver has not converged, or the iteration number + """ + exec_queue = A.sycl_queue + x = dpt.zeros(b.shape, dtype=b.dtype) + Ap = empty_like(x) + + all_host_tasks = [] + r = dpt.copy(b) + p = dpt.copy(b) + rsold = sycl_gemm.norm_squared_blocking(exec_queue, r) + if rsold < 1e-20: + return (b, 0) + converged = False + max_iters = b.shape[0] + + e_p = dpctl.SyclEvent() + e_x = dpctl.SyclEvent() + for i in range(max_iters): + # Ap = A @ p + he_dot, e_dot = sycl_gemm.gemv(exec_queue, A, p, Ap, depends=[e_p]) + all_host_tasks.append(he_dot) + # alpha = rsold / dot(p, Ap) + alpha = rsold / sycl_gemm.dot_blocking( + exec_queue, p, Ap, depends=[e_dot] + ) + # x = x + alpha * p + he1_x_update, e1_x_update = sycl_gemm.axpby_inplace( + exec_queue, alpha, p, 1, x, depends=[e_p, e_x] + ) + all_host_tasks.append(he1_x_update) + e_x = e1_x_update + + # r = r - alpha * Ap + he2_r_update, e2_r_update = sycl_gemm.axpby_inplace( + exec_queue, -alpha, Ap, 1, r, depends=[e_p] + ) + all_host_tasks.append(he2_r_update) + + # rsnew = dot(r, r) + rsnew = sycl_gemm.norm_squared_blocking( + exec_queue, r, depends=[e2_r_update] + ) + if rsnew < 1e-20: + e1_x_update.wait() + converged = i + break + beta = rsnew / rsold + + # p = r + beta * p + he3_p_update, e3_p_update = sycl_gemm.axpby_inplace( + exec_queue, 1, r, beta, p, depends=[e2_r_update] + ) + + rsold = rsnew + all_host_tasks.append(he3_p_update) + e_p = e3_p_update + e_x = e1_x_update + + dpctl.SyclEvent.wait_for(all_host_tasks) + return x, converged + + +def cg_solve_numpy(A, b): + x = np.zeros_like(b) + r = b - np.dot(A, x) + p = r + rsold = np.dot(r, r) + converged = False + max_iters = b.shape[0] + + for i in range(max_iters): + Ap = np.dot(A, p) + alpha = rsold / np.dot(p, Ap) + x = x + alpha * p + r = r - alpha * Ap + rsnew = np.dot(r, r) + + if rsnew < 1e-20: + converged = i + break + + beta = rsnew / rsold + p = r + beta * p + rsold = rsnew + + return (x, converged) + + +if __name__ == "__main__": + n = 32 + Anp = ( + 2 * np.eye(n, n, k=0, dtype="d") + + np.eye(n, n, k=1, dtype="d") + + np.eye(n, n, k=-1, dtype="d") + ) + bnp = np.geomspace(0.5, 2, n, dtype="d") + # bounds on eigenvalues of cartan matrix are, needed only + # for the Chebyshev solver + lambda_max = 4 + lambda_min = 4 * np.square(np.sin(np.pi / (2 * (n + 2)))) + + q = dpctl.SyclQueue(property="enable_profiling") + q.print_device_info() + A = dpt.asarray(Anp, dtype="d", usm_type="device", sycl_queue=q) + dev = A.device + b = dpt.asarray(bnp, dtype="d", usm_type="device", device=dev) + x0 = b + t = dpctl.SyclTimer() + with t(dev.sycl_queue): + x, conv = cg_solve(A, b) + print("SYCL solver, 1st run: ", (conv, t.dt)) + with t(dev.sycl_queue): + x, conv = cg_solve(A, b) + print("SYCL solver, 2nd run: ", (conv, t.dt)) + + x_ref = check_with_numpy(Anp, bnp) # solve usign LU solver + + with t(dev.sycl_queue): + x_np, conv = cg_solve_numpy(dpt.asnumpy(A), dpt.asnumpy(b)) + print("NumPy's powered CG solver: ", (conv, t.dt)) + print( + "SYCL cg-solver solution close to reference: ", + np.allclose(dpt.asnumpy(x), x_ref), + ) + print( + "NumPy's cg-solver solution close to reference: ", + np.allclose(x_np, x_ref), + ) diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py index 07d7ec6ced..defce5b76f 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py @@ -1,3 +1,33 @@ -from ._onemkl import gemv +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -__all__ = ["gemv"] +from ._onemkl import ( + axpby_inplace, + cpp_cg_solve, + dot_blocking, + gemv, + norm_squared_blocking, + sub, +) + +__all__ = [ + "gemv", + "sub", + "axpby_inplace", + "norm_squared_blocking", + "dot_blocking", + "cpp_cg_solve", +] diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp index 569428004f..0531b8880f 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp @@ -1,8 +1,38 @@ +//==- _onemkl.cpp - Example of Pybind11 extension working with -===// +// dpctl Python objects. +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2021 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements Pybind11-generated extension exposing functions that +/// take dpctl Python objects, such as dpctl.SyclQueue, dpctl.SyclDevice, and +/// dpctl.tensor.usm_ndarray as arguments. +/// +//===----------------------------------------------------------------------===// + // clang-format off #include #include +#include "cg_solver.hpp" #include #include +#include #include "dpctl4pybind11.hpp" // clang-format on @@ -11,11 +41,11 @@ namespace py = pybind11; using dpctl::utils::keep_args_alive; std::pair -gemv(sycl::queue q, - dpctl::tensor::usm_ndarray matrix, - dpctl::tensor::usm_ndarray vector, - dpctl::tensor::usm_ndarray result, - const std::vector &depends = {}) +py_gemv(sycl::queue q, + dpctl::tensor::usm_ndarray matrix, + dpctl::tensor::usm_ndarray vector, + dpctl::tensor::usm_ndarray result, + const std::vector &depends = {}) { if (matrix.get_ndim() != 2 || vector.get_ndim() != 1 || result.get_ndim() != 1) { @@ -32,6 +62,15 @@ gemv(sycl::queue q, throw std::runtime_error("Inconsistent shapes."); } + auto q_ctx = q.get_context(); + if (q_ctx != matrix.get_queue().get_context() || + q_ctx != vector.get_queue().get_context() || + q_ctx != result.get_queue().get_context()) + { + throw std::runtime_error( + "USM allocation is not bound to the context in execution queue."); + } + int mat_flags = matrix.get_flags(); int v_flags = vector.get_flags(); int r_flags = result.get_flags(); @@ -110,9 +149,489 @@ gemv(sycl::queue q, return std::make_pair(ht_event, res_ev); } +template +sycl::event sub_impl(sycl::queue q, + size_t n, + const char *v1_i, + const char *v2_i, + char *r_i, + const std::vector &depends = {}) +{ + const T *v1 = reinterpret_cast(v1_i); + const T *v2 = reinterpret_cast(v2_i); + T *r = reinterpret_cast(r_i); + + sycl::event r_ev = cg_solver::detail::sub(q, n, v1, v2, r, depends); + + return r_ev; +} + +// out_r = in_v1 - in_v2 +std::pair +py_sub(sycl::queue q, + dpctl::tensor::usm_ndarray in_v1, + dpctl::tensor::usm_ndarray in_v2, + dpctl::tensor::usm_ndarray out_r, + const std::vector &depends = {}) +{ + if (in_v1.get_ndim() != 1 || in_v2.get_ndim() != 1 || out_r.get_ndim() != 1) + { + throw std::runtime_error("Inconsistent dimensions, expecting vectors"); + } + + py::ssize_t n = in_v1.get_shape(0); // get length of the vector + + if (n != in_v2.get_shape(0) || n != out_r.get_shape(0)) { + throw std::runtime_error("Vectors must have the same length"); + } + + if (q.get_context() != in_v1.get_queue().get_context() || + q.get_context() != in_v2.get_queue().get_context() || + q.get_context() != out_r.get_queue().get_context()) + { + throw std::runtime_error( + "USM allocation is not bound to the context in execution queue"); + } + + int in_v1_flags = in_v1.get_flags(); + int in_v2_flags = in_v2.get_flags(); + int out_r_flags = out_r.get_flags(); + + if (!((in_v1_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) && + (in_v2_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) && + (out_r_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)))) + { + throw std::runtime_error("Vectors must be contiguous."); + } + + int in_v1_typenum = in_v1.get_typenum(); + int in_v2_typenum = in_v2.get_typenum(); + int out_r_typenum = out_r.get_typenum(); + + if ((in_v2_typenum != in_v1_typenum) || (out_r_typenum != in_v1_typenum) || + !((in_v1_typenum == UAR_DOUBLE) || (in_v1_typenum == UAR_FLOAT) || + (in_v1_typenum == UAR_CDOUBLE) || (in_v1_typenum == UAR_CFLOAT))) + { + throw std::runtime_error( + "Only real and complex floating point arrays are supported."); + } + + const char *in_v1_typeless_ptr = in_v1.get_data(); + const char *in_v2_typeless_ptr = in_v2.get_data(); + char *out_r_typeless_ptr = out_r.get_data(); + + sycl::event res_ev; + if (out_r_typenum == UAR_DOUBLE) { + using T = double; + res_ev = sub_impl(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr, + out_r_typeless_ptr, depends); + } + else if (out_r_typenum == UAR_FLOAT) { + using T = float; + res_ev = sub_impl(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr, + out_r_typeless_ptr, depends); + } + else if (out_r_typenum == UAR_CDOUBLE) { + using T = std::complex; + res_ev = sub_impl(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr, + out_r_typeless_ptr, depends); + } + else if (out_r_typenum == UAR_CFLOAT) { + using T = std::complex; + res_ev = sub_impl(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr, + out_r_typeless_ptr, depends); + } + else { + throw std::runtime_error("Type dispatch ran into trouble."); + } + + sycl::event ht_event = keep_args_alive(q, {in_v1, in_v2, out_r}, {res_ev}); + + return std::make_pair(ht_event, res_ev); +} + +template class axpy_inplace_kern; +template class axpby_inplace_kern; + +template +sycl::event axpby_inplace_impl(sycl::queue q, + size_t nelems, + py::object pyobj_a, + const char *x_typeless, + py::object pyobj_b, + char *y_typeless, + const std::vector depends = {}) +{ + T a = py::cast(pyobj_a); + T b = py::cast(pyobj_b); + + const T *x = reinterpret_cast(x_typeless); + T *y = reinterpret_cast(y_typeless); + + sycl::event res_ev = + cg_solver::detail::axpby_inplace(q, nelems, a, x, b, y, depends); + + return res_ev; +} + +// y = a * x + b * y +std::pair +py_axpby_inplace(sycl::queue q, + py::object a, + dpctl::tensor::usm_ndarray x, + py::object b, + dpctl::tensor::usm_ndarray y, + const std::vector &depends = {}) +{ + + if (x.get_ndim() != 1 || y.get_ndim() != 1) { + throw std::runtime_error("Inconsistent dimensions, expecting vectors"); + } + + py::ssize_t n = x.get_shape(0); // get length of the vector + + if (n != y.get_shape(0)) { + throw std::runtime_error("Vectors must have the same length"); + } + + if (q.get_context() != x.get_queue().get_context() || + q.get_context() != y.get_queue().get_context()) + { + throw std::runtime_error( + "USM allocation is not bound to the context in execution queue"); + } + + int x_flags = x.get_flags(); + int y_flags = y.get_flags(); + + if (!((x_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) && + (y_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)))) + { + throw std::runtime_error("Vectors must be contiguous."); + } + + int x_typenum = x.get_typenum(); + int y_typenum = y.get_typenum(); + + if ((x_typenum != y_typenum) || + !((x_typenum == UAR_DOUBLE) || (x_typenum == UAR_FLOAT) || + (x_typenum == UAR_CDOUBLE) || (x_typenum == UAR_CFLOAT))) + { + throw std::runtime_error( + "Only real and complex floating point arrays are supported."); + } + + const char *x_typeless_ptr = x.get_data(); + char *y_typeless_ptr = y.get_data(); + + sycl::event res_ev; + if (x_typenum == UAR_DOUBLE) { + using T = double; + res_ev = axpby_inplace_impl(q, n, a, x_typeless_ptr, b, + y_typeless_ptr, depends); + } + else if (x_typenum == UAR_FLOAT) { + using T = float; + res_ev = axpby_inplace_impl(q, n, a, x_typeless_ptr, b, + y_typeless_ptr, depends); + } + else if (x_typenum == UAR_CDOUBLE) { + using T = std::complex; + res_ev = axpby_inplace_impl(q, n, a, x_typeless_ptr, b, + y_typeless_ptr, depends); + } + else if (x_typenum == UAR_CFLOAT) { + using T = std::complex; + res_ev = axpby_inplace_impl(q, n, a, x_typeless_ptr, b, + y_typeless_ptr, depends); + } + else { + throw std::runtime_error("Type dispatch ran into trouble."); + } + + sycl::event ht_event = keep_args_alive(q, {x, y}, {res_ev}); + + return std::make_pair(ht_event, res_ev); +} + +template +T norm_squared_blocking_impl(sycl::queue q, + size_t nelems, + const char *r_typeless, + const std::vector &depends = {}) +{ + const T *r = reinterpret_cast(r_typeless); + + return cg_solver::detail::norm_squared_blocking(q, nelems, r, depends); +} + +template class complex_norm_squared_blocking_kern; + +template +T complex_norm_squared_blocking_impl( + sycl::queue q, + size_t nelems, + const char *r_typeless, + const std::vector &depends = {}) +{ + const std::complex *r = + reinterpret_cast *>(r_typeless); + + return cg_solver::detail::complex_norm_squared_blocking(q, nelems, r, + depends); +} + +py::object py_norm_squared_blocking(sycl::queue q, + dpctl::tensor::usm_ndarray r, + const std::vector depends = {}) +{ + if (r.get_ndim() != 1) { + throw std::runtime_error("Expecting a vector"); + } + + py::ssize_t n = r.get_shape(0); // get length of the vector + + int r_flags = r.get_flags(); + + if (!(r_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS))) { + throw std::runtime_error("Vector must be contiguous."); + } + + if (q.get_context() != r.get_queue().get_context()) { + throw std::runtime_error( + "USM allocation is not bound to the context in execution queue"); + } + + int r_typenum = r.get_typenum(); + if ((r_typenum != UAR_DOUBLE) && (r_typenum != UAR_FLOAT) && + (r_typenum != UAR_CDOUBLE) && (r_typenum != UAR_CFLOAT)) + { + throw std::runtime_error( + "Only real and complex floating point arrays are supported."); + } + + const char *r_typeless_ptr = r.get_data(); + py::object res; + + if (r_typenum == UAR_DOUBLE) { + using T = double; + T n_sq = norm_squared_blocking_impl(q, n, r_typeless_ptr, depends); + res = py::float_(n_sq); + } + else if (r_typenum == UAR_FLOAT) { + using T = float; + T n_sq = norm_squared_blocking_impl(q, n, r_typeless_ptr, depends); + res = py::float_(n_sq); + } + else if (r_typenum == UAR_CDOUBLE) { + using T = std::complex; + double n_sq = complex_norm_squared_blocking_impl( + q, n, r_typeless_ptr, depends); + res = py::float_(n_sq); + } + else if (r_typenum == UAR_CFLOAT) { + using T = std::complex; + float n_sq = complex_norm_squared_blocking_impl( + q, n, r_typeless_ptr, depends); + res = py::float_(n_sq); + } + else { + throw std::runtime_error("Type dispatch ran into trouble."); + } + + return res; +} + +py::object py_dot_blocking(sycl::queue q, + dpctl::tensor::usm_ndarray v1, + dpctl::tensor::usm_ndarray v2, + const std::vector &depends = {}) +{ + if (v1.get_ndim() != 1 || v2.get_ndim() != 1) { + throw std::runtime_error("Expecting two vectors"); + } + + py::ssize_t n = v1.get_shape(0); // get length of the vector + + if (n != v2.get_shape(0)) { + throw std::runtime_error("Length of vectors are not the same"); + } + + int v1_flags = v1.get_flags(); + int v2_flags = v2.get_flags(); + + if (!(v1_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) || + !(v2_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS))) + { + throw std::runtime_error("Vectors must be contiguous."); + } + + if (q.get_context() != v1.get_queue().get_context() || + q.get_context() != v2.get_queue().get_context()) + { + throw std::runtime_error( + "USM allocation is not bound to the context in execution queue"); + } + + int v1_typenum = v1.get_typenum(); + int v2_typenum = v2.get_typenum(); + + if ((v1_typenum != v2_typenum) || + ((v1_typenum != UAR_DOUBLE) && (v1_typenum != UAR_FLOAT) && + (v1_typenum != UAR_CDOUBLE) && (v1_typenum != UAR_CFLOAT))) + { + throw py::value_error( + "Data types of vectors must be the same. " + "Only real and complex floating types are supported."); + } + + const char *v1_typeless_ptr = v1.get_data(); + const char *v2_typeless_ptr = v2.get_data(); + py::object res; + + if (v1_typenum == UAR_DOUBLE) { + using T = double; + T *res_usm = sycl::malloc_device(1, q); + sycl::event dot_ev = oneapi::mkl::blas::row_major::dot( + q, n, reinterpret_cast(v1_typeless_ptr), 1, + reinterpret_cast(v2_typeless_ptr), 1, res_usm, depends); + T res_v{}; + q.copy(res_usm, &res_v, 1, {dot_ev}).wait_and_throw(); + res = py::float_(res_v); + } + else if (v1_typenum == UAR_FLOAT) { + using T = float; + T *res_usm = sycl::malloc_device(1, q); + sycl::event dot_ev = oneapi::mkl::blas::row_major::dot( + q, n, reinterpret_cast(v1_typeless_ptr), 1, + reinterpret_cast(v2_typeless_ptr), 1, res_usm, depends); + T res_v(0); + q.copy(res_usm, &res_v, 1, {dot_ev}).wait_and_throw(); + res = py::float_(res_v); + } + else if (v1_typenum == UAR_CDOUBLE) { + using T = std::complex; + T *res_usm = sycl::malloc_device(1, q); + sycl::event dotc_ev = oneapi::mkl::blas::row_major::dotc( + q, n, reinterpret_cast(v1_typeless_ptr), 1, + reinterpret_cast(v2_typeless_ptr), 1, res_usm, depends); + T res_v{}; + q.copy(res_usm, &res_v, 1, {dotc_ev}).wait_and_throw(); + res = py::cast(res_v); + } + else if (v1_typenum == UAR_CFLOAT) { + using T = std::complex; + T *res_usm = sycl::malloc_device(1, q); + sycl::event dotc_ev = oneapi::mkl::blas::row_major::dotc( + q, n, reinterpret_cast(v1_typeless_ptr), 1, + reinterpret_cast(v2_typeless_ptr), 1, res_usm, depends); + T res_v{}; + q.copy(res_usm, &res_v, 1, {dotc_ev}).wait_and_throw(); + res = py::cast(res_v); + } + else { + throw std::runtime_error("Type dispatch ran into trouble."); + } + + return res; +} + +int py_cg_solve(sycl::queue exec_q, + dpctl::tensor::usm_ndarray Amat, + dpctl::tensor::usm_ndarray bvec, + dpctl::tensor::usm_ndarray xvec, + double rs_tol, + const std::vector &depends = {}) +{ + if (Amat.get_ndim() != 2 || bvec.get_ndim() != 1 || xvec.get_ndim() != 1) { + throw py::value_error("Expecting a matrix and two vectors"); + } + + py::ssize_t n0 = Amat.get_shape(0); + py::ssize_t n1 = Amat.get_shape(1); + + if (n0 != n1) { + throw py::value_error("Matrix must be square."); + } + + if (n0 != bvec.get_shape(0) || n0 != xvec.get_shape(0)) { + throw py::value_error( + "Dimensions of the matrix and vectors are not consistent."); + } + + bool all_contig = (Amat.get_flags() & USM_ARRAY_C_CONTIGUOUS) && + (bvec.get_flags() & USM_ARRAY_C_CONTIGUOUS) && + (xvec.get_flags() & USM_ARRAY_C_CONTIGUOUS); + if (!all_contig) { + throw py::value_error("All inputs must be C-contiguous"); + } + + int A_typenum = Amat.get_typenum(); + int b_typenum = bvec.get_typenum(); + int x_typenum = xvec.get_typenum(); + + if (A_typenum != b_typenum || A_typenum != x_typenum) { + throw py::value_error("All arrays must have the same type"); + } + + if (exec_q.get_context() != Amat.get_queue().get_context() || + exec_q.get_context() != bvec.get_queue().get_context() || + exec_q.get_context() != xvec.get_queue().get_context()) + { + throw std::runtime_error( + "USM allocations are not bound to context in execution queue"); + } + + const char *A_ch = Amat.get_data(); + const char *b_ch = bvec.get_data(); + char *x_ch = xvec.get_data(); + + if (A_typenum == UAR_DOUBLE) { + using T = double; + int iters = cg_solver::cg_solve( + exec_q, n0, reinterpret_cast(A_ch), + reinterpret_cast(b_ch), reinterpret_cast(x_ch), + depends, static_cast(rs_tol)); + + return iters; + } + else if (A_typenum == UAR_FLOAT) { + using T = float; + int iters = cg_solver::cg_solve( + exec_q, n0, reinterpret_cast(A_ch), + reinterpret_cast(b_ch), reinterpret_cast(x_ch), + depends, static_cast(rs_tol)); + + return iters; + } + else { + throw std::runtime_error( + "Unsupported data type. Use single or double precision."); + } +} + PYBIND11_MODULE(_onemkl, m) { // Import the dpctl extensions import_dpctl(); - m.def("gemv", &gemv, "Uses oneMKL to compute dot(matrix, vector)"); + + m.def("gemv", &py_gemv, "Uses oneMKL to compute dot(matrix, vector)", + py::arg("exec_queue"), py::arg("Amatrix"), py::arg("xvec"), + py::arg("resvec"), py::arg("depends") = py::list()); + m.def("sub", &py_sub, "Subtraction: out = v1 - v2", py::arg("exec_queue"), + py::arg("in1"), py::arg("in2"), py::arg("out"), + py::arg("depends") = py::list()); + m.def("axpby_inplace", &py_axpby_inplace, "y = a * x + b * y", + py::arg("exec_queue"), py::arg("a"), py::arg("x"), py::arg("b"), + py::arg("y"), py::arg("depends") = py::list()); + m.def("norm_squared_blocking", &py_norm_squared_blocking, "norm(r)**2", + py::arg("exec_queue"), py::arg("r"), py::arg("depends") = py::list()); + m.def("dot_blocking", &py_dot_blocking, "", py::arg("exec_queue"), + py::arg("v1"), py::arg("v2"), py::arg("depends") = py::list()); + + m.def("cpp_cg_solve", &py_cg_solve, + "Dispatch to call C++ implementation of cg_solve", + py::arg("exec_queue"), py::arg("Amat"), py::arg("bvec"), + py::arg("xvec"), py::arg("rs_squared_tolerance") = py::float_(1e-20), + py::arg("depends") = py::list()); } diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp b/examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp new file mode 100644 index 0000000000..6e0396c7a1 --- /dev/null +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp @@ -0,0 +1,226 @@ +//==- _cg_solver.cpp - C++ impl of conjugate gradient linear solver -==// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2021 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements Pybind11-generated extension exposing functions that +/// take dpctl Python objects, such as dpctl.SyclQueue, dpctl.SyclDevice, and +/// dpctl.tensor.usm_ndarray as arguments. +/// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +namespace cg_solver +{ +namespace detail +{ + +template class sub_kern; + +template +sycl::event sub(sycl::queue &q, + size_t n, + const T *v1, + const T *v2, + T *r, + const std::vector &depends = {}) +{ + sycl::event r_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>(sycl::range<1>{n}, [=](sycl::id<1> id) { + auto i = id.get(0); + r[i] = v1[i] - v2[i]; + }); + }); + + return r_ev; +} + +template class axpy_inplace_kern; +template class axpby_inplace_kern; + +template +sycl::event axpby_inplace(sycl::queue &q, + size_t nelems, + T a, + const T *x, + T b, + T *y, + const std::vector depends = {}) +{ + + sycl::event res_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + if (b == T(1)) { + cgh.parallel_for>(sycl::range<1>{nelems}, + [=](sycl::id<1> id) { + auto i = id.get(0); + y[i] += a * x[i]; + }); + } + else { + cgh.parallel_for>( + sycl::range<1>{nelems}, [=](sycl::id<1> id) { + auto i = id.get(0); + y[i] = b * y[i] + a * x[i]; + }); + } + }); + + return res_ev; +} + +template class norm_squared_blocking_kern; + +template +T norm_squared_blocking(sycl::queue &q, + size_t nelems, + const T *r, + const std::vector &depends = {}) +{ + sycl::buffer sum_sq_buf(sycl::range<1>{1}); + + q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + auto sum_sq_reduction = sycl::reduction( + sum_sq_buf, cgh, sycl::plus(), + {sycl::property::reduction::initialize_to_identity{}}); + cgh.parallel_for>( + sycl::range<1>{nelems}, sum_sq_reduction, + [=](sycl::id<1> id, auto &sum_sq) { + auto i = id.get(0); + sum_sq += r[i] * r[i]; + }); + }).wait_and_throw(); + + sycl::host_accessor ha(sum_sq_buf); + return ha[0]; +} + +template class complex_norm_squared_blocking_kern; + +template +T complex_norm_squared_blocking(sycl::queue &q, + size_t nelems, + const std::complex *r, + const std::vector &depends = {}) +{ + sycl::buffer sum_sq_buf(sycl::range<1>{1}); + + q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + auto sum_sq_reduction = sycl::reduction( + sum_sq_buf, cgh, sycl::plus(), + {sycl::property::reduction::initialize_to_identity{}}); + cgh.parallel_for>( + sycl::range<1>{nelems}, sum_sq_reduction, + [=](sycl::id<1> id, auto &sum_sq) { + auto i = id.get(0); + sum_sq += + r[i].real() * r[i].real() + r[i].imag() * r[i].imag(); + }); + }).wait_and_throw(); + + sycl::host_accessor ha(sum_sq_buf); + return ha[0]; +} + +} // namespace detail + +template +int cg_solve(sycl::queue exec_q, + std::int64_t n, + const T *Amat, + const T *bvec, + T *sol_vec, + const std::vector &depends = {}, + T rs_threshold = T(1e-20)) +{ + T *x_vec = sol_vec; + sycl::event fill_ev = exec_q.fill(x_vec, T(0), n, depends); + + // n for r, n for p, n for Ap and 1 for pAp_dot_dev + T *r = sycl::malloc_device(3 * n + 1, exec_q); + T *p = r + n; + T *Ap = p + n; + T *pAp_dot_dev = Ap + n; + + sycl::event copy_to_r_ev = exec_q.copy(bvec, r, n, depends); + sycl::event copy_to_p_ev = exec_q.copy(bvec, p, n, depends); + T rs_old = detail::norm_squared_blocking(exec_q, n, r, {copy_to_r_ev}); + + std::int64_t max_iters = n; + + if (rs_old < rs_threshold) { + sycl::free(r, exec_q); + return 0; + } + + int converged_at = max_iters; + sycl::event prev_dep = copy_to_p_ev; + sycl::event e_x = fill_ev; + + for (std::int64_t i = 0; i < max_iters; ++i) { + sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv( + exec_q, oneapi::mkl::transpose::N, n, n, T(1), Amat, n, p, 1, T(0), + Ap, 1, {prev_dep}); + + sycl::event pAp_dot_ev = oneapi::mkl::blas::row_major::dot( + exec_q, n, p, 1, Ap, 1, pAp_dot_dev, {prev_dep, gemv_ev}); + + T pAp_dot_host{}; + exec_q.copy(pAp_dot_dev, &pAp_dot_host, 1, {pAp_dot_ev}) + .wait_and_throw(); + T alpha = rs_old / pAp_dot_host; + + // x = x + alpha * p + sycl::event x_update_ev = + detail::axpby_inplace(exec_q, n, alpha, p, T(1), x_vec, {e_x}); + + // r = r - alpha * Ap + sycl::event r_update_ev = + detail::axpby_inplace(exec_q, n, -alpha, Ap, T(1), r); + + T rs_new = detail::norm_squared_blocking(exec_q, n, r, {r_update_ev}); + + if (rs_new < rs_threshold) { + converged_at = i; + x_update_ev.wait(); + break; + } + + T beta = rs_new / rs_old; + + // p = r + beta * p + prev_dep = + detail::axpby_inplace(exec_q, n, T(1), r, beta, p, {r_update_ev}); + e_x = x_update_ev; + + rs_old = rs_new; + } + + sycl::free(r, exec_q); + return converged_at; +} + +} // namespace cg_solver diff --git a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py new file mode 100644 index 0000000000..42178ee0ed --- /dev/null +++ b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py @@ -0,0 +1,92 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +import numpy as np + +# coding: utf-8 +import solve +import sycl_gemm + +import dpctl +import dpctl.tensor as dpt + +argv = sys.argv + +n = 1000 +rank = 11 + +if len(argv) > 1: + n = int(argv[1]) +if len(argv) > 2: + rank = int(argv[2]) + + +print( + f"Solving {n} by {n} diagonal linear " + f"system with rank {rank} perturbation." +) + +Anp = np.eye(n, n) + (lambda x: x.T @ x)(np.random.randn(rank, n)) +bnp = np.random.rand(n) + +q = dpctl.SyclQueue(property=["enable_profiling"]) +q.print_device_info() +if q.is_in_order: + print("Using in-order queue") +else: + print("Using not in-order queue") + +api_dev = dpctl.tensor.Device.create_device(q) +A = dpt.asarray(Anp, "d", device=api_dev) +b = dpt.asarray(bnp, "d", device=api_dev) + +timer = dpctl.SyclTimer(time_scale=1e3) + +iters = [] +for i in range(6): + with timer(api_dev.sycl_queue): + x, conv_in = solve.cg_solve(A, b) + + print(i, "(host_dt, device_dt)=", timer.dt) + iters.append(conv_in) + +print("Converged in: ", iters) + +r = dpt.empty_like(b) +hev, ev = sycl_gemm.gemv(q, A, x, r) +delta = dpt.empty_like(b) +hev2, ev2 = sycl_gemm.sub(q, r, b, delta, [ev]) +rs = sycl_gemm.norm_squared_blocking(q, delta) +dpctl.SyclEvent.wait_for([hev, hev2]) +print(f"Python solution residual norm squared: {rs}") + +x_cpp = dpt.empty_like(b) +iters = [] +for i in range(6): + with timer(api_dev.sycl_queue): + conv_in = sycl_gemm.cpp_cg_solve(q, A, b, x_cpp) + + print(i, "(host_dt, device_dt)=", timer.dt) + iters.append(conv_in) + +print("Converged in: ", iters) +hev, ev = sycl_gemm.gemv(q, A, x_cpp, r) +hev2, ev2 = sycl_gemm.sub(q, r, b, delta, [ev]) +rs = sycl_gemm.norm_squared_blocking(q, delta) +dpctl.SyclEvent.wait_for([hev, hev2]) +print(f"cpp_cg_solve solution residual norm squared: {rs}") diff --git a/examples/pybind11/onemkl_gemv/tests/test_gemm.py b/examples/pybind11/onemkl_gemv/tests/test_gemm.py index 7a78fba346..6090322c75 100644 --- a/examples/pybind11/onemkl_gemv/tests/test_gemm.py +++ b/examples/pybind11/onemkl_gemv/tests/test_gemm.py @@ -1,6 +1,12 @@ import numpy as np import pytest -from sycl_gemm import gemv +from sycl_gemm import ( + axpby_inplace, + dot_blocking, + gemv, + norm_squared_blocking, + sub, +) import dpctl import dpctl.tensor as dpt @@ -15,7 +21,59 @@ def test_gemv(): r = dpt.empty((5,), dtype="d", sycl_queue=q) M = dpt.asarray(Mnp, sycl_queue=q) v = dpt.asarray(vnp, sycl_queue=q) - hev, ev = gemv(M.sycl_queue, M, v, r, []) + hev, ev = gemv(q, M, v, r, []) hev.wait() rnp = dpt.asnumpy(r) assert np.allclose(rnp, Mnp @ vnp) + + +def test_sub(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not be created") + anp, bnp = np.random.randn(5), np.random.randn(5) + r = dpt.empty((5,), dtype="d", sycl_queue=q) + a = dpt.asarray(anp, sycl_queue=q) + b = dpt.asarray(bnp, sycl_queue=q) + hev, ev = sub(q, a, b, r, []) + hev.wait() + rnp = dpt.asnumpy(r) + assert np.allclose(rnp + bnp, anp) + + +def test_axpby(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not be created") + xnp, pnp = np.random.randn(5), np.random.randn(5) + x = dpt.asarray(xnp, sycl_queue=q) + p = dpt.asarray(pnp, sycl_queue=q) + hev, ev = axpby_inplace(q, 0.5, x, -0.7, p, []) + hev.wait() + rnp = dpt.asnumpy(p) + assert np.allclose(rnp, 0.5 * xnp - 0.7 * pnp) + + +def test_dot(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not be created") + anp, bnp = np.random.randn(5), np.random.randn(5) + a = dpt.asarray(anp, sycl_queue=q) + b = dpt.asarray(bnp, sycl_queue=q) + dot_res = dot_blocking(q, a, b) + assert np.allclose(dot_res, np.dot(anp, bnp)) + + +def test_norm_squared(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not be created") + anp = np.random.randn(5) + a = dpt.asarray(anp, sycl_queue=q) + dot_res = norm_squared_blocking(q, a) + assert np.allclose(dot_res, np.dot(anp, anp)) diff --git a/examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt b/examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt new file mode 100644 index 0000000000..a788c56bae --- /dev/null +++ b/examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt @@ -0,0 +1,36 @@ +cmake_minimum_required(VERSION 3.21) + +project(use_queue_device LANGUAGES CXX) + +set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) +find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED True) + +# Fetch pybind11 +include(FetchContent) +FetchContent_Declare( + pybind11 + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.tar.gz + URL_HASH SHA256=6bd528c4dbe2276635dc787b6b1f2e5316cf6b49ee3e150264e455a0d68d19c1 +) +FetchContent_MakeAvailable(pybind11) + +find_package(PythonExtensions REQUIRED) +find_package(Dpctl REQUIRED) +find_package(NumPy REQUIRED) + +set(py_module_name _use_queue_device) +pybind11_add_module(${py_module_name} + MODULE + use_queue_device/_example.cpp +) +target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) +target_compile_options(${py_module_name} PRIVATE -Wno-deprecated-declarations) +install(TARGETS ${py_module_name} + DESTINATION use_queue_device +) + +set(ignoreMe "${SKBUILD}") diff --git a/examples/pybind11/use_dpctl_syclqueue/README.md b/examples/pybind11/use_dpctl_syclqueue/README.md index 3e7b6804a9..f3ef6e92e5 100644 --- a/examples/pybind11/use_dpctl_syclqueue/README.md +++ b/examples/pybind11/use_dpctl_syclqueue/README.md @@ -9,7 +9,8 @@ extensions. ``` source /opt/intel/oneapi/compiler/latest/env/vars.sh -CXX=dpcpp CC=dpcpp python setup.py build_ext --inplace +CXX=icpx python setup.py build_ext --inplace +python -m pytest tests python example.py ``` diff --git a/examples/pybind11/use_dpctl_syclqueue/example.py b/examples/pybind11/use_dpctl_syclqueue/example.py index 7189adecb8..698e43baf9 100644 --- a/examples/pybind11/use_dpctl_syclqueue/example.py +++ b/examples/pybind11/use_dpctl_syclqueue/example.py @@ -17,7 +17,7 @@ # coding: utf-8 import numpy as np -import use_queue_device_ext as eg +import use_queue_device as eg import dpctl diff --git a/examples/pybind11/use_dpctl_syclqueue/setup.py b/examples/pybind11/use_dpctl_syclqueue/setup.py index 34eeebe38c..4d09be820e 100644 --- a/examples/pybind11/use_dpctl_syclqueue/setup.py +++ b/examples/pybind11/use_dpctl_syclqueue/setup.py @@ -14,20 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pybind11.setup_helpers import Pybind11Extension -from setuptools import setup +from skbuild import setup -import dpctl - -exts = [ - Pybind11Extension( - "use_queue_device_ext", - ["./_example.cpp"], - include_dirs=[dpctl.get_include()], - extra_compile_args=["-fPIC"], - extra_link_args=["-fPIC"], - language="c++", - ), -] - -setup(name="pybind11_example", ext_modules=exts) +setup( + name="use_queue_device", + version="0.0.1", + description="an example of SYCL-powered Python package (with pybind11)", + author="Intel Scripting", + license="Apache 2.0", + packages=["use_queue_device"], +) diff --git a/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py b/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py new file mode 100644 index 0000000000..120cc1fd9e --- /dev/null +++ b/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py @@ -0,0 +1,57 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# coding: utf-8 + +import numpy as np +import use_queue_device as uqd + +import dpctl + + +def test_compute_units(): + q = dpctl.SyclQueue() + mcu = uqd.get_max_compute_units(q) + + assert type(mcu) is int + assert mcu == q.sycl_device.max_compute_units + + +def test_global_memory(): + d = dpctl.SyclDevice() + gm = uqd.get_device_global_mem_size(d) + assert type(gm) is int + assert gm == d.global_mem_size + + +def test_local_memory(): + d = dpctl.SyclDevice() + lm = uqd.get_device_local_mem_size(d) + assert type(lm) is int + assert lm == d.local_mem_size + + +def test_offload_array_mod(): + execution_queue = dpctl.SyclQueue() + X = np.random.randint(low=1, high=2**16 - 1, size=10**6, dtype=np.int64) + modulus_p = 347 + + # Y is a regular NumPy array with NumPy allocated host memory + Y = uqd.offloaded_array_mod(execution_queue, X, modulus_p) + + Ynp = X % modulus_p + + assert np.array_equal(Y, Ynp) diff --git a/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py new file mode 100644 index 0000000000..ccbca8fd73 --- /dev/null +++ b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py @@ -0,0 +1,42 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# coding: utf-8 + +from ._use_queue_device import ( + get_device_global_mem_size, + get_device_local_mem_size, + get_max_compute_units, + offloaded_array_mod, +) + +__all__ = [ + "get_max_compute_units", + "get_device_global_mem_size", + "get_device_local_mem_size", + "offloaded_array_mod", +] + +__doc__ = """ +Example pybind11 extension demonstrating binding of dpctl entities to +SYCL entities. + +dpctl provides type casters that bind ``sycl::queue`` to `dpctl.SyclQueue`, +``sycl::device`` to `dpctl.SyclDevice`, etc. + +Use of these type casters simplifies writing of Python extensions and compile +then using SYCL C++ compilers, such as Intel(R) oneAPI DPC++ compiler. +""" diff --git a/examples/pybind11/use_dpctl_syclqueue/_example.cpp b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/_example.cpp similarity index 98% rename from examples/pybind11/use_dpctl_syclqueue/_example.cpp rename to examples/pybind11/use_dpctl_syclqueue/use_queue_device/_example.cpp index 7b26a35858..54474a0ca1 100644 --- a/examples/pybind11/use_dpctl_syclqueue/_example.cpp +++ b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/_example.cpp @@ -84,7 +84,7 @@ offloaded_array_mod(sycl::queue &q, return res; } -PYBIND11_MODULE(use_queue_device_ext, m) +PYBIND11_MODULE(_use_queue_device, m) { // Import the dpctl extensions import_dpctl();