From c2042e9e1ed71c0dd6a38d7faafd7cde001acb29 Mon Sep 17 00:00:00 2001 From: Ruyman Reyes Date: Thu, 17 Jun 2021 20:35:05 +0100 Subject: [PATCH 1/4] Mixing CUDA Fortran and SYCL CUDA Example showing how to link together a program written in CUDA FORTRAN that calls a routine that contains a SYCL program running on CUDA --- example-06/Makefile | 22 +++++++++++++++++++ example-06/saxpy.cpp | 25 +++++++++++++++++++++ example-06/saxpy.cuf | 52 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100644 example-06/Makefile create mode 100644 example-06/saxpy.cpp create mode 100644 example-06/saxpy.cuf diff --git a/example-06/Makefile b/example-06/Makefile new file mode 100644 index 0000000..08283f7 --- /dev/null +++ b/example-06/Makefile @@ -0,0 +1,22 @@ +CXX=clang++ +FORT=nvfortran +FFLAGS=-c++libs -cuda +CXXFLAGS=-fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda + + +default: final.exe + +saxpy_sycl.so: saxpy.cpp + $(CXX) $(CXXFLAGS) -fPIC --shared saxpy.cpp -o saxpy_sycl.so + +saxpy_cuf.o: saxpy.cuf + $(FORT) $(FFLAGS) -c saxpy.cuf -o saxpy_cuf.o + +final.exe: saxpy_cuf.o saxpy_sycl.so + $(FORT) $(FFLAGS) -o final.exe saxpy_cuf.o saxpy_sycl.so -L/home/ruyman/soft/dpcpp/lib/ -lsycl + +.PHONY: clean + +clean: + rm -f saxpy_cuf.o saxpy_sycl.so final.exe mathops.mod + diff --git a/example-06/saxpy.cpp b/example-06/saxpy.cpp new file mode 100644 index 0000000..7a76b2a --- /dev/null +++ b/example-06/saxpy.cpp @@ -0,0 +1,25 @@ +#include +#include + +extern "C" { + void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N); +}; + + +void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N) { + sycl::context c{sycl::property::context::cuda::use_primary_context()}; + sycl::queue q{c, c.get_devices()[0]}; + { + sycl::buffer bX {x, sycl::range<1>(N)}; + + q.submit([&](sycl::handler& h) { + auto aX = bX.get_access(h); + h.single_task([=]() { + aX[0] = 3.f; + }); + }); + + q.wait_and_throw(); + } + return; +} diff --git a/example-06/saxpy.cuf b/example-06/saxpy.cuf new file mode 100644 index 0000000..265eec8 --- /dev/null +++ b/example-06/saxpy.cuf @@ -0,0 +1,52 @@ +module mathOps +contains + attributes(global) subroutine saxpy(x, y, a) + implicit none + real :: x(:), y(:) + real, value :: a + integer :: i, n + n = size(x) + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= n) y(i) = y(i) + a*x(i) + end subroutine saxpy +end module mathOps + +program testSaxpy + use mathOps + use cudafor + + implicit none + +interface saxpy_sycl + subroutine saxpy_call(x, y, a, N) & + bind(C,name='saxpy_sycl_cuda_wrapper') + implicit none + real :: x(:), y(:) + real, value :: a + integer, value :: N + end subroutine +end interface + + + integer, parameter :: N = 1024 + real :: x(N), y(N), a + real, device :: x_d(N), y_d(N) + type(dim3) :: grid, tBlock + + tBlock = dim3(256,1,1) + grid = dim3(ceiling(real(N)/tBlock%x),1,1) + + write (*,*) 'CUDA version: ' + x = 1.0; y = 2.0; a = 2.0 + x_d = x + y_d = y + call saxpy<<>>(x_d, y_d, a) + y = y_d + write(*,*) 'Max error: ', maxval(abs(y-4.0)) + write(*,*) 'N ', N + + write (*,*) 'SYCL version: ' + call saxpy_call(x, y, a, N); + write(*,*) 'Max error: ', maxval(abs(y-4.0)) + +end program testSaxpy From ffe82475385c6a8c46208183b317d20d9ff8cc2e Mon Sep 17 00:00:00 2001 From: Ruyman Reyes Date: Fri, 18 Jun 2021 16:51:24 +0100 Subject: [PATCH 2/4] Fixed: SYCL code does a proper SAXPY operation --- example-06/saxpy.cpp | 7 +++++-- example-06/saxpy.cuf | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/example-06/saxpy.cpp b/example-06/saxpy.cpp index 7a76b2a..d36731a 100644 --- a/example-06/saxpy.cpp +++ b/example-06/saxpy.cpp @@ -11,11 +11,14 @@ void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N) { sycl::queue q{c, c.get_devices()[0]}; { sycl::buffer bX {x, sycl::range<1>(N)}; + sycl::buffer bY {y, sycl::range<1>(N)}; q.submit([&](sycl::handler& h) { auto aX = bX.get_access(h); - h.single_task([=]() { - aX[0] = 3.f; + auto aY = bY.get_access(h); + h.parallel_for(sycl::range<1>(N), [=](sycl::id<1> id) { + if (id[0] < N) + aY[id] = aX[id] * a + aY[id]; }); }); diff --git a/example-06/saxpy.cuf b/example-06/saxpy.cuf index 265eec8..85efad9 100644 --- a/example-06/saxpy.cuf +++ b/example-06/saxpy.cuf @@ -46,6 +46,7 @@ end interface write(*,*) 'N ', N write (*,*) 'SYCL version: ' + y = 2.0; call saxpy_call(x, y, a, N); write(*,*) 'Max error: ', maxval(abs(y-4.0)) From 67ac99240dec878114dd48b164b1e210ff439cab Mon Sep 17 00:00:00 2001 From: Ruyman Reyes Date: Fri, 18 Jun 2021 20:46:05 +0100 Subject: [PATCH 3/4] Adding README --- example-06/README.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 example-06/README.md diff --git a/example-06/README.md b/example-06/README.md new file mode 100644 index 0000000..d819059 --- /dev/null +++ b/example-06/README.md @@ -0,0 +1,32 @@ +CUDA Frotran and SYCL integration +====================================== + +This directory shows an example of how to call a SYCL function +from a CUDA fortran code. + +The SYCL routine is called using the Fortran ISO bindings like +any other C function. + +```fortran +interface saxpy_sycl + subroutine saxpy_call(x, y, a, N) & + bind(C,name='saxpy_sycl_cuda_wrapper') + implicit none + real :: x(:), y(:) + real, value :: a + integer, value :: N + end subroutine +end interface +``` + +The SYCL code implemented in the C++ version of the code works as usual with one minor modification: +Uses the CUDA Primary context to enable inter-operating with the CUDA Fortran code, ensuring the same resources are shared. + +The following snipped highligts the construction of a SYCL context associated with the Primary context. + +```cpp +sycl::context c{sycl::property::context::cuda::use_primary_context()}; +sycl::queue q{c, c.get_devices()[0]}; +``` + + From d66be3dafadd0995f40a093e77ed44d61fb20adb Mon Sep 17 00:00:00 2001 From: Ruyman Reyes Date: Sun, 27 Jun 2021 15:34:52 +0100 Subject: [PATCH 4/4] Using default stream for synchronization --- example-06/Makefile | 4 ++-- example-06/README.md | 6 +++++- example-06/saxpy.cpp | 34 +++++++++++++++++----------------- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/example-06/Makefile b/example-06/Makefile index 08283f7..b7685f0 100644 --- a/example-06/Makefile +++ b/example-06/Makefile @@ -2,7 +2,7 @@ CXX=clang++ FORT=nvfortran FFLAGS=-c++libs -cuda CXXFLAGS=-fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda - +DPCPP_PATH=/home/ruyman/sycl_workspace/build_dpcpp/install default: final.exe @@ -13,7 +13,7 @@ saxpy_cuf.o: saxpy.cuf $(FORT) $(FFLAGS) -c saxpy.cuf -o saxpy_cuf.o final.exe: saxpy_cuf.o saxpy_sycl.so - $(FORT) $(FFLAGS) -o final.exe saxpy_cuf.o saxpy_sycl.so -L/home/ruyman/soft/dpcpp/lib/ -lsycl + $(FORT) $(FFLAGS) -o final.exe saxpy_cuf.o saxpy_sycl.so -L${DPCPP_PATH}/lib/ -lsycl .PHONY: clean diff --git a/example-06/README.md b/example-06/README.md index d819059..6b0ac88 100644 --- a/example-06/README.md +++ b/example-06/README.md @@ -23,10 +23,14 @@ The SYCL code implemented in the C++ version of the code works as usual with one Uses the CUDA Primary context to enable inter-operating with the CUDA Fortran code, ensuring the same resources are shared. The following snipped highligts the construction of a SYCL context associated with the Primary context. +To ensure synchronization with the CUDA Fortran code, the queue will also be mapped to the default CUDA +stream, instead of creating a new stream. +It is possible to create a normal stream, just by using the default SYCL queue constructor on the CUDA +context. Said queue will run concurrently (i.e. won't sync) to the main queue. ```cpp sycl::context c{sycl::property::context::cuda::use_primary_context()}; -sycl::queue q{c, c.get_devices()[0]}; +sycl::queue q{c, c.get_devices()[0], sycl::property::queue::cuda::use_default_stream()}; ``` diff --git a/example-06/saxpy.cpp b/example-06/saxpy.cpp index d36731a..c0c75e0 100644 --- a/example-06/saxpy.cpp +++ b/example-06/saxpy.cpp @@ -2,27 +2,27 @@ #include extern "C" { - void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N); + void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N); }; void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N) { - sycl::context c{sycl::property::context::cuda::use_primary_context()}; - sycl::queue q{c, c.get_devices()[0]}; - { - sycl::buffer bX {x, sycl::range<1>(N)}; - sycl::buffer bY {y, sycl::range<1>(N)}; + sycl::context c{sycl::property::context::cuda::use_primary_context()}; + sycl::queue q{c, c.get_devices()[0], sycl::property::queue::cuda::use_default_stream()}; + { + sycl::buffer bX {x, sycl::range<1>(N)}; + sycl::buffer bY {y, sycl::range<1>(N)}; - q.submit([&](sycl::handler& h) { - auto aX = bX.get_access(h); - auto aY = bY.get_access(h); - h.parallel_for(sycl::range<1>(N), [=](sycl::id<1> id) { - if (id[0] < N) - aY[id] = aX[id] * a + aY[id]; - }); - }); + q.submit([&](sycl::handler& h) { + auto aX = bX.get_access(h); + auto aY = bY.get_access(h); + h.parallel_for(sycl::range<1>(N), [=](sycl::id<1> id) { + if (id[0] < N) + aY[id] = aX[id] * a + aY[id]; + }); + }); - q.wait_and_throw(); - } - return; + q.wait_and_throw(); + } + return; }