From 245ccd02ce961ad2ba6f6cbf7d9783d269bf8dc9 Mon Sep 17 00:00:00 2001 From: Ruyman Reyes Castro Date: Thu, 9 Feb 2023 16:44:52 +0000 Subject: [PATCH 1/7] Ignoring VIM temporary files --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 55f356e..8fe35b5 100644 --- a/.gitignore +++ b/.gitignore @@ -34,4 +34,7 @@ # Temporaries *~ *# -*/build \ No newline at end of file +*/build +] +# vim +*.swp From 0c886cb3a3281da186100f41155a660b6943931a Mon Sep 17 00:00:00 2001 From: Ruyman Reyes Castro Date: Thu, 9 Feb 2023 17:16:34 +0000 Subject: [PATCH 2/7] Vector Addition example update Works with latest DPC++ and SYCL2020 features README updated to reflect CUDA backend has USM support --- examples/vector_addition/README.md | 3 --- examples/vector_addition/vector_addition.cpp | 12 +++++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/examples/vector_addition/README.md b/examples/vector_addition/README.md index b82357a..133fc51 100644 --- a/examples/vector_addition/README.md +++ b/examples/vector_addition/README.md @@ -7,9 +7,6 @@ to highlight how to build an application with SYCL for CUDA using DPC++ support, for which an example CMakefile is provided. For detailed documentation on how to migrate from CUDA to SYCL, see [SYCL For CUDA Developers](https://developer.codeplay.com/products/computecpp/ce/guides/sycl-for-cuda-developers). -Note currently the CUDA backend does not support the [USM](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc) extension, so we use -`sycl::buffer` and `sycl::accessors` instead. - Pre-requisites --------------- diff --git a/examples/vector_addition/vector_addition.cpp b/examples/vector_addition/vector_addition.cpp index 9fdaf87..98a8143 100644 --- a/examples/vector_addition/vector_addition.cpp +++ b/examples/vector_addition/vector_addition.cpp @@ -34,10 +34,12 @@ int main(int argc, char *argv[]) { // Initialize input data { - const auto dwrite_t = sycl::access::mode::discard_write; + const auto dwrite_t = sycl::write_only; + + + sycl::host_accessor h_a{bufA, sycl::write_only}; + sycl::host_accessor h_b{bufB, sycl::write_only}; - auto h_a = bufA.get_access(); - auto h_b = bufB.get_access(); for (int i = 0; i < N; i++) { h_a[i] = sin(i) * sin(i); h_b[i] = cos(i) * cos(i); @@ -70,8 +72,8 @@ int main(int argc, char *argv[]) { myQueue.submit(cg); { - const auto read_t = sycl::access::mode::read; - auto h_c = bufC.get_access(); + sycl::host_accessor h_c{bufC, sycl::read_only}; + double sum = 0.0f; for (int i = 0; i < N; i++) { sum += h_c[i]; From 283e69d46dccd91d4324f3c0bfbde4d391f2de30 Mon Sep 17 00:00:00 2001 From: Ruyman Reyes Castro Date: Mon, 13 Feb 2023 11:59:36 +0000 Subject: [PATCH 3/7] Removing unnecessary CUDA Driver types Since https://github.com/intel/llvm/pull/8197, SYCL CUDA backend uses CUDA primary context by default, so individual context setting is no longer required. --- examples/sgemm_interop/sycl_sgemm.cpp | 10 +++++----- examples/sgemm_interop/sycl_sgemm_usm.cpp | 11 ++++------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/examples/sgemm_interop/sycl_sgemm.cpp b/examples/sgemm_interop/sycl_sgemm.cpp index 0e90beb..119368f 100644 --- a/examples/sgemm_interop/sycl_sgemm.cpp +++ b/examples/sgemm_interop/sycl_sgemm.cpp @@ -25,11 +25,10 @@ #include #include -#include -#include +#include -#include #include +#include #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC) @@ -47,12 +46,14 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) { } } -void inline checkCudaErrorMsg(CUresult status, const char *msg) { +/* + * void inline checkCudaErrorMsg(CUresult status, const char *msg) { if (status != CUDA_SUCCESS) { std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl; exit(EXIT_FAILURE); } } +*/ class CUDASelector : public sycl::device_selector { public: @@ -104,7 +105,6 @@ int main() { auto d_C = b_C.get_access(h); h.host_task([=](sycl::interop_handle ih) { - cuCtxSetCurrent(ih.get_native_context()); auto cuStream = ih.get_native_queue(); cublasSetStream(handle, cuStream); auto cuA = reinterpret_cast(ih.get_native_mem(d_A)); diff --git a/examples/sgemm_interop/sycl_sgemm_usm.cpp b/examples/sgemm_interop/sycl_sgemm_usm.cpp index 2468bf4..617afd7 100644 --- a/examples/sgemm_interop/sycl_sgemm_usm.cpp +++ b/examples/sgemm_interop/sycl_sgemm_usm.cpp @@ -25,11 +25,10 @@ #include #include -#include -#include +#include -#include #include +#include #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC) @@ -47,12 +46,12 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) { } } -void inline checkCudaErrorMsg(CUresult status, const char *msg) { +/* void inline checkCudaErrorMsg(CUresult status, const char *msg) { if (status != CUDA_SUCCESS) { std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl; exit(EXIT_FAILURE); } -} +} */ class CUDASelector : public sycl::device_selector { public: @@ -107,9 +106,7 @@ int main() { q.submit([&](handler &h) { h.host_task([=](sycl::interop_handle ih) { - // Set the correct cuda context & stream - cuCtxSetCurrent(ih.get_native_context()); auto cuStream = ih.get_native_queue(); cublasSetStream(handle, cuStream); From 0d4facbd129df94339582227fcb5e9c0798e6a30 Mon Sep 17 00:00:00 2001 From: Ruyman Reyes Castro Date: Mon, 13 Feb 2023 12:09:34 +0000 Subject: [PATCH 4/7] Using moder queue construction SYCL 1.2.1 device selectors have been deprecated in favour of a new simplified form using lambdas. --- examples/sgemm_interop/sycl_sgemm.cpp | 23 +---------------------- examples/sgemm_interop/sycl_sgemm_usm.cpp | 20 +------------------- 2 files changed, 2 insertions(+), 41 deletions(-) diff --git a/examples/sgemm_interop/sycl_sgemm.cpp b/examples/sgemm_interop/sycl_sgemm.cpp index 119368f..14bbd02 100644 --- a/examples/sgemm_interop/sycl_sgemm.cpp +++ b/examples/sgemm_interop/sycl_sgemm.cpp @@ -46,27 +46,6 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) { } } -/* - * void inline checkCudaErrorMsg(CUresult status, const char *msg) { - if (status != CUDA_SUCCESS) { - std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl; - exit(EXIT_FAILURE); - } -} -*/ - -class CUDASelector : public sycl::device_selector { -public: - int operator()(const sycl::device &device) const override { - if(device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda){ - std::cout << " CUDA device found " << std::endl; - return 1; - } else{ - return -1; - } - } -}; - int main() { using namespace sycl; @@ -89,7 +68,7 @@ int main() { // B is a matrix fill with 1 std::fill(std::begin(h_B), std::end(h_B), 1.0f); - sycl::queue q{CUDASelector()}; + sycl::queue q{[](auto& d) { return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); }}; cublasHandle_t handle; CHECK_ERROR(cublasCreate(&handle)); diff --git a/examples/sgemm_interop/sycl_sgemm_usm.cpp b/examples/sgemm_interop/sycl_sgemm_usm.cpp index 617afd7..f658872 100644 --- a/examples/sgemm_interop/sycl_sgemm_usm.cpp +++ b/examples/sgemm_interop/sycl_sgemm_usm.cpp @@ -46,24 +46,6 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) { } } -/* void inline checkCudaErrorMsg(CUresult status, const char *msg) { - if (status != CUDA_SUCCESS) { - std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl; - exit(EXIT_FAILURE); - } -} */ - -class CUDASelector : public sycl::device_selector { -public: - int operator()(const sycl::device &device) const override { - if(device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda){ - std::cout << " CUDA device found " << std::endl; - return 1; - } else{ - return -1; - } - } -}; int main() { using namespace sycl; @@ -87,7 +69,7 @@ int main() { // B is a matrix fill with 1 std::fill(std::begin(h_B), std::end(h_B), 1.0f); - sycl::queue q{CUDASelector()}; + sycl::queue q{[](auto& d) { return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); }}; // Allocate memory on the device float* d_A = sycl::malloc_device(WIDTH*HEIGHT,q); From 6cfee58aff8960c0c918ab5731fc482919469369 Mon Sep 17 00:00:00 2001 From: Ruyman Reyes Castro Date: Mon, 13 Feb 2023 12:12:14 +0000 Subject: [PATCH 5/7] Format files Run clang-format on files, separate commit to avoid noise --- examples/sgemm_interop/sycl_sgemm.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/sgemm_interop/sycl_sgemm.cpp b/examples/sgemm_interop/sycl_sgemm.cpp index 14bbd02..e1b54b2 100644 --- a/examples/sgemm_interop/sycl_sgemm.cpp +++ b/examples/sgemm_interop/sycl_sgemm.cpp @@ -27,8 +27,8 @@ #include -#include #include +#include #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC) @@ -68,7 +68,9 @@ int main() { // B is a matrix fill with 1 std::fill(std::begin(h_B), std::end(h_B), 1.0f); - sycl::queue q{[](auto& d) { return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); }}; + sycl::queue q{[](auto &d) { + return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); + }}; cublasHandle_t handle; CHECK_ERROR(cublasCreate(&handle)); @@ -86,9 +88,12 @@ int main() { h.host_task([=](sycl::interop_handle ih) { auto cuStream = ih.get_native_queue(); cublasSetStream(handle, cuStream); - auto cuA = reinterpret_cast(ih.get_native_mem(d_A)); - auto cuB = reinterpret_cast(ih.get_native_mem(d_B)); - auto cuC = reinterpret_cast(ih.get_native_mem(d_C)); + auto cuA = reinterpret_cast( + ih.get_native_mem(d_A)); + auto cuB = reinterpret_cast( + ih.get_native_mem(d_B)); + auto cuC = reinterpret_cast( + ih.get_native_mem(d_C)); CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT, WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA, From ce4abc22fef41f421df8d27d92156599d9e08f3f Mon Sep 17 00:00:00 2001 From: Ruyman Reyes Castro Date: Wed, 19 Apr 2023 17:13:35 +0100 Subject: [PATCH 6/7] Explicitly setting CUDA context on host task Because of the changes on SYCL context, it is necessary now to set the active CUDA context manually inside the host task. Note there was some clang-formatting here as well --- examples/sgemm_interop/sycl_sgemm.cpp | 2 ++ examples/sgemm_interop/sycl_sgemm_usm.cpp | 39 ++++++++++++----------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/examples/sgemm_interop/sycl_sgemm.cpp b/examples/sgemm_interop/sycl_sgemm.cpp index e1b54b2..eb728aa 100644 --- a/examples/sgemm_interop/sycl_sgemm.cpp +++ b/examples/sgemm_interop/sycl_sgemm.cpp @@ -86,6 +86,8 @@ int main() { auto d_C = b_C.get_access(h); h.host_task([=](sycl::interop_handle ih) { + // Set the correct cuda context & stream + cuCtxSetCurrent(ih.get_native_context()); auto cuStream = ih.get_native_queue(); cublasSetStream(handle, cuStream); auto cuA = reinterpret_cast( diff --git a/examples/sgemm_interop/sycl_sgemm_usm.cpp b/examples/sgemm_interop/sycl_sgemm_usm.cpp index f658872..235bbd1 100644 --- a/examples/sgemm_interop/sycl_sgemm_usm.cpp +++ b/examples/sgemm_interop/sycl_sgemm_usm.cpp @@ -27,8 +27,8 @@ #include -#include #include +#include #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC) @@ -46,7 +46,6 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) { } } - int main() { using namespace sycl; @@ -69,12 +68,14 @@ int main() { // B is a matrix fill with 1 std::fill(std::begin(h_B), std::end(h_B), 1.0f); - sycl::queue q{[](auto& d) { return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); }}; + sycl::queue q{[](auto &d) { + return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); + }}; // Allocate memory on the device - float* d_A = sycl::malloc_device(WIDTH*HEIGHT,q); - float* d_B = sycl::malloc_device(WIDTH*HEIGHT,q); - float* d_C = sycl::malloc_device(WIDTH*HEIGHT,q); + float *d_A = sycl::malloc_device(WIDTH * HEIGHT, q); + float *d_B = sycl::malloc_device(WIDTH * HEIGHT, q); + float *d_C = sycl::malloc_device(WIDTH * HEIGHT, q); // Copy matrices A & B to device from host vectors const size_t numBytes = WIDTH * HEIGHT * sizeof(float); @@ -86,19 +87,19 @@ int main() { CHECK_ERROR(cublasCreate(&handle)); q.submit([&](handler &h) { - - h.host_task([=](sycl::interop_handle ih) { - // Set the correct cuda context & stream - auto cuStream = ih.get_native_queue(); - cublasSetStream(handle, cuStream); - - // Call generalised matrix-matrix multiply - CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT, - WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA, - d_C, WIDTH)); - cuStreamSynchronize(cuStream); - }); - }).wait(); + h.host_task([=](sycl::interop_handle ih) { + // Set the correct cuda context & stream + cuCtxSetCurrent(ih.get_native_context()); + auto cuStream = ih.get_native_queue(); + cublasSetStream(handle, cuStream); + + // Call generalised matrix-matrix multiply + CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT, + WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA, + d_C, WIDTH)); + cuStreamSynchronize(cuStream); + }); + }).wait(); // Copy the result back to host q.memcpy(h_C.data(), d_C, numBytes).wait(); From a06022bcfe44a795fc10b372717abe837db436b8 Mon Sep 17 00:00:00 2001 From: Ruyman Reyes Castro Date: Wed, 19 Apr 2023 17:23:49 +0100 Subject: [PATCH 7/7] Addressing feedback from Gordon --- examples/vector_addition/vector_addition.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/vector_addition/vector_addition.cpp b/examples/vector_addition/vector_addition.cpp index 98a8143..5e92d94 100644 --- a/examples/vector_addition/vector_addition.cpp +++ b/examples/vector_addition/vector_addition.cpp @@ -34,9 +34,6 @@ int main(int argc, char *argv[]) { // Initialize input data { - const auto dwrite_t = sycl::write_only; - - sycl::host_accessor h_a{bufA, sycl::write_only}; sycl::host_accessor h_b{bufB, sycl::write_only}; @@ -65,8 +62,7 @@ int main(int argc, char *argv[]) { auto b = bufB.get_access(h); auto c = bufC.get_access(h); - h.parallel_for(VecSize, - [=](sycl::id<1> i) { c[i] = a[i] + b[i]; }); + h.parallel_for(VecSize, [=](sycl::id<1> i) { c[i] = a[i] + b[i]; }); }; myQueue.submit(cg);