From 245ccd02ce961ad2ba6f6cbf7d9783d269bf8dc9 Mon Sep 17 00:00:00 2001
From: Ruyman Reyes Castro <ruyman@codeplay.com>
Date: Thu, 9 Feb 2023 16:44:52 +0000
Subject: [PATCH 1/7] Ignoring VIM temporary files

---
 .gitignore | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
index 55f356e..8fe35b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,4 +34,7 @@
 # Temporaries
 *~
 *#
-*/build
\ No newline at end of file
+*/build
+]
+# vim
+*.swp

From 0c886cb3a3281da186100f41155a660b6943931a Mon Sep 17 00:00:00 2001
From: Ruyman Reyes Castro <ruyman@codeplay.com>
Date: Thu, 9 Feb 2023 17:16:34 +0000
Subject: [PATCH 2/7] Vector Addition example update

Works with latest DPC++ and SYCL2020 features
README updated to reflect CUDA backend has USM support
---
 examples/vector_addition/README.md           |  3 ---
 examples/vector_addition/vector_addition.cpp | 12 +++++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/examples/vector_addition/README.md b/examples/vector_addition/README.md
index b82357a..133fc51 100644
--- a/examples/vector_addition/README.md
+++ b/examples/vector_addition/README.md
@@ -7,9 +7,6 @@ to highlight how to build an application with SYCL for CUDA using DPC++ support,
 for which an example CMakefile is provided. For detailed documentation on how to
 migrate from CUDA to SYCL, see [SYCL For CUDA Developers](https://developer.codeplay.com/products/computecpp/ce/guides/sycl-for-cuda-developers).
 
-Note currently the CUDA backend does not support the [USM](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc) extension, so we use
-`sycl::buffer` and `sycl::accessors` instead.
-
 Pre-requisites
 ---------------
 
diff --git a/examples/vector_addition/vector_addition.cpp b/examples/vector_addition/vector_addition.cpp
index 9fdaf87..98a8143 100644
--- a/examples/vector_addition/vector_addition.cpp
+++ b/examples/vector_addition/vector_addition.cpp
@@ -34,10 +34,12 @@ int main(int argc, char *argv[]) {
 
   // Initialize input data
   {
-    const auto dwrite_t = sycl::access::mode::discard_write;
+    const auto dwrite_t = sycl::write_only;
+
+
+    sycl::host_accessor h_a{bufA, sycl::write_only};
+    sycl::host_accessor h_b{bufB, sycl::write_only};
 
-    auto h_a = bufA.get_access<dwrite_t>();
-    auto h_b = bufB.get_access<dwrite_t>();
     for (int i = 0; i < N; i++) {
       h_a[i] = sin(i) * sin(i);
       h_b[i] = cos(i) * cos(i);
@@ -70,8 +72,8 @@ int main(int argc, char *argv[]) {
   myQueue.submit(cg);
 
   {
-    const auto read_t = sycl::access::mode::read;
-    auto h_c = bufC.get_access<read_t>();
+    sycl::host_accessor h_c{bufC, sycl::read_only};
+
     double sum = 0.0f;
     for (int i = 0; i < N; i++) {
       sum += h_c[i];

From 283e69d46dccd91d4324f3c0bfbde4d391f2de30 Mon Sep 17 00:00:00 2001
From: Ruyman Reyes Castro <ruyman@codeplay.com>
Date: Mon, 13 Feb 2023 11:59:36 +0000
Subject: [PATCH 3/7] Removing unnecessary CUDA Driver types

Since https://github.com/intel/llvm/pull/8197, SYCL CUDA backend uses
CUDA primary context by default, so individual context setting is no
longer required.
---
 examples/sgemm_interop/sycl_sgemm.cpp     | 10 +++++-----
 examples/sgemm_interop/sycl_sgemm_usm.cpp | 11 ++++-------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/examples/sgemm_interop/sycl_sgemm.cpp b/examples/sgemm_interop/sycl_sgemm.cpp
index 0e90beb..119368f 100644
--- a/examples/sgemm_interop/sycl_sgemm.cpp
+++ b/examples/sgemm_interop/sycl_sgemm.cpp
@@ -25,11 +25,10 @@
 #include <iostream>
 #include <vector>
 
-#include <CL/sycl.hpp>
-#include <CL/sycl/backend/cuda.hpp>
+#include <sycl/sycl.hpp>
 
-#include <cublas_v2.h>
 #include <cuda.h>
+#include <cublas_v2.h>
 
 #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
 
@@ -47,12 +46,14 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) {
   }
 }
 
-void inline checkCudaErrorMsg(CUresult status, const char *msg) {
+/*
+ * void inline checkCudaErrorMsg(CUresult status, const char *msg) {
   if (status != CUDA_SUCCESS) {
     std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
     exit(EXIT_FAILURE);
   }
 }
+*/
 
 class CUDASelector : public sycl::device_selector {
 public:
@@ -104,7 +105,6 @@ int main() {
       auto d_C = b_C.get_access<sycl::access::mode::write>(h);
 
       h.host_task([=](sycl::interop_handle ih) {
-        cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
         auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cuStream);
         auto cuA = reinterpret_cast<float *>(ih.get_native_mem<backend::ext_oneapi_cuda>(d_A));
diff --git a/examples/sgemm_interop/sycl_sgemm_usm.cpp b/examples/sgemm_interop/sycl_sgemm_usm.cpp
index 2468bf4..617afd7 100644
--- a/examples/sgemm_interop/sycl_sgemm_usm.cpp
+++ b/examples/sgemm_interop/sycl_sgemm_usm.cpp
@@ -25,11 +25,10 @@
 #include <iostream>
 #include <vector>
 
-#include <CL/sycl.hpp>
-#include <CL/sycl/backend/cuda.hpp>
+#include <sycl/sycl.hpp>
 
-#include <cublas_v2.h>
 #include <cuda.h>
+#include <cublas_v2.h>
 
 #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
 
@@ -47,12 +46,12 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) {
   }
 }
 
-void inline checkCudaErrorMsg(CUresult status, const char *msg) {
+/* void inline checkCudaErrorMsg(CUresult status, const char *msg) {
   if (status != CUDA_SUCCESS) {
     std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
     exit(EXIT_FAILURE);
   }
-}
+} */
 
 class CUDASelector : public sycl::device_selector {
 public:
@@ -107,9 +106,7 @@ int main() {
   q.submit([&](handler &h) {
 
     h.host_task([=](sycl::interop_handle ih) {
-
       // Set the correct cuda context & stream
-      cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
       auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
       cublasSetStream(handle, cuStream);
 

From 0d4facbd129df94339582227fcb5e9c0798e6a30 Mon Sep 17 00:00:00 2001
From: Ruyman Reyes Castro <ruyman@codeplay.com>
Date: Mon, 13 Feb 2023 12:09:34 +0000
Subject: [PATCH 4/7] Using moder queue construction

SYCL 1.2.1 device selectors have been deprecated in favour of a new
simplified form using lambdas.
---
 examples/sgemm_interop/sycl_sgemm.cpp     | 23 +----------------------
 examples/sgemm_interop/sycl_sgemm_usm.cpp | 20 +-------------------
 2 files changed, 2 insertions(+), 41 deletions(-)

diff --git a/examples/sgemm_interop/sycl_sgemm.cpp b/examples/sgemm_interop/sycl_sgemm.cpp
index 119368f..14bbd02 100644
--- a/examples/sgemm_interop/sycl_sgemm.cpp
+++ b/examples/sgemm_interop/sycl_sgemm.cpp
@@ -46,27 +46,6 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) {
   }
 }
 
-/*
- * void inline checkCudaErrorMsg(CUresult status, const char *msg) {
-  if (status != CUDA_SUCCESS) {
-    std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
-    exit(EXIT_FAILURE);
-  }
-}
-*/
-
-class CUDASelector : public sycl::device_selector {
-public:
-  int operator()(const sycl::device &device) const override {
-    if(device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda){
-      std::cout << " CUDA device found " << std::endl;
-      return 1;
-    } else{
-      return -1;
-    }
-  }
-};
-
 int main() {
   using namespace sycl;
 
@@ -89,7 +68,7 @@ int main() {
   // B is a matrix fill with 1
   std::fill(std::begin(h_B), std::end(h_B), 1.0f);
 
-  sycl::queue q{CUDASelector()};
+  sycl::queue q{[](auto& d) { return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); }};
 
   cublasHandle_t handle;
   CHECK_ERROR(cublasCreate(&handle));
diff --git a/examples/sgemm_interop/sycl_sgemm_usm.cpp b/examples/sgemm_interop/sycl_sgemm_usm.cpp
index 617afd7..f658872 100644
--- a/examples/sgemm_interop/sycl_sgemm_usm.cpp
+++ b/examples/sgemm_interop/sycl_sgemm_usm.cpp
@@ -46,24 +46,6 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) {
   }
 }
 
-/* void inline checkCudaErrorMsg(CUresult status, const char *msg) {
-  if (status != CUDA_SUCCESS) {
-    std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
-    exit(EXIT_FAILURE);
-  }
-} */
-
-class CUDASelector : public sycl::device_selector {
-public:
-  int operator()(const sycl::device &device) const override {
-    if(device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda){
-      std::cout << " CUDA device found " << std::endl;
-      return 1;
-    } else{
-      return -1;
-    }
-  }
-};
 
 int main() {
   using namespace sycl;
@@ -87,7 +69,7 @@ int main() {
   // B is a matrix fill with 1
   std::fill(std::begin(h_B), std::end(h_B), 1.0f);
 
-  sycl::queue q{CUDASelector()};
+  sycl::queue q{[](auto& d) { return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); }};
 
   // Allocate memory on the device
   float* d_A = sycl::malloc_device<float>(WIDTH*HEIGHT,q);

From 6cfee58aff8960c0c918ab5731fc482919469369 Mon Sep 17 00:00:00 2001
From: Ruyman Reyes Castro <ruyman@codeplay.com>
Date: Mon, 13 Feb 2023 12:12:14 +0000
Subject: [PATCH 5/7] Format files

Run clang-format on files, separate commit to avoid noise
---
 examples/sgemm_interop/sycl_sgemm.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/sgemm_interop/sycl_sgemm.cpp b/examples/sgemm_interop/sycl_sgemm.cpp
index 14bbd02..e1b54b2 100644
--- a/examples/sgemm_interop/sycl_sgemm.cpp
+++ b/examples/sgemm_interop/sycl_sgemm.cpp
@@ -27,8 +27,8 @@
 
 #include <sycl/sycl.hpp>
 
-#include <cuda.h>
 #include <cublas_v2.h>
+#include <cuda.h>
 
 #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
 
@@ -68,7 +68,9 @@ int main() {
   // B is a matrix fill with 1
   std::fill(std::begin(h_B), std::end(h_B), 1.0f);
 
-  sycl::queue q{[](auto& d) { return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); }};
+  sycl::queue q{[](auto &d) {
+    return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda);
+  }};
 
   cublasHandle_t handle;
   CHECK_ERROR(cublasCreate(&handle));
@@ -86,9 +88,12 @@ int main() {
       h.host_task([=](sycl::interop_handle ih) {
         auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cuStream);
-        auto cuA = reinterpret_cast<float *>(ih.get_native_mem<backend::ext_oneapi_cuda>(d_A));
-        auto cuB = reinterpret_cast<float *>(ih.get_native_mem<backend::ext_oneapi_cuda>(d_B));
-        auto cuC = reinterpret_cast<float *>(ih.get_native_mem<backend::ext_oneapi_cuda>(d_C));
+        auto cuA = reinterpret_cast<float *>(
+            ih.get_native_mem<backend::ext_oneapi_cuda>(d_A));
+        auto cuB = reinterpret_cast<float *>(
+            ih.get_native_mem<backend::ext_oneapi_cuda>(d_B));
+        auto cuC = reinterpret_cast<float *>(
+            ih.get_native_mem<backend::ext_oneapi_cuda>(d_C));
 
         CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
                                 WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA,

From ce4abc22fef41f421df8d27d92156599d9e08f3f Mon Sep 17 00:00:00 2001
From: Ruyman Reyes Castro <ruyman@codeplay.com>
Date: Wed, 19 Apr 2023 17:13:35 +0100
Subject: [PATCH 6/7] Explicitly setting CUDA context on host task

Because of the changes on SYCL context, it is necessary now to set the
active CUDA context manually inside the host task.

Note there was some clang-formatting here as well
---
 examples/sgemm_interop/sycl_sgemm.cpp     |  2 ++
 examples/sgemm_interop/sycl_sgemm_usm.cpp | 39 ++++++++++++-----------
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/examples/sgemm_interop/sycl_sgemm.cpp b/examples/sgemm_interop/sycl_sgemm.cpp
index e1b54b2..eb728aa 100644
--- a/examples/sgemm_interop/sycl_sgemm.cpp
+++ b/examples/sgemm_interop/sycl_sgemm.cpp
@@ -86,6 +86,8 @@ int main() {
       auto d_C = b_C.get_access<sycl::access::mode::write>(h);
 
       h.host_task([=](sycl::interop_handle ih) {
+        // Set the correct cuda context & stream
+	cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
         auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cuStream);
         auto cuA = reinterpret_cast<float *>(
diff --git a/examples/sgemm_interop/sycl_sgemm_usm.cpp b/examples/sgemm_interop/sycl_sgemm_usm.cpp
index f658872..235bbd1 100644
--- a/examples/sgemm_interop/sycl_sgemm_usm.cpp
+++ b/examples/sgemm_interop/sycl_sgemm_usm.cpp
@@ -27,8 +27,8 @@
 
 #include <sycl/sycl.hpp>
 
-#include <cuda.h>
 #include <cublas_v2.h>
+#include <cuda.h>
 
 #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
 
@@ -46,7 +46,6 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) {
   }
 }
 
-
 int main() {
   using namespace sycl;
 
@@ -69,12 +68,14 @@ int main() {
   // B is a matrix fill with 1
   std::fill(std::begin(h_B), std::end(h_B), 1.0f);
 
-  sycl::queue q{[](auto& d) { return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); }};
+  sycl::queue q{[](auto &d) {
+    return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda);
+  }};
 
   // Allocate memory on the device
-  float* d_A = sycl::malloc_device<float>(WIDTH*HEIGHT,q);
-  float* d_B = sycl::malloc_device<float>(WIDTH*HEIGHT,q);
-  float* d_C = sycl::malloc_device<float>(WIDTH*HEIGHT,q);
+  float *d_A = sycl::malloc_device<float>(WIDTH * HEIGHT, q);
+  float *d_B = sycl::malloc_device<float>(WIDTH * HEIGHT, q);
+  float *d_C = sycl::malloc_device<float>(WIDTH * HEIGHT, q);
 
   // Copy matrices A & B to device from host vectors
   const size_t numBytes = WIDTH * HEIGHT * sizeof(float);
@@ -86,19 +87,19 @@ int main() {
   CHECK_ERROR(cublasCreate(&handle));
 
   q.submit([&](handler &h) {
-
-    h.host_task([=](sycl::interop_handle ih) {
-      // Set the correct cuda context & stream
-      auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
-      cublasSetStream(handle, cuStream);
-
-      // Call generalised matrix-matrix multiply
-      CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
-                              WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA,
-                              d_C, WIDTH));
-      cuStreamSynchronize(cuStream);
-    });
-  }).wait();
+     h.host_task([=](sycl::interop_handle ih) {
+       // Set the correct cuda context & stream
+       cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
+       auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
+       cublasSetStream(handle, cuStream);
+
+       // Call generalised matrix-matrix multiply
+       CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
+                               WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA,
+                               d_C, WIDTH));
+       cuStreamSynchronize(cuStream);
+     });
+   }).wait();
 
   // Copy the result back to host
   q.memcpy(h_C.data(), d_C, numBytes).wait();

From a06022bcfe44a795fc10b372717abe837db436b8 Mon Sep 17 00:00:00 2001
From: Ruyman Reyes Castro <ruyman@codeplay.com>
Date: Wed, 19 Apr 2023 17:23:49 +0100
Subject: [PATCH 7/7] Addressing feedback from Gordon

---
 examples/vector_addition/vector_addition.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/examples/vector_addition/vector_addition.cpp b/examples/vector_addition/vector_addition.cpp
index 98a8143..5e92d94 100644
--- a/examples/vector_addition/vector_addition.cpp
+++ b/examples/vector_addition/vector_addition.cpp
@@ -34,9 +34,6 @@ int main(int argc, char *argv[]) {
 
   // Initialize input data
   {
-    const auto dwrite_t = sycl::write_only;
-
-
     sycl::host_accessor h_a{bufA, sycl::write_only};
     sycl::host_accessor h_b{bufB, sycl::write_only};
 
@@ -65,8 +62,7 @@ int main(int argc, char *argv[]) {
     auto b = bufB.get_access<read_t>(h);
     auto c = bufC.get_access<write_t>(h);
 
-    h.parallel_for(VecSize,
-                   [=](sycl::id<1> i) { c[i] = a[i] + b[i]; });
+    h.parallel_for(VecSize, [=](sycl::id<1> i) { c[i] = a[i] + b[i]; });
   };
 
   myQueue.submit(cg);