From 7944d20051e8c99bcd64ec69c798c61f1a7f4ccf Mon Sep 17 00:00:00 2001
From: m moadeli <mahmoud.moadeli@codeplay.com>
Date: Thu, 8 Jun 2023 14:07:19 +0100
Subject: [PATCH 01/11] Improve guessLocalWorkSize to avoid poor performance
 when having prime number for ranage diementsions.

---
 sycl/plugins/cuda/pi_cuda.cpp | 46 ++++++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 12 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index e7957a139a6ea..43459ef8349f1 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -295,6 +295,20 @@ int getAttribute(pi_device device, CUdevice_attribute attribute) {
 }
 /// \endcond
 
+bool isPrime(size_t number) {
+  if (number < 2)
+    return false;
+  if (number == 2)
+    return true;
+  if (number % 2 == 0)
+    return false;
+  for (int i = 3; (i * i) <= number; i += 2) {
+    if (number % i == 0)
+      return false;
+  }
+  return true;
+}
+
 // Determine local work sizes that result in uniform work groups.
 // The default threadsPerBlock only require handling the first work_dim
 // dimension.
@@ -305,29 +319,37 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
   assert(threadsPerBlock != nullptr);
   assert(global_work_size != nullptr);
   assert(kernel != nullptr);
-  int minGrid, maxBlockSize, gridDim[3];
+  int minGrid, maxBlockSize, maxBlockDim[3];
 
-  cuDeviceGetAttribute(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
+  cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
                        device->get());
-  cuDeviceGetAttribute(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
+  cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
                        device->get());
 
-  threadsPerBlock[1] = ((global_work_size[1] - 1) / gridDim[1]) + 1;
-  threadsPerBlock[2] = ((global_work_size[2] - 1) / gridDim[2]) + 1;
-
   PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
       &minGrid, &maxBlockSize, kernel->get(), NULL, local_size,
       maxThreadsPerBlock[0]));
 
-  gridDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
-
-  threadsPerBlock[0] =
-      std::min(maxThreadsPerBlock[0],
-               std::min(global_work_size[0], static_cast<size_t>(gridDim[0])));
+  threadsPerBlock[2] = std::min(global_work_size[2], size_t(maxBlockDim[2]));
+  threadsPerBlock[1] =
+      std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2],
+                                             size_t(maxBlockDim[1])));
+  maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
+  threadsPerBlock[0] = std::min(
+      maxThreadsPerBlock[0],
+      std::min(global_work_size[0], static_cast<size_t>(maxBlockDim[0])));
+
+  // When global_work_size[0] is prime threadPerBlock[0] will later computed as
+  // 1, which is not efficient configuration. In such case we use
+  // global_work_size[0] to compute threadPerBlock[0].
+  int x_global_work_size = (isPrime(global_work_size[0]) &&
+                            (threadsPerBlock[0] != global_work_size[0]))
+                               ? global_work_size[0] + 1
+                               : global_work_size[0];
 
   // Find a local work group size that is a divisor of the global
   // work group size to produce uniform work groups.
-  while (0u != (global_work_size[0] % threadsPerBlock[0])) {
+  while (0u != (x_global_work_size % threadsPerBlock[0])) {
     --threadsPerBlock[0];
   }
 }

From 2962217c3816689a044c086121b8f32ac7a5e93b Mon Sep 17 00:00:00 2001
From: m moadeli <mahmoud.moadeli@codeplay.com>
Date: Thu, 8 Jun 2023 14:37:42 +0100
Subject: [PATCH 02/11] Address merge conflicts

---
 sycl/plugins/cuda/pi_cuda.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 4dc8357e8c9c1..9a5dd5710f55a 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -353,7 +353,8 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
 
   // Find a local work group size that is a divisor of the global
   // work group size to produce uniform work groups.
-  while (0u != (x_global_work_size % threadsPerBlock[0]) || !isPowerOf2(threadsPerBlock[0]) {
+  while (0u != (x_global_work_size % threadsPerBlock[0]) || 
+    !isPowerOf2(threadsPerBlock[0]) {
     --threadsPerBlock[0];
   }
 }
@@ -649,8 +650,8 @@ _pi_event::_pi_event(pi_context context, CUevent eventNative)
     : commandType_{PI_COMMAND_TYPE_USER}, refCount_{1}, has_ownership_{false},
       hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
       streamToken_{std::numeric_limits<pi_uint32>::max()}, evEnd_{eventNative},
-      evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr},
-      context_{context} {
+      evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, context_{
+                                                                  context} {
   cuda_piContextRetain(context_);
 }
 
@@ -2180,7 +2181,8 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
         cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) ==
         CUDA_SUCCESS);
     // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
-    sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12);
+    sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
+                                12);
     return getInfoArray(strnlen(AddressBuffer, AddressBufferSize - 1) + 1,
                         param_value_size, param_value, param_value_size_ret,
                         AddressBuffer);

From d7f1e4c21ddb06f35b4ac2d0dd21e873af92e2b4 Mon Sep 17 00:00:00 2001
From: m moadeli <mahmoud.moadeli@codeplay.com>
Date: Thu, 8 Jun 2023 14:42:02 +0100
Subject: [PATCH 03/11] Fix a mistake in comment.

---
 .gitignore                    | 1 +
 sycl/plugins/cuda/pi_cuda.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 14482c643868c..c6a76c6512dac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,6 +28,7 @@
 
 # Nested build directory
 /build*
+/sycl/test-e2e/build*
 !buildbot
 
 #==============================================================================#
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 9a5dd5710f55a..b5f086d6c0896 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -341,7 +341,7 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
 
   // When global_work_size[0] is prime threadPerBlock[0] will later computed as
   // 1, which is not efficient configuration. In such case we use
-  // global_work_size[0] to compute threadPerBlock[0].
+  // global_work_size[0] + 1 to compute threadPerBlock[0].
   int x_global_work_size = (isPrime(global_work_size[0]) &&
                             (threadsPerBlock[0] != global_work_size[0]))
                                ? global_work_size[0] + 1

From c3939b81e61249d17a646259a2a11cb75f4b61b7 Mon Sep 17 00:00:00 2001
From: m moadeli <mahmoud.moadeli@codeplay.com>
Date: Thu, 8 Jun 2023 14:43:32 +0100
Subject: [PATCH 04/11] Revert change to .gitignore.

---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index c6a76c6512dac..14482c643868c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,7 +28,6 @@
 
 # Nested build directory
 /build*
-/sycl/test-e2e/build*
 !buildbot
 
 #==============================================================================#

From 3b3f7f8231f895cfb3e7bbabeb3ea55fea71156b Mon Sep 17 00:00:00 2001
From: m moadeli <mahmoud.moadeli@codeplay.com>
Date: Thu, 8 Jun 2023 14:54:13 +0100
Subject: [PATCH 05/11] Reverts removed comment.

---
 sycl/plugins/cuda/pi_cuda.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index b5f086d6c0896..6f9ba9cb1f485 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -353,6 +353,8 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
 
   // Find a local work group size that is a divisor of the global
   // work group size to produce uniform work groups.
+  // Additionally, for best compute utilisation, the local size has
+  // to be a power of two.
   while (0u != (x_global_work_size % threadsPerBlock[0]) || 
     !isPowerOf2(threadsPerBlock[0]) {
     --threadsPerBlock[0];

From 073fd4a41f3273d7aa7ce9cc036d2697a8337552 Mon Sep 17 00:00:00 2001
From: m moadeli <mahmoud.moadeli@codeplay.com>
Date: Thu, 8 Jun 2023 16:05:14 +0100
Subject: [PATCH 06/11] Fix code style

---
 sycl/plugins/cuda/pi_cuda.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 6f9ba9cb1f485..98b7a4cadad08 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -652,8 +652,8 @@ _pi_event::_pi_event(pi_context context, CUevent eventNative)
     : commandType_{PI_COMMAND_TYPE_USER}, refCount_{1}, has_ownership_{false},
       hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
       streamToken_{std::numeric_limits<pi_uint32>::max()}, evEnd_{eventNative},
-      evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, context_{
-                                                                  context} {
+      evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr},
+      context_{context} {
   cuda_piContextRetain(context_);
 }
 

From e3f83960beec5c57f41d1c19d97f78f17754e99f Mon Sep 17 00:00:00 2001
From: m moadeli <mahmoud.moadeli@codeplay.com>
Date: Thu, 8 Jun 2023 21:03:17 +0100
Subject: [PATCH 07/11] Add missing parantheses

---
 sycl/plugins/cuda/pi_cuda.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 98b7a4cadad08..d79259c862489 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -355,8 +355,8 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
   // work group size to produce uniform work groups.
   // Additionally, for best compute utilisation, the local size has
   // to be a power of two.
-  while (0u != (x_global_work_size % threadsPerBlock[0]) || 
-    !isPowerOf2(threadsPerBlock[0]) {
+  while (0u != (x_global_work_size % threadsPerBlock[0]) ||
+         !isPowerOf2(threadsPerBlock[0])) {
     --threadsPerBlock[0];
   }
 }

From e8787445c67bededc06599271ebc9020d54e6fc3 Mon Sep 17 00:00:00 2001
From: m moadeli <mahmoud.moadeli@codeplay.com>
Date: Mon, 12 Jun 2023 10:38:22 +0100
Subject: [PATCH 08/11] Define isPrime a a lambda.

---
 sycl/plugins/cuda/pi_cuda.cpp | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index d79259c862489..e50dbe0f8e8b6 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -295,20 +295,6 @@ int getAttribute(pi_device device, CUdevice_attribute attribute) {
 }
 /// \endcond
 
-bool isPrime(size_t number) {
-  if (number < 2)
-    return false;
-  if (number == 2)
-    return true;
-  if (number % 2 == 0)
-    return false;
-  for (int i = 3; (i * i) <= number; i += 2) {
-    if (number % i == 0)
-      return false;
-  }
-  return true;
-}
-
 // Determine local work sizes that result in uniform work groups.
 // The default threadsPerBlock only require handling the first work_dim
 // dimension.
@@ -321,6 +307,20 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
   assert(kernel != nullptr);
   int minGrid, maxBlockSize, maxBlockDim[3];
 
+  static auto isPrime = [](size_t number) -> bool {
+    if (number < 2)
+      return false;
+    if (number == 2)
+      return true;
+    if (number % 2 == 0)
+      return false;
+    for (int i = 3; (i * i) <= number; i += 2) {
+      if (number % i == 0)
+        return false;
+    }
+    return true;
+  };
+
   cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
                        device->get());
   cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,

From 2bf6505f6e447163466205b125be11c4e595526f Mon Sep 17 00:00:00 2001
From: m moadeli <mahmoud.moadeli@codeplay.com>
Date: Tue, 13 Jun 2023 09:02:12 +0100
Subject: [PATCH 09/11] Update a variable name to a more descriptive one.

---
 sycl/plugins/cuda/pi_cuda.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index e50dbe0f8e8b6..495bc47d14341 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -342,10 +342,10 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
   // When global_work_size[0] is prime threadPerBlock[0] will later computed as
   // 1, which is not efficient configuration. In such case we use
   // global_work_size[0] + 1 to compute threadPerBlock[0].
-  int x_global_work_size = (isPrime(global_work_size[0]) &&
-                            (threadsPerBlock[0] != global_work_size[0]))
-                               ? global_work_size[0] + 1
-                               : global_work_size[0];
+  int global_work_size_0_dim = (isPrime(global_work_size[0]) &&
+                                (threadsPerBlock[0] != global_work_size[0]))
+                                   ? global_work_size[0] + 1
+                                   : global_work_size[0];
 
   static auto isPowerOf2 = [](size_t value) -> bool {
     return value && !(value & (value - 1));
@@ -355,7 +355,7 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
   // work group size to produce uniform work groups.
   // Additionally, for best compute utilisation, the local size has
   // to be a power of two.
-  while (0u != (x_global_work_size % threadsPerBlock[0]) ||
+  while (0u != (global_work_size_0_dim % threadsPerBlock[0]) ||
          !isPowerOf2(threadsPerBlock[0])) {
     --threadsPerBlock[0];
   }

From 2b485cb36a49e0c2319c0cd6935f0c4b88f97d4f Mon Sep 17 00:00:00 2001
From: m moadeli <mahmoud.moadeli@codeplay.com>
Date: Tue, 13 Jun 2023 09:11:57 +0100
Subject: [PATCH 10/11] Update a variable name to a more descriptive one.

---
 sycl/plugins/cuda/pi_cuda.cpp | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 495bc47d14341..5df792833e698 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -335,17 +335,18 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
       std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2],
                                              size_t(maxBlockDim[1])));
   maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
-  threadsPerBlock[0] = std::min(
-      maxThreadsPerBlock[0],
-      std::min(global_work_size[0], static_cast<size_t>(maxBlockDim[0])));
+  threadsPerBlock[0] =
+      std::min(maxThreadsPerBlock[0],
+               std::min(global_work_size[0], size_t(maxBlockDim[0])));
 
   // When global_work_size[0] is prime threadPerBlock[0] will later computed as
   // 1, which is not efficient configuration. In such case we use
   // global_work_size[0] + 1 to compute threadPerBlock[0].
-  int global_work_size_0_dim = (isPrime(global_work_size[0]) &&
-                                (threadsPerBlock[0] != global_work_size[0]))
-                                   ? global_work_size[0] + 1
-                                   : global_work_size[0];
+  int adjusted_0_dim_global_work_size =
+      (isPrime(global_work_size[0]) &&
+       (threadsPerBlock[0] != global_work_size[0]))
+          ? global_work_size[0] + 1
+          : global_work_size[0];
 
   static auto isPowerOf2 = [](size_t value) -> bool {
     return value && !(value & (value - 1));
@@ -355,7 +356,7 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
   // work group size to produce uniform work groups.
   // Additionally, for best compute utilisation, the local size has
   // to be a power of two.
-  while (0u != (global_work_size_0_dim % threadsPerBlock[0]) ||
+  while (0u != (adjusted_0_dim_global_work_size % threadsPerBlock[0]) ||
          !isPowerOf2(threadsPerBlock[0])) {
     --threadsPerBlock[0];
   }

From e39beeeefda287fdcc59f406c55c60fa8ca1b9f8 Mon Sep 17 00:00:00 2001
From: m moadeli <mahmoud.moadeli@codeplay.com>
Date: Tue, 13 Jun 2023 09:27:46 +0100
Subject: [PATCH 11/11] Remove the need for computing square in for loop check
 in isPrime.

---
 sycl/plugins/cuda/pi_cuda.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 5df792833e698..dd68c196e94c1 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -19,6 +19,7 @@
 #include <algorithm>
 #include <cassert>
 #include <chrono>
+#include <cmath>
 #include <cuda.h>
 #include <cuda_device_runtime_api.h>
 #include <limits>
@@ -308,13 +309,14 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
   int minGrid, maxBlockSize, maxBlockDim[3];
 
   static auto isPrime = [](size_t number) -> bool {
+    auto lastNumToCheck = ceil(sqrt(number));
     if (number < 2)
       return false;
     if (number == 2)
       return true;
     if (number % 2 == 0)
       return false;
-    for (int i = 3; (i * i) <= number; i += 2) {
+    for (int i = 3; i <= lastNumToCheck; i += 2) {
       if (number % i == 0)
         return false;
     }