From 7944d20051e8c99bcd64ec69c798c61f1a7f4ccf Mon Sep 17 00:00:00 2001 From: m moadeli Date: Thu, 8 Jun 2023 14:07:19 +0100 Subject: [PATCH 01/11] Improve guessLocalWorkSize to avoid poor performance when having prime number for ranage diementsions. --- sycl/plugins/cuda/pi_cuda.cpp | 46 ++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index e7957a139a6ea..43459ef8349f1 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -295,6 +295,20 @@ int getAttribute(pi_device device, CUdevice_attribute attribute) { } /// \endcond +bool isPrime(size_t number) { + if (number < 2) + return false; + if (number == 2) + return true; + if (number % 2 == 0) + return false; + for (int i = 3; (i * i) <= number; i += 2) { + if (number % i == 0) + return false; + } + return true; +} + // Determine local work sizes that result in uniform work groups. // The default threadsPerBlock only require handling the first work_dim // dimension. @@ -305,29 +319,37 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock, assert(threadsPerBlock != nullptr); assert(global_work_size != nullptr); assert(kernel != nullptr); - int minGrid, maxBlockSize, gridDim[3]; + int minGrid, maxBlockSize, maxBlockDim[3]; - cuDeviceGetAttribute(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, + cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device->get()); - cuDeviceGetAttribute(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, + cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device->get()); - threadsPerBlock[1] = ((global_work_size[1] - 1) / gridDim[1]) + 1; - threadsPerBlock[2] = ((global_work_size[2] - 1) / gridDim[2]) + 1; - PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize( &minGrid, &maxBlockSize, kernel->get(), NULL, local_size, maxThreadsPerBlock[0])); - gridDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]); - - threadsPerBlock[0] = - std::min(maxThreadsPerBlock[0], - std::min(global_work_size[0], static_cast(gridDim[0]))); + threadsPerBlock[2] = std::min(global_work_size[2], size_t(maxBlockDim[2])); + threadsPerBlock[1] = + std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2], + size_t(maxBlockDim[1]))); + maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]); + threadsPerBlock[0] = std::min( + maxThreadsPerBlock[0], + std::min(global_work_size[0], static_cast(maxBlockDim[0]))); + + // When global_work_size[0] is prime threadPerBlock[0] will later computed as + // 1, which is not efficient configuration. In such case we use + // global_work_size[0] to compute threadPerBlock[0]. + int x_global_work_size = (isPrime(global_work_size[0]) && + (threadsPerBlock[0] != global_work_size[0])) + ? global_work_size[0] + 1 + : global_work_size[0]; // Find a local work group size that is a divisor of the global // work group size to produce uniform work groups. - while (0u != (global_work_size[0] % threadsPerBlock[0])) { + while (0u != (x_global_work_size % threadsPerBlock[0])) { --threadsPerBlock[0]; } } From 2962217c3816689a044c086121b8f32ac7a5e93b Mon Sep 17 00:00:00 2001 From: m moadeli Date: Thu, 8 Jun 2023 14:37:42 +0100 Subject: [PATCH 02/11] Address merge conflicts --- sycl/plugins/cuda/pi_cuda.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 4dc8357e8c9c1..9a5dd5710f55a 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -353,7 +353,8 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock, // Find a local work group size that is a divisor of the global // work group size to produce uniform work groups. - while (0u != (x_global_work_size % threadsPerBlock[0]) || !isPowerOf2(threadsPerBlock[0]) { + while (0u != (x_global_work_size % threadsPerBlock[0]) || + !isPowerOf2(threadsPerBlock[0]) { --threadsPerBlock[0]; } } @@ -649,8 +650,8 @@ _pi_event::_pi_event(pi_context context, CUevent eventNative) : commandType_{PI_COMMAND_TYPE_USER}, refCount_{1}, has_ownership_{false}, hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false}, streamToken_{std::numeric_limits::max()}, evEnd_{eventNative}, - evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, - context_{context} { + evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, context_{ + context} { cuda_piContextRetain(context_); } @@ -2180,7 +2181,8 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) == CUDA_SUCCESS); // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written - sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12); + sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) == + 12); return getInfoArray(strnlen(AddressBuffer, AddressBufferSize - 1) + 1, param_value_size, param_value, param_value_size_ret, AddressBuffer); From d7f1e4c21ddb06f35b4ac2d0dd21e873af92e2b4 Mon Sep 17 00:00:00 2001 From: m moadeli Date: Thu, 8 Jun 2023 14:42:02 +0100 Subject: [PATCH 03/11] Fix a mistake in comment. --- .gitignore | 1 + sycl/plugins/cuda/pi_cuda.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 14482c643868c..c6a76c6512dac 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ # Nested build directory /build* +/sycl/test-e2e/build* !buildbot #==============================================================================# diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 9a5dd5710f55a..b5f086d6c0896 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -341,7 +341,7 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock, // When global_work_size[0] is prime threadPerBlock[0] will later computed as // 1, which is not efficient configuration. In such case we use - // global_work_size[0] to compute threadPerBlock[0]. + // global_work_size[0] + 1 to compute threadPerBlock[0]. int x_global_work_size = (isPrime(global_work_size[0]) && (threadsPerBlock[0] != global_work_size[0])) ? global_work_size[0] + 1 From c3939b81e61249d17a646259a2a11cb75f4b61b7 Mon Sep 17 00:00:00 2001 From: m moadeli Date: Thu, 8 Jun 2023 14:43:32 +0100 Subject: [PATCH 04/11] Revert change to .gitignore. --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index c6a76c6512dac..14482c643868c 100644 --- a/.gitignore +++ b/.gitignore @@ -28,7 +28,6 @@ # Nested build directory /build* -/sycl/test-e2e/build* !buildbot #==============================================================================# From 3b3f7f8231f895cfb3e7bbabeb3ea55fea71156b Mon Sep 17 00:00:00 2001 From: m moadeli Date: Thu, 8 Jun 2023 14:54:13 +0100 Subject: [PATCH 05/11] Reverts removed comment. --- sycl/plugins/cuda/pi_cuda.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index b5f086d6c0896..6f9ba9cb1f485 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -353,6 +353,8 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock, // Find a local work group size that is a divisor of the global // work group size to produce uniform work groups. + // Additionally, for best compute utilisation, the local size has + // to be a power of two. while (0u != (x_global_work_size % threadsPerBlock[0]) || !isPowerOf2(threadsPerBlock[0]) { --threadsPerBlock[0]; From 073fd4a41f3273d7aa7ce9cc036d2697a8337552 Mon Sep 17 00:00:00 2001 From: m moadeli Date: Thu, 8 Jun 2023 16:05:14 +0100 Subject: [PATCH 06/11] Fix code style --- sycl/plugins/cuda/pi_cuda.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 6f9ba9cb1f485..98b7a4cadad08 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -652,8 +652,8 @@ _pi_event::_pi_event(pi_context context, CUevent eventNative) : commandType_{PI_COMMAND_TYPE_USER}, refCount_{1}, has_ownership_{false}, hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false}, streamToken_{std::numeric_limits::max()}, evEnd_{eventNative}, - evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, context_{ - context} { + evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, + context_{context} { cuda_piContextRetain(context_); } From e3f83960beec5c57f41d1c19d97f78f17754e99f Mon Sep 17 00:00:00 2001 From: m moadeli Date: Thu, 8 Jun 2023 21:03:17 +0100 Subject: [PATCH 07/11] Add missing parantheses --- sycl/plugins/cuda/pi_cuda.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 98b7a4cadad08..d79259c862489 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -355,8 +355,8 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock, // work group size to produce uniform work groups. // Additionally, for best compute utilisation, the local size has // to be a power of two. - while (0u != (x_global_work_size % threadsPerBlock[0]) || - !isPowerOf2(threadsPerBlock[0]) { + while (0u != (x_global_work_size % threadsPerBlock[0]) || + !isPowerOf2(threadsPerBlock[0])) { --threadsPerBlock[0]; } } From e8787445c67bededc06599271ebc9020d54e6fc3 Mon Sep 17 00:00:00 2001 From: m moadeli Date: Mon, 12 Jun 2023 10:38:22 +0100 Subject: [PATCH 08/11] Define isPrime a a lambda. --- sycl/plugins/cuda/pi_cuda.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index d79259c862489..e50dbe0f8e8b6 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -295,20 +295,6 @@ int getAttribute(pi_device device, CUdevice_attribute attribute) { } /// \endcond -bool isPrime(size_t number) { - if (number < 2) - return false; - if (number == 2) - return true; - if (number % 2 == 0) - return false; - for (int i = 3; (i * i) <= number; i += 2) { - if (number % i == 0) - return false; - } - return true; -} - // Determine local work sizes that result in uniform work groups. // The default threadsPerBlock only require handling the first work_dim // dimension. @@ -321,6 +307,20 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock, assert(kernel != nullptr); int minGrid, maxBlockSize, maxBlockDim[3]; + static auto isPrime = [](size_t number) -> bool { + if (number < 2) + return false; + if (number == 2) + return true; + if (number % 2 == 0) + return false; + for (int i = 3; (i * i) <= number; i += 2) { + if (number % i == 0) + return false; + } + return true; + }; + cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device->get()); cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, From 2bf6505f6e447163466205b125be11c4e595526f Mon Sep 17 00:00:00 2001 From: m moadeli Date: Tue, 13 Jun 2023 09:02:12 +0100 Subject: [PATCH 09/11] Update a variable name to a more descriptive one. --- sycl/plugins/cuda/pi_cuda.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index e50dbe0f8e8b6..495bc47d14341 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -342,10 +342,10 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock, // When global_work_size[0] is prime threadPerBlock[0] will later computed as // 1, which is not efficient configuration. In such case we use // global_work_size[0] + 1 to compute threadPerBlock[0]. - int x_global_work_size = (isPrime(global_work_size[0]) && - (threadsPerBlock[0] != global_work_size[0])) - ? global_work_size[0] + 1 - : global_work_size[0]; + int global_work_size_0_dim = (isPrime(global_work_size[0]) && + (threadsPerBlock[0] != global_work_size[0])) + ? global_work_size[0] + 1 + : global_work_size[0]; static auto isPowerOf2 = [](size_t value) -> bool { return value && !(value & (value - 1)); @@ -355,7 +355,7 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock, // work group size to produce uniform work groups. // Additionally, for best compute utilisation, the local size has // to be a power of two. - while (0u != (x_global_work_size % threadsPerBlock[0]) || + while (0u != (global_work_size_0_dim % threadsPerBlock[0]) || !isPowerOf2(threadsPerBlock[0])) { --threadsPerBlock[0]; } From 2b485cb36a49e0c2319c0cd6935f0c4b88f97d4f Mon Sep 17 00:00:00 2001 From: m moadeli Date: Tue, 13 Jun 2023 09:11:57 +0100 Subject: [PATCH 10/11] Update a variable name to a more descriptive one. --- sycl/plugins/cuda/pi_cuda.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 495bc47d14341..5df792833e698 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -335,17 +335,18 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock, std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2], size_t(maxBlockDim[1]))); maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]); - threadsPerBlock[0] = std::min( - maxThreadsPerBlock[0], - std::min(global_work_size[0], static_cast(maxBlockDim[0]))); + threadsPerBlock[0] = + std::min(maxThreadsPerBlock[0], + std::min(global_work_size[0], size_t(maxBlockDim[0]))); // When global_work_size[0] is prime threadPerBlock[0] will later computed as // 1, which is not efficient configuration. In such case we use // global_work_size[0] + 1 to compute threadPerBlock[0]. - int global_work_size_0_dim = (isPrime(global_work_size[0]) && - (threadsPerBlock[0] != global_work_size[0])) - ? global_work_size[0] + 1 - : global_work_size[0]; + int adjusted_0_dim_global_work_size = + (isPrime(global_work_size[0]) && + (threadsPerBlock[0] != global_work_size[0])) + ? global_work_size[0] + 1 + : global_work_size[0]; static auto isPowerOf2 = [](size_t value) -> bool { return value && !(value & (value - 1)); @@ -355,7 +356,7 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock, // work group size to produce uniform work groups. // Additionally, for best compute utilisation, the local size has // to be a power of two. - while (0u != (global_work_size_0_dim % threadsPerBlock[0]) || + while (0u != (adjusted_0_dim_global_work_size % threadsPerBlock[0]) || !isPowerOf2(threadsPerBlock[0])) { --threadsPerBlock[0]; } From e39beeeefda287fdcc59f406c55c60fa8ca1b9f8 Mon Sep 17 00:00:00 2001 From: m moadeli Date: Tue, 13 Jun 2023 09:27:46 +0100 Subject: [PATCH 11/11] Remove the need for computing square in for loop check in isPrime. --- sycl/plugins/cuda/pi_cuda.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 5df792833e698..dd68c196e94c1 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -308,13 +309,14 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock, int minGrid, maxBlockSize, maxBlockDim[3]; static auto isPrime = [](size_t number) -> bool { + auto lastNumToCheck = ceil(sqrt(number)); if (number < 2) return false; if (number == 2) return true; if (number % 2 == 0) return false; - for (int i = 3; (i * i) <= number; i += 2) { + for (int i = 3; i <= lastNumToCheck; i += 2) { if (number % i == 0) return false; }