From 09d5cab96ebc8a812bd385c5d2a18a64aab0ed2e Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Thu, 5 Jun 2025 12:21:52 +0100
Subject: [PATCH 1/2] [Offload] Add olKernelMaxGroupSize

This is equivalent to `cuOccupancyMaxPotentialBlockSize`. It is currently
only implented on Cuda; AMDGPU and Host return the legal-but-suboptimal
value of `1`.

Co-Authored-By: Callum Fare <callum@codeplay.com>
---
 offload/liboffload/API/Kernel.td              | 13 +++++++
 offload/liboffload/src/OffloadImpl.cpp        | 20 +++++++++-
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |  8 ++++
 .../common/include/PluginInterface.h          |  3 ++
 .../cuda/dynamic_cuda/cuda.cpp                |  1 +
 .../plugins-nextgen/cuda/dynamic_cuda/cuda.h  |  3 ++
 offload/plugins-nextgen/cuda/src/rtl.cpp      | 14 +++++++
 offload/plugins-nextgen/host/src/rtl.cpp      |  7 ++++
 offload/unittests/OffloadAPI/CMakeLists.txt   |  1 +
 .../kernel/olGetKernelMaxGroupSize.cpp        | 37 +++++++++++++++++++
 10 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 offload/unittests/OffloadAPI/kernel/olGetKernelMaxGroupSize.cpp
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 0913a036fa04f..549ef79fbea42 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -24,6 +24,19 @@ def : Function {
     let returns = [];
 }
 
+def : Function {
+    let name = "olGetKernelMaxGroupSize";
+    let desc = "Get the maximum block size needed to achieve maximum occupancy.";
+    let details = [];
+    let params = [
+        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,
+        Param<"size_t", "SharedMemory", "dynamic shared memory required", PARAM_IN>,
+        Param<"size_t*", "GroupSize", "maximum block size", PARAM_OUT>
+    ];
+    let returns = [];
+}
+
 def : Struct {
     let name = "ol_kernel_launch_size_args_t";
     let desc = "Size-related arguments for a kernel launch.";
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index e7da4eddce54f..329a8878ff6ec 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -557,6 +557,10 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
   return olDestroy(Program);
 }
 
+inline GenericKernelTy *getPluginKernel(ol_kernel_handle_t OlKernel) {
+  return reinterpret_cast<GenericKernelTy *>(OlKernel);
+}
+
 Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
                        ol_kernel_handle_t *Kernel) {
 
@@ -573,6 +577,20 @@ Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
   return Error::success();
 }
 
+Error olGetKernelMaxGroupSize_impl(ol_kernel_handle_t Kernel,
+                                   ol_device_handle_t Device,
+                                   size_t DynamicMemSize, size_t *GroupSize) {
+  auto *KernelImpl = getPluginKernel(Kernel);
+
+  auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize);
+  if (auto Err = Res.takeError())
+    return Err;
+
+  *GroupSize = *Res;
+
+  return Error::success();
+}
+
 Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
                           ol_kernel_handle_t Kernel, const void *ArgumentsData,
                           size_t ArgumentsSize,
@@ -603,7 +621,7 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
   // Don't do anything with pointer indirection; use arg data as-is
   LaunchArgs.Flags.IsCUDA = true;
 
-  auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
+  auto *KernelImpl = getPluginKernel(Kernel);
   auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
                                 LaunchArgs, AsyncInfoWrapper);
 
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 35cb297f65188..4369ccb6208ed 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -570,6 +570,14 @@ struct AMDGPUKernelTy : public GenericKernelTy {
                    KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
+  /// Return maximum block size for maximum occupancy
+  ///
+  /// TODO: This needs to be implemented for amdgpu
+  Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                size_t DynamicMemSize) const override {
+    return 1;
+  }
+
   /// Print more elaborate kernel launch info for AMDGPU
   Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
                                KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index fbc798faec24b..753b193de65b3 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -316,6 +316,9 @@ struct GenericKernelTy {
                            KernelLaunchParamsTy LaunchParams,
                            AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
 
+  virtual Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                        size_t DynamicMemSize) const = 0;
+
   /// Get the kernel name.
   const char *getName() const { return Name; }
 
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index e5332686fcffb..e6699ee78596d 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -71,6 +71,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3)
 DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
 DLWRAP(cuDevicePrimaryCtxRetain, 2)
 DLWRAP(cuModuleLoadDataEx, 5)
+DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
 
 DLWRAP(cuDeviceCanAccessPeer, 3)
 DLWRAP(cuCtxEnablePeerAccess, 2)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 1c5b421768894..2c856c68a9368 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -289,6 +289,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
 static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
 
 typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
+typedef size_t (*CUoccupancyB2DSize)(int);
 
 CUresult cuCtxGetDevice(CUdevice *);
 CUresult cuDeviceGet(CUdevice *, int);
@@ -370,5 +371,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
 CUresult cuMemGetAllocationGranularity(size_t *granularity,
                                        const CUmemAllocationProp *prop,
                                        CUmemAllocationGranularity_flags option);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+                                          CUoccupancyB2DSize, size_t, int);
 
 #endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index f1164074f9ea9..f154718e91440 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
                    KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
+  /// Return maximum block size for maximum occupancy
+  Expected<size_t> maxGroupSize(GenericDeviceTy &,
+                                size_t DynamicMemSize) const override {
+    int minGridSize;
+    int maxBlockSize;
+    auto Res = cuOccupancyMaxPotentialBlockSize(
+        &minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
+    if (auto Err = Plugin::check(
+            Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
+      return Err;
+    }
+    return maxBlockSize;
+  }
+
 private:
   /// The CUDA kernel function to execute.
   CUfunction Func;
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index a35910aece986..9d5fa153643ef 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -114,6 +114,13 @@ struct GenELF64KernelTy : public GenericKernelTy {
     return Plugin::success();
   }
 
+  /// Return maximum block size for maximum occupancy
+  Expected<size_t> maxGroupSize(GenericDeviceTy &Device,
+                                size_t DynamicMemSize) const override {
+    // TODO
+    return 1;
+  }
+
 private:
   /// The kernel function to execute.
   void (*Func)(void);
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 05e862865ed33..a10822372a1c6 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -18,6 +18,7 @@ target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER)
 
 add_offload_unittest("kernel"
     kernel/olGetKernel.cpp
+    kernel/olGetKernelMaxGroupSize.cpp
     kernel/olLaunchKernel.cpp)
 
 add_offload_unittest("memory"
diff --git a/offload/unittests/OffloadAPI/kernel/olGetKernelMaxGroupSize.cpp b/offload/unittests/OffloadAPI/kernel/olGetKernelMaxGroupSize.cpp
new file mode 100644
index 0000000000000..f81986f535c0f
--- /dev/null
+++ b/offload/unittests/OffloadAPI/kernel/olGetKernelMaxGroupSize.cpp
@@ -0,0 +1,37 @@
+//===------- Offload API tests - olGetKernelMaxGroupSize ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olKernelGetMaxGroupSizeTest = OffloadKernelTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olKernelGetMaxGroupSizeTest);
+
+TEST_P(olKernelGetMaxGroupSizeTest, Success) {
+  size_t Size{0};
+  ASSERT_SUCCESS(olGetKernelMaxGroupSize(Kernel, Device, 0, &Size));
+  ASSERT_GT(Size, 0u);
+}
+
+TEST_P(olKernelGetMaxGroupSizeTest, NullKernel) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olGetKernelMaxGroupSize(nullptr, Device, 0, &Size));
+}
+
+TEST_P(olKernelGetMaxGroupSizeTest, NullDevice) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olGetKernelMaxGroupSize(Kernel, nullptr, 0, &Size));
+}
+
+TEST_P(olKernelGetMaxGroupSizeTest, NullOutput) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olGetKernelMaxGroupSize(Kernel, Device, 0, nullptr));
+}

From 1b0f3d630a0d1129c836ce1e0434ebcb24a6065f Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Mon, 30 Jun 2025 16:22:33 +0100
Subject: [PATCH 2/2] Use uint64_t rather than size_t

---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp               | 4 ++--
 offload/plugins-nextgen/common/include/PluginInterface.h | 4 ++--
 offload/plugins-nextgen/cuda/src/rtl.cpp                 | 4 ++--
 offload/plugins-nextgen/host/src/rtl.cpp                 | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 4369ccb6208ed..19d1c96cd1066 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -573,8 +573,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
   /// Return maximum block size for maximum occupancy
   ///
   /// TODO: This needs to be implemented for amdgpu
-  Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
-                                size_t DynamicMemSize) const override {
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                  uint64_t DynamicMemSize) const override {
     return 1;
   }
 
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 753b193de65b3..ae68f05c9ece0 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -316,8 +316,8 @@ struct GenericKernelTy {
                            KernelLaunchParamsTy LaunchParams,
                            AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
 
-  virtual Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
-                                        size_t DynamicMemSize) const = 0;
+  virtual Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                          uint64_t DynamicMemSize) const = 0;
 
   /// Get the kernel name.
   const char *getName() const { return Name; }
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index f154718e91440..05b165e38fefd 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -158,8 +158,8 @@ struct CUDAKernelTy : public GenericKernelTy {
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
   /// Return maximum block size for maximum occupancy
-  Expected<size_t> maxGroupSize(GenericDeviceTy &,
-                                size_t DynamicMemSize) const override {
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &,
+                                  uint64_t DynamicMemSize) const override {
     int minGridSize;
     int maxBlockSize;
     auto Res = cuOccupancyMaxPotentialBlockSize(
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 9d5fa153643ef..eb1f8782abc40 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -115,8 +115,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
   }
 
   /// Return maximum block size for maximum occupancy
-  Expected<size_t> maxGroupSize(GenericDeviceTy &Device,
-                                size_t DynamicMemSize) const override {
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &Device,
+                                  uint64_t DynamicMemSize) const override {
     // TODO
     return 1;
   }