From 09d5cab96ebc8a812bd385c5d2a18a64aab0ed2e Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Thu, 5 Jun 2025 12:21:52 +0100 Subject: [PATCH 1/2] [Offload] Add olKernelMaxGroupSize This is equivalent to `cuOccupancyMaxPotentialBlockSize`. It is currently only implented on Cuda; AMDGPU and Host return the legal-but-suboptimal value of `1`. Co-Authored-By: Callum Fare --- offload/liboffload/API/Kernel.td | 13 +++++++ offload/liboffload/src/OffloadImpl.cpp | 20 +++++++++- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 8 ++++ .../common/include/PluginInterface.h | 3 ++ .../cuda/dynamic_cuda/cuda.cpp | 1 + .../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 3 ++ offload/plugins-nextgen/cuda/src/rtl.cpp | 14 +++++++ offload/plugins-nextgen/host/src/rtl.cpp | 7 ++++ offload/unittests/OffloadAPI/CMakeLists.txt | 1 + .../kernel/olGetKernelMaxGroupSize.cpp | 37 +++++++++++++++++++ 10 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 offload/unittests/OffloadAPI/kernel/olGetKernelMaxGroupSize.cpp diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td index 0913a036fa04f..549ef79fbea42 100644 --- a/offload/liboffload/API/Kernel.td +++ b/offload/liboffload/API/Kernel.td @@ -24,6 +24,19 @@ def : Function { let returns = []; } +def : Function { + let name = "olGetKernelMaxGroupSize"; + let desc = "Get the maximum block size needed to achieve maximum occupancy."; + let details = []; + let params = [ + Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>, + Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>, + Param<"size_t", "SharedMemory", "dynamic shared memory required", PARAM_IN>, + Param<"size_t*", "GroupSize", "maximum block size", PARAM_OUT> + ]; + let returns = []; +} + def : Struct { let name = "ol_kernel_launch_size_args_t"; let desc = "Size-related arguments for a kernel launch."; diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index e7da4eddce54f..329a8878ff6ec 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -557,6 +557,10 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) { return olDestroy(Program); } +inline GenericKernelTy *getPluginKernel(ol_kernel_handle_t OlKernel) { + return reinterpret_cast(OlKernel); +} + Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName, ol_kernel_handle_t *Kernel) { @@ -573,6 +577,20 @@ Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName, return Error::success(); } +Error olGetKernelMaxGroupSize_impl(ol_kernel_handle_t Kernel, + ol_device_handle_t Device, + size_t DynamicMemSize, size_t *GroupSize) { + auto *KernelImpl = getPluginKernel(Kernel); + + auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize); + if (auto Err = Res.takeError()) + return Err; + + *GroupSize = *Res; + + return Error::success(); +} + Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device, ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize, @@ -603,7 +621,7 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device, // Don't do anything with pointer indirection; use arg data as-is LaunchArgs.Flags.IsCUDA = true; - auto *KernelImpl = reinterpret_cast(Kernel); + auto *KernelImpl = getPluginKernel(Kernel); auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr, LaunchArgs, AsyncInfoWrapper); diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 35cb297f65188..4369ccb6208ed 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -570,6 +570,14 @@ struct AMDGPUKernelTy : public GenericKernelTy { KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + /// Return maximum block size for maximum occupancy + /// + /// TODO: This needs to be implemented for amdgpu + Expected maxGroupSize(GenericDeviceTy &GenericDevice, + size_t DynamicMemSize) const override { + return 1; + } + /// Print more elaborate kernel launch info for AMDGPU Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, uint32_t NumThreads[3], diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index fbc798faec24b..753b193de65b3 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -316,6 +316,9 @@ struct GenericKernelTy { KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; + virtual Expected maxGroupSize(GenericDeviceTy &GenericDevice, + size_t DynamicMemSize) const = 0; + /// Get the kernel name. const char *getName() const { return Name; } diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp index e5332686fcffb..e6699ee78596d 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -71,6 +71,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3) DLWRAP(cuDevicePrimaryCtxSetFlags, 2) DLWRAP(cuDevicePrimaryCtxRetain, 2) DLWRAP(cuModuleLoadDataEx, 5) +DLWRAP(cuOccupancyMaxPotentialBlockSize, 6) DLWRAP(cuDeviceCanAccessPeer, 3) DLWRAP(cuCtxEnablePeerAccess, 2) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index 1c5b421768894..2c856c68a9368 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -289,6 +289,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01; static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02; typedef void (*CUstreamCallback)(CUstream, CUresult, void *); +typedef size_t (*CUoccupancyB2DSize)(int); CUresult cuCtxGetDevice(CUdevice *); CUresult cuDeviceGet(CUdevice *, int); @@ -370,5 +371,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, CUresult cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, + CUoccupancyB2DSize, size_t, int); #endif diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index f1164074f9ea9..f154718e91440 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy { KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + /// Return maximum block size for maximum occupancy + Expected maxGroupSize(GenericDeviceTy &, + size_t DynamicMemSize) const override { + int minGridSize; + int maxBlockSize; + auto Res = cuOccupancyMaxPotentialBlockSize( + &minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX); + if (auto Err = Plugin::check( + Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) { + return Err; + } + return maxBlockSize; + } + private: /// The CUDA kernel function to execute. CUfunction Func; diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index a35910aece986..9d5fa153643ef 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -114,6 +114,13 @@ struct GenELF64KernelTy : public GenericKernelTy { return Plugin::success(); } + /// Return maximum block size for maximum occupancy + Expected maxGroupSize(GenericDeviceTy &Device, + size_t DynamicMemSize) const override { + // TODO + return 1; + } + private: /// The kernel function to execute. void (*Func)(void); diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt index 05e862865ed33..a10822372a1c6 100644 --- a/offload/unittests/OffloadAPI/CMakeLists.txt +++ b/offload/unittests/OffloadAPI/CMakeLists.txt @@ -18,6 +18,7 @@ target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER) add_offload_unittest("kernel" kernel/olGetKernel.cpp + kernel/olGetKernelMaxGroupSize.cpp kernel/olLaunchKernel.cpp) add_offload_unittest("memory" diff --git a/offload/unittests/OffloadAPI/kernel/olGetKernelMaxGroupSize.cpp b/offload/unittests/OffloadAPI/kernel/olGetKernelMaxGroupSize.cpp new file mode 100644 index 0000000000000..f81986f535c0f --- /dev/null +++ b/offload/unittests/OffloadAPI/kernel/olGetKernelMaxGroupSize.cpp @@ -0,0 +1,37 @@ +//===------- Offload API tests - olGetKernelMaxGroupSize ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include +#include + +using olKernelGetMaxGroupSizeTest = OffloadKernelTest; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olKernelGetMaxGroupSizeTest); + +TEST_P(olKernelGetMaxGroupSizeTest, Success) { + size_t Size{0}; + ASSERT_SUCCESS(olGetKernelMaxGroupSize(Kernel, Device, 0, &Size)); + ASSERT_GT(Size, 0u); +} + +TEST_P(olKernelGetMaxGroupSizeTest, NullKernel) { + size_t Size; + ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, + olGetKernelMaxGroupSize(nullptr, Device, 0, &Size)); +} + +TEST_P(olKernelGetMaxGroupSizeTest, NullDevice) { + size_t Size; + ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, + olGetKernelMaxGroupSize(Kernel, nullptr, 0, &Size)); +} + +TEST_P(olKernelGetMaxGroupSizeTest, NullOutput) { + ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, + olGetKernelMaxGroupSize(Kernel, Device, 0, nullptr)); +} From 1b0f3d630a0d1129c836ce1e0434ebcb24a6065f Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Mon, 30 Jun 2025 16:22:33 +0100 Subject: [PATCH 2/2] Use uint64_t rather than size_t --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 4 ++-- offload/plugins-nextgen/common/include/PluginInterface.h | 4 ++-- offload/plugins-nextgen/cuda/src/rtl.cpp | 4 ++-- offload/plugins-nextgen/host/src/rtl.cpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 4369ccb6208ed..19d1c96cd1066 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -573,8 +573,8 @@ struct AMDGPUKernelTy : public GenericKernelTy { /// Return maximum block size for maximum occupancy /// /// TODO: This needs to be implemented for amdgpu - Expected maxGroupSize(GenericDeviceTy &GenericDevice, - size_t DynamicMemSize) const override { + Expected maxGroupSize(GenericDeviceTy &GenericDevice, + uint64_t DynamicMemSize) const override { return 1; } diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 753b193de65b3..ae68f05c9ece0 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -316,8 +316,8 @@ struct GenericKernelTy { KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; - virtual Expected maxGroupSize(GenericDeviceTy &GenericDevice, - size_t DynamicMemSize) const = 0; + virtual Expected maxGroupSize(GenericDeviceTy &GenericDevice, + uint64_t DynamicMemSize) const = 0; /// Get the kernel name. const char *getName() const { return Name; } diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index f154718e91440..05b165e38fefd 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -158,8 +158,8 @@ struct CUDAKernelTy : public GenericKernelTy { AsyncInfoWrapperTy &AsyncInfoWrapper) const override; /// Return maximum block size for maximum occupancy - Expected maxGroupSize(GenericDeviceTy &, - size_t DynamicMemSize) const override { + Expected maxGroupSize(GenericDeviceTy &, + uint64_t DynamicMemSize) const override { int minGridSize; int maxBlockSize; auto Res = cuOccupancyMaxPotentialBlockSize( diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index 9d5fa153643ef..eb1f8782abc40 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -115,8 +115,8 @@ struct GenELF64KernelTy : public GenericKernelTy { } /// Return maximum block size for maximum occupancy - Expected maxGroupSize(GenericDeviceTy &Device, - size_t DynamicMemSize) const override { + Expected maxGroupSize(GenericDeviceTy &Device, + uint64_t DynamicMemSize) const override { // TODO return 1; }