Skip to content

Commit 31a5796

Browse files
author
Jaime Arteaga
committed
[UR][L0] Unify use of large allocation in L0 adapter
Intel(R) GPUs have two modes of operation in terms of allocations: Stateful and stateless mode. Stateful optimizes memory accesses through pointer arithmetic. This can be done as long as allocations used by the allocation are smaller than 4GB. Stateless disables such pointer-arithmetic optimization to allow the kernel to use allocations larger than 4GB. Currently, L0 adapter dynamically and automatically requests the L0 driver large allocations if it detects an allocation size is larger than 4GB. This creates a problem if a kernel has been previously compiled for stateful access. This ultimately means the adapter mixes stateful and stateless behavior, which is not a user-friendly experience. This patch aims at correcting this behavior by defining a default one. On Intel(R) GPUs previous to Intel(R) Data Center GPU Max, default behavior is now stateful, meaning small allocations are only allowed and any allocation larger than 4GB fails. Users can opt-in for stateless mode setting a new environment variable UR_L0_ALLOW_LARGE_ALLOCATIONS. Intel(R) Data Center GPU Max use stateless mode by default. Addresses: https://stackoverflow.com/questions/75621264/sycl-dot-product-code-gives-wrong-results Signed-off-by: Jaime Arteaga <[email protected]>
1 parent 71957e8 commit 31a5796

File tree

4 files changed

+78
-20
lines changed

4 files changed

+78
-20
lines changed

source/adapters/level_zero/device.cpp

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(
8888
return UR_RESULT_SUCCESS;
8989
}
9090

91+
inline uint64_t getGlobalMemSize(ur_device_handle_t Device) {
92+
uint64_t GlobalMemSize = 0;
93+
// Support to read physicalSize depends on kernel,
94+
// so fallback into reading totalSize if physicalSize
95+
// is not available.
96+
for (const auto &ZeDeviceMemoryExtProperty :
97+
Device->ZeDeviceMemoryProperties->second) {
98+
GlobalMemSize += ZeDeviceMemoryExtProperty.physicalSize;
99+
}
100+
if (GlobalMemSize == 0) {
101+
for (const auto &ZeDeviceMemoryProperty :
102+
Device->ZeDeviceMemoryProperties->first) {
103+
GlobalMemSize += ZeDeviceMemoryProperty.totalSize;
104+
}
105+
}
106+
return GlobalMemSize;
107+
}
108+
91109
UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
92110
ur_device_handle_t Device, ///< [in] handle of the device instance
93111
ur_device_info_t ParamName, ///< [in] type of the info to retrieve
@@ -249,23 +267,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
249267
return ReturnValue(uint32_t{64});
250268
}
251269
case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
252-
return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
253-
case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
254-
uint64_t GlobalMemSize = 0;
255-
// Support to read physicalSize depends on kernel,
256-
// so fallback into reading totalSize if physicalSize
257-
// is not available.
258-
for (const auto &ZeDeviceMemoryExtProperty :
259-
Device->ZeDeviceMemoryProperties->second) {
260-
GlobalMemSize += ZeDeviceMemoryExtProperty.physicalSize;
261-
}
262-
if (GlobalMemSize == 0) {
263-
for (const auto &ZeDeviceMemoryProperty :
264-
Device->ZeDeviceMemoryProperties->first) {
265-
GlobalMemSize += ZeDeviceMemoryProperty.totalSize;
266-
}
270+
// if using large allocations, then return total size in the device.
271+
// if not, then return L0's maxMemAllocSize.
272+
if (Device->useLargeAllocations()) {
273+
return ReturnValue(uint64_t{getGlobalMemSize(Device)});
274+
} else {
275+
return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
267276
}
268-
return ReturnValue(uint64_t{GlobalMemSize});
277+
case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
278+
return ReturnValue(uint64_t{getGlobalMemSize(Device)});
269279
}
270280
case UR_DEVICE_INFO_LOCAL_MEM_SIZE:
271281
return ReturnValue(
@@ -900,6 +910,17 @@ ur_device_handle_t_::useImmediateCommandLists() {
900910
}
901911
}
902912

913+
bool ur_device_handle_t_::useLargeAllocations() {
914+
static const bool UseLargeAllocations = [this] {
915+
const char *UrRet = std::getenv("UR_L0_ALLOW_LARGE_ALLOCATIONS");
916+
if (!UrRet)
917+
return (this->isPVC() ? true : false);
918+
return std::atoi(UrRet) != 0;
919+
}();
920+
921+
return UseLargeAllocations;
922+
}
923+
903924
ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
904925
int SubSubDeviceIndex) {
905926
// Maintain various device properties cache.

source/adapters/level_zero/device.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,17 @@ struct ur_device_handle_t_ : _ur_object {
141141
// Returns whether immediate command lists are used on this device.
142142
ImmCmdlistMode ImmCommandListUsed{};
143143

144+
// Returns whether large allocations are being used or not.
145+
// On some Intel GPUs, this influences how kernels are compiled.
146+
// If large allocations (>4GB) are requested, then kernels are
147+
// compiled with stateless access.
148+
// If small allocations (<4GB) are requested, then kernels are
149+
// compiled with stateful access, with potential performance
150+
// improvements.
151+
// Some GPUs support only one mode, such us Intel(R) Data Center GPU Max,
152+
// which supports only stateless.
153+
bool useLargeAllocations();
154+
144155
bool isSubDevice() { return RootDevice != nullptr; }
145156

146157
// Is this a Data Center GPU Max series (aka PVC)?

source/adapters/level_zero/program.cpp

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(
148148
ZeModuleDesc.format = (hProgram->State == ur_program_handle_t_::IL)
149149
? ZE_MODULE_FORMAT_IL_SPIRV
150150
: ZE_MODULE_FORMAT_NATIVE;
151+
151152
ZeModuleDesc.inputSize = hProgram->CodeLength;
152153
ZeModuleDesc.pInputModule = hProgram->Code.get();
153-
ZeModuleDesc.pBuildFlags = pOptions;
154+
155+
// if large allocations are selected, then pass
156+
// ze-opt-greater-than-4GB-buffer-required to disable
157+
// stateful optimizations and be able to use larger than
158+
// 4GB allocations on these kernels.
159+
std::string ZeBuildOptions{};
160+
if (pOptions) {
161+
ZeBuildOptions += pOptions;
162+
}
163+
164+
if (phDevices[0]->useLargeAllocations()) {
165+
ZeBuildOptions += " -ze-opt-greater-than-4GB-buffer-required";
166+
}
167+
168+
ZeModuleDesc.pBuildFlags = ZeBuildOptions.c_str();
154169
ZeModuleDesc.pConstants = Shim.ze();
155170

156171
ze_device_handle_t ZeDevice = phDevices[0]->ZeDevice;
@@ -234,8 +249,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(
234249
// This produces better code because the driver can do cross-module
235250
// optimizations. Therefore, we just remember the compilation flags, so we
236251
// can use them later.
237-
if (Options)
252+
if (Options) {
238253
Program->BuildFlags = Options;
254+
255+
// if large allocations are selected, then pass
256+
// ze-opt-greater-than-4GB-buffer-required to disable
257+
// stateful optimizations and be able to use larger than
258+
// 4GB allocations on these kernels.
259+
if (Context->Devices[0]->useLargeAllocations()) {
260+
Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required";
261+
}
262+
}
239263
Program->State = ur_program_handle_t_::Object;
240264

241265
return UR_RESULT_SUCCESS;

source/adapters/level_zero/usm.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,10 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr,
179179
ZeDesc.ordinal = 0;
180180

181181
ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
182-
if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
183-
// Tell Level-Zero to accept Size > maxMemAllocSize
182+
if (Device->useLargeAllocations() &&
183+
(Size > Device->ZeDeviceProperties->maxMemAllocSize)) {
184+
// Tell Level-Zero to accept Size > maxMemAllocSize if
185+
// large allocations are used.
184186
RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
185187
ZeDesc.pNext = &RelaxedDesc;
186188
}

0 commit comments

Comments
 (0)