Skip to content

Add new launch property to support work_group_scratch_memory #1968

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions include/ur_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -9537,6 +9537,7 @@ typedef enum ur_exp_launch_property_id_t {
UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, ///< The property has no effect
UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, ///< Whether to launch a cooperative kernel
UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
/// @cond
UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
/// @endcond
Expand All @@ -9550,10 +9551,12 @@ typedef enum ur_exp_launch_property_id_t {
/// _Analogues_
/// - **CUlaunchAttributeValue**
typedef union ur_exp_launch_property_value_t {
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
///< value must be a divisor of the corresponding global work-size
///< dimension (in units of work-group).
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
///< value must be a divisor of the corresponding global work-size
///< dimension (in units of work-group).
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
///< allocate in bytes

} ur_exp_launch_property_value_t;

Expand Down Expand Up @@ -9594,6 +9597,7 @@ typedef struct ur_exp_launch_property_t {
/// + NULL == hQueue
/// + NULL == hKernel
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
/// + `NULL == pGlobalWorkOffset`
/// + `NULL == pGlobalWorkSize`
/// + `NULL == launchPropList`
/// + NULL == pGlobalWorkSize
Expand Down Expand Up @@ -9622,6 +9626,8 @@ urEnqueueKernelLaunchCustomExp(
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
///< work-group work-items
const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
///< offset used to calculate the global ID of a work-item
const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
///< number of global work-items in workDim that will execute the kernel
///< function
Expand Down Expand Up @@ -11531,6 +11537,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
ur_queue_handle_t *phQueue;
ur_kernel_handle_t *phKernel;
uint32_t *pworkDim;
const size_t **ppGlobalWorkOffset;
const size_t **ppGlobalWorkSize;
const size_t **ppLocalWorkSize;
uint32_t *pnumPropsInLaunchPropList;
Expand Down
1 change: 1 addition & 0 deletions include/ur_ddi.h
Original file line number Diff line number Diff line change
Expand Up @@ -1467,6 +1467,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
uint32_t,
const size_t *,
const size_t *,
const size_t *,
uint32_t,
const ur_exp_launch_property_t *,
uint32_t,
Expand Down
16 changes: 16 additions & 0 deletions include/ur_print.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10319,6 +10319,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
break;
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
break;
default:
os << "unknown enumerator";
break;
Expand Down Expand Up @@ -10355,6 +10358,13 @@ inline ur_result_t printUnion(

os << (params.cooperative);

break;
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:

os << ".workgroup_mem_size = ";

os << (params.workgroup_mem_size);

break;
default:
os << "<unknown>";
Expand Down Expand Up @@ -15022,6 +15032,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct

os << *(params->pworkDim);

os << ", ";
os << ".pGlobalWorkOffset = ";

ur::details::printPtr(os,
*(params->ppGlobalWorkOffset));

os << ", ";
os << ".pGlobalWorkSize = ";

Expand Down
13 changes: 11 additions & 2 deletions scripts/core/exp-launch-properties.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ etors:
desc: "Whether to launch a cooperative kernel"
- name: CLUSTER_DIMENSION
desc: "work-group cluster dimensions"
- name: WORK_GROUP_MEMORY
desc: "Implicit work group memory allocation"
--- #--------------------------------------------------------------------------
type: union
desc: "Specifies a launch property value"
Expand All @@ -45,6 +47,10 @@ members:
name: cooperative
desc: "[in] non-zero value indicates a cooperative kernel"
tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
- type: size_t
name: workgroup_mem_size
desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes"
tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
--- #--------------------------------------------------------------------------
type: struct
desc: "Kernel launch property"
Expand Down Expand Up @@ -82,6 +88,9 @@ params:
- type: uint32_t
name: workDim
desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
- type: "const size_t*"
name: pGlobalWorkOffset
desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
- type: const size_t*
name: pGlobalWorkSize
desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
Expand All @@ -97,10 +106,10 @@ params:
- type: uint32_t
name: numEventsInWaitList
desc: "[in] size of the event wait list"
- type: const ur_event_handle_t*
- type: const $x_event_handle_t*
name: phEventWaitList
desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
- type: ur_event_handle_t*
- type: $x_event_handle_t*
name: phEvent
desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
returns:
Expand Down
111 changes: 70 additions & 41 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,11 +422,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
phEventWaitList, phEvent);
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
static ur_result_t
enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
uint32_t workDim, const size_t *pGlobalWorkOffset,
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
// Preconditions
UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
UR_RESULT_ERROR_INVALID_KERNEL);
Expand All @@ -444,6 +446,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};

// Set work group memory so we can compute the whole memory requirement
if (WorkGroupMemory)
hKernel->setWorkGroupMemory(WorkGroupMemory);
uint32_t LocalSize = hKernel->getLocalSize();
CUfunction CuFunc = hKernel->get();

Expand Down Expand Up @@ -506,6 +511,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
pGlobalWorkSize, pLocalWorkSize,
numEventsInWaitList, phEventWaitList, phEvent,
/*WorkGroupMemory=*/0);
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
Expand All @@ -516,8 +532,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
coop_prop.value.cooperative = 1;
return urEnqueueKernelLaunchCustomExp(
hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
&coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
phEvent);
}
return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
pGlobalWorkSize, pLocalWorkSize,
Expand All @@ -526,16 +543,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
uint32_t numPropsInLaunchPropList,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
const ur_exp_launch_property_t *launchPropList,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {

if (numPropsInLaunchPropList == 0) {
urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
pLocalWorkSize, numEventsInWaitList, phEventWaitList,
phEvent);
size_t WorkGroupMemory = [&]() -> size_t {
const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
launchPropList, launchPropList + numPropsInLaunchPropList,
[](const ur_exp_launch_property_t &Prop) {
return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
});
if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
return WorkGroupMemoryProp->value.workgroup_mem_size;
return 0;
}();

if (numPropsInLaunchPropList == 0 ||
(WorkGroupMemory && numPropsInLaunchPropList == 1)) {
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
pGlobalWorkSize, pLocalWorkSize,
numEventsInWaitList, phEventWaitList, phEvent,
WorkGroupMemory);
}
#if CUDA_VERSION >= 11080
// Preconditions
Expand All @@ -548,7 +578,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
}

std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
std::vector<CUlaunchAttribute> launch_attribute;
launch_attribute.reserve(numPropsInLaunchPropList);

// Early exit for zero size kernel
if (*pGlobalWorkSize == 0) {
Expand All @@ -561,40 +592,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};

// Set work group memory so we can compute the whole memory requirement
if (WorkGroupMemory)
hKernel->setWorkGroupMemory(WorkGroupMemory);
uint32_t LocalSize = hKernel->getLocalSize();
CUfunction CuFunc = hKernel->get();

for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
switch (launchPropList[i].id) {
case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
auto &attr = launch_attribute.emplace_back();
attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE;
break;
}
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {

launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
auto &attr = launch_attribute.emplace_back();
attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
// Note that cuda orders from right to left wrt SYCL dimensional order.
if (workDim == 3) {
launch_attribute[i].value.clusterDim.x =
launchPropList[i].value.clusterDim[2];
launch_attribute[i].value.clusterDim.y =
launchPropList[i].value.clusterDim[1];
launch_attribute[i].value.clusterDim.z =
launchPropList[i].value.clusterDim[0];
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[2];
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[0];
} else if (workDim == 2) {
launch_attribute[i].value.clusterDim.x =
launchPropList[i].value.clusterDim[1];
launch_attribute[i].value.clusterDim.y =
launchPropList[i].value.clusterDim[0];
launch_attribute[i].value.clusterDim.z =
launchPropList[i].value.clusterDim[2];
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[1];
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[0];
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
} else {
launch_attribute[i].value.clusterDim.x =
launchPropList[i].value.clusterDim[0];
launch_attribute[i].value.clusterDim.y =
launchPropList[i].value.clusterDim[1];
launch_attribute[i].value.clusterDim.z =
launchPropList[i].value.clusterDim[2];
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[0];
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
}

UR_CHECK_ERROR(cuFuncSetAttribute(
Expand All @@ -603,9 +629,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
break;
}
case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
launch_attribute[i].value.cooperative =
launchPropList[i].value.cooperative;
auto &attr = launch_attribute.emplace_back();
attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
attr.value.cooperative = launchPropList[i].value.cooperative;
break;
}
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
break;
}
default: {
Expand All @@ -618,8 +647,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
// using the standard UR_CHECK_ERROR
if (ur_result_t Ret =
setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
CuFunc, ThreadsPerBlock, BlocksPerGrid);
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
Ret != UR_RESULT_SUCCESS)
return Ret;

Expand Down Expand Up @@ -667,7 +696,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
launch_config.sharedMemBytes = LocalSize;
launch_config.hStream = CuStream;
launch_config.attrs = &launch_attribute[0];
launch_config.numAttrs = numPropsInLaunchPropList;
launch_config.numAttrs = launch_attribute.size();

UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
const_cast<void **>(ArgIndices.data()),
Expand Down
Loading
Loading