diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index feb6c221d1d17..59a13381e2eb9 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -2940,6 +2940,43 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, if (kernel != nullptr) { switch (param_name) { + case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { + size_t global_work_size[3] = {0, 0, 0}; + + int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_block_dimX, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + device->get()) == CUDA_SUCCESS); + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_block_dimY, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, + device->get()) == CUDA_SUCCESS); + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_block_dimZ, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, + device->get()) == CUDA_SUCCESS); + + int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_grid_dimX, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + device->get()) == CUDA_SUCCESS); + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_grid_dimY, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, + device->get()) == CUDA_SUCCESS); + sycl::detail::pi::assertion( + cuDeviceGetAttribute(&max_grid_dimZ, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, + device->get()) == CUDA_SUCCESS); + + global_work_size[0] = max_block_dimX * max_grid_dimX; + global_work_size[1] = max_block_dimY * max_grid_dimY; + global_work_size[2] = max_block_dimZ * max_grid_dimZ; + return getInfoArray(3, param_value_size, param_value, + param_value_size_ret, global_work_size); + } case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { int max_threads = 0; sycl::detail::pi::assertion( @@ -5612,7 +5649,7 @@ pi_result cuda_piextEnqueueWriteHostPipe( // Windows: dynamically loaded plugins might have been unloaded already // when this is called. Sycl RT holds onto the PI plugin so it can be // called safely. But this is not transitive. If the PI plugin in turn -// dynamically loaded a different DLL, that may have been unloaded. +// dynamically loaded a different DLL, that may have been unloaded. // TODO: add a global variable lifetime management code here (see // pi_level_zero.cpp for reference) Currently this is just a NOOP. pi_result cuda_piTearDown(void *) { diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index f216bc4565edf..acac19eef627a 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -3586,6 +3586,37 @@ pi_result hip_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, if (kernel != nullptr) { switch (param_name) { + case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { + size_t global_work_size[3] = {0, 0, 0}; + + int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0}; + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_block_dimX, hipDeviceAttributeMaxBlockDimX, + device->get()) == hipSuccess); + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_block_dimY, hipDeviceAttributeMaxBlockDimY, + device->get()) == hipSuccess); + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_block_dimZ, hipDeviceAttributeMaxBlockDimZ, + device->get()) == hipSuccess); + + int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0}; + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_grid_dimX, hipDeviceAttributeMaxGridDimX, + device->get()) == hipSuccess); + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_grid_dimY, hipDeviceAttributeMaxGridDimY, + device->get()) == hipSuccess); + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&max_grid_dimZ, hipDeviceAttributeMaxGridDimZ, + device->get()) == hipSuccess); + + global_work_size[0] = max_block_dimX * max_grid_dimX; + global_work_size[1] = max_block_dimY * max_grid_dimY; + global_work_size[2] = max_block_dimZ * max_grid_dimZ; + return getInfoArray(3, param_value_size, param_value, + param_value_size_ret, global_work_size); + } case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { int max_threads = 0; sycl::detail::pi::assertion( @@ -5431,7 +5462,7 @@ pi_result hip_piextEnqueueWriteHostPipe( // Windows: dynamically loaded plugins might have been unloaded already // when this is called. Sycl RT holds onto the PI plugin so it can be // called safely. But this is not transitive. If the PI plugin in turn -// dynamically loaded a different DLL, that may have been unloaded. +// dynamically loaded a different DLL, that may have been unloaded. // TODO: add a global variable lifetime management code here (see // pi_level_zero.cpp for reference) Currently this is just a NOOP. pi_result hip_piTearDown(void *PluginParameter) { diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index c25c8624ea00d..1a1c3dea04a1c 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -4234,13 +4234,15 @@ pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device, std::shared_lock Guard(Kernel->Mutex); switch (ParamName) { case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { - // TODO: To revisit after level_zero/issues/262 is resolved struct { size_t Arr[3]; - } WorkSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX, - Device->ZeDeviceComputeProperties->maxGroupSizeY, - Device->ZeDeviceComputeProperties->maxGroupSizeZ}}; - return ReturnValue(WorkSize); + } GlobalWorkSize = {{(Device->ZeDeviceComputeProperties->maxGroupSizeX * + Device->ZeDeviceComputeProperties->maxGroupCountX), + (Device->ZeDeviceComputeProperties->maxGroupSizeY * + Device->ZeDeviceComputeProperties->maxGroupCountY), + (Device->ZeDeviceComputeProperties->maxGroupSizeZ * + Device->ZeDeviceComputeProperties->maxGroupCountZ)}}; + return ReturnValue(GlobalWorkSize); } case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { // As of right now, L0 is missing API to query kernel and device specific diff --git a/sycl/test-e2e/Basic/kernel_info.cpp b/sycl/test-e2e/Basic/kernel_info.cpp index 0703ec2d97213..e359ba1eb17ab 100644 --- a/sycl/test-e2e/Basic/kernel_info.cpp +++ b/sycl/test-e2e/Basic/kernel_info.cpp @@ -6,8 +6,6 @@ // Fail is flaky for level_zero, enable when fixed. // UNSUPPORTED: level_zero // -// CUDA and HIP do not currently implement global_work_size -// UNSUPPORTED: cuda, hip //==--- kernel_info.cpp - SYCL kernel info test ----------------------------==// // @@ -56,9 +54,14 @@ int main() { assert(prefWGSizeMult > 0); try { + // To check (a) first if the kernel is device built-in, (b) then check if + // the device type is custom + if (!sycl::is_compatible({KernelID}, q.get_device())) { + assert(dev.get_info() == + sycl::info::device_type::custom); + } + krn.get_info(dev); - assert(dev.get_info() == - sycl::info::device_type::custom); } catch (sycl::exception &e) { assert(e.code() == sycl::errc::invalid); }