diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index 9f98f84377dac..87479d8b86cab 100644 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -261,6 +261,8 @@ static pi_result mapError(ze_result_t ZeResult) { {ZE_RESULT_ERROR_INVALID_KERNEL_NAME, PI_INVALID_KERNEL_NAME}, {ZE_RESULT_ERROR_INVALID_FUNCTION_NAME, PI_BUILD_PROGRAM_FAILURE}, {ZE_RESULT_ERROR_OVERLAPPING_REGIONS, PI_INVALID_OPERATION}, + {ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION, + PI_INVALID_WORK_GROUP_SIZE}, {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, PI_BUILD_PROGRAM_FAILURE}}; auto It = ErrorMapping.find(ZeResult); if (It == ErrorMapping.end()) { diff --git a/sycl/source/detail/error_handling/enqueue_kernel.cpp b/sycl/source/detail/error_handling/enqueue_kernel.cpp index 1438e66a1e80b..b97f9738e81b1 100644 --- a/sycl/source/detail/error_handling/enqueue_kernel.cpp +++ b/sycl/source/detail/error_handling/enqueue_kernel.cpp @@ -12,6 +12,7 @@ #include "error_handling.hpp" +#include #include #include @@ -21,18 +22,40 @@ namespace detail { namespace enqueue_kernel_launch { -bool oclHandleInvalidWorkGroupSize(const device_impl &DeviceImpl, - pi_kernel Kernel, const NDRDescT &NDRDesc) { +bool handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel, + const NDRDescT &NDRDesc) { const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0); const plugin &Plugin = DeviceImpl.getPlugin(); RT::PiDevice Device = DeviceImpl.getHandleRef(); + cl::sycl::platform Platform = DeviceImpl.get_platform(); + + if (HasLocalSize) { + size_t MaxThreadsPerBlock[3] = {}; + Plugin.call( + Device, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES, sizeof(MaxThreadsPerBlock), + MaxThreadsPerBlock, nullptr); + + for (size_t I = 0; I < 3; ++I) { + if (MaxThreadsPerBlock[I] < NDRDesc.LocalSize[I]) { + throw sycl::nd_range_error( + "The number of work-items in each dimension of a work-group cannot " + "exceed {" + + std::to_string(MaxThreadsPerBlock[0]) + ", " + + std::to_string(MaxThreadsPerBlock[1]) + ", " + + std::to_string(MaxThreadsPerBlock[2]) + "} for this device", + PI_INVALID_WORK_GROUP_SIZE); + } + } + } size_t VerSize = 0; Plugin.call(Device, PI_DEVICE_INFO_VERSION, 0, nullptr, &VerSize); - assert(VerSize >= 10 && - "Unexpected device version string"); // strlen("OpenCL X.Y") + if (Platform.get_backend() == cl::sycl::backend::opencl) { + assert(VerSize >= 10 && + "Unexpected device version string"); // strlen("OpenCL X.Y") + } string_class VerStr(VerSize, '\0'); Plugin.call(Device, PI_DEVICE_INFO_VERSION, VerSize, &VerStr.front(), nullptr); @@ -48,12 +71,13 @@ bool oclHandleInvalidWorkGroupSize(const device_impl &DeviceImpl, // PI_INVALID_WORK_GROUP_SIZE if local_work_size is NULL and the // reqd_work_group_size attribute is used to declare the work-group size // for kernel in the program source. - if (!HasLocalSize && (Ver[0] == '1' || (Ver[0] == '2' && Ver[2] == '0'))) - throw sycl::nd_range_error( - "OpenCL 1.x and 2.0 requires to pass local size argument even if " - "required work-group size was specified in the program source", - PI_INVALID_WORK_GROUP_SIZE); - + if (Platform.get_backend() == cl::sycl::backend::opencl) { + if (!HasLocalSize && (Ver[0] == '1' || (Ver[0] == '2' && Ver[2] == '0'))) + throw sycl::nd_range_error( + "OpenCL 1.x and 2.0 requires to pass local size argument even if " + "required work-group size was specified in the program source", + PI_INVALID_WORK_GROUP_SIZE); + } // Any OpenCL version: // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and does not // match the required work-group size for kernel in the program source. @@ -65,45 +89,44 @@ bool oclHandleInvalidWorkGroupSize(const device_impl &DeviceImpl, "specified in the program source", PI_INVALID_WORK_GROUP_SIZE); } - - if (Ver[0] == '1') { - // OpenCL 1.x: - // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the - // total number of work-items in the work-group computed as - // local_work_size[0] * ... * local_work_size[work_dim – 1] is greater - // than the value specified by PI_DEVICE_MAX_WORK_GROUP_SIZE in - // table 4.3 - size_t MaxWGSize = 0; - Plugin.call( - Device, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE, sizeof(size_t), &MaxWGSize, - nullptr); - const size_t TotalNumberOfWIs = - NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2]; - if (TotalNumberOfWIs > MaxWGSize) - throw sycl::nd_range_error( - "Total number of work-items in a work-group cannot exceed " - "info::device::max_work_group_size which is equal to " + - std::to_string(MaxWGSize), - PI_INVALID_WORK_GROUP_SIZE); - } else { - // OpenCL 2.x: - // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the - // total number of work-items in the work-group computed as - // local_work_size[0] * ... * local_work_size[work_dim – 1] is greater - // than the value specified by PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE in - // table 5.21. - size_t KernelWGSize = 0; - Plugin.call( - Kernel, Device, PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE, sizeof(size_t), - &KernelWGSize, nullptr); - const size_t TotalNumberOfWIs = - NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2]; - if (TotalNumberOfWIs > KernelWGSize) - throw sycl::nd_range_error( - "Total number of work-items in a work-group cannot exceed " - "info::kernel_work_group::work_group_size which is equal to " + - std::to_string(KernelWGSize) + " for this kernel", - PI_INVALID_WORK_GROUP_SIZE); + if (Platform.get_backend() == cl::sycl::backend::opencl) { + if (Ver[0] == '1') { + // OpenCL 1.x: + // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the + // total number of work-items in the work-group computed as + // local_work_size[0] * ... * local_work_size[work_dim – 1] is greater + // than the value specified by PI_DEVICE_MAX_WORK_GROUP_SIZE in + // table 4.3 + size_t MaxWGSize = 0; + Plugin.call( + Device, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE, sizeof(size_t), + &MaxWGSize, nullptr); + const size_t TotalNumberOfWIs = + NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2]; + if (TotalNumberOfWIs > MaxWGSize) + throw sycl::nd_range_error( + "Total number of work-items in a work-group cannot exceed " + + std::to_string(MaxWGSize), + PI_INVALID_WORK_GROUP_SIZE); + } else { + // RELEVENT // OpenCL 2.x: + // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the + // total number of work-items in the work-group computed as + // local_work_size[0] * ... * local_work_size[work_dim – 1] is greater + // than the value specified by PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE in + // table 5.21. + size_t KernelWGSize = 0; + Plugin.call( + Kernel, Device, PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE, sizeof(size_t), + &KernelWGSize, nullptr); + const size_t TotalNumberOfWIs = + NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2]; + if (TotalNumberOfWIs > KernelWGSize) + throw sycl::nd_range_error( + "Total number of work-items in a work-group cannot exceed " + + std::to_string(KernelWGSize) + " for this kernel", + PI_INVALID_WORK_GROUP_SIZE); + } } if (HasLocalSize) { @@ -116,120 +139,89 @@ bool oclHandleInvalidWorkGroupSize(const device_impl &DeviceImpl, (NDRDesc.LocalSize[2] != 0 && NDRDesc.GlobalSize[2] % NDRDesc.LocalSize[2] != 0); // Is the local size of the workgroup greater than the global range size in - // any dimension? This is a sub-case of NonUniformWGs. - const bool LocalExceedsGlobal = - NonUniformWGs && (NDRDesc.LocalSize[0] > NDRDesc.GlobalSize[0] || - NDRDesc.LocalSize[1] > NDRDesc.GlobalSize[1] || - NDRDesc.LocalSize[2] > NDRDesc.GlobalSize[2]); - - if (NonUniformWGs) { - if (Ver[0] == '1') { - // OpenCL 1.x: - // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and - // number of workitems specified by global_work_size is not evenly - // divisible by size of work-group given by local_work_size - if (LocalExceedsGlobal) - throw sycl::nd_range_error("Local workgroup size cannot be greater " - "than global range in any dimension", - PI_INVALID_WORK_GROUP_SIZE); - else - throw sycl::nd_range_error( - "Global_work_size must be evenly divisible by local_work_size. " - "Non-uniform work-groups are not supported by the target device", - PI_INVALID_WORK_GROUP_SIZE); - } else { - // OpenCL 2.x: - // PI_INVALID_WORK_GROUP_SIZE if the program was compiled with - // –cl-uniform-work-group-size and the number of work-items specified - // by global_work_size is not evenly divisible by size of work-group - // given by local_work_size - - pi_program Program = nullptr; - Plugin.call(Kernel, PI_KERNEL_INFO_PROGRAM, - sizeof(pi_program), &Program, - nullptr); - size_t OptsSize = 0; - Plugin.call( - Program, Device, PI_PROGRAM_BUILD_INFO_OPTIONS, 0, nullptr, - &OptsSize); - string_class Opts(OptsSize, '\0'); - Plugin.call( - Program, Device, PI_PROGRAM_BUILD_INFO_OPTIONS, OptsSize, - &Opts.front(), nullptr); - const bool HasStd20 = Opts.find("-cl-std=CL2.0") != string_class::npos; - const bool RequiresUniformWGSize = - Opts.find("-cl-uniform-work-group-size") != string_class::npos; - std::string message = - LocalExceedsGlobal - ? "Local workgroup size greater than global range size. " - : "Global_work_size not evenly divisible by local_work_size. "; - if (!HasStd20) - throw sycl::nd_range_error( - message.append("Non-uniform work-groups are not allowed by " - "default. Underlying " - "OpenCL 2.x implementation supports this feature " - "and to enable " - "it, build device program with -cl-std=CL2.0"), - PI_INVALID_WORK_GROUP_SIZE); - else if (RequiresUniformWGSize) - throw sycl::nd_range_error( - message.append( - "Non-uniform work-groups are not allowed by when " - "-cl-uniform-work-group-size flag is used. Underlying " - "OpenCL 2.x implementation supports this feature, but it is " - "being " - "disabled by -cl-uniform-work-group-size build flag"), - PI_INVALID_WORK_GROUP_SIZE); - // else unknown. fallback (below) + // any dimension? + if (Platform.get_backend() == cl::sycl::backend::opencl) { + const bool LocalExceedsGlobal = + NonUniformWGs && (NDRDesc.LocalSize[0] > NDRDesc.GlobalSize[0] || + NDRDesc.LocalSize[1] > NDRDesc.GlobalSize[1] || + NDRDesc.LocalSize[2] > NDRDesc.GlobalSize[2]); + + if (NonUniformWGs) { + if (Ver[0] == '1') { + // OpenCL 1.x: + // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and + // number of workitems specified by global_work_size is not evenly + // divisible by size of work-group given by local_work_size + if (LocalExceedsGlobal) + throw sycl::nd_range_error("Local workgroup size cannot be greater " + "than global range in any dimension", + PI_INVALID_WORK_GROUP_SIZE); + else + throw sycl::nd_range_error( + "Global_work_size must be evenly divisible by local_work_size. " + "Non-uniform work-groups are not supported by the target " + "device", + PI_INVALID_WORK_GROUP_SIZE); + } else { + // OpenCL 2.x: + // PI_INVALID_WORK_GROUP_SIZE if the program was compiled with + // –cl-uniform-work-group-size and the number of work-items specified + // by global_work_size is not evenly divisible by size of work-group + // given by local_work_size + + pi_program Program = nullptr; + Plugin.call( + Kernel, PI_KERNEL_INFO_PROGRAM, sizeof(pi_program), &Program, + nullptr); + size_t OptsSize = 0; + Plugin.call( + Program, Device, PI_PROGRAM_BUILD_INFO_OPTIONS, 0, nullptr, + &OptsSize); + string_class Opts(OptsSize, '\0'); + Plugin.call( + Program, Device, PI_PROGRAM_BUILD_INFO_OPTIONS, OptsSize, + &Opts.front(), nullptr); + const bool HasStd20 = + Opts.find("-cl-std=CL2.0") != string_class::npos; + const bool RequiresUniformWGSize = + Opts.find("-cl-uniform-work-group-size") != string_class::npos; + std::string message = + LocalExceedsGlobal + ? "Local workgroup size greater than global range size. " + : "Global_work_size not evenly divisible by " + "local_work_size. "; + if (!HasStd20) + throw sycl::nd_range_error( + message.append( + "Non-uniform work-groups are not allowed by " + "default. Underlying " + "OpenCL 2.x implementation supports this feature " + "and to enable " + "it, build device program with -cl-std=CL2.0"), + PI_INVALID_WORK_GROUP_SIZE); + else if (RequiresUniformWGSize) + throw sycl::nd_range_error( + message.append( + "Non-uniform work-groups are not allowed by when " + "-cl-uniform-work-group-size flag is used. Underlying " + "OpenCL 2.x implementation supports this feature, but it " + "is " + "being " + "disabled by -cl-uniform-work-group-size build flag"), + PI_INVALID_WORK_GROUP_SIZE); + // else unknown. fallback (below) + } } } + throw sycl::nd_range_error( + "Non-uniform work-groups are not supported by the target device", + PI_INVALID_WORK_GROUP_SIZE); } - // TODO: required number of sub-groups, OpenCL 2.1: // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and is not // consistent with the required number of sub-groups for kernel in the // program source. - // Fallback - constexpr pi_result Error = PI_INVALID_WORK_GROUP_SIZE; - throw runtime_error( - "OpenCL API failed. OpenCL API returns: " + codeToString(Error), Error); -} - -bool handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel, - const NDRDescT &NDRDesc) { - const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0); - - const plugin &Plugin = DeviceImpl.getPlugin(); - RT::PiDevice Device = DeviceImpl.getHandleRef(); - - if (HasLocalSize) { - size_t MaxThreadsPerBlock[3] = {}; - Plugin.call( - Device, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES, sizeof(MaxThreadsPerBlock), - MaxThreadsPerBlock, nullptr); - - for (size_t I = 0; I < 3; ++I) { - if (MaxThreadsPerBlock[I] < NDRDesc.LocalSize[I]) { - throw sycl::nd_range_error( - "The number of work-items in each dimension of a work-group cannot " - "exceed info::device::max_work_item_sizes which is {" + - std::to_string(MaxThreadsPerBlock[0]) + ", " + - std::to_string(MaxThreadsPerBlock[1]) + ", " + - std::to_string(MaxThreadsPerBlock[2]) + "} for this device", - PI_INVALID_WORK_GROUP_SIZE); - } - } - } - - // Backend specific invalid work group size handing - // TODO: Find a better way to determine the backend - std::string PlatformName = - DeviceImpl.get_platform().get_info(); - if (PlatformName.find("OpenCL") != std::string::npos) { - return oclHandleInvalidWorkGroupSize(DeviceImpl, Kernel, NDRDesc); - } - // Fallback constexpr pi_result Error = PI_INVALID_WORK_GROUP_SIZE; throw runtime_error( diff --git a/sycl/test/basic_tests/parallel_for_range.cpp b/sycl/test/basic_tests/parallel_for_range.cpp index d5ed05868d1c3..3031d3d30f388 100644 --- a/sycl/test/basic_tests/parallel_for_range.cpp +++ b/sycl/test/basic_tests/parallel_for_range.cpp @@ -300,8 +300,7 @@ int main() { return 1; // We shouldn't be here, exception is expected } catch (nd_range_error &E) { if (string_class(E.what()).find( - "Total number of work-items in a work-group cannot exceed " - "info::device::max_work_group_size which is equal to " + + "Total number of work-items in a work-group cannot exceed " + std::to_string(MaxDeviceWGSize)) == string_class::npos) { std::cerr << "Test case OpenCL1XNegativeC failed: unexpected exception: " @@ -349,8 +348,7 @@ int main() { return 1; // We shouldn't be here, exception is expected } catch (nd_range_error &E) { if (string_class(E.what()).find( - "Total number of work-items in a work-group cannot exceed " - "info::kernel_work_group::work_group_size which is equal to " + + "Total number of work-items in a work-group cannot exceed " + std::to_string(MaxKernelWGSize) + " for this kernel") == string_class::npos) { std::cerr diff --git a/sycl/test/basic_tests/reqd_work_group_size.cpp b/sycl/test/basic_tests/reqd_work_group_size.cpp new file mode 100644 index 0000000000000..0f6ba6383d8ea --- /dev/null +++ b/sycl/test/basic_tests/reqd_work_group_size.cpp @@ -0,0 +1,109 @@ +// XFAIL: cuda || opencl +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include + +#include + +using namespace cl::sycl; + +int main() { + auto AsyncHandler = [](exception_list ES) { + for (auto &E : ES) { + std::rethrow_exception(E); + } + }; + + queue Q(AsyncHandler); + device D(Q.get_device()); + + string_class DeviceVendorName = D.get_info(); + auto DeviceType = D.get_info(); + + // parallel_for, (16, 16, 16) global, (8, 8, 8) local, reqd_wg_size(4, 4, 4) + // -> fail + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for( + nd_range<3>(range<3>(16, 16, 16), range<3>(8, 8, 8)), + [=](nd_item<3>) [[intel::reqd_work_group_size(4, 4, 4)]]{ + + }); + }); + Q.wait_and_throw(); + std::cerr << "Test case ReqdWGSizeNegativeA failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } catch (nd_range_error &E) { + if (string_class(E.what()).find( + "Specified local size doesn't match the required work-group size " + "specified in the program source") == string_class::npos) { + std::cerr + << "Test case ReqdWGSizeNegativeA failed 1: unexpected exception: " + << E.what() << std::endl; + return 1; + } + } catch (runtime_error &E) { + std::cerr << "Test case ReqdWGSizeNegativeA failed 2: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case ReqdWGSizeNegativeA failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } + + // Positive test-cases that should pass on any underlying OpenCL runtime + // parallel_for, (8, 8, 8) global, (4, 4, 4) local, reqd_wg_size(4, 4, 4) -> + // pass + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for( + nd_range<3>(range<3>(8, 8, 8), range<3>(4, 4, 4)), + [=](nd_item<3>) [[intel::reqd_work_group_size(4, 4, 4)]]{}); + }); + Q.wait_and_throw(); + } catch (nd_range_error &E) { + std::cerr << "Test case ReqdWGSizePositiveA failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (runtime_error &E) { + std::cerr << "Test case ReqdWGSizePositiveA failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case ReqdWGSizePositiveA failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } + + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for( + range<3>(16, 16, 16), [=](item<3>) [[intel::reqd_work_group_size(4, 4, 4)]]{}); + }); + Q.wait_and_throw(); + + } catch (nd_range_error &E) { + std::cerr << "Test case ReqdWGSizePositiveB failed 1: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (runtime_error &E) { + std::cerr + << "Test case ReqdWGSizePositiveB failed 2: unexpected exception: " + << E.what() << std::endl; + return 1; + } catch (...) { + std::cerr << "Test case ReqdWGSizePositiveB failed: something unexpected " + "has been caught" + << std::endl; + return 1; + } + + return 0; +}