diff --git a/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp b/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp index 66dc061d6c866..40af89ddcf26f 100644 --- a/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp +++ b/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp @@ -362,19 +362,24 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) { AddFPControlMetadataForWidth(SPIRV_DENORM_PRESERVE, 64); } - if (AttrKindStr == "sycl-work-group-size" || - AttrKindStr == "sycl-work-group-size-hint") { + static constexpr std::tuple SimpleWGAttrs[] = { + {"sycl-work-group-size", "reqd_work_group_size"}, + {"sycl-work-group-size-hint", "work_group_size_hint"}, + {"sycl-max-work-group-size", "max_work_group_size"}, + }; + + for (auto &[AttrKind, MDStr] : SimpleWGAttrs) { + if (AttrKindStr != AttrKind) + continue; // Split values in the comma-separated list integers. - SmallVector ValStrs; - Attr.getValueAsString().split(ValStrs, ','); + SmallVector AttrValStrs; + Attr.getValueAsString().split(AttrValStrs, ','); - size_t NumDims = ValStrs.size(); - assert(NumDims <= 3 && - "sycl-work-group-size and sycl-work-group-size-hint currently only " - "support up to three values"); + size_t NumDims = AttrValStrs.size(); + assert(NumDims <= 3 && "Incorrect number of values for kernel property"); // SYCL work-group sizes must be reversed for SPIR-V. - std::reverse(ValStrs.begin(), ValStrs.end()); + std::reverse(AttrValStrs.begin(), AttrValStrs.end()); // Use integer pointer size as closest analogue to size_t. IntegerType *IntPtrTy = DLayout.getIntPtrType(Ctx); @@ -383,7 +388,7 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) { // Get the integers from the strings. SmallVector MDVals; - for (StringRef ValStr : ValStrs) + for (StringRef ValStr : AttrValStrs) MDVals.push_back(ConstantAsMetadata::get( Constant::getIntegerValue(SizeTTy, APInt(SizeTBitSize, ValStr, 10)))); while (MDVals.size() < 3) @@ -397,10 +402,7 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) { Type::getInt32Ty(Ctx), NumDims)))); } - const char *MDName = (AttrKindStr == "sycl-work-group-size") - ? "reqd_work_group_size" - : "work_group_size_hint"; - return std::pair(MDName, MDNode::get(Ctx, MDVals)); + return std::pair(MDStr, MDNode::get(Ctx, MDVals)); } if (AttrKindStr == "sycl-sub-group-size") { @@ -413,6 +415,21 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) { MDNode::get(Ctx, MD)); } + if (AttrKindStr == "sycl-max-linear-work-group-size") { + auto MaxLinearSize = getAttributeAsInteger(Attr); + // Use integer pointer size as closest analogue to size_t. + IntegerType *IntPtrTy = DLayout.getIntPtrType(Ctx); + IntegerType *SizeTTy = Type::getIntNTy(Ctx, IntPtrTy->getBitWidth()); + unsigned SizeTBitSize = SizeTTy->getBitWidth(); + + // Get the integers from the strings. + Metadata *MD = ConstantAsMetadata::get(Constant::getIntegerValue( + SizeTTy, APInt(SizeTBitSize, MaxLinearSize, 10))); + + return std::pair("max_linear_work_group_size", + MDNode::get(Ctx, MD)); + } + // The sycl-single-task attribute currently only has an effect when targeting // SPIR FPGAs, in which case it will generate a "max_global_work_dim" MD node // with a 0 value, similar to applying [[intel::max_global_work_dim(0)]] to diff --git a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp index 5f7678313087d..e7d6877b89a9a 100644 --- a/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp +++ b/llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp @@ -112,28 +112,35 @@ std::vector getKernelNamesUsingAssert(const Module &M) { return SPIRKernelNames; } -// Gets reqd_work_group_size information for function Func. -std::vector getKernelReqdWorkGroupSizeMetadata(const Function &Func) { - MDNode *ReqdWorkGroupSizeMD = Func.getMetadata("reqd_work_group_size"); - if (!ReqdWorkGroupSizeMD) +// Gets 1- to 3-dimension work-group related information for function Func. +// Returns an empty vector if not present. +template +std::vector getKernelWorkGroupMetadata(const Function &Func, + const char *MDName) { + MDNode *WorkGroupMD = Func.getMetadata(MDName); + if (!WorkGroupMD) return {}; - size_t NumOperands = ReqdWorkGroupSizeMD->getNumOperands(); + size_t NumOperands = WorkGroupMD->getNumOperands(); assert(NumOperands >= 1 && NumOperands <= 3 && - "reqd_work_group_size does not have between 1 and 3 operands."); - std::vector OutVals; + "work-group metadata does not have between 1 and 3 operands."); + std::vector OutVals; OutVals.reserve(NumOperands); - for (const MDOperand &MDOp : ReqdWorkGroupSizeMD->operands()) + for (const MDOperand &MDOp : WorkGroupMD->operands()) OutVals.push_back(mdconst::extract(MDOp)->getZExtValue()); return OutVals; } -// Gets work_group_num_dim information for function Func, conviniently 0 if -// metadata is not present. -uint32_t getKernelWorkGroupNumDim(const Function &Func) { - MDNode *MaxDimMD = Func.getMetadata("work_group_num_dim"); - if (!MaxDimMD) - return 0; - assert(MaxDimMD->getNumOperands() == 1 && "Malformed node."); - return mdconst::extract(MaxDimMD->getOperand(0))->getZExtValue(); + +// Gets a single-dimensional piece of information for function Func. +// Returns std::nullopt if metadata is not present. +template +std::optional getKernelSingleEltMetadata(const Function &Func, + const char *MDName) { + if (MDNode *MaxDimMD = Func.getMetadata(MDName)) { + assert(MaxDimMD->getNumOperands() == 1 && "Malformed node."); + return mdconst::extract(MaxDimMD->getOperand(0)) + ->getZExtValue(); + } + return std::nullopt; } PropSetRegTy computeModuleProperties(const Module &M, @@ -249,22 +256,40 @@ PropSetRegTy computeModuleProperties(const Module &M, SmallVector MetadataNames; if (GlobProps.EmitProgramMetadata) { - // Add reqd_work_group_size and work_group_num_dim information to - // program metadata. + // Add various pieces of function metadata to program metadata. for (const Function &Func : M.functions()) { - std::vector KernelReqdWorkGroupSize = - getKernelReqdWorkGroupSizeMetadata(Func); - if (!KernelReqdWorkGroupSize.empty()) { + // Note - we're implicitly truncating 64-bit work-group data to 32 bits in + // all work-group related metadata. All current consumers of this program + // metadata format only support SYCL ID queries that fit within MAX_INT. + if (auto KernelReqdWorkGroupSize = getKernelWorkGroupMetadata( + Func, "reqd_work_group_size"); + !KernelReqdWorkGroupSize.empty()) { MetadataNames.push_back(Func.getName().str() + "@reqd_work_group_size"); PropSet.add(PropSetRegTy::SYCL_PROGRAM_METADATA, MetadataNames.back(), KernelReqdWorkGroupSize); } - uint32_t WorkGroupNumDim = getKernelWorkGroupNumDim(Func); - if (WorkGroupNumDim) { + if (auto WorkGroupNumDim = getKernelSingleEltMetadata( + Func, "work_group_num_dim")) { MetadataNames.push_back(Func.getName().str() + "@work_group_num_dim"); PropSet.add(PropSetRegTy::SYCL_PROGRAM_METADATA, MetadataNames.back(), - WorkGroupNumDim); + *WorkGroupNumDim); + } + + if (auto KernelMaxWorkGroupSize = + getKernelWorkGroupMetadata(Func, "max_work_group_size"); + !KernelMaxWorkGroupSize.empty()) { + MetadataNames.push_back(Func.getName().str() + "@max_work_group_size"); + PropSet.add(PropSetRegTy::SYCL_PROGRAM_METADATA, MetadataNames.back(), + KernelMaxWorkGroupSize); + } + + if (auto MaxLinearWGSize = getKernelSingleEltMetadata( + Func, "max_linear_work_group_size")) { + MetadataNames.push_back(Func.getName().str() + + "@max_linear_work_group_size"); + PropSet.add(PropSetRegTy::SYCL_PROGRAM_METADATA, MetadataNames.back(), + *MaxLinearWGSize); } } diff --git a/llvm/lib/SYCLLowerIR/SYCLCreateNVVMAnnotations.cpp b/llvm/lib/SYCLLowerIR/SYCLCreateNVVMAnnotations.cpp index 6de662d36041a..1d8546fd7ed2e 100644 --- a/llvm/lib/SYCLLowerIR/SYCLCreateNVVMAnnotations.cpp +++ b/llvm/lib/SYCLLowerIR/SYCLCreateNVVMAnnotations.cpp @@ -97,7 +97,8 @@ SYCLCreateNVVMAnnotationsPass::run(Module &M, ModuleAnalysisManager &MAM) { constexpr static std::pair SingleValAnnotations[] = {{"min_work_groups_per_cu", "minctasm"}, - {"max_work_groups_per_mp", "maxclusterrank"}}; + {"max_work_groups_per_mp", "maxclusterrank"}, + {"max_linear_work_group_size", "maxntidx"}}; for (auto &[MDName, AnnotationName] : SingleValAnnotations) { if (MDNode *Node = F.getMetadata(MDName)) { diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 03cf0cd831e69..2fec2b61007be 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -117,13 +117,13 @@ if(SYCL_UR_USE_FETCH_CONTENT) endfunction() set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit 45ad7c52a75e6d3e52f658e38e796563744914c7 - # Merge: 7a2caca5 3bf2becb + # commit 7ecf64d60c31cd72bd88588498536d067bad59d6 + # Merge: 17aa04d3 6eb5208b # Author: aarongreig - # Date: Tue Sep 24 08:04:54 2024 -0700 - # Merge pull request #2116 from RossBrunton/ross/morewarn - # More warning squishing - set(UNIFIED_RUNTIME_TAG 45ad7c52a75e6d3e52f658e38e796563744914c7) + # Date: Wed Sep 25 11:14:47 2024 +0100 + # Merge pull request #1996 from frasercrmck/ur-max-wg-size-props + # Add two new properties to ur_kernel_group_info_t + set(UNIFIED_RUNTIME_TAG 7ecf64d60c31cd72bd88588498536d067bad59d6) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") # Due to the use of dependentloadflag and no installer for UMF and hwloc we need diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_properties.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_properties.asciidoc index 878f0862ac990..d0ae2a0727046 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_properties.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_properties.asciidoc @@ -58,7 +58,8 @@ Jessica Davies, Intel + Joe Garvey, Intel + Greg Lueck, Intel + John Pennycook, Intel + -Roland Schulz, Intel +Roland Schulz, Intel + +Fraser Cormack, Codeplay == Overview @@ -232,6 +233,68 @@ SYCL implementations may introduce additional kernel properties. If any combinations of kernel attributes are invalid, this must be clearly documented as part of the new kernel property definition. +=== Kernel Properties for the CUDA backend + +The kernel properties specified in this section may only be used to decorate +kernels that are submitted to the CUDA backend. Attempting to submit a kernel +with these properties to another backend has undefined behavior. + +```c++ +namespace sycl { +namespace ext { +namespace oneapi { +namespace experimental { + +struct max_work_group_size_key { + template + using value_t = property_value...>; +}; // max_work_group_size_key + +struct max_linear_work_group_size_key { + template + using value_t = property_value>; +}; // max_linear_work_group_size_key + +template +inline constexpr max_work_group_size_key::value_t max_work_group_size; + +template +inline constexpr max_linear_work_group_size_key::value_t max_linear_work_group_size; + +template <> struct is_property_key : std::true_type {}; +template <> struct is_property_key : std::true_type {}; + +} // namespace experimental +} // namespace oneapi +} // namespace ext +} // namespace sycl +``` + +|=== +|Property|Description + +|`max_work_group_size` +|The `max_work_group_size` property provides a promise to the compiler +that the kernel will never be launched with a larger work-group than the +specified size. The number of template arguments in the `Dims` parameter pack +must match the dimensionality of the work-group used to invoke the kernel. The +order of the template arguments matches the constructor of the `range` class. + +If the kernel is submitted with an `nd_range` that exceeds the size specified +by the property, the implementation must throw a synchronous exception with the +`errc::nd_range` error code. + +|`max_linear_work_group_size` +|The `max_linear_work_group_size` property provides a promise to the compiler +that the kernel will never be launched with a work-group for which the return +value of `group::get_local_linear_range()` exceeds the specified amount. + +If the kernel is submitted with an `nd_range` that exceeds the size specified +by the property, the implementation must throw a synchronous exception with the +`errc::nd_range` error code. + +|=== + === Adding a Property List to a Kernel Launch To enable properties to be associated with kernels, this extension adds diff --git a/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp b/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp index cd805dcd8e082..2c9a31cf05ed2 100644 --- a/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp +++ b/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp @@ -71,6 +71,21 @@ struct single_task_kernel_key { using value_t = property_value; }; +struct max_work_group_size_key + : detail::compile_time_property_key { + template + using value_t = property_value...>; +}; + +struct max_linear_work_group_size_key + : detail::compile_time_property_key< + detail::PropKind::MaxLinearWorkGroupSize> { + template + using value_t = property_value>; +}; + template struct property_value, std::integral_constant...> { @@ -138,6 +153,28 @@ template <> struct property_value { using key_t = single_task_kernel_key; }; +template +struct property_value, + std::integral_constant...> { + static_assert(sizeof...(Dims) + 1 <= 3, + "max_work_group_size property currently " + "only supports up to three values."); + static_assert( + detail::AllNonZero::value, + "max_work_group_size property must only contain non-zero values."); + + using key_t = max_work_group_size_key; + + constexpr size_t operator[](int Dim) const { + return std::array{Dim0, Dims...}[Dim]; + } +}; + +template <> struct property_value { + using key_t = max_linear_work_group_size_key; +}; + template inline constexpr work_group_size_key::value_t work_group_size; @@ -156,6 +193,14 @@ inline constexpr nd_range_kernel_key::value_t nd_range_kernel; inline constexpr single_task_kernel_key::value_t single_task_kernel; +template +inline constexpr max_work_group_size_key::value_t + max_work_group_size; + +template +inline constexpr max_linear_work_group_size_key::value_t + max_linear_work_group_size; + struct work_group_progress_key : detail::compile_time_property_key { template struct PropertyMetaInfo { static constexpr const char *name = "sycl-single-task-kernel"; static constexpr int value = 0; }; +template +struct PropertyMetaInfo> { + static constexpr const char *name = "sycl-max-work-group-size"; + static constexpr const char *value = SizeListToStr::value; +}; +template +struct PropertyMetaInfo> { + static constexpr const char *name = "sycl-max-linear-work-group-size"; + static constexpr size_t value = Size; +}; template struct HasKernelPropertiesGetMethod : std::false_type {}; diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp index 3f057c9aba3ca..33228130ec36d 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp @@ -212,8 +212,10 @@ enum PropKind : uint32_t { Balanced = 71, InvocationCapacity = 72, ResponseCapacity = 73, + MaxWorkGroupSize = 74, + MaxLinearWorkGroupSize = 75, // PropKindSize must always be the last value. - PropKindSize = 74, + PropKindSize = 76, }; struct property_key_base_tag {}; diff --git a/sycl/source/detail/error_handling/error_handling.cpp b/sycl/source/detail/error_handling/error_handling.cpp index 46b291e4c2147..daae5563776f1 100644 --- a/sycl/source/detail/error_handling/error_handling.cpp +++ b/sycl/source/detail/error_handling/error_handling.cpp @@ -104,6 +104,22 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, Kernel, Device, UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(size_t) * 3, CompileWGSize, nullptr); + size_t CompileMaxWGSize[3] = {0}; + ur_result_t URRes = Adapter->call_nocheck( + Kernel, Device, UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE, + sizeof(size_t) * 3, CompileMaxWGSize, nullptr); + if (URRes != UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION) { + Adapter->checkUrResult(URRes); + } + + size_t CompileMaxLinearWGSize = 0; + URRes = Adapter->call_nocheck( + Kernel, Device, UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE, + sizeof(size_t), &CompileMaxLinearWGSize, nullptr); + if (URRes != UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION) { + Adapter->checkUrResult(URRes); + } + size_t MaxWGSize = 0; Adapter->call( Device, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE, sizeof(size_t), &MaxWGSize, @@ -147,7 +163,28 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, std::to_string(CompileWGSize[0]) + "}"); } + const size_t TotalNumberOfWIs = + NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2]; + if (HasLocalSize) { + if (CompileMaxWGSize[0] != 0) { + if (NDRDesc.LocalSize[0] > CompileMaxWGSize[0] || + NDRDesc.LocalSize[1] > CompileMaxWGSize[1] || + NDRDesc.LocalSize[2] > CompileMaxWGSize[2]) { + throw sycl::exception( + make_error_code(errc::nd_range), + "The specified local size {" + + std::to_string(NDRDesc.LocalSize[2]) + ", " + + std::to_string(NDRDesc.LocalSize[1]) + ", " + + std::to_string(NDRDesc.LocalSize[0]) + + "} exceeds the maximum work-group size specified " + "in the program source {" + + std::to_string(CompileMaxWGSize[2]) + ", " + + std::to_string(CompileMaxWGSize[1]) + ", " + + std::to_string(CompileMaxWGSize[0]) + "}"); + } + } + size_t MaxThreadsPerBlock[3] = {}; Adapter->call( Device, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES, sizeof(MaxThreadsPerBlock), @@ -164,6 +201,15 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, "} for this device"); } } + + if (CompileMaxLinearWGSize && TotalNumberOfWIs > CompileMaxLinearWGSize) { + throw sycl::exception( + make_error_code(errc::nd_range), + "The total number of work-items in the work-group (" + + std::to_string(TotalNumberOfWIs) + + ") exceeds the maximum specified in the program source (" + + std::to_string(CompileMaxLinearWGSize) + ")"); + } } if (IsOpenCLV1x) { @@ -173,8 +219,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, // local_work_size[0] * ... * local_work_size[work_dim - 1] is greater // than the value specified by UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE in // table 4.3 - const size_t TotalNumberOfWIs = - NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2]; if (TotalNumberOfWIs > MaxWGSize) throw sycl::exception( make_error_code(errc::nd_range), @@ -191,8 +235,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, Adapter->call( Kernel, Device, UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE, sizeof(size_t), &KernelWGSize, nullptr); - const size_t TotalNumberOfWIs = - NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2]; if (TotalNumberOfWIs > KernelWGSize) throw sycl::exception( make_error_code(errc::nd_range), diff --git a/sycl/test-e2e/Basic/max_linear_work_group_size_props.cpp b/sycl/test-e2e/Basic/max_linear_work_group_size_props.cpp new file mode 100644 index 0000000000000..7009ca367d8e9 --- /dev/null +++ b/sycl/test-e2e/Basic/max_linear_work_group_size_props.cpp @@ -0,0 +1,200 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// This property is not yet supported by all UR adapters +// XFAIL: level_zero, opencl, hip + +#include + +#include + +using namespace sycl; + +enum class Variant { Function, Functor, FunctorAndProperty }; + +template +class MaxLinearWGSizePositive; +template +class MaxLinearWGSizeNoLocalPositive; +template +class MaxLinearWGSizeNegative; + +template range repeatRange(size_t Val); +template <> range<1> repeatRange<1>(size_t Val) { return range<1>{Val}; } +template <> range<2> repeatRange<2>(size_t Val) { return range<2>{Val, Val}; } +template <> range<3> repeatRange<3>(size_t Val) { + return range<3>{Val, Val, Val}; +} + +std::string rangeToString(range<1> Range) { + return "{1, 1, " + std::to_string(Range[0]) + "}"; +} +std::string rangeToString(range<2> Range) { + return "{1, " + std::to_string(Range[0]) + ", " + std::to_string(Range[1]) + + "}"; +} +std::string rangeToString(range<3> Range) { + return "{" + std::to_string(Range[0]) + ", " + std::to_string(Range[1]) + + ", " + std::to_string(Range[2]) + "}"; +} +std::string linearRangeToString(range<1> Range) { + return std::to_string(Range[0]); +} +std::string linearRangeToString(range<2> Range) { + return std::to_string(Range[0] * Range[1]); +} +std::string linearRangeToString(range<3> Range) { + return std::to_string(Range[0] * Range[1] * Range[2]); +} + +template struct KernelFunctorWithMaxWGSizeProp { + void operator()(nd_item<1>) const {} + void operator()(item<1>) const {} + + auto get(sycl::ext::oneapi::experimental::properties_tag) { + return sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::max_linear_work_group_size}; + } +}; + +template +int test(queue &Q, PropertiesT Props, KernelType KernelFunc) { + constexpr size_t Dims = 1; + + // Positive test case: Specify local size that matches required size. + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for>( + nd_range(repeatRange(8), range(I)), Props, + KernelFunc); + }); + Q.wait_and_throw(); + } catch (exception &E) { + std::cerr + << "Test case MaxLinearWGSizePositive failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + + // Same as above but using the queue shortcuts. + try { + Q.parallel_for>( + nd_range(repeatRange(8), range(I)), Props, + KernelFunc); + Q.wait_and_throw(); + } catch (exception &E) { + std::cerr + << "Test case MaxLinearWGSizePositive shortcut failed: unexpected " + "exception: " + << E.what() << std::endl; + return 1; + } + + // Kernel that has a required WG size, but no local size is specified. + // + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for>( + repeatRange(16), Props, KernelFunc); + }); + Q.wait_and_throw(); + } catch (exception &E) { + std::cerr << "Test case MaxLinearWGSizeNoLocalPositive failed: unexpected " + "exception: " + << E.what() << std::endl; + return 1; + } + + try { + Q.parallel_for>( + repeatRange(16), Props, KernelFunc); + Q.wait_and_throw(); + } catch (exception &E) { + std::cerr << "Test case MaxLinearWGSizeNoLocalPositive shortcut failed: " + "unexpected exception: " + << E.what() << std::endl; + return 1; + } + + // Negative test case: Specify local size that does not match required size. + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for>( + nd_range(repeatRange(16), repeatRange(8)), Props, + KernelFunc); + }); + Q.wait_and_throw(); + std::cerr + << "Test case MaxLinearWGSizeNegative failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } catch (exception &E) { + if (E.code() != errc::nd_range || + std::string(E.what()).find( + "The total number of work-items in the work-group (" + + linearRangeToString(repeatRange(8)) + + ") exceeds the maximum specified in the program source (" + + linearRangeToString(range(I)) + ")") == std::string::npos) { + std::cerr + << "Test case MaxLinearWGSizeNegative failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + } + + // Same as above but using the queue shortcuts. + try { + Q.parallel_for>( + nd_range(repeatRange(16), repeatRange(8)), Props, + KernelFunc); + Q.wait_and_throw(); + std::cerr + << "Test case MaxLinearWGSizeNegative shortcut failed: no exception " + "has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } catch (exception &E) { + if (E.code() != errc::nd_range || + std::string(E.what()).find( + "The total number of work-items in the work-group (" + + linearRangeToString(repeatRange(8)) + + ") exceeds the maximum specified in the program source (" + + linearRangeToString(range(I)) + ")") == std::string::npos) { + std::cerr + << "Test case MaxLinearWGSizeNegative shortcut failed: unexpected " + "exception: " + << E.what() << std::endl; + return 1; + } + } + + return 0; +} + +template int test_max(queue &Q) { + auto Props = ext::oneapi::experimental::properties{ + ext::oneapi::experimental::max_linear_work_group_size}; + auto KernelFunction = [](auto) {}; + + auto EmptyProps = ext::oneapi::experimental::properties{}; + KernelFunctorWithMaxWGSizeProp KernelFunctor; + + int Res = 0; + Res += test(Q, Props, KernelFunction); + Res += test(Q, EmptyProps, KernelFunctor); + Res += test(Q, Props, KernelFunctor); + return Res; +} + +int main() { + auto AsyncHandler = [](exception_list ES) { + for (auto &E : ES) { + std::rethrow_exception(E); + } + }; + + queue Q(AsyncHandler); + + return test_max<4>(Q); +} diff --git a/sycl/test-e2e/Basic/max_work_group_size_props.cpp b/sycl/test-e2e/Basic/max_work_group_size_props.cpp new file mode 100644 index 0000000000000..96439971d904a --- /dev/null +++ b/sycl/test-e2e/Basic/max_work_group_size_props.cpp @@ -0,0 +1,194 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// This property is not yet supported by all UR adapters +// XFAIL: level_zero, opencl, hip + +#include + +#include + +using namespace sycl; + +enum class Variant { Function, Functor, FunctorAndProperty }; + +template +class MaxWGSizePositive; +template +class MaxWGSizeNoLocalPositive; +template +class MaxWGSizeNegative; + +template range repeatRange(size_t Val); +template <> range<1> repeatRange<1>(size_t Val) { return range<1>{Val}; } +template <> range<2> repeatRange<2>(size_t Val) { return range<2>{Val, Val}; } +template <> range<3> repeatRange<3>(size_t Val) { + return range<3>{Val, Val, Val}; +} + +std::string rangeToString(range<1> Range) { + return "{1, 1, " + std::to_string(Range[0]) + "}"; +} +std::string rangeToString(range<2> Range) { + return "{1, " + std::to_string(Range[0]) + ", " + std::to_string(Range[1]) + + "}"; +} +std::string rangeToString(range<3> Range) { + return "{" + std::to_string(Range[0]) + ", " + std::to_string(Range[1]) + + ", " + std::to_string(Range[2]) + "}"; +} + +template struct KernelFunctorWithMaxWGSizeProp { + void operator()(nd_item) const {} + void operator()(item) const {} + + auto get(sycl::ext::oneapi::experimental::properties_tag) { + return sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::max_work_group_size}; + } +}; + +template +int test(queue &Q, PropertiesT Props, KernelType KernelFunc) { + constexpr size_t Dims = sizeof...(Is); + + // Positive test case: Specify local size that matches required size. + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for>( + nd_range(repeatRange(8), range(Is...)), Props, + KernelFunc); + }); + Q.wait_and_throw(); + } catch (exception &E) { + std::cerr << "Test case MaxWGSizePositive failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + + // Same as above but using the queue shortcuts. + try { + Q.parallel_for>( + nd_range(repeatRange(8), range(Is...)), Props, + KernelFunc); + Q.wait_and_throw(); + } catch (exception &E) { + std::cerr << "Test case MaxWGSizePositive shortcut failed: unexpected " + "exception: " + << E.what() << std::endl; + return 1; + } + + // Kernel that has a required WG size, but no local size is specified. + // + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for>( + repeatRange(16), Props, KernelFunc); + }); + Q.wait_and_throw(); + } catch (exception &E) { + std::cerr << "Test case MaxWGSizeNoLocalPositive failed: unexpected " + "exception: " + << E.what() << std::endl; + return 1; + } + + try { + Q.parallel_for>( + repeatRange(16), Props, KernelFunc); + Q.wait_and_throw(); + } catch (exception &E) { + std::cerr << "Test case MaxWGSizeNoLocalPositive shortcut failed: " + "unexpected exception: " + << E.what() << std::endl; + return 1; + } + + // Negative test case: Specify local size that does not match required size. + try { + Q.submit([&](handler &CGH) { + CGH.parallel_for>( + nd_range(repeatRange(16), repeatRange(8)), Props, + KernelFunc); + }); + Q.wait_and_throw(); + std::cerr << "Test case MaxWGSizeNegative failed: no exception has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } catch (exception &E) { + if (E.code() != errc::nd_range || + std::string(E.what()).find( + "The specified local size " + rangeToString(repeatRange(8)) + + " exceeds the maximum work-group size specified in the program " + "source " + + rangeToString(range(Is...))) == std::string::npos) { + std::cerr << "Test case MaxWGSizeNegative failed: unexpected exception: " + << E.what() << std::endl; + return 1; + } + } + + // Same as above but using the queue shortcuts. + try { + Q.parallel_for>( + nd_range(repeatRange(16), repeatRange(8)), Props, + KernelFunc); + Q.wait_and_throw(); + std::cerr << "Test case MaxWGSizeNegative shortcut failed: no exception " + "has been " + "thrown\n"; + return 1; // We shouldn't be here, exception is expected + } catch (exception &E) { + if (E.code() != errc::nd_range || + std::string(E.what()).find( + "The specified local size " + rangeToString(repeatRange(8)) + + " exceeds the maximum work-group size specified in the program " + "source " + + rangeToString(range(Is...))) == std::string::npos) { + std::cerr << "Test case MaxWGSizeNegative shortcut failed: unexpected " + "exception: " + << E.what() << std::endl; + return 1; + } + } + + return 0; +} + +template int test_max(queue &Q) { + auto Props = ext::oneapi::experimental::properties{ + ext::oneapi::experimental::max_work_group_size}; + auto KernelFunction = [](auto) {}; + + auto EmptyProps = ext::oneapi::experimental::properties{}; + KernelFunctorWithMaxWGSizeProp KernelFunctor; + + int Res = 0; + Res += test(Q, Props, KernelFunction); + Res += test(Q, EmptyProps, KernelFunctor); + Res += test(Q, Props, KernelFunctor); + return Res; +} + +int main() { + auto AsyncHandler = [](exception_list ES) { + for (auto &E : ES) { + std::rethrow_exception(E); + } + }; + + queue Q(AsyncHandler); + + int Res = 0; + Res += test_max<4>(Q); + Res += test_max<4, 4>(Q); + Res += test_max<8, 4>(Q); + Res += test_max<4, 8>(Q); + Res += test_max<4, 4, 4>(Q); + Res += test_max<4, 4, 8>(Q); + Res += test_max<8, 4, 4>(Q); + Res += test_max<4, 8, 4>(Q); + return Res; +} diff --git a/sycl/test/check_device_code/extensions/properties/properties_kernel_launch_bounds.cpp b/sycl/test/check_device_code/extensions/properties/properties_kernel_launch_bounds.cpp new file mode 100644 index 0000000000000..96ac3da42a504 --- /dev/null +++ b/sycl/test/check_device_code/extensions/properties/properties_kernel_launch_bounds.cpp @@ -0,0 +1,20 @@ +// RUN: %clangxx -fsycl-device-only -S -Xclang -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR +// RUN: %clangxx -fsycl -fsyntax-only -Xclang -verify %s +// expected-no-diagnostics + +#include + +int main() { + sycl::queue Q; + + constexpr auto Props = sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::max_linear_work_group_size<4>, + }; + // CHECK-IR: spir_kernel void @{{.*}}LaunchBoundsKernel(){{.*}} #[[LaunchBoundsAttrs:[0-9]+]] + Q.single_task(Props, []() {}); + + return 0; +} + +// CHECK-IR: attributes #[[LaunchBoundsAttrs]] = { +// CHECK-IR-SAME: "sycl-max-linear-work-group-size"="4" diff --git a/sycl/test/check_device_code/extensions/properties/properties_kernel_launch_bounds_nvptx.cpp b/sycl/test/check_device_code/extensions/properties/properties_kernel_launch_bounds_nvptx.cpp new file mode 100644 index 0000000000000..2040c5418f7c6 --- /dev/null +++ b/sycl/test/check_device_code/extensions/properties/properties_kernel_launch_bounds_nvptx.cpp @@ -0,0 +1,24 @@ +// REQUIRES: cuda + +// RUN: %clangxx -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -S -Xclang -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR + +#include + +int main() { + sycl::queue Q; + + constexpr auto Props = sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::max_linear_work_group_size<4>, + }; + + // CHECK-IR: define{{.*}}void @[[LaunchBoundsKernelFn:.*LaunchBoundsKernel0]](){{.*}} #[[LaunchBoundsAttrs:[0-9]+]] + Q.single_task(Props, []() {}); + + return 0; +} + +// CHECK-IR: attributes #[[LaunchBoundsAttrs]] = { +// CHECK-IR-SAME: "sycl-max-linear-work-group-size"="4" + +// CHECK-IR-DAG: !{ptr @[[LaunchBoundsKernelFn]], !"kernel", i32 1} +// CHECK-IR-DAG: !{ptr @[[LaunchBoundsKernelFn]], !"maxntidx", i32 4} diff --git a/sycl/test/check_device_code/extensions/properties/properties_kernel_max_work_group_size.cpp b/sycl/test/check_device_code/extensions/properties/properties_kernel_max_work_group_size.cpp new file mode 100644 index 0000000000000..924270bb6cafe --- /dev/null +++ b/sycl/test/check_device_code/extensions/properties/properties_kernel_max_work_group_size.cpp @@ -0,0 +1,37 @@ +// RUN: %clangxx -fsycl-device-only -S -Xclang -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR +// RUN: %clangxx -fsycl -fsyntax-only -Xclang -verify %s +// expected-no-diagnostics + +#include + +int main() { + sycl::queue Q; + sycl::event Ev; + + constexpr auto Props1 = sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::max_work_group_size<8>}; + constexpr auto Props2 = sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::max_work_group_size<8, 4>}; + constexpr auto Props3 = sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::max_work_group_size<8, 4, 2>}; + + // CHECK-IR: spir_kernel void @{{.*}}MaxWGSizeKernel0(){{.*}} #[[MaxWGSizeAttr0:[0-9]+]] + // CHECK-IR-SAME: !max_work_group_size ![[MaxWGSizeMD0:[0-9]+]] + Q.single_task(Props1, []() {}); + // CHECK-IR: spir_kernel void @{{.*}}MaxWGSizeKernel1(){{.*}} #[[MaxWGSizeAttr1:[0-9]+]] + // CHECK-IR-SAME: !max_work_group_size ![[MaxWGSizeMD1:[0-9]+]] + Q.single_task(Ev, Props2, []() {}); + // CHECK-IR: spir_kernel void @{{.*}}MaxWGSizeKernel2(){{.*}} #[[MaxWGSizeAttr2:[0-9]+]] + // CHECK-IR-SAME: !max_work_group_size ![[MaxWGSizeMD2:[0-9]+]] + Q.single_task({Ev}, Props3, []() {}); + + return 0; +} + +// CHECK-IR: attributes #[[MaxWGSizeAttr0]] = { {{.*}}"sycl-max-work-group-size"="8" +// CHECK-IR: attributes #[[MaxWGSizeAttr1]] = { {{.*}}"sycl-max-work-group-size"="8,4" +// CHECK-IR: attributes #[[MaxWGSizeAttr2]] = { {{.*}}"sycl-max-work-group-size"="8,4,2" + +// CHECK-IR: ![[MaxWGSizeMD0]] = !{i64 8, i64 1, i64 1} +// CHECK-IR: ![[MaxWGSizeMD1]] = !{i64 4, i64 8, i64 1} +// CHECK-IR: ![[MaxWGSizeMD2]] = !{i64 2, i64 4, i64 8} diff --git a/sycl/test/check_device_code/extensions/properties/properties_kernel_max_work_group_size_nvptx.cpp b/sycl/test/check_device_code/extensions/properties/properties_kernel_max_work_group_size_nvptx.cpp new file mode 100644 index 0000000000000..af20bb82650fa --- /dev/null +++ b/sycl/test/check_device_code/extensions/properties/properties_kernel_max_work_group_size_nvptx.cpp @@ -0,0 +1,52 @@ +// REQUIRES: cuda + +// RUN: %clangxx -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -S -Xclang -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR + +#include + +int main() { + sycl::queue Q; + + constexpr auto Props1 = sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::max_work_group_size<8>}; + constexpr auto Props2 = sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::max_work_group_size<8, 4>}; + constexpr auto Props3 = sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::max_work_group_size<8, 4, 2>}; + + // CHECK-IR: define{{.*}}void @[[MaxWGSizeKernelFn0:.*MaxWGSizeKernel0]](){{.*}} #[[MaxWGSizeAttr0:[0-9]+]] + Q.single_task(Props1, []() {}); + + // CHECK-IR: define{{.*}}void @[[MaxWGSizeKernelFn1:.*MaxWGSizeKernel1]](){{.*}} #[[MaxWGSizeAttr1:[0-9]+]] + Q.single_task(Props2, []() {}); + + // CHECK-IR: define{{.*}}void @[[MaxWGSizeKernelFn2:.*MaxWGSizeKernel2]](){{.*}} #[[MaxWGSizeAttr2:[0-9]+]] + Q.single_task(Props3, []() {}); + + return 0; +} + +// CHECK-IR: attributes #[[MaxWGSizeAttr0]] = { +// CHECK-IR-SAME: "sycl-max-work-group-size"="8" + +// CHECK-IR: attributes #[[MaxWGSizeAttr1]] = { +// CHECK-IR-SAME: "sycl-max-work-group-size"="8,4" + +// CHECK-IR: attributes #[[MaxWGSizeAttr2]] = { +// CHECK-IR-SAME: "sycl-max-work-group-size"="8,4,2" + +// CHECK-IR: !{ptr @[[MaxWGSizeKernelFn0]], !"kernel", i32 1} +// CHECK-IR: !{ptr @[[MaxWGSizeKernelFn1]], !"kernel", i32 1} +// CHECK-IR: !{ptr @[[MaxWGSizeKernelFn2]], !"kernel", i32 1} + +// CHECK-IR: !{ptr @[[MaxWGSizeKernelFn0]], !"maxntidx", i32 8} +// CHECK-IR-NOT: !{ptr @[[MaxWGSizeKernelFn0]], !"maxntidy", +// CHECK-IR-NOT: !{ptr @[[MaxWGSizeKernelFn0]], !"maxntidz", + +// CHECK-IR: !{ptr @[[MaxWGSizeKernelFn1]], !"maxntidx", i32 4} +// CHECK-IR: !{ptr @[[MaxWGSizeKernelFn1]], !"maxntidy", i32 8} +// CHECK-IR-NOT: !{ptr @[[MaxWGSizeKernelFn1]], !"maxntidz", + +// CHECK-IR: !{ptr @[[MaxWGSizeKernelFn2]], !"maxntidx", i32 2} +// CHECK-IR: !{ptr @[[MaxWGSizeKernelFn2]], !"maxntidy", i32 4} +// CHECK-IR: !{ptr @[[MaxWGSizeKernelFn2]], !"maxntidz", i32 8} diff --git a/sycl/test/extensions/properties/properties_kernel.cpp b/sycl/test/extensions/properties/properties_kernel.cpp index dfad5f9c638e9..0024cd4cb221c 100644 --- a/sycl/test/extensions/properties/properties_kernel.cpp +++ b/sycl/test/extensions/properties/properties_kernel.cpp @@ -59,6 +59,12 @@ int main() { static_assert( is_property_value)>::value); static_assert(is_property_value)>::value); + static_assert(is_property_value)>::value); + static_assert(is_property_value)>::value); + static_assert( + is_property_value)>::value); + static_assert( + is_property_value)>::value); static_assert( std::is_same_v)::key_t>); @@ -75,6 +81,16 @@ int main() { decltype(work_group_size_hint<13, 13, 13>)::key_t>); static_assert( std::is_same_v)::key_t>); + static_assert(std::is_same_v)::key_t>); + static_assert(std::is_same_v)::key_t>); + static_assert( + std::is_same_v)::key_t>); + static_assert( + std::is_same_v)::key_t>); static_assert(work_group_size<15>[0] == 15); static_assert(work_group_size<16, 17>[0] == 16); @@ -89,6 +105,13 @@ int main() { static_assert(work_group_size_hint<24, 25, 26>[1] == 25); static_assert(work_group_size_hint<24, 25, 26>[2] == 26); static_assert(sub_group_size<27>.value == 27); + static_assert(max_work_group_size<28>[0] == 28); + static_assert(max_work_group_size<28, 29>[0] == 28); + static_assert(max_work_group_size<28, 29>[1] == 29); + static_assert(max_work_group_size<28, 29, 30>[0] == 28); + static_assert(max_work_group_size<28, 29, 30>[1] == 29); + static_assert(max_work_group_size<28, 29, 30>[2] == 30); + static_assert(max_linear_work_group_size<28>.value == 28); static_assert(std::is_same_v)::value_t, std::integral_constant>);