diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 1a9aee6a3ebfc..4af37e4e9b4d8 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1583,6 +1583,22 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr { let SupportsNonconformingLambdaSyntax = 1; } +def SYCLIntelMinWorkGroupsPerComputeUnit : InheritableAttr { + let Spellings = [CXX11<"intel", "min_work_groups_per_cu">]; + let Args = [ExprArgument<"Value">]; + let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost]; + let Subjects = SubjectList<[Function], ErrorDiag>; + let Documentation = [SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs]; +} + +def SYCLIntelMaxWorkGroupsPerMultiprocessor : InheritableAttr { + let Spellings = [CXX11<"intel", "max_work_groups_per_mp">]; + let Args = [ExprArgument<"Value">]; + let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost]; + let Subjects = SubjectList<[Function], ErrorDiag>; + let Documentation = [SYCLIntelMaxWorkGroupsPerMultiprocessorDocs]; +} + def SYCLIntelMaxGlobalWorkDim : InheritableAttr { let Spellings = [CXX11<"intel", "max_global_work_dim">]; let Args = [ExprArgument<"Value">]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 0957da6019298..3be0445701ec4 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -3017,6 +3017,65 @@ In SYCL 2020 mode, the attribute is not propagated to the kernel. }]; } +def SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs: Documentation { + let Category = DocCatFunction; + let Heading = "intel::min_work_groups_per_cu"; + let Content = [{ +Applies to a device function/lambda function. Indicates the desired minimum +number of resident work_groups per multiprocessor. It complies to the +.minnctapersm PTX directive. + +.. code-block:: c++ + + [[intel::min_work_groups_per_cu(2)]] void foo() {} + + class Foo { + public: + [[intel::min_work_groups_per_cu(2)]] void operator()() const {} + }; + + template + class Functor { + public: + [[intel::min_work_groups_per_cu(N)]] void operator()() const {} + }; + + template + [[intel::min_work_groups_per_cu(N)]] void func() {} + + }]; +} + +def SYCLIntelMaxWorkGroupsPerMultiprocessorDocs: Documentation { + let Category = DocCatFunction; + let Heading = "intel::max_work_groups_per_mp"; + let Content = [{ +Applies to a device function/lambda function. Indicates the desired maximum +number work_groups per cluster with which the application will ever launch. It +complies to the .maxclusterrank PTX directive. Note, that the feature requires +SM_90 or higher. + +.. code-block:: c++ + + [[intel::max_work_groups_per_mp(2)]] void foo() {} + + class Foo { + public: + [[intel::max_work_groups_per_mp(2)]] void operator()() const {} + }; + + template + class Functor { + public: + [[intel::max_work_groups_per_mp(N)]] void operator()() const {} + }; + + template + [[intel::max_work_groups_per_mp(N)]] void func() {} + + }]; +} + def SYCLIntelMaxGlobalWorkDimAttrDocs : Documentation { let Category = DocCatFunction; let Heading = "intel::max_global_work_dim"; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index a4328d78c0ccf..6484483134686 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12002,9 +12002,12 @@ def warn_sycl_kernel_return_type : Warning< def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; +def warn_launch_bounds_is_cuda_specific : Warning< + "%0 attribute ignored, only applicable when targeting Nvidia devices">, + InGroup; def warn_cuda_maxclusterrank_sm_90 : Warning< - "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " + "'maxclusterrank' requires sm_90 or higher, CUDA arch provided: %0, ignoring " "%1 attribute">, InGroup; def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must " diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 5c749d2e0c165..07929a8006f39 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11432,6 +11432,16 @@ class Sema final { SYCLIntelMaxGlobalWorkDimAttr * MergeSYCLIntelMaxGlobalWorkDimAttr(Decl *D, const SYCLIntelMaxGlobalWorkDimAttr &A); + void AddSYCLIntelMinWorkGroupsPerComputeUnitAttr( + Decl *D, const AttributeCommonInfo &CI, Expr *E); + SYCLIntelMinWorkGroupsPerComputeUnitAttr * + MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr( + Decl *D, const SYCLIntelMinWorkGroupsPerComputeUnitAttr &A); + void AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( + Decl *D, const AttributeCommonInfo &CI, Expr *E); + SYCLIntelMaxWorkGroupsPerMultiprocessorAttr * + MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( + Decl *D, const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr &A); void AddSYCLIntelBankWidthAttr(Decl *D, const AttributeCommonInfo &CI, Expr *E); SYCLIntelBankWidthAttr * diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 3fe3a66ae03d7..647ae6206d8e3 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -758,6 +758,24 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD, llvm::MDNode::get(Context, AttrMDArgs)); } + auto attrAsMDArg = [&](Expr *E) { + const auto *CE = cast(E); + std::optional ArgVal = CE->getResultAsAPSInt(); + return llvm::ConstantAsMetadata::get( + Builder.getInt32(ArgVal->getSExtValue())); + }; + + if (const auto *A = FD->getAttr()) { + Fn->setMetadata("min_work_groups_per_cu", + llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())})); + } + + if (const auto *A = + FD->getAttr()) { + Fn->setMetadata("max_work_groups_per_mp", + llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())})); + } + if (const SYCLIntelMaxWorkGroupSizeAttr *A = FD->getAttr()) { diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 820c99a7b3410..bbebb560a67b1 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -245,6 +245,31 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes( // And kernel functions are not subject to inlining F->addFnAttr(llvm::Attribute::NoInline); } + if (const auto *MWGS = FD->getAttr()) { + auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() * + (*MWGS->getYDimVal()).getExtValue() * + (*MWGS->getXDimVal()).getExtValue(); + if (MaxThreads > 0) + addNVVMMetadata(F, "maxntidx", MaxThreads); + + auto attrValue = [&](Expr *E) { + const auto *CE = cast(E); + std::optional Val = CE->getResultAsAPSInt(); + return Val->getZExtValue(); + }; + + if (const auto *MWGPCU = + FD->getAttr()) { + // The value is guaranteed to be > 0, pass it to the metadata. + addNVVMMetadata(F, "minnctapersm", attrValue(MWGPCU->getValue())); + + if (const auto *MWGPMP = + FD->getAttr()) { + // The value is guaranteed to be > 0, pass it to the metadata. + addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue())); + } + } + } } // Perform special handling in CUDA mode. diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 47632f544a47d..c74abb5174e46 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -2999,6 +2999,12 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D, NewAttr = S.MergeSYCLIntelInitiationIntervalAttr(D, *A); else if (const auto *A = dyn_cast(Attr)) NewAttr = S.MergeSYCLWorkGroupSizeHintAttr(D, *A); + else if (const auto *A = + dyn_cast(Attr)) + NewAttr = S.MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(D, *A); + else if (const auto *A = + dyn_cast(Attr)) + NewAttr = S.MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(D, *A); else if (const auto *A = dyn_cast(Attr)) NewAttr = S.MergeSYCLIntelMaxGlobalWorkDimAttr(D, *A); else if (const auto *BTFA = dyn_cast(Attr)) diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 22fc155343b1d..0b123a3ca878d 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -3744,6 +3744,19 @@ static void handleSYCLIntelMaxWorkGroupSize(Sema &S, Decl *D, AL.getArgAsExpr(1), AL.getArgAsExpr(2)); } +// Handles min_work_groups_per_cu attribute. +static void handleSYCLIntelMinWorkGroupsPerComputeUnit(Sema &S, Decl *D, + const ParsedAttr &AL) { + S.AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(D, AL, AL.getArgAsExpr(0)); +} + +// Handles max_work_groups_per_mp attribute. +static void +handleSYCLIntelMaxWorkGroupsPerMultiprocessor(Sema &S, Decl *D, + const ParsedAttr &AL) { + S.AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(D, AL, AL.getArgAsExpr(0)); +} + // Handles reqd_work_group_size. // If the 'reqd_work_group_size' attribute is specified on a declaration along // with 'num_simd_work_items' attribute, the required work group size specified @@ -4434,6 +4447,127 @@ void Sema::AddSYCLIntelMaxGlobalWorkDimAttr(Decl *D, D->addAttr(::new (Context) SYCLIntelMaxGlobalWorkDimAttr(Context, CI, E)); } +// Check that the value is a non-negative integer constant that can fit in +// 32-bits. Issue correct error message and return false on failure. +bool static check32BitInt(const Expr *E, Sema &S, llvm::APSInt &I, + const AttributeCommonInfo &CI) { + if (!I.isIntN(32)) { + S.Diag(E->getExprLoc(), diag::err_ice_too_large) + << llvm::toString(I, 10, false) << 32 << /* Unsigned */ 1; + return false; + } + + if (I.isSigned() && I.isNegative()) { + S.Diag(E->getExprLoc(), diag::err_attribute_requires_positive_integer) + << CI << /* Non-negative */ 1; + return false; + } + + return true; +} + +void Sema::AddSYCLIntelMinWorkGroupsPerComputeUnitAttr( + Decl *D, const AttributeCommonInfo &CI, Expr *E) { + if (Context.getLangOpts().SYCLIsDevice && + !Context.getTargetInfo().getTriple().isNVPTX()) { + Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific) + << CI << E->getSourceRange(); + return; + } + if (!E->isValueDependent()) { + // Validate that we have an integer constant expression and then store the + // converted constant expression into the semantic attribute so that we + // don't have to evaluate it again later. + llvm::APSInt ArgVal; + ExprResult Res = VerifyIntegerConstantExpression(E, &ArgVal); + if (Res.isInvalid()) + return; + if (!check32BitInt(E, *this, ArgVal, CI)) + return; + E = Res.get(); + + // Check to see if there's a duplicate attribute with different values + // already applied to the declaration. + if (const auto *DeclAttr = + D->getAttr()) { + // If the other attribute argument is instantiation dependent, we won't + // have converted it to a constant expression yet and thus we test + // whether this is a null pointer. + if (const auto *DeclExpr = dyn_cast(DeclAttr->getValue())) { + if (ArgVal != DeclExpr->getResultAsAPSInt()) { + Diag(CI.getLoc(), diag::warn_duplicate_attribute) << CI; + Diag(DeclAttr->getLoc(), diag::note_previous_attribute); + } + // Drop the duplicate attribute. + return; + } + } + } + + D->addAttr(::new (Context) + SYCLIntelMinWorkGroupsPerComputeUnitAttr(Context, CI, E)); +} + +// Helper to get CudaArch. +static CudaArch getCudaArch(const TargetInfo &TI) { + if (!TI.getTriple().isNVPTX()) + llvm_unreachable("getCudaArch is only valid for NVPTX triple"); + auto &TO = TI.getTargetOpts(); + return StringToCudaArch(TO.CPU); +} + +void Sema::AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( + Decl *D, const AttributeCommonInfo &CI, Expr *E) { + auto &TI = Context.getTargetInfo(); + if (Context.getLangOpts().SYCLIsDevice) { + if (!TI.getTriple().isNVPTX()) { + Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific) + << CI << E->getSourceRange(); + return; + } + + // Feature '.maxclusterrank' requires .target sm_90 or higher. + auto SM = getCudaArch(TI); + if (SM == CudaArch::UNKNOWN || SM < CudaArch::SM_90) { + Diag(E->getBeginLoc(), diag::warn_cuda_maxclusterrank_sm_90) + << CudaArchToString(SM) << CI << E->getSourceRange(); + return; + } + } + if (!E->isValueDependent()) { + // Validate that we have an integer constant expression and then store the + // converted constant expression into the semantic attribute so that we + // don't have to evaluate it again later. + llvm::APSInt ArgVal; + ExprResult Res = VerifyIntegerConstantExpression(E, &ArgVal); + if (Res.isInvalid()) + return; + if (!check32BitInt(E, *this, ArgVal, CI)) + return; + E = Res.get(); + + // Check to see if there's a duplicate attribute with different values + // already applied to the declaration. + if (const auto *DeclAttr = + D->getAttr()) { + // If the other attribute argument is instantiation dependent, we won't + // have converted it to a constant expression yet and thus we test + // whether this is a null pointer. + if (const auto *DeclExpr = dyn_cast(DeclAttr->getValue())) { + if (ArgVal != DeclExpr->getResultAsAPSInt()) { + Diag(CI.getLoc(), diag::warn_duplicate_attribute) << CI; + Diag(DeclAttr->getLoc(), diag::note_previous_attribute); + } + // Drop the duplicate attribute. + return; + } + } + } + + D->addAttr(::new (Context) + SYCLIntelMaxWorkGroupsPerMultiprocessorAttr(Context, CI, E)); +} + SYCLIntelMaxGlobalWorkDimAttr *Sema::MergeSYCLIntelMaxGlobalWorkDimAttr( Decl *D, const SYCLIntelMaxGlobalWorkDimAttr &A) { // Check to see if there's a duplicate attribute with different values @@ -4473,6 +4607,52 @@ static void handleSYCLIntelMaxGlobalWorkDimAttr(Sema &S, Decl *D, S.AddSYCLIntelMaxGlobalWorkDimAttr(D, AL, E); } +SYCLIntelMinWorkGroupsPerComputeUnitAttr * +Sema::MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr( + Decl *D, const SYCLIntelMinWorkGroupsPerComputeUnitAttr &A) { + // Check to see if there's a duplicate attribute with different values + // already applied to the declaration. + if (const auto *DeclAttr = + D->getAttr()) { + if (const auto *DeclExpr = dyn_cast(DeclAttr->getValue())) { + if (const auto *MergeExpr = dyn_cast(A.getValue())) { + if (DeclExpr->getResultAsAPSInt() != MergeExpr->getResultAsAPSInt()) { + Diag(DeclAttr->getLoc(), diag::warn_duplicate_attribute) << &A; + Diag(A.getLoc(), diag::note_previous_attribute); + } + // Do not add a duplicate attribute. + return nullptr; + } + } + } + + return ::new (Context) + SYCLIntelMinWorkGroupsPerComputeUnitAttr(Context, A, A.getValue()); +} + +SYCLIntelMaxWorkGroupsPerMultiprocessorAttr * +Sema::MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( + Decl *D, const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr &A) { + // Check to see if there's a duplicate attribute with different values + // already applied to the declaration. + if (const auto *DeclAttr = + D->getAttr()) { + if (const auto *DeclExpr = dyn_cast(DeclAttr->getValue())) { + if (const auto *MergeExpr = dyn_cast(A.getValue())) { + if (DeclExpr->getResultAsAPSInt() != MergeExpr->getResultAsAPSInt()) { + Diag(DeclAttr->getLoc(), diag::warn_duplicate_attribute) << &A; + Diag(A.getLoc(), diag::note_previous_attribute); + } + // Do not add a duplicate attribute. + return nullptr; + } + } + } + + return ::new (Context) + SYCLIntelMaxWorkGroupsPerMultiprocessorAttr(Context, A, A.getValue()); +} + // Handles [[intel::loop_fuse]] and [[intel::loop_fuse_independent]]. void Sema::AddSYCLIntelLoopFuseAttr(Decl *D, const AttributeCommonInfo &CI, Expr *E) { @@ -7045,14 +7225,6 @@ bool Sema::CheckRegparmAttr(const ParsedAttr &AL, unsigned &numParams) { return false; } -// Helper to get CudaArch. -static CudaArch getCudaArch(const TargetInfo &TI) { - if (!TI.getTriple().isNVPTX()) - llvm_unreachable("getCudaArch is only valid for NVPTX triple"); - auto &TO = TI.getTargetOpts(); - return StringToCudaArch(TO.CPU); -} - // Checks whether an argument of launch_bounds attribute is // acceptable, performs implicit conversion to Rvalue, and returns // non-nullptr Expr result on success. Otherwise, it returns nullptr @@ -12010,6 +12182,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, case ParsedAttr::AT_SYCLIntelMaxWorkGroupSize: handleSYCLIntelMaxWorkGroupSize(S, D, AL); break; + case ParsedAttr::AT_SYCLIntelMinWorkGroupsPerComputeUnit: + handleSYCLIntelMinWorkGroupsPerComputeUnit(S, D, AL); + break; + case ParsedAttr::AT_SYCLIntelMaxWorkGroupsPerMultiprocessor: + handleSYCLIntelMaxWorkGroupsPerMultiprocessor(S, D, AL); + break; case ParsedAttr::AT_IntelReqdSubGroupSize: handleIntelReqdSubGroupSize(S, D, AL); break; @@ -12537,6 +12715,14 @@ void Sema::ProcessDeclAttributeList( } else if (const auto *A = D->getAttr()) { Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A; D->setInvalidDecl(); + } else if (const auto *A = + D->getAttr()) { + Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A; + D->setInvalidDecl(); + } else if (const auto *A = + D->getAttr()) { + Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A; + D->setInvalidDecl(); } else if (const auto *A = D->getAttr()) { Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A; D->setInvalidDecl(); diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 15182d369143f..e2a8d10ded1ab 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -538,6 +538,8 @@ static void collectSYCLAttributes(Sema &S, FunctionDecl *FD, SYCLIntelKernelArgsRestrictAttr, SYCLIntelNumSimdWorkItemsAttr, SYCLIntelSchedulerTargetFmaxMhzAttr, SYCLIntelMaxWorkGroupSizeAttr, SYCLIntelMaxGlobalWorkDimAttr, + SYCLIntelMinWorkGroupsPerComputeUnitAttr, + SYCLIntelMaxWorkGroupsPerMultiprocessorAttr, SYCLIntelNoGlobalWorkOffsetAttr, SYCLSimdAttr>(A); }); } @@ -4489,6 +4491,8 @@ static void PropagateAndDiagnoseDeviceAttr( case attr::Kind::SYCLIntelNumSimdWorkItems: case attr::Kind::SYCLIntelSchedulerTargetFmaxMhz: case attr::Kind::SYCLIntelMaxGlobalWorkDim: + case attr::Kind::SYCLIntelMinWorkGroupsPerComputeUnit: + case attr::Kind::SYCLIntelMaxWorkGroupsPerMultiprocessor: case attr::Kind::SYCLIntelNoGlobalWorkOffset: case attr::Kind::SYCLIntelLoopFuse: case attr::Kind::SYCLIntelMaxConcurrency: diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 66e3dc0612525..7d44576a6b7de 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -754,6 +754,28 @@ static void instantiateSYCLIntelMaxGlobalWorkDimAttr( S.AddSYCLIntelMaxGlobalWorkDimAttr(New, *A, Result.getAs()); } +static void instantiateSYCLIntelMinWorkGroupsPerComputeUnitAttr( + Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs, + const SYCLIntelMinWorkGroupsPerComputeUnitAttr *A, Decl *New) { + EnterExpressionEvaluationContext Unevaluated( + S, Sema::ExpressionEvaluationContext::ConstantEvaluated); + ExprResult Result = S.SubstExpr(A->getValue(), TemplateArgs); + if (!Result.isInvalid()) + S.AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(New, *A, + Result.getAs()); +} + +static void instantiateSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( + Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs, + const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr *A, Decl *New) { + EnterExpressionEvaluationContext Unevaluated( + S, Sema::ExpressionEvaluationContext::ConstantEvaluated); + ExprResult Result = S.SubstExpr(A->getValue(), TemplateArgs); + if (!Result.isInvalid()) + S.AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(New, *A, + Result.getAs()); +} + static void instantiateSYCLIntelMaxConcurrencyAttr( Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs, const SYCLIntelMaxConcurrencyAttr *A, Decl *New) { @@ -1145,6 +1167,18 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs, SYCLIntelMaxGlobalWorkDim, New); continue; } + if (const auto *SYCLIntelMinWorkGroupsPerComputeUnit = + dyn_cast(TmplAttr)) { + instantiateSYCLIntelMinWorkGroupsPerComputeUnitAttr( + *this, TemplateArgs, SYCLIntelMinWorkGroupsPerComputeUnit, New); + continue; + } + if (const auto *SYCLIntelMaxWorkGroupsPerMultiprocessor = + dyn_cast(TmplAttr)) { + instantiateSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( + *this, TemplateArgs, SYCLIntelMaxWorkGroupsPerMultiprocessor, New); + continue; + } if (const auto *SYCLIntelLoopFuse = dyn_cast(TmplAttr)) { instantiateSYCLIntelLoopFuseAttr(*this, TemplateArgs, SYCLIntelLoopFuse, diff --git a/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp new file mode 100644 index 0000000000000..13c6d1ce4e5f0 --- /dev/null +++ b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp @@ -0,0 +1,105 @@ +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -triple nvptx-unknown-unknown -target-cpu sm_90 -disable-llvm-passes -S -emit-llvm -o - %s | FileCheck %s + +// Test correct handling of maximum work group size, minimum work groups per +// compute unit and maximum work groups per multi-processor attributes, that +// correspond to CUDA's launch bounds. Expect max_work_group_size, +// min_work_groups_per_cu and max_work_groups_per_mp that are mapped to +// maxntidx, minnctapersm, maxclusterrank PTX directives respectively. + +#include "sycl.hpp" + +using namespace sycl; +queue q; + +class Foo { +public: + [[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] void + operator()() const {} +}; + +template class Functor { +public: + [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N), + intel::max_work_groups_per_mp(N)]] void + operator()() const {} +}; + +template +[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N), + intel::max_work_groups_per_mp(N)]] void +zoo() {} + +[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] void +bar() {} + +int main() { + q.submit([&](handler &h) { + // Test attribute argument size. + Foo boo; + h.single_task(boo); + + // Test attribute is applied on lambda. + h.single_task( + [] [[intel::max_work_group_size(8, 8, 8), + intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] () {}); + + // Test class template argument. + Functor<6> f; + h.single_task(f); + + // Test attribute is propagated. + h.single_task([]() { bar(); }); + + // Test function template argument. + h.single_task([]() { zoo<16>(); }); + }); + return 0; +} + +// CHECK: define dso_local void @{{.*}}kernel_name1() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]] +// CHECK: define dso_local void @{{.*}}kernel_name2() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]] +// CHECK: define dso_local void @{{.*}}kernel_name3() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM]] !max_work_group_size ![[MWGS_2:[0-9]+]] +// CHECK: define dso_local void @{{.*}}kernel_name4() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]] +// CHECK: define dso_local void @{{.*}}kernel_name5() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM_2:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM_2]] !max_work_group_size ![[MWGS_3:[0-9]+]] + +// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 512} +// CHECK: {{.*}}@{{.*}}kernel_name1, !"minnctapersm", i32 2} +// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxclusterrank", i32 4} +// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 512} +// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minnctapersm", i32 2} +// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxclusterrank", i32 4} +// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 512} +// CHECK: {{.*}}@{{.*}}kernel_name2, !"minnctapersm", i32 2} +// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxclusterrank", i32 4} +// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 512} +// CHECK: {{.*}}@{{.*}}main{{.*}}, !"minnctapersm", i32 2} +// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxclusterrank", i32 4} +// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 384} +// CHECK: {{.*}}@{{.*}}kernel_name3, !"minnctapersm", i32 6} +// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxclusterrank", i32 6} +// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 384} +// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minnctapersm", i32 6} +// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxclusterrank", i32 6} +// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 512} +// CHECK: {{.*}}@{{.*}}kernel_name4, !"minnctapersm", i32 2} +// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxclusterrank", i32 4} +// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 512} +// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"minnctapersm", i32 2} +// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxclusterrank", i32 4} +// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 1024} +// CHECK: {{.*}}@{{.*}}kernel_name5, !"minnctapersm", i32 16} +// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxclusterrank", i32 16} +// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 1024} +// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"minnctapersm", i32 16} +// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxclusterrank", i32 16} + +// CHECK: ![[MWGPC]] = !{i32 2} +// CHECK: ![[MWGPM]] = !{i32 4} +// CHECK: ![[MWGS]] = !{i32 8, i32 8, i32 8} +// CHECK: ![[MWGPC_MWGPM]] = !{i32 6} +// CHECK: ![[MWGS_2]] = !{i32 8, i32 8, i32 6} +// CHECK: ![[MWGPC_MWGPM_2]] = !{i32 16} +// CHECK: ![[MWGS_3]] = !{i32 8, i32 8, i32 16} diff --git a/clang/test/CodeGenSYCL/lb_sm_90.cpp b/clang/test/CodeGenSYCL/lb_sm_90.cpp new file mode 100644 index 0000000000000..5909edde1479f --- /dev/null +++ b/clang/test/CodeGenSYCL/lb_sm_90.cpp @@ -0,0 +1,46 @@ +// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_90 -fsycl-is-device -Wno-c++23-extensions -S -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR + +// Maximum work groups per multi-processor, mapped to maxclusterrank PTX +// directive, is an SM_90 feature, make sure that correct metadata is generated. + +#include "sycl.hpp" + +template class Functor { +public: + [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2), + intel::max_work_groups_per_mp(N3)]] void + operator()() const {} +}; + +int main() { + sycl::queue Q{}; + + sycl::range<1> Gws(32); + + Q.submit([&](sycl::handler &cgh) { + cgh.parallel_for(Gws, [=] [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] ( + sycl::id<1>) { volatile int A = 42; }); + }); + // CHECK-IR: !min_work_groups_per_cu [[MWGPCU:![0-9]+]] + // CHECK-IR: !max_work_groups_per_mp [[MWGPMP:![0-9]+]] + // CHECK-IR: !max_work_group_size [[MWGS:![0-9]+]] + + Q.submit([&](sycl::handler &cgh) { + cgh.single_task(Functor<512, 8, 16>{}); + }); + // CHECK-IR: !min_work_groups_per_cu [[MWGPCU_F:![0-9]+]] + // CHECK-IR: !max_work_groups_per_mp [[MWGPMP_F:![0-9]+]] + // CHECK-IR: !max_work_group_size [[MWGS_F:![0-9]+]] + + // CHECK-IR: [[MWGPCU]] = !{i32 2} + // CHECK-IR: [[MWGPMP]] = !{i32 4} + // CHECK-IR: [[MWGS]] = !{i32 256, i32 1, i32 1} + + // CHECK-IR: [[MWGPCU_F]] = !{i32 8} + // CHECK-IR: [[MWGPMP_F]] = !{i32 16} + // CHECK-IR: [[MWGS_F]] = !{i32 512, i32 1, i32 1} + + return 0; +} diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test index a79c6157e7ffa..b160de34da345 100644 --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -181,6 +181,8 @@ // CHECK-NEXT: SYCLIntelMaxConcurrency (SubjectMatchRule_function) // CHECK-NEXT: SYCLIntelMaxGlobalWorkDim (SubjectMatchRule_function) // CHECK-NEXT: SYCLIntelMaxWorkGroupSize (SubjectMatchRule_function) +// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessor (SubjectMatchRule_function) +// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnit (SubjectMatchRule_function) // CHECK-NEXT: SYCLIntelNoGlobalWorkOffset (SubjectMatchRule_function) // CHECK-NEXT: SYCLIntelNumSimdWorkItems (SubjectMatchRule_function) // CHECK-NEXT: SYCLIntelPipeIO (SubjectMatchRule_variable) diff --git a/clang/test/SemaCUDA/launch_bounds.cu b/clang/test/SemaCUDA/launch_bounds.cu index 045f475692959..aa83c3560e71d 100644 --- a/clang/test/SemaCUDA/launch_bounds.cu +++ b/clang/test/SemaCUDA/launch_bounds.cu @@ -11,7 +11,7 @@ __launch_bounds__(0x10000000000000000) void TestWayTooBigArg(void); // expected- __launch_bounds__(-128, 7) void TestNegArg1(void); // expected-warning {{'launch_bounds' attribute parameter 0 is negative and will be ignored}} __launch_bounds__(128, -7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 1 is negative and will be ignored}} -__launch_bounds__(128, 2, -8) void TestNegArg2(void); // expected-warning {{maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}} +__launch_bounds__(128, 2, -8) void TestNegArg2(void); // expected-warning {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}} __launch_bounds__(1, 2, 3, 4) void Test4Args(void); // expected-error {{'launch_bounds' attribute takes no more than 3 arguments}} __launch_bounds__() void TestNoArgs(void); // expected-error {{'launch_bounds' attribute takes at least 1 argument}} @@ -49,4 +49,4 @@ __launch_bounds__(Args) void TestTemplateVariadicArgs(void) {} // expected-error template __launch_bounds__(1, Args) void TestTemplateVariadicArgs2(void) {} // expected-error {{expression contains unexpanded parameter pack 'Args'}} -__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-warning {{maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}} +__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-warning {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}} diff --git a/clang/test/SemaSYCL/lb_sm_70.cpp b/clang/test/SemaSYCL/lb_sm_70.cpp new file mode 100644 index 0000000000000..02a7ceee0b221 --- /dev/null +++ b/clang/test/SemaSYCL/lb_sm_70.cpp @@ -0,0 +1,61 @@ +// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_70 -fsycl-is-device -Wno-c++23-extensions %s -o -fsyntax-only -verify %s + +// Maximum work groups per multi-processor, mapped to maxclusterrank PTX +// directive, is an SM_90 feature, make sure that correct warning is issued on +// architectures lower than that. Furthermore, warn/error incorrect values +// specified for max_work_groups_per_mp and min_work_groups_per_cu. + +#include "sycl.hpp" + +template class Functor { +public: + // expected-warning@+2 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}} + [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2), + intel::max_work_groups_per_mp(N3)]] void + operator()() const {} +}; + +int main() { + sycl::queue Q{}; + + Q.submit([&](sycl::handler &cgh) { + // expected-warning@+4 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}} + cgh.single_task( + [=] [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; }); + + constexpr float A = 2.0; + // expected-warning@+5{{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}} + // expected-error@+3 {{integral constant expression must have integral or unscoped enumeration type, not 'float'}} + cgh.single_task( + [=] [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(A), + intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; }); + + // expected-error@+4 {{expression is not an integral constant expression}} + // expected-note@+3 {{value 2147483648 is outside the range of representable values of type 'int'}} + cgh.single_task( + [=] [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(2147483647 + 1)]] () { + volatile int A = 42; + }); + + // expected-warning@+5 {{attribute 'min_work_groups_per_cu' is already applied with different arguments}} + // expected-note@+3 {{previous attribute is here}} + cgh.single_task( + [=] [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(4), + intel::min_work_groups_per_cu(8)]] () { volatile int A = 42; }); + + // expected-error@+2 {{'min_work_groups_per_cu' attribute requires a non-negative integral compile time constant expression}} + cgh.single_task( + [=] [[intel::min_work_groups_per_cu(-8)]] () { volatile int A = 42; }); + }); + + Q.submit([&](sycl::handler &cgh) { + cgh.single_task(Functor<512, 8, 16>{}); + }); + + return 0; +} diff --git a/clang/test/SemaSYCL/lb_sm_90.cpp b/clang/test/SemaSYCL/lb_sm_90.cpp new file mode 100644 index 0000000000000..9d2f61571565d --- /dev/null +++ b/clang/test/SemaSYCL/lb_sm_90.cpp @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -internal-isystem %S/Inputs %s -triple nvptx64-nvidia-cuda -target-cpu sm_90 -fsycl-is-device -fsyntax-only -Wno-c++23-extensions -verify +// expected-no-diagnostics + +// Maximum work groups per multi-processor, mapped to maxclusterrank PTX +// directive, is an SM_90 feature, make sure that no warnings/errors are issued. + +#include "sycl.hpp" + +template class Functor { +public: + [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2), + intel::max_work_groups_per_mp(N3)]] void + operator()() const {} +}; + +int main() { + sycl::queue Q{}; + + Q.submit([&](sycl::handler &cgh) { + cgh.single_task( [=] [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] ( +) { volatile int A = 42; }); + }); + + Q.submit([&](sycl::handler &cgh) { + cgh.single_task(Functor<512, 8, 16>{}); + }); + + return 0; +} diff --git a/clang/test/SemaSYCL/lb_sm_90_ast.cpp b/clang/test/SemaSYCL/lb_sm_90_ast.cpp new file mode 100644 index 0000000000000..2493445d6b985 --- /dev/null +++ b/clang/test/SemaSYCL/lb_sm_90_ast.cpp @@ -0,0 +1,176 @@ +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -ast-dump -triple nvptx-unknown-unknown -target-cpu sm_90 -Wno-c++23-extensions %s | FileCheck %s + +// Tests for AST of Intel max_work_group_size, min_work_groups_per_cu and +// max_work_groups_per_mp attribute. + +#include "sycl.hpp" + +sycl::queue deviceQueue; + +// CHECK: FunctionDecl {{.*}} func1 'void ()' +// CHECK-NEXT: CompoundStmt +// CHECK-NEXT: SYCLIntelMaxWorkGroupSizeAttr {{.*}} +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}} +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 4 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4 +// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}} +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2 +[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(4), + intel::max_work_groups_per_mp(2)]] void +func1() {} + +// Test that checks template parameter support on function. +// CHECK: FunctionTemplateDecl {{.*}} func2 +// CHECK: FunctionDecl {{.*}} func2 'void ()' +// CHECK-NEXT: CompoundStmt +// CHECK-NEXT: SYCLIntelMaxWorkGroupSizeAttr {{.*}} +// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'N' 'int' +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}} +// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'N' 'int' +// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}} +// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'N' 'int' + +// CHECK: FunctionDecl {{.*}} func2 'void ()' +// CHECK-NEXT: TemplateArgument integral 6 +// CHECK-NEXT: CompoundStmt +// CHECK-NEXT: SYCLIntelMaxWorkGroupSizeAttr {{.*}} +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 6 +// CHECK-NEXT: SubstNonTypeTemplateParmExpr +// CHECK-NEXT: NonTypeTemplateParmDecl +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6 +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}} +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 6 +// CHECK-NEXT: SubstNonTypeTemplateParmExpr +// CHECK-NEXT: NonTypeTemplateParmDecl +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6 +// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}} +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 6 +// CHECK-NEXT: SubstNonTypeTemplateParmExpr +// CHECK-NEXT: NonTypeTemplateParmDecl +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6 +template +[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N), + intel::max_work_groups_per_mp(N)]] void +func2() {} + +class KernelFunctor { +public: + void operator()() const { func1(); } +}; + +// Test that checks template parameter support on class member function. +template class KernelFunctor2 { +public: + [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N), + intel::max_work_groups_per_mp(N)]] void + operator()() const {} +}; + +int main() { + deviceQueue.submit([&](sycl::handler &h) { + // CHECK-LABEL: FunctionDecl {{.*}}kernel_name_1 + // CHECK: SYCLIntelMaxWorkGroupSizeAttr + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK: SYCLIntelMinWorkGroupsPerComputeUnitAttr + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 4 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4 + // CHECK: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 2 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2 + KernelFunctor f1; + h.single_task(f1); + + // CHECK-LABEL: FunctionDecl {{.*}}kernel_name_2 + // CHECK: SYCLIntelMaxWorkGroupSizeAttr + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 3 + // CHECK-NEXT: SubstNonTypeTemplateParmExpr + // CHECK-NEXT: NonTypeTemplateParmDecl + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3 + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK: SYCLIntelMinWorkGroupsPerComputeUnitAttr + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 3 + // CHECK-NEXT: SubstNonTypeTemplateParmExpr + // CHECK-NEXT: NonTypeTemplateParmDecl + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3 + // CHECK: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 3 + // CHECK-NEXT: SubstNonTypeTemplateParmExpr + // CHECK-NEXT: NonTypeTemplateParmDecl + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3 + KernelFunctor2<3> f2; + h.single_task(f2); + + // CHECK-LABEL: FunctionDecl {{.*}}kernel_name_3 + // CHECK: SYCLIntelMaxWorkGroupSizeAttr {{.*}} + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}} + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 4 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4 + // CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}} + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 6 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6 + h.single_task( + [] [[intel::max_work_group_size(8, 8, 8), + intel::min_work_groups_per_cu(4), + intel::max_work_groups_per_mp(6)]] () {}); + }); + + func2<6>(); + + return 0; +}