From bbe994d9fc4e1576023e053f4e0c0c93bcdffee3 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 15 Sep 2023 13:21:45 +0100 Subject: [PATCH 1/9] [SYCL] Introduce min_work_groups_per_cu and max_work_groups_per_mp The attributes match to CUDA's launch bounds minBlocksPerMultiprocessor and maxBlocksPerCluster respectively. --- clang/include/clang/Basic/Attr.td | 18 ++ clang/include/clang/Basic/AttrDocs.td | 59 +++++ .../clang/Basic/DiagnosticSemaKinds.td | 3 + clang/include/clang/Sema/Sema.h | 10 + clang/lib/CodeGen/CodeGenFunction.cpp | 19 ++ clang/lib/CodeGen/Targets/NVPTX.cpp | 25 ++ clang/lib/Sema/SemaDecl.cpp | 6 + clang/lib/Sema/SemaDeclAttr.cpp | 217 +++++++++++++++++- clang/lib/Sema/SemaSYCL.cpp | 4 + .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 34 +++ ...a-attribute-supported-attributes-list.test | 2 + sycl/test/launch_bounds/lb_sm_70.cpp | 67 ++++++ sycl/test/launch_bounds/lb_sm_90.cpp | 48 ++++ 13 files changed, 504 insertions(+), 8 deletions(-) create mode 100644 sycl/test/launch_bounds/lb_sm_70.cpp create mode 100644 sycl/test/launch_bounds/lb_sm_90.cpp diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 132410c110df6..377b7e6625bfb 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1583,6 +1583,24 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr { let SupportsNonconformingLambdaSyntax = 1; } +def SYCLIntelMinWorkGroupsPerComputeUnit : InheritableAttr { + let Spellings = [CXX11<"intel", "min_work_groups_per_cu">]; + let Args = [ExprArgument<"Value">]; + let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost]; + let Subjects = SubjectList<[Function], ErrorDiag>; + let Documentation = [SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs]; + let SupportsNonconformingLambdaSyntax = 1; +} + +def SYCLIntelMaxWorkGroupsPerMultiprocessor : InheritableAttr { + let Spellings = [CXX11<"intel", "max_work_groups_per_mp">]; + let Args = [ExprArgument<"Value">]; + let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost]; + let Subjects = SubjectList<[Function], ErrorDiag>; + let Documentation = [SYCLIntelMaxWorkGroupsPerMultiprocessorDocs]; + let SupportsNonconformingLambdaSyntax = 1; +} + def SYCLIntelMaxGlobalWorkDim : InheritableAttr { let Spellings = [CXX11<"intel", "max_global_work_dim">]; let Args = [ExprArgument<"Value">]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index b04c562374963..158e4cf149155 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -3017,6 +3017,65 @@ In SYCL 2020 mode, the attribute is not propagated to the kernel. }]; } +def SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs: Documentation { + let Category = DocCatFunction; + let Heading = "intel::min_work_groups_per_cu"; + let Content = [{ +Applies to a device function/lambda function. Indicates the desired minimum +number of resident work_groups per multiprocessor. It complies to the +.minnctapersm PTX directive. + +.. code-block:: c++ + + [[intel::min_work_groups_per_cu(2)]] void foo() {} + + class Foo { + public: + [[intel::min_work_groups_per_cu(2)]] void operator()() const {} + }; + + template + class Functor { + public: + [[intel::min_work_groups_per_cu(N)]] void operator()() const {} + }; + + template + [[intel::min_work_groups_per_cu(N)]] void func() {} + + }]; +} + +def SYCLIntelMaxWorkGroupsPerMultiprocessorDocs: Documentation { + let Category = DocCatFunction; + let Heading = "intel::max_work_groups_per_mp"; + let Content = [{ +Applies to a device function/lambda function. Indicates the desired maximum +number work_groups per cluster with which the application will ever launch. It +complies to the .maxclusterrank PTX directive. Note, that the feature requires +SM_90 or higher. + +.. code-block:: c++ + + [[intel::max_work_groups_per_mp(2)]] void foo() {} + + class Foo { + public: + [[intel::max_work_groups_per_mp(2)]] void operator()() const {} + }; + + template + class Functor { + public: + [[intel::max_work_groups_per_mp(N)]] void operator()() const {} + }; + + template + [[intel::max_work_groups_per_mp(N)]] void func() {} + + }]; +} + def SYCLIntelMaxGlobalWorkDimAttrDocs : Documentation { let Category = DocCatFunction; let Heading = "intel::max_global_work_dim"; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 8fb37f3c57290..6e309c62d3a0f 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11986,6 +11986,9 @@ def warn_sycl_kernel_return_type : Warning< def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; +def warn_launch_bounds_is_cuda_specific : Warning< + "%0 attribute ignored, only applicable when targetting Nvidia devices">, + InGroup; def warn_cuda_maxclusterrank_sm_90 : Warning< "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 7f3e0047f8888..1009f2bb5a0a5 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11392,6 +11392,16 @@ class Sema final { SYCLIntelMaxGlobalWorkDimAttr * MergeSYCLIntelMaxGlobalWorkDimAttr(Decl *D, const SYCLIntelMaxGlobalWorkDimAttr &A); + void AddSYCLIntelMinWorkGroupsPerComputeUnitAttr( + Decl *D, const AttributeCommonInfo &CI, Expr *E); + SYCLIntelMinWorkGroupsPerComputeUnitAttr * + MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr( + Decl *D, const SYCLIntelMinWorkGroupsPerComputeUnitAttr &A); + void AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( + Decl *D, const AttributeCommonInfo &CI, Expr *E); + SYCLIntelMaxWorkGroupsPerMultiprocessorAttr * + MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( + Decl *D, const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr &A); void AddSYCLIntelBankWidthAttr(Decl *D, const AttributeCommonInfo &CI, Expr *E); SYCLIntelBankWidthAttr * diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index f53a3033d8e9e..ae00ca1329ce6 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -758,6 +758,25 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD, llvm::MDNode::get(Context, AttrMDArgs)); } + if (const auto *A = FD->getAttr()) { + const auto *CE = cast(A->getValue()); + std::optional ArgVal = CE->getResultAsAPSInt(); + llvm::Metadata *AttrMDArgs[] = {llvm::ConstantAsMetadata::get( + Builder.getInt32(ArgVal->getSExtValue()))}; + Fn->setMetadata("min_work_groups_per_cu", + llvm::MDNode::get(Context, AttrMDArgs)); + } + + if (const auto *A = + FD->getAttr()) { + const auto *CE = cast(A->getValue()); + std::optional ArgVal = CE->getResultAsAPSInt(); + llvm::Metadata *AttrMDArgs[] = {llvm::ConstantAsMetadata::get( + Builder.getInt32(ArgVal->getSExtValue()))}; + Fn->setMetadata("max_work_groups_per_mp", + llvm::MDNode::get(Context, AttrMDArgs)); + } + if (const SYCLIntelMaxWorkGroupSizeAttr *A = FD->getAttr()) { diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 820c99a7b3410..f13bf26d2c36c 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -245,6 +245,31 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes( // And kernel functions are not subject to inlining F->addFnAttr(llvm::Attribute::NoInline); } + if (const auto *MWGS = FD->getAttr()) { + auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() * + (*MWGS->getYDimVal()).getExtValue() * + (*MWGS->getXDimVal()).getExtValue(); + if (MaxThreads > 0) + addNVVMMetadata(F, "maxntidx", MaxThreads); + } + if (const auto *MWGPCU = + FD->getAttr()) { + auto *MinWorkGroups = MWGPCU->getValue(); + if (const auto *CE = dyn_cast(MinWorkGroups)) { + auto MinVal = CE->getResultAsAPSInt(); + // The value is guaranteed to be > 0, pass it to the metadata. + addNVVMMetadata(F, "minnctapersm", MinVal.getExtValue()); + } + } + if (const auto *MWGPMP = + FD->getAttr()) { + auto *MaxWorkGroups = MWGPMP->getValue(); + if (const auto *CE = dyn_cast(MaxWorkGroups)) { + auto MaxVal = CE->getResultAsAPSInt(); + // The value is guaranteed to be > 0, pass it to the metadata. + addNVVMMetadata(F, "maxclusterrank", MaxVal.getExtValue()); + } + } } // Perform special handling in CUDA mode. diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 46d63a4a34dc6..fa7667635b0c8 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -2999,6 +2999,12 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D, NewAttr = S.MergeSYCLIntelInitiationIntervalAttr(D, *A); else if (const auto *A = dyn_cast(Attr)) NewAttr = S.MergeSYCLWorkGroupSizeHintAttr(D, *A); + else if (const auto *A = + dyn_cast(Attr)) + NewAttr = S.MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(D, *A); + else if (const auto *A = + dyn_cast(Attr)) + NewAttr = S.MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(D, *A); else if (const auto *A = dyn_cast(Attr)) NewAttr = S.MergeSYCLIntelMaxGlobalWorkDimAttr(D, *A); else if (const auto *BTFA = dyn_cast(Attr)) diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 88a4626886af6..aae3c9005c60d 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -3744,6 +3744,19 @@ static void handleSYCLIntelMaxWorkGroupSize(Sema &S, Decl *D, AL.getArgAsExpr(1), AL.getArgAsExpr(2)); } +// Handles min_work_groups_per_cu attribute. +static void handleSYCLIntelMinWorkGroupsPerComputeUnit(Sema &S, Decl *D, + const ParsedAttr &AL) { + S.AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(D, AL, AL.getArgAsExpr(0)); +} + +// Handles max_work_groups_per_mp attribute. +static void +handleSYCLIntelMaxWorkGroupsPerMultiprocessor(Sema &S, Decl *D, + const ParsedAttr &AL) { + S.AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(D, AL, AL.getArgAsExpr(0)); +} + // Handles reqd_work_group_size. // If the 'reqd_work_group_size' attribute is specified on a declaration along // with 'num_simd_work_items' attribute, the required work group size specified @@ -4434,6 +4447,142 @@ void Sema::AddSYCLIntelMaxGlobalWorkDimAttr(Decl *D, D->addAttr(::new (Context) SYCLIntelMaxGlobalWorkDimAttr(Context, CI, E)); } +// Check that the attribute is an integer constant that can fit in 32-bits. +// Issue correct error message and return false on failure. +bool static check32BitInt(const Expr *E, const AttributeCommonInfo &CI, + Sema &S) { + std::optional I = llvm::APSInt(64); + if (!(I = E->getIntegerConstantExpr(S.Context))) { + S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type) + << CI << 0 << AANT_ArgumentIntegerConstant << E->getSourceRange(); + return false; + } + // Make sure we can fit it in 32 bits. + if (!I->isIntN(32)) { + S.Diag(E->getExprLoc(), diag::err_ice_too_large) + << toString(*I, 10, false) << 32 << /* Unsigned */ 1; + return false; + } + + return true; +} + +void Sema::AddSYCLIntelMinWorkGroupsPerComputeUnitAttr( + Decl *D, const AttributeCommonInfo &CI, Expr *E) { + if (Context.getLangOpts().SYCLIsDevice && + !Context.getTargetInfo().getTriple().isNVPTX()) { + Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific) + << CI << E->getSourceRange(); + return; + } + if (!E->isValueDependent()) { + if (!check32BitInt(E, CI, *this)) + return; + // Validate that we have an integer constant expression and then store the + // converted constant expression into the semantic attribute so that we + // don't have to evaluate it again later. + llvm::APSInt ArgVal; + ExprResult Res = VerifyIntegerConstantExpression(E, &ArgVal); + if (Res.isInvalid()) + return; + E = Res.get(); + + // This attribute must be greater than 0. + if (ArgVal <= 0) { + Diag(E->getBeginLoc(), diag::err_attribute_argument_is_zero) + << CI << E->getSourceRange(); + return; + } + + // Check to see if there's a duplicate attribute with different values + // already applied to the declaration. + if (const auto *DeclAttr = + D->getAttr()) { + // If the other attribute argument is instantiation dependent, we won't + // have converted it to a constant expression yet and thus we test + // whether this is a null pointer. + if (const auto *DeclExpr = dyn_cast(DeclAttr->getValue())) { + if (ArgVal != DeclExpr->getResultAsAPSInt()) { + Diag(CI.getLoc(), diag::warn_duplicate_attribute) << CI; + Diag(DeclAttr->getLoc(), diag::note_previous_attribute); + } + // Drop the duplicate attribute. + return; + } + } + } + + D->addAttr(::new (Context) + SYCLIntelMinWorkGroupsPerComputeUnitAttr(Context, CI, E)); +} + +// Helper to get CudaArch. +static CudaArch getCudaArch(const TargetInfo &TI) { + if (!TI.getTriple().isNVPTX()) + llvm_unreachable("getCudaArch is only valid for NVPTX triple"); + auto &TO = TI.getTargetOpts(); + return StringToCudaArch(TO.CPU); +} + +void Sema::AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( + Decl *D, const AttributeCommonInfo &CI, Expr *E) { + auto &TI = Context.getTargetInfo(); + if (Context.getLangOpts().SYCLIsDevice) { + if (!TI.getTriple().isNVPTX()) { + Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific) + << CI << E->getSourceRange(); + return; + } + + // Feature '.maxclusterrank' requires .target sm_90 or higher. + auto SM = getCudaArch(TI); + if (SM == CudaArch::UNKNOWN || SM < CudaArch::SM_90) { + Diag(E->getBeginLoc(), diag::warn_cuda_maxclusterrank_sm_90) + << CudaArchToString(SM) << CI << E->getSourceRange(); + return; + } + } + if (!E->isValueDependent()) { + if (!check32BitInt(E, CI, *this)) + return; + // Validate that we have an integer constant expression and then store the + // converted constant expression into the semantic attribute so that we + // don't have to evaluate it again later. + llvm::APSInt ArgVal; + ExprResult Res = VerifyIntegerConstantExpression(E, &ArgVal); + if (Res.isInvalid()) + return; + E = Res.get(); + + // This attribute must be greater than 0. + if (ArgVal <= 0) { + Diag(E->getBeginLoc(), diag::err_attribute_argument_is_zero) + << CI << E->getSourceRange(); + return; + } + + // Check to see if there's a duplicate attribute with different values + // already applied to the declaration. + if (const auto *DeclAttr = + D->getAttr()) { + // If the other attribute argument is instantiation dependent, we won't + // have converted it to a constant expression yet and thus we test + // whether this is a null pointer. + if (const auto *DeclExpr = dyn_cast(DeclAttr->getValue())) { + if (ArgVal != DeclExpr->getResultAsAPSInt()) { + Diag(CI.getLoc(), diag::warn_duplicate_attribute) << CI; + Diag(DeclAttr->getLoc(), diag::note_previous_attribute); + } + // Drop the duplicate attribute. + return; + } + } + } + + D->addAttr(::new (Context) + SYCLIntelMaxWorkGroupsPerMultiprocessorAttr(Context, CI, E)); +} + SYCLIntelMaxGlobalWorkDimAttr *Sema::MergeSYCLIntelMaxGlobalWorkDimAttr( Decl *D, const SYCLIntelMaxGlobalWorkDimAttr &A) { // Check to see if there's a duplicate attribute with different values @@ -4473,6 +4622,52 @@ static void handleSYCLIntelMaxGlobalWorkDimAttr(Sema &S, Decl *D, S.AddSYCLIntelMaxGlobalWorkDimAttr(D, AL, E); } +SYCLIntelMinWorkGroupsPerComputeUnitAttr * +Sema::MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr( + Decl *D, const SYCLIntelMinWorkGroupsPerComputeUnitAttr &A) { + // Check to see if there's a duplicate attribute with different values + // already applied to the declaration. + if (const auto *DeclAttr = + D->getAttr()) { + if (const auto *DeclExpr = dyn_cast(DeclAttr->getValue())) { + if (const auto *MergeExpr = dyn_cast(A.getValue())) { + if (DeclExpr->getResultAsAPSInt() != MergeExpr->getResultAsAPSInt()) { + Diag(DeclAttr->getLoc(), diag::warn_duplicate_attribute) << &A; + Diag(A.getLoc(), diag::note_previous_attribute); + } + // Do not add a duplicate attribute. + return nullptr; + } + } + } + + return ::new (Context) + SYCLIntelMinWorkGroupsPerComputeUnitAttr(Context, A, A.getValue()); +} + +SYCLIntelMaxWorkGroupsPerMultiprocessorAttr * +Sema::MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( + Decl *D, const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr &A) { + // Check to see if there's a duplicate attribute with different values + // already applied to the declaration. + if (const auto *DeclAttr = + D->getAttr()) { + if (const auto *DeclExpr = dyn_cast(DeclAttr->getValue())) { + if (const auto *MergeExpr = dyn_cast(A.getValue())) { + if (DeclExpr->getResultAsAPSInt() != MergeExpr->getResultAsAPSInt()) { + Diag(DeclAttr->getLoc(), diag::warn_duplicate_attribute) << &A; + Diag(A.getLoc(), diag::note_previous_attribute); + } + // Do not add a duplicate attribute. + return nullptr; + } + } + } + + return ::new (Context) + SYCLIntelMaxWorkGroupsPerMultiprocessorAttr(Context, A, A.getValue()); +} + // Handles [[intel::loop_fuse]] and [[intel::loop_fuse_independent]]. void Sema::AddSYCLIntelLoopFuseAttr(Decl *D, const AttributeCommonInfo &CI, Expr *E) { @@ -7039,14 +7234,6 @@ bool Sema::CheckRegparmAttr(const ParsedAttr &AL, unsigned &numParams) { return false; } -// Helper to get CudaArch. -static CudaArch getCudaArch(const TargetInfo &TI) { - if (!TI.getTriple().isNVPTX()) - llvm_unreachable("getCudaArch is only valid for NVPTX triple"); - auto &TO = TI.getTargetOpts(); - return StringToCudaArch(TO.CPU); -} - // Checks whether an argument of launch_bounds attribute is // acceptable, performs implicit conversion to Rvalue, and returns // non-nullptr Expr result on success. Otherwise, it returns nullptr @@ -12008,6 +12195,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, case ParsedAttr::AT_SYCLIntelMaxWorkGroupSize: handleSYCLIntelMaxWorkGroupSize(S, D, AL); break; + case ParsedAttr::AT_SYCLIntelMinWorkGroupsPerComputeUnit: + handleSYCLIntelMinWorkGroupsPerComputeUnit(S, D, AL); + break; + case ParsedAttr::AT_SYCLIntelMaxWorkGroupsPerMultiprocessor: + handleSYCLIntelMaxWorkGroupsPerMultiprocessor(S, D, AL); + break; case ParsedAttr::AT_IntelReqdSubGroupSize: handleIntelReqdSubGroupSize(S, D, AL); break; @@ -12534,6 +12727,14 @@ void Sema::ProcessDeclAttributeList( } else if (const auto *A = D->getAttr()) { Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A; D->setInvalidDecl(); + } else if (const auto *A = + D->getAttr()) { + Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A; + D->setInvalidDecl(); + } else if (const auto *A = + D->getAttr()) { + Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A; + D->setInvalidDecl(); } else if (const auto *A = D->getAttr()) { Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A; D->setInvalidDecl(); diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 15182d369143f..e2a8d10ded1ab 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -538,6 +538,8 @@ static void collectSYCLAttributes(Sema &S, FunctionDecl *FD, SYCLIntelKernelArgsRestrictAttr, SYCLIntelNumSimdWorkItemsAttr, SYCLIntelSchedulerTargetFmaxMhzAttr, SYCLIntelMaxWorkGroupSizeAttr, SYCLIntelMaxGlobalWorkDimAttr, + SYCLIntelMinWorkGroupsPerComputeUnitAttr, + SYCLIntelMaxWorkGroupsPerMultiprocessorAttr, SYCLIntelNoGlobalWorkOffsetAttr, SYCLSimdAttr>(A); }); } @@ -4489,6 +4491,8 @@ static void PropagateAndDiagnoseDeviceAttr( case attr::Kind::SYCLIntelNumSimdWorkItems: case attr::Kind::SYCLIntelSchedulerTargetFmaxMhz: case attr::Kind::SYCLIntelMaxGlobalWorkDim: + case attr::Kind::SYCLIntelMinWorkGroupsPerComputeUnit: + case attr::Kind::SYCLIntelMaxWorkGroupsPerMultiprocessor: case attr::Kind::SYCLIntelNoGlobalWorkOffset: case attr::Kind::SYCLIntelLoopFuse: case attr::Kind::SYCLIntelMaxConcurrency: diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 164b7a5a541ce..06d3cbb438a17 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -754,6 +754,28 @@ static void instantiateSYCLIntelMaxGlobalWorkDimAttr( S.AddSYCLIntelMaxGlobalWorkDimAttr(New, *A, Result.getAs()); } +static void instantiateSYCLIntelMinWorkGroupsPerComputeUnitAttr( + Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs, + const SYCLIntelMinWorkGroupsPerComputeUnitAttr *A, Decl *New) { + EnterExpressionEvaluationContext Unevaluated( + S, Sema::ExpressionEvaluationContext::ConstantEvaluated); + ExprResult Result = S.SubstExpr(A->getValue(), TemplateArgs); + if (!Result.isInvalid()) + S.AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(New, *A, + Result.getAs()); +} + +static void instantiateSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( + Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs, + const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr *A, Decl *New) { + EnterExpressionEvaluationContext Unevaluated( + S, Sema::ExpressionEvaluationContext::ConstantEvaluated); + ExprResult Result = S.SubstExpr(A->getValue(), TemplateArgs); + if (!Result.isInvalid()) + S.AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(New, *A, + Result.getAs()); +} + static void instantiateSYCLIntelMaxConcurrencyAttr( Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs, const SYCLIntelMaxConcurrencyAttr *A, Decl *New) { @@ -1145,6 +1167,18 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs, SYCLIntelMaxGlobalWorkDim, New); continue; } + if (const auto *SYCLIntelMinWorkGroupsPerComputeUnit = + dyn_cast(TmplAttr)) { + instantiateSYCLIntelMinWorkGroupsPerComputeUnitAttr( + *this, TemplateArgs, SYCLIntelMinWorkGroupsPerComputeUnit, New); + continue; + } + if (const auto *SYCLIntelMaxWorkGroupsPerMultiprocessor = + dyn_cast(TmplAttr)) { + instantiateSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( + *this, TemplateArgs, SYCLIntelMaxWorkGroupsPerMultiprocessor, New); + continue; + } if (const auto *SYCLIntelLoopFuse = dyn_cast(TmplAttr)) { instantiateSYCLIntelLoopFuseAttr(*this, TemplateArgs, SYCLIntelLoopFuse, diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test index a79c6157e7ffa..b160de34da345 100644 --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -181,6 +181,8 @@ // CHECK-NEXT: SYCLIntelMaxConcurrency (SubjectMatchRule_function) // CHECK-NEXT: SYCLIntelMaxGlobalWorkDim (SubjectMatchRule_function) // CHECK-NEXT: SYCLIntelMaxWorkGroupSize (SubjectMatchRule_function) +// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessor (SubjectMatchRule_function) +// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnit (SubjectMatchRule_function) // CHECK-NEXT: SYCLIntelNoGlobalWorkOffset (SubjectMatchRule_function) // CHECK-NEXT: SYCLIntelNumSimdWorkItems (SubjectMatchRule_function) // CHECK-NEXT: SYCLIntelPipeIO (SubjectMatchRule_variable) diff --git a/sycl/test/launch_bounds/lb_sm_70.cpp b/sycl/test/launch_bounds/lb_sm_70.cpp new file mode 100644 index 0000000000000..5398bfd508116 --- /dev/null +++ b/sycl/test/launch_bounds/lb_sm_70.cpp @@ -0,0 +1,67 @@ +// REQUIRES: cuda + +// RUN: not %clangxx -ferror-limit=100 -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_70 -fsyntax-only %s -o - 2>&1 | FileCheck %s + +// NOTE: we can not use the `-verify` run alongside +// `expected-error`/`expected-warnings` as the diagnostics come from the device +// compilation, which happen in temporary files, while `expected-...` are +// placed in the main file, causing clang to complain at the file mismatch + +#include + +template class Functor { +public: + [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2), + intel::max_work_groups_per_mp(N3)]] void + // CHECK: maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute + operator()() const {} +}; + +int main() { + sycl::queue Q{}; + + sycl::range<1> Gws(32); + sycl::range<1> Lws(32); + + Q.submit([&](sycl::handler &cgh) { + cgh.single_task( + sycl::nd_range<1>(Gws, Lws), + [=]() + [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(2), + // CHECK: maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute + intel::max_work_groups_per_mp(4)]] { volatile int A = 42; }); + + constexpr float A = 2.0; + cgh.single_task( + [=]() + [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(A), + // CHECK: 'min_work_groups_per_cu' attribute requires parameter 0 to be an integer constant + intel::max_work_groups_per_mp(4)]] { volatile int A = 42; }); + // CHECK: maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute + + cgh.single_task( + [=]() [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(2147483647 + 1)]] + // CHECK: 'min_work_groups_per_cu' attribute requires parameter 0 to be an integer constant + { volatile int A = 42; }); + + cgh.single_task([=]() [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(4), + intel::min_work_groups_per_cu(8)]] { + // CHECK: attribute 'min_work_groups_per_cu' is already applied with different arguments + // CHECK: note: previous attribute is here + volatile int A = 42; + }); + + cgh.single_task([=]() [[intel::min_work_groups_per_cu(-8)]] { + // CHECK: 'min_work_groups_per_cu' attribute must be greater than 0 + volatile int A = 42; + }); + }).wait_and_throw(); + + Q.single_task(Functor<512, 8, 16>{}).wait(); + + return 0; +} diff --git a/sycl/test/launch_bounds/lb_sm_90.cpp b/sycl/test/launch_bounds/lb_sm_90.cpp new file mode 100644 index 0000000000000..daa63c4fd875c --- /dev/null +++ b/sycl/test/launch_bounds/lb_sm_90.cpp @@ -0,0 +1,48 @@ +// REQUIRES: cuda + +// RUN: %clangxx -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_90 -fsycl-device-only -S -Xclang -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR +// RUN: %clangxx -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_90 -fsycl -fsyntax-only -Xclang -verify %s +// expected-no-diagnostics + +#include + +template class Functor { +public: + [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2), + intel::max_work_groups_per_mp(N3)]] void + operator()() const {} +}; + +int main() { + sycl::queue Q{}; + + sycl::range<1> Gws(32); + sycl::range<1> Lws(32); + + Q.submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<1>(Gws, Lws), + [=](sycl::id<1>) [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] { + volatile int A = 42; + }); + }).wait_and_throw(); + // CHECK-IR: !min_work_groups_per_cu [[MWGPCU:![0-9]+]] + // CHECK-IR: !max_work_groups_per_mp [[MWGPMP:![0-9]+]] + // CHECK-IR: !max_work_group_size [[MWGS:![0-9]+]] + + Q.single_task(Functor<512, 8, 16>{}).wait(); + // CHECK-IR: !min_work_groups_per_cu [[MWGPCU_F:![0-9]+]] + // CHECK-IR: !max_work_groups_per_mp [[MWGPMP_F:![0-9]+]] + // CHECK-IR: !max_work_group_size [[MWGS_F:![0-9]+]] + + // CHECK-IR: [[MWGPCU]] = !{i32 2} + // CHECK-IR: [[MWGPMP]] = !{i32 4} + // CHECK-IR: [[MWGS]] = !{i32 256, i32 1, i32 1} + + // CHECK-IR: [[MWGPCU_F]] = !{i32 8} + // CHECK-IR: [[MWGPMP_F]] = !{i32 16} + // CHECK-IR: [[MWGS_F]] = !{i32 512, i32 1, i32 1} + + return 0; +} From a97bc95db26ce5b8526e67287f12690800a0d0b8 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Wed, 11 Oct 2023 13:33:04 +0100 Subject: [PATCH 2/9] Move test to clang/test/SemaSYCL --- .../clang/Basic/DiagnosticSemaKinds.td | 2 +- clang/test/SemaCUDA/launch_bounds.cu | 4 +-- .../test/SemaSYCL}/lb_sm_70.cpp | 29 +++++++------------ .../test/SemaSYCL}/lb_sm_90.cpp | 0 4 files changed, 13 insertions(+), 22 deletions(-) rename {sycl/test/launch_bounds => clang/test/SemaSYCL}/lb_sm_70.cpp (51%) rename {sycl/test/launch_bounds => clang/test/SemaSYCL}/lb_sm_90.cpp (100%) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 6e309c62d3a0f..a19eab34cd01d 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11991,7 +11991,7 @@ def warn_launch_bounds_is_cuda_specific : Warning< InGroup; def warn_cuda_maxclusterrank_sm_90 : Warning< - "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring " + "'maxclusterrank' requires sm_90 or higher, CUDA arch provided: %0, ignoring " "%1 attribute">, InGroup; def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must " diff --git a/clang/test/SemaCUDA/launch_bounds.cu b/clang/test/SemaCUDA/launch_bounds.cu index 045f475692959..aa83c3560e71d 100644 --- a/clang/test/SemaCUDA/launch_bounds.cu +++ b/clang/test/SemaCUDA/launch_bounds.cu @@ -11,7 +11,7 @@ __launch_bounds__(0x10000000000000000) void TestWayTooBigArg(void); // expected- __launch_bounds__(-128, 7) void TestNegArg1(void); // expected-warning {{'launch_bounds' attribute parameter 0 is negative and will be ignored}} __launch_bounds__(128, -7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 1 is negative and will be ignored}} -__launch_bounds__(128, 2, -8) void TestNegArg2(void); // expected-warning {{maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}} +__launch_bounds__(128, 2, -8) void TestNegArg2(void); // expected-warning {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}} __launch_bounds__(1, 2, 3, 4) void Test4Args(void); // expected-error {{'launch_bounds' attribute takes no more than 3 arguments}} __launch_bounds__() void TestNoArgs(void); // expected-error {{'launch_bounds' attribute takes at least 1 argument}} @@ -49,4 +49,4 @@ __launch_bounds__(Args) void TestTemplateVariadicArgs(void) {} // expected-error template __launch_bounds__(1, Args) void TestTemplateVariadicArgs2(void) {} // expected-error {{expression contains unexpanded parameter pack 'Args'}} -__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-warning {{maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}} +__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-warning {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}} diff --git a/sycl/test/launch_bounds/lb_sm_70.cpp b/clang/test/SemaSYCL/lb_sm_70.cpp similarity index 51% rename from sycl/test/launch_bounds/lb_sm_70.cpp rename to clang/test/SemaSYCL/lb_sm_70.cpp index 5398bfd508116..e23ba113ca120 100644 --- a/sycl/test/launch_bounds/lb_sm_70.cpp +++ b/clang/test/SemaSYCL/lb_sm_70.cpp @@ -1,62 +1,53 @@ -// REQUIRES: cuda +// REQUIRES: cuda -// RUN: not %clangxx -ferror-limit=100 -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_70 -fsyntax-only %s -o - 2>&1 | FileCheck %s - -// NOTE: we can not use the `-verify` run alongside -// `expected-error`/`expected-warnings` as the diagnostics come from the device -// compilation, which happen in temporary files, while `expected-...` are -// placed in the main file, causing clang to complain at the file mismatch +// RUN: %clangxx -ferror-limit=100 -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_70 -fsycl-device-only -fsyntax-only -Xclang -verify %s #include template class Functor { public: + // expected-warning@+2 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}} [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2), intel::max_work_groups_per_mp(N3)]] void - // CHECK: maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute operator()() const {} }; int main() { sycl::queue Q{}; - sycl::range<1> Gws(32); - sycl::range<1> Lws(32); - Q.submit([&](sycl::handler &cgh) { + // expected-warning@+5 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}} cgh.single_task( - sycl::nd_range<1>(Gws, Lws), [=]() [[intel::max_work_group_size(1, 1, 256), intel::min_work_groups_per_cu(2), - // CHECK: maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute intel::max_work_groups_per_mp(4)]] { volatile int A = 42; }); constexpr float A = 2.0; + // expected-error@+5 {{'min_work_groups_per_cu' attribute requires parameter 0 to be an integer constant}} + // expected-warning@+5 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}} cgh.single_task( [=]() [[intel::max_work_group_size(1, 1, 256), intel::min_work_groups_per_cu(A), - // CHECK: 'min_work_groups_per_cu' attribute requires parameter 0 to be an integer constant intel::max_work_groups_per_mp(4)]] { volatile int A = 42; }); - // CHECK: maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute + // expected-error@+3 {{'min_work_groups_per_cu' attribute requires parameter 0 to be an integer constant}} cgh.single_task( [=]() [[intel::max_work_group_size(1, 1, 256), intel::min_work_groups_per_cu(2147483647 + 1)]] - // CHECK: 'min_work_groups_per_cu' attribute requires parameter 0 to be an integer constant { volatile int A = 42; }); + // expected-warning@+4 {{attribute 'min_work_groups_per_cu' is already applied with different arguments}} + // expected-note@+2 {{previous attribute is here}} cgh.single_task([=]() [[intel::max_work_group_size(1, 1, 256), intel::min_work_groups_per_cu(4), intel::min_work_groups_per_cu(8)]] { - // CHECK: attribute 'min_work_groups_per_cu' is already applied with different arguments - // CHECK: note: previous attribute is here volatile int A = 42; }); + // expected-error@+1 {{'min_work_groups_per_cu' attribute must be greater than 0}} cgh.single_task([=]() [[intel::min_work_groups_per_cu(-8)]] { - // CHECK: 'min_work_groups_per_cu' attribute must be greater than 0 volatile int A = 42; }); }).wait_and_throw(); diff --git a/sycl/test/launch_bounds/lb_sm_90.cpp b/clang/test/SemaSYCL/lb_sm_90.cpp similarity index 100% rename from sycl/test/launch_bounds/lb_sm_90.cpp rename to clang/test/SemaSYCL/lb_sm_90.cpp From 27d973e4013d112b4605dba7c5b41452531fb635 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 13 Oct 2023 11:39:09 +0100 Subject: [PATCH 3/9] ast and md tests --- .../test/CodeGenSYCL/launch_bounds_nvptx.cpp | 101 ++++++++++ clang/test/SemaSYCL/lb_sm_90_ast.cpp | 179 ++++++++++++++++++ 2 files changed, 280 insertions(+) create mode 100644 clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp create mode 100644 clang/test/SemaSYCL/lb_sm_90_ast.cpp diff --git a/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp new file mode 100644 index 0000000000000..9669b43a02b96 --- /dev/null +++ b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp @@ -0,0 +1,101 @@ +// REQUIRES: cuda + +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -triple nvptx-unknown-unknown -target-cpu sm_90 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s + +#include "sycl.hpp" + +using namespace sycl; +queue q; + +class Foo { +public: + [[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] void + operator()() const {} +}; + +template class Functor { +public: + [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N), + intel::max_work_groups_per_mp(N)]] void + operator()() const {} +}; + +template +[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N), + intel::max_work_groups_per_mp(N)]] void +zoo() {} + +[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] void +bar() {} + +int main() { + q.submit([&](handler &h) { + // Test attribute argument size. + Foo boo; + h.single_task(boo); + + // Test attribute is applied on lambda. + h.single_task( + []() [[intel::max_work_group_size(8, 8, 8), + intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] {}); + + // Test class template argument. + Functor<6> f; + h.single_task(f); + + // Test attribute is propagated. + h.single_task([]() { bar(); }); + + // Test function template argument. + h.single_task([]() { zoo<16>(); }); + }); + return 0; +} + +// CHECK: define dso_local void @{{.*}}kernel_name1() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]] +// CHECK: define dso_local void @{{.*}}kernel_name2() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]] +// CHECK: define dso_local void @{{.*}}kernel_name3() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM]] !max_work_group_size ![[MWGS_2:[0-9]+]] +// CHECK: define dso_local void @{{.*}}kernel_name4() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]] +// CHECK: define dso_local void @{{.*}}kernel_name5() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM_2:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM_2]] !max_work_group_size ![[MWGS_3:[0-9]+]] + +// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 512} +// CHECK: {{.*}}@{{.*}}kernel_name1, !"minnctapersm", i32 2} +// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxclusterrank", i32 4} +// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 512} +// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minnctapersm", i32 2} +// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxclusterrank", i32 4} +// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 512} +// CHECK: {{.*}}@{{.*}}kernel_name2, !"minnctapersm", i32 2} +// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxclusterrank", i32 4} +// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 512} +// CHECK: {{.*}}@{{.*}}main{{.*}}, !"minnctapersm", i32 2} +// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxclusterrank", i32 4} +// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 384} +// CHECK: {{.*}}@{{.*}}kernel_name3, !"minnctapersm", i32 6} +// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxclusterrank", i32 6} +// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 384} +// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minnctapersm", i32 6} +// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxclusterrank", i32 6} +// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 512} +// CHECK: {{.*}}@{{.*}}kernel_name4, !"minnctapersm", i32 2} +// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxclusterrank", i32 4} +// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 512} +// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"minnctapersm", i32 2} +// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxclusterrank", i32 4} +// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 1024} +// CHECK: {{.*}}@{{.*}}kernel_name5, !"minnctapersm", i32 16} +// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxclusterrank", i32 16} +// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 1024} +// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"minnctapersm", i32 16} +// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxclusterrank", i32 16} + +// CHECK: ![[MWGPC]] = !{i32 2} +// CHECK: ![[MWGPM]] = !{i32 4} +// CHECK: ![[MWGS]] = !{i32 8, i32 8, i32 8} +// CHECK: ![[MWGPC_MWGPM]] = !{i32 6} +// CHECK: ![[MWGS_2]] = !{i32 8, i32 8, i32 6} +// CHECK: ![[MWGPC_MWGPM_2]] = !{i32 16} +// CHECK: ![[MWGS_3]] = !{i32 8, i32 8, i32 16} diff --git a/clang/test/SemaSYCL/lb_sm_90_ast.cpp b/clang/test/SemaSYCL/lb_sm_90_ast.cpp new file mode 100644 index 0000000000000..f40fee0d749af --- /dev/null +++ b/clang/test/SemaSYCL/lb_sm_90_ast.cpp @@ -0,0 +1,179 @@ +// REQUIERS: cuda + +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -ast-dump -triple nvptx-unknown-unknown -target-cpu sm_90 %s | FileCheck %s + +// Tests for AST of Intel max_work_group_size, min_work_groups_per_cu and +// max_work_groups_per_mp attribute. + +#include "sycl.hpp" + +sycl::queue deviceQueue; + +// CHECK: FunctionDecl {{.*}} func1 'void ()' +// CHECK-NEXT: CompoundStmt +// CHECK-NEXT: SYCLIntelMaxWorkGroupSizeAttr {{.*}} +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}} +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 4 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4 +// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}} +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2 +[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(4), + intel::max_work_groups_per_mp(2)]] void +func1() {} + +// Test that checks template parameter support on function. +// CHECK: FunctionTemplateDecl {{.*}} func2 +// CHECK: FunctionDecl {{.*}} func2 'void ()' +// CHECK-NEXT: CompoundStmt +// CHECK-NEXT: SYCLIntelMaxWorkGroupSizeAttr {{.*}} +// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'N' 'int' +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}} +// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'N' 'int' +// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}} +// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'N' 'int' + +// CHECK: FunctionDecl {{.*}} func2 'void ()' +// CHECK-NEXT: TemplateArgument integral 6 +// CHECK-NEXT: CompoundStmt +// CHECK-NEXT: SYCLIntelMaxWorkGroupSizeAttr {{.*}} +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 6 +// CHECK-NEXT: SubstNonTypeTemplateParmExpr +// CHECK-NEXT: NonTypeTemplateParmDecl +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6 +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 8 +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}} +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 6 +// CHECK-NEXT: SubstNonTypeTemplateParmExpr +// CHECK-NEXT: NonTypeTemplateParmDecl +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6 +// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}} +// CHECK-NEXT: ConstantExpr{{.*}}'int' +// CHECK-NEXT: value: Int 6 +// CHECK-NEXT: SubstNonTypeTemplateParmExpr +// CHECK-NEXT: NonTypeTemplateParmDecl +// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6 +template +[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N), + intel::max_work_groups_per_mp(N)]] void func2() {} + +class KernelFunctor { +public: + void operator()() const { + func1(); + } +}; + +// Test that checks template parameter support on class member function. +template +class KernelFunctor2 { +public: + [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N), + intel::max_work_groups_per_mp(N)]] void operator()() const { + } +}; + +int main() { + deviceQueue.submit([&](sycl::handler &h) { + // CHECK-LABEL: FunctionDecl {{.*}}kernel_name_1 + // CHECK: SYCLIntelMaxWorkGroupSizeAttr + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK: SYCLIntelMinWorkGroupsPerComputeUnitAttr + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 4 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4 + // CHECK: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 2 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2 + KernelFunctor f1; + h.single_task(f1); + + // CHECK-LABEL: FunctionDecl {{.*}}kernel_name_2 + // CHECK: SYCLIntelMaxWorkGroupSizeAttr + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 3 + // CHECK-NEXT: SubstNonTypeTemplateParmExpr + // CHECK-NEXT: NonTypeTemplateParmDecl + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3 + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK: SYCLIntelMinWorkGroupsPerComputeUnitAttr + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 3 + // CHECK-NEXT: SubstNonTypeTemplateParmExpr + // CHECK-NEXT: NonTypeTemplateParmDecl + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3 + // CHECK: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 3 + // CHECK-NEXT: SubstNonTypeTemplateParmExpr + // CHECK-NEXT: NonTypeTemplateParmDecl + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3 + KernelFunctor2<3> f2; + h.single_task(f2); + + // CHECK-LABEL: FunctionDecl {{.*}}kernel_name_3 + // CHECK: SYCLIntelMaxWorkGroupSizeAttr {{.*}} + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 8 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8 + // CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}} + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 4 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4 + // CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}} + // CHECK-NEXT: ConstantExpr{{.*}}'int' + // CHECK-NEXT: value: Int 6 + // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6 + h.single_task( + []() [[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(4), + intel::max_work_groups_per_mp(6)]]{}); + }); + + func2<6>(); + + return 0; +} From 9431687c6bfe57ab8f06cdabff4273a73c0abf9f Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Mon, 23 Oct 2023 01:19:25 -0700 Subject: [PATCH 4/9] PR feedback --- clang/lib/CodeGen/CodeGenFunction.cpp | 20 ++-- clang/lib/CodeGen/Targets/NVPTX.cpp | 33 ++++--- clang/lib/Sema/SemaDeclAttr.cpp | 64 +++++------- .../test/CodeGenSYCL/launch_bounds_nvptx.cpp | 8 +- .../{SemaSYCL => CodeGenSYCL}/lb_sm_90.cpp | 99 ++++++++++--------- clang/test/SemaSYCL/lb_sm_70.cpp | 21 ++-- clang/test/SemaSYCL/lb_sm_90_ast.cpp | 2 - 7 files changed, 120 insertions(+), 127 deletions(-) rename clang/test/{SemaSYCL => CodeGenSYCL}/lb_sm_90.cpp (63%) diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index ae00ca1329ce6..411a79209d14b 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -758,23 +758,23 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD, llvm::MDNode::get(Context, AttrMDArgs)); } - if (const auto *A = FD->getAttr()) { - const auto *CE = cast(A->getValue()); + auto attrAsMDArg = [&](Expr *E) { + const auto *CE = cast(E); std::optional ArgVal = CE->getResultAsAPSInt(); - llvm::Metadata *AttrMDArgs[] = {llvm::ConstantAsMetadata::get( - Builder.getInt32(ArgVal->getSExtValue()))}; + assert(ArgVal.has_value() && "Failed to obtain attribute value."); + return llvm::ConstantAsMetadata::get( + Builder.getInt32(ArgVal->getSExtValue())); + }; + + if (const auto *A = FD->getAttr()) { Fn->setMetadata("min_work_groups_per_cu", - llvm::MDNode::get(Context, AttrMDArgs)); + llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())})); } if (const auto *A = FD->getAttr()) { - const auto *CE = cast(A->getValue()); - std::optional ArgVal = CE->getResultAsAPSInt(); - llvm::Metadata *AttrMDArgs[] = {llvm::ConstantAsMetadata::get( - Builder.getInt32(ArgVal->getSExtValue()))}; Fn->setMetadata("max_work_groups_per_mp", - llvm::MDNode::get(Context, AttrMDArgs)); + llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())})); } if (const SYCLIntelMaxWorkGroupSizeAttr *A = diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index f13bf26d2c36c..3fa2d84c73b1d 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -251,23 +251,24 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes( (*MWGS->getXDimVal()).getExtValue(); if (MaxThreads > 0) addNVVMMetadata(F, "maxntidx", MaxThreads); - } - if (const auto *MWGPCU = - FD->getAttr()) { - auto *MinWorkGroups = MWGPCU->getValue(); - if (const auto *CE = dyn_cast(MinWorkGroups)) { - auto MinVal = CE->getResultAsAPSInt(); - // The value is guaranteed to be > 0, pass it to the metadata. - addNVVMMetadata(F, "minnctapersm", MinVal.getExtValue()); - } - } - if (const auto *MWGPMP = - FD->getAttr()) { - auto *MaxWorkGroups = MWGPMP->getValue(); - if (const auto *CE = dyn_cast(MaxWorkGroups)) { - auto MaxVal = CE->getResultAsAPSInt(); + + auto attrValue = [&](Expr *E) { + const auto *CE = cast(E); + std::optional Val = CE->getResultAsAPSInt(); + assert(Val.has_value() && "Failed to get attribute value."); + return Val->getZExtValue(); + }; + + if (const auto *MWGPCU = + FD->getAttr()) { // The value is guaranteed to be > 0, pass it to the metadata. - addNVVMMetadata(F, "maxclusterrank", MaxVal.getExtValue()); + addNVVMMetadata(F, "minnctapersm", attrValue(MWGPCU->getValue())); + + if (const auto *MWGPMP = + FD->getAttr()) { + // The value is guaranteed to be > 0, pass it to the metadata. + addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue())); + } } } } diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index aae3c9005c60d..2ecf0a46b9631 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -22,6 +22,7 @@ #include "clang/AST/Mangle.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/AST/Type.h" +#include "clang/Basic/AttributeCommonInfo.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/Cuda.h" #include "clang/Basic/DarwinSDKInfo.h" @@ -200,13 +201,18 @@ static unsigned getNumAttributeArgs(const ParsedAttr &AL) { return AL.getNumArgs() + AL.hasParsedType(); } -/// A helper function to provide Attribute Location for the Attr types -/// AND the ParsedAttr. -template -static std::enable_if_t, SourceLocation> -getAttrLoc(const AttrInfo &AL) { +/// Helper functions to provide Attribute Location for the Attr types, +/// AttributeCommonInfo AND the ParsedAttr. +template +static std::enable_if_t, SourceLocation> +getAttrLoc(const T &AL) { return AL.getLocation(); } +template , bool> = true> +static SourceLocation getAttrLoc(const T &AL) { + return AL.getScopeLoc(); +} static SourceLocation getAttrLoc(const ParsedAttr &AL) { return AL.getLoc(); } /// If Expr is a valid integer constant, get the value of the integer @@ -4447,26 +4453,6 @@ void Sema::AddSYCLIntelMaxGlobalWorkDimAttr(Decl *D, D->addAttr(::new (Context) SYCLIntelMaxGlobalWorkDimAttr(Context, CI, E)); } -// Check that the attribute is an integer constant that can fit in 32-bits. -// Issue correct error message and return false on failure. -bool static check32BitInt(const Expr *E, const AttributeCommonInfo &CI, - Sema &S) { - std::optional I = llvm::APSInt(64); - if (!(I = E->getIntegerConstantExpr(S.Context))) { - S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type) - << CI << 0 << AANT_ArgumentIntegerConstant << E->getSourceRange(); - return false; - } - // Make sure we can fit it in 32 bits. - if (!I->isIntN(32)) { - S.Diag(E->getExprLoc(), diag::err_ice_too_large) - << toString(*I, 10, false) << 32 << /* Unsigned */ 1; - return false; - } - - return true; -} - void Sema::AddSYCLIntelMinWorkGroupsPerComputeUnitAttr( Decl *D, const AttributeCommonInfo &CI, Expr *E) { if (Context.getLangOpts().SYCLIsDevice && @@ -4476,8 +4462,11 @@ void Sema::AddSYCLIntelMinWorkGroupsPerComputeUnitAttr( return; } if (!E->isValueDependent()) { - if (!check32BitInt(E, CI, *this)) + uint32_t Val; + if (!checkUInt32Argument(*this, CI, E, Val, UINT_MAX /* Idx */, + true /* StrictlyUnsigned */)) return; + // Validate that we have an integer constant expression and then store the // converted constant expression into the semantic attribute so that we // don't have to evaluate it again later. @@ -4485,15 +4474,10 @@ void Sema::AddSYCLIntelMinWorkGroupsPerComputeUnitAttr( ExprResult Res = VerifyIntegerConstantExpression(E, &ArgVal); if (Res.isInvalid()) return; + if (Val != ArgVal) + llvm_unreachable("Values must not differ."); E = Res.get(); - // This attribute must be greater than 0. - if (ArgVal <= 0) { - Diag(E->getBeginLoc(), diag::err_attribute_argument_is_zero) - << CI << E->getSourceRange(); - return; - } - // Check to see if there's a duplicate attribute with different values // already applied to the declaration. if (const auto *DeclAttr = @@ -4543,8 +4527,11 @@ void Sema::AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( } } if (!E->isValueDependent()) { - if (!check32BitInt(E, CI, *this)) + uint32_t Val; + if (!checkUInt32Argument(*this, CI, E, Val, UINT_MAX /* Idx */, + true /* StrictlyUnsigned */)) return; + // Validate that we have an integer constant expression and then store the // converted constant expression into the semantic attribute so that we // don't have to evaluate it again later. @@ -4553,13 +4540,8 @@ void Sema::AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( if (Res.isInvalid()) return; E = Res.get(); - - // This attribute must be greater than 0. - if (ArgVal <= 0) { - Diag(E->getBeginLoc(), diag::err_attribute_argument_is_zero) - << CI << E->getSourceRange(); - return; - } + if (Val != ArgVal) + llvm_unreachable("Values must not differ."); // Check to see if there's a duplicate attribute with different values // already applied to the declaration. diff --git a/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp index 9669b43a02b96..5e61ae690254d 100644 --- a/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp +++ b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp @@ -1,6 +1,10 @@ -// REQUIRES: cuda +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -triple nvptx-unknown-unknown -target-cpu sm_90 -disable-llvm-passes -S -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -triple nvptx-unknown-unknown -target-cpu sm_90 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s +// Test correct handling of maximum work group size, minimum work groups per +// compute unit and maximum work groups per multi-processor attributes, that +// correspond to CUDA's launch bounds. Expect max_work_group_size, +// min_work_groups_per_cu and max_work_groups_per_mp that are mapped to +// maxntidx, minnctapersm, maxclusterrank PTX directives respectively. #include "sycl.hpp" diff --git a/clang/test/SemaSYCL/lb_sm_90.cpp b/clang/test/CodeGenSYCL/lb_sm_90.cpp similarity index 63% rename from clang/test/SemaSYCL/lb_sm_90.cpp rename to clang/test/CodeGenSYCL/lb_sm_90.cpp index daa63c4fd875c..55dc1ef3e5640 100644 --- a/clang/test/SemaSYCL/lb_sm_90.cpp +++ b/clang/test/CodeGenSYCL/lb_sm_90.cpp @@ -1,48 +1,51 @@ -// REQUIRES: cuda - -// RUN: %clangxx -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_90 -fsycl-device-only -S -Xclang -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR -// RUN: %clangxx -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_90 -fsycl -fsyntax-only -Xclang -verify %s -// expected-no-diagnostics - -#include - -template class Functor { -public: - [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2), - intel::max_work_groups_per_mp(N3)]] void - operator()() const {} -}; - -int main() { - sycl::queue Q{}; - - sycl::range<1> Gws(32); - sycl::range<1> Lws(32); - - Q.submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<1>(Gws, Lws), - [=](sycl::id<1>) [[intel::max_work_group_size(1, 1, 256), - intel::min_work_groups_per_cu(2), - intel::max_work_groups_per_mp(4)]] { - volatile int A = 42; - }); - }).wait_and_throw(); - // CHECK-IR: !min_work_groups_per_cu [[MWGPCU:![0-9]+]] - // CHECK-IR: !max_work_groups_per_mp [[MWGPMP:![0-9]+]] - // CHECK-IR: !max_work_group_size [[MWGS:![0-9]+]] - - Q.single_task(Functor<512, 8, 16>{}).wait(); - // CHECK-IR: !min_work_groups_per_cu [[MWGPCU_F:![0-9]+]] - // CHECK-IR: !max_work_groups_per_mp [[MWGPMP_F:![0-9]+]] - // CHECK-IR: !max_work_group_size [[MWGS_F:![0-9]+]] - - // CHECK-IR: [[MWGPCU]] = !{i32 2} - // CHECK-IR: [[MWGPMP]] = !{i32 4} - // CHECK-IR: [[MWGS]] = !{i32 256, i32 1, i32 1} - - // CHECK-IR: [[MWGPCU_F]] = !{i32 8} - // CHECK-IR: [[MWGPMP_F]] = !{i32 16} - // CHECK-IR: [[MWGS_F]] = !{i32 512, i32 1, i32 1} - - return 0; -} +// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_90 -fsycl-is-device -S -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR +// RUN: %clang_cc1 -internal-isystem %S/Inputs %s -triple nvptx64-nvidia-cuda -target-cpu sm_90 -fsycl-is-device -fsyntax-only -verify +// expected-no-diagnostics + +// Maximum work groups per multi-processor, mapped to maxclusterrank PTX +// directive, is an SM_90 feature, make sure that correct metadata is generated +// and no warnings/errors are issued. + +#include "sycl.hpp" + +template class Functor { +public: + [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2), + intel::max_work_groups_per_mp(N3)]] void + operator()() const {} +}; + +int main() { + sycl::queue Q{}; + + sycl::range<1> Gws(32); + + Q.submit([&](sycl::handler &cgh) { + cgh.parallel_for(Gws, + [=](sycl::id<1>) [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] { + volatile int A = 42; + }); + }); + // CHECK-IR: !min_work_groups_per_cu [[MWGPCU:![0-9]+]] + // CHECK-IR: !max_work_groups_per_mp [[MWGPMP:![0-9]+]] + // CHECK-IR: !max_work_group_size [[MWGS:![0-9]+]] + + Q.submit([&](sycl::handler &cgh) { + cgh.single_task(Functor<512, 8, 16>{}); + }); + // CHECK-IR: !min_work_groups_per_cu [[MWGPCU_F:![0-9]+]] + // CHECK-IR: !max_work_groups_per_mp [[MWGPMP_F:![0-9]+]] + // CHECK-IR: !max_work_group_size [[MWGS_F:![0-9]+]] + + // CHECK-IR: [[MWGPCU]] = !{i32 2} + // CHECK-IR: [[MWGPMP]] = !{i32 4} + // CHECK-IR: [[MWGS]] = !{i32 256, i32 1, i32 1} + + // CHECK-IR: [[MWGPCU_F]] = !{i32 8} + // CHECK-IR: [[MWGPMP_F]] = !{i32 16} + // CHECK-IR: [[MWGS_F]] = !{i32 512, i32 1, i32 1} + + return 0; +} diff --git a/clang/test/SemaSYCL/lb_sm_70.cpp b/clang/test/SemaSYCL/lb_sm_70.cpp index e23ba113ca120..9fb31ab429cc5 100644 --- a/clang/test/SemaSYCL/lb_sm_70.cpp +++ b/clang/test/SemaSYCL/lb_sm_70.cpp @@ -1,8 +1,11 @@ -// REQUIRES: cuda +// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_70 -fsycl-is-device -S -emit-llvm %s -o -ferror-limit=100 -fsyntax-only -verify %s -// RUN: %clangxx -ferror-limit=100 -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_70 -fsycl-device-only -fsyntax-only -Xclang -verify %s +// Maximum work groups per multi-processor, mapped to maxclusterrank PTX +// directive, is an SM_90 feature, make sure that correct warning is issued on +// architectures lower than that. Furthermore, warn/error incorrect values +// specified for max_work_groups_per_mp and min_work_groups_per_cu. -#include +#include "sycl.hpp" template class Functor { public: @@ -24,7 +27,7 @@ int main() { intel::max_work_groups_per_mp(4)]] { volatile int A = 42; }); constexpr float A = 2.0; - // expected-error@+5 {{'min_work_groups_per_cu' attribute requires parameter 0 to be an integer constant}} + // expected-error@+5 {{'min_work_groups_per_cu' attribute requires an integer constant}} // expected-warning@+5 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}} cgh.single_task( [=]() @@ -32,7 +35,7 @@ int main() { intel::min_work_groups_per_cu(A), intel::max_work_groups_per_mp(4)]] { volatile int A = 42; }); - // expected-error@+3 {{'min_work_groups_per_cu' attribute requires parameter 0 to be an integer constant}} + // expected-error@+3 {{'min_work_groups_per_cu' attribute requires an integer constant}} cgh.single_task( [=]() [[intel::max_work_group_size(1, 1, 256), intel::min_work_groups_per_cu(2147483647 + 1)]] @@ -46,13 +49,15 @@ int main() { volatile int A = 42; }); - // expected-error@+1 {{'min_work_groups_per_cu' attribute must be greater than 0}} + // expected-error@+1 {{'min_work_groups_per_cu' attribute requires a non-negative integral compile time constant expression}} cgh.single_task([=]() [[intel::min_work_groups_per_cu(-8)]] { volatile int A = 42; }); - }).wait_and_throw(); + }); - Q.single_task(Functor<512, 8, 16>{}).wait(); + Q.submit([&](sycl::handler &cgh) { + cgh.single_task(Functor<512, 8, 16>{}); + }); return 0; } diff --git a/clang/test/SemaSYCL/lb_sm_90_ast.cpp b/clang/test/SemaSYCL/lb_sm_90_ast.cpp index f40fee0d749af..5831224387489 100644 --- a/clang/test/SemaSYCL/lb_sm_90_ast.cpp +++ b/clang/test/SemaSYCL/lb_sm_90_ast.cpp @@ -1,5 +1,3 @@ -// REQUIERS: cuda - // RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -ast-dump -triple nvptx-unknown-unknown -target-cpu sm_90 %s | FileCheck %s // Tests for AST of Intel max_work_group_size, min_work_groups_per_cu and From 7744f6952153d9e31a6165d65f1c8a2896443e8f Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Thu, 26 Oct 2023 23:43:50 -0700 Subject: [PATCH 5/9] Remove SupportsNonconformingLambdaSyntax and adjust tests --- clang/include/clang/Basic/Attr.td | 2 - .../test/CodeGenSYCL/launch_bounds_nvptx.cpp | 6 +- clang/test/CodeGenSYCL/lb_sm_90.cpp | 16 ++--- clang/test/SemaSYCL/lb_sm_70.cpp | 69 +++++++++---------- clang/test/SemaSYCL/lb_sm_90_ast.cpp | 21 +++--- 5 files changed, 53 insertions(+), 61 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 377b7e6625bfb..9fdd64a10b091 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1589,7 +1589,6 @@ def SYCLIntelMinWorkGroupsPerComputeUnit : InheritableAttr { let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost]; let Subjects = SubjectList<[Function], ErrorDiag>; let Documentation = [SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs]; - let SupportsNonconformingLambdaSyntax = 1; } def SYCLIntelMaxWorkGroupsPerMultiprocessor : InheritableAttr { @@ -1598,7 +1597,6 @@ def SYCLIntelMaxWorkGroupsPerMultiprocessor : InheritableAttr { let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost]; let Subjects = SubjectList<[Function], ErrorDiag>; let Documentation = [SYCLIntelMaxWorkGroupsPerMultiprocessorDocs]; - let SupportsNonconformingLambdaSyntax = 1; } def SYCLIntelMaxGlobalWorkDim : InheritableAttr { diff --git a/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp index 5e61ae690254d..13c6d1ce4e5f0 100644 --- a/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp +++ b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp @@ -42,9 +42,9 @@ int main() { // Test attribute is applied on lambda. h.single_task( - []() [[intel::max_work_group_size(8, 8, 8), - intel::min_work_groups_per_cu(2), - intel::max_work_groups_per_mp(4)]] {}); + [] [[intel::max_work_group_size(8, 8, 8), + intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] () {}); // Test class template argument. Functor<6> f; diff --git a/clang/test/CodeGenSYCL/lb_sm_90.cpp b/clang/test/CodeGenSYCL/lb_sm_90.cpp index 55dc1ef3e5640..752174b00749f 100644 --- a/clang/test/CodeGenSYCL/lb_sm_90.cpp +++ b/clang/test/CodeGenSYCL/lb_sm_90.cpp @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_90 -fsycl-is-device -S -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR -// RUN: %clang_cc1 -internal-isystem %S/Inputs %s -triple nvptx64-nvidia-cuda -target-cpu sm_90 -fsycl-is-device -fsyntax-only -verify +// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_90 -fsycl-is-device -Wno-c++23-extensions -S -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR +// RUN: %clang_cc1 -internal-isystem %S/Inputs %s -triple nvptx64-nvidia-cuda -target-cpu sm_90 -fsycl-is-device -fsyntax-only -Wno-c++23-extensions -verify // expected-no-diagnostics // Maximum work groups per multi-processor, mapped to maxclusterrank PTX @@ -21,13 +21,11 @@ int main() { sycl::range<1> Gws(32); Q.submit([&](sycl::handler &cgh) { - cgh.parallel_for(Gws, - [=](sycl::id<1>) [[intel::max_work_group_size(1, 1, 256), - intel::min_work_groups_per_cu(2), - intel::max_work_groups_per_mp(4)]] { - volatile int A = 42; - }); - }); + cgh.parallel_for(Gws, [=] [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] ( + sycl::id<1>) { volatile int A = 42; }); + }); // CHECK-IR: !min_work_groups_per_cu [[MWGPCU:![0-9]+]] // CHECK-IR: !max_work_groups_per_mp [[MWGPMP:![0-9]+]] // CHECK-IR: !max_work_group_size [[MWGS:![0-9]+]] diff --git a/clang/test/SemaSYCL/lb_sm_70.cpp b/clang/test/SemaSYCL/lb_sm_70.cpp index 9fb31ab429cc5..f9078eb2588e5 100644 --- a/clang/test/SemaSYCL/lb_sm_70.cpp +++ b/clang/test/SemaSYCL/lb_sm_70.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_70 -fsycl-is-device -S -emit-llvm %s -o -ferror-limit=100 -fsyntax-only -verify %s +// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_70 -fsycl-is-device -Wno-c++23-extensions -S -emit-llvm %s -o -ferror-limit=100 -fsyntax-only -verify %s // Maximum work groups per multi-processor, mapped to maxclusterrank PTX // directive, is an SM_90 feature, make sure that correct warning is issued on @@ -19,41 +19,38 @@ int main() { sycl::queue Q{}; Q.submit([&](sycl::handler &cgh) { - // expected-warning@+5 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}} - cgh.single_task( - [=]() - [[intel::max_work_group_size(1, 1, 256), - intel::min_work_groups_per_cu(2), - intel::max_work_groups_per_mp(4)]] { volatile int A = 42; }); - - constexpr float A = 2.0; - // expected-error@+5 {{'min_work_groups_per_cu' attribute requires an integer constant}} - // expected-warning@+5 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}} - cgh.single_task( - [=]() - [[intel::max_work_group_size(1, 1, 256), - intel::min_work_groups_per_cu(A), - intel::max_work_groups_per_mp(4)]] { volatile int A = 42; }); - - // expected-error@+3 {{'min_work_groups_per_cu' attribute requires an integer constant}} - cgh.single_task( - [=]() [[intel::max_work_group_size(1, 1, 256), - intel::min_work_groups_per_cu(2147483647 + 1)]] - { volatile int A = 42; }); - - // expected-warning@+4 {{attribute 'min_work_groups_per_cu' is already applied with different arguments}} - // expected-note@+2 {{previous attribute is here}} - cgh.single_task([=]() [[intel::max_work_group_size(1, 1, 256), - intel::min_work_groups_per_cu(4), - intel::min_work_groups_per_cu(8)]] { - volatile int A = 42; - }); - - // expected-error@+1 {{'min_work_groups_per_cu' attribute requires a non-negative integral compile time constant expression}} - cgh.single_task([=]() [[intel::min_work_groups_per_cu(-8)]] { - volatile int A = 42; - }); - }); + // expected-warning@+4 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}} + cgh.single_task( + [=] [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; }); + + constexpr float A = 2.0; + // expected-error@+4 {{'min_work_groups_per_cu' attribute requires an integer constant}} + // expected-warning@+4 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}} + cgh.single_task( + [=] [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(A), + intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; }); + + // expected-error@+3 {{'min_work_groups_per_cu' attribute requires an integer constant}} + cgh.single_task( + [=] [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(2147483647 + 1)]] () { + volatile int A = 42; + }); + + // expected-warning@+5 {{attribute 'min_work_groups_per_cu' is already applied with different arguments}} + // expected-note@+3 {{previous attribute is here}} + cgh.single_task( + [=] [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(4), + intel::min_work_groups_per_cu(8)]] () { volatile int A = 42; }); + + // expected-error@+2 {{'min_work_groups_per_cu' attribute requires a non-negative integral compile time constant expression}} + cgh.single_task( + [=] [[intel::min_work_groups_per_cu(-8)]] () { volatile int A = 42; }); + }); Q.submit([&](sycl::handler &cgh) { cgh.single_task(Functor<512, 8, 16>{}); diff --git a/clang/test/SemaSYCL/lb_sm_90_ast.cpp b/clang/test/SemaSYCL/lb_sm_90_ast.cpp index 5831224387489..2493445d6b985 100644 --- a/clang/test/SemaSYCL/lb_sm_90_ast.cpp +++ b/clang/test/SemaSYCL/lb_sm_90_ast.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -ast-dump -triple nvptx-unknown-unknown -target-cpu sm_90 %s | FileCheck %s +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -ast-dump -triple nvptx-unknown-unknown -target-cpu sm_90 -Wno-c++23-extensions %s | FileCheck %s // Tests for AST of Intel max_work_group_size, min_work_groups_per_cu and // max_work_groups_per_mp attribute. @@ -77,22 +77,20 @@ func1() {} // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6 template [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N), - intel::max_work_groups_per_mp(N)]] void func2() {} + intel::max_work_groups_per_mp(N)]] void +func2() {} class KernelFunctor { public: - void operator()() const { - func1(); - } + void operator()() const { func1(); } }; // Test that checks template parameter support on class member function. -template -class KernelFunctor2 { +template class KernelFunctor2 { public: [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N), - intel::max_work_groups_per_mp(N)]] void operator()() const { - } + intel::max_work_groups_per_mp(N)]] void + operator()() const {} }; int main() { @@ -167,8 +165,9 @@ int main() { // CHECK-NEXT: value: Int 6 // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6 h.single_task( - []() [[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(4), - intel::max_work_groups_per_mp(6)]]{}); + [] [[intel::max_work_group_size(8, 8, 8), + intel::min_work_groups_per_cu(4), + intel::max_work_groups_per_mp(6)]] () {}); }); func2<6>(); From 0d6b772f88c238fd36de3b6a8a41e81185fc2d00 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Tue, 31 Oct 2023 04:50:04 -0700 Subject: [PATCH 6/9] PR feedback 2. --- .../clang/Basic/DiagnosticSemaKinds.td | 2 +- clang/lib/Sema/SemaDeclAttr.cpp | 1 - clang/test/CodeGenSYCL/lb_sm_90.cpp | 5 +-- clang/test/SemaSYCL/lb_sm_70.cpp | 2 +- clang/test/SemaSYCL/lb_sm_90.cpp | 31 +++++++++++++++++++ 5 files changed, 34 insertions(+), 7 deletions(-) create mode 100644 clang/test/SemaSYCL/lb_sm_90.cpp diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 6e1ad6c734a70..6484483134686 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12003,7 +12003,7 @@ def err_sycl_special_type_num_init_method : Error< "types with 'sycl_special_class' attribute must have one and only one '__init' " "method defined">; def warn_launch_bounds_is_cuda_specific : Warning< - "%0 attribute ignored, only applicable when targetting Nvidia devices">, + "%0 attribute ignored, only applicable when targeting Nvidia devices">, InGroup; def warn_cuda_maxclusterrank_sm_90 : Warning< diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index a2c338b9fc38b..bffece2670d2b 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -22,7 +22,6 @@ #include "clang/AST/Mangle.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/AST/Type.h" -#include "clang/Basic/AttributeCommonInfo.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/Cuda.h" #include "clang/Basic/DarwinSDKInfo.h" diff --git a/clang/test/CodeGenSYCL/lb_sm_90.cpp b/clang/test/CodeGenSYCL/lb_sm_90.cpp index 752174b00749f..5909edde1479f 100644 --- a/clang/test/CodeGenSYCL/lb_sm_90.cpp +++ b/clang/test/CodeGenSYCL/lb_sm_90.cpp @@ -1,10 +1,7 @@ // RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_90 -fsycl-is-device -Wno-c++23-extensions -S -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR -// RUN: %clang_cc1 -internal-isystem %S/Inputs %s -triple nvptx64-nvidia-cuda -target-cpu sm_90 -fsycl-is-device -fsyntax-only -Wno-c++23-extensions -verify -// expected-no-diagnostics // Maximum work groups per multi-processor, mapped to maxclusterrank PTX -// directive, is an SM_90 feature, make sure that correct metadata is generated -// and no warnings/errors are issued. +// directive, is an SM_90 feature, make sure that correct metadata is generated. #include "sycl.hpp" diff --git a/clang/test/SemaSYCL/lb_sm_70.cpp b/clang/test/SemaSYCL/lb_sm_70.cpp index f9078eb2588e5..f5bee9b0242c6 100644 --- a/clang/test/SemaSYCL/lb_sm_70.cpp +++ b/clang/test/SemaSYCL/lb_sm_70.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_70 -fsycl-is-device -Wno-c++23-extensions -S -emit-llvm %s -o -ferror-limit=100 -fsyntax-only -verify %s +// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_70 -fsycl-is-device -Wno-c++23-extensions -S -emit-llvm %s -o -fsyntax-only -verify %s // Maximum work groups per multi-processor, mapped to maxclusterrank PTX // directive, is an SM_90 feature, make sure that correct warning is issued on diff --git a/clang/test/SemaSYCL/lb_sm_90.cpp b/clang/test/SemaSYCL/lb_sm_90.cpp new file mode 100644 index 0000000000000..9d2f61571565d --- /dev/null +++ b/clang/test/SemaSYCL/lb_sm_90.cpp @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -internal-isystem %S/Inputs %s -triple nvptx64-nvidia-cuda -target-cpu sm_90 -fsycl-is-device -fsyntax-only -Wno-c++23-extensions -verify +// expected-no-diagnostics + +// Maximum work groups per multi-processor, mapped to maxclusterrank PTX +// directive, is an SM_90 feature, make sure that no warnings/errors are issued. + +#include "sycl.hpp" + +template class Functor { +public: + [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2), + intel::max_work_groups_per_mp(N3)]] void + operator()() const {} +}; + +int main() { + sycl::queue Q{}; + + Q.submit([&](sycl::handler &cgh) { + cgh.single_task( [=] [[intel::max_work_group_size(1, 1, 256), + intel::min_work_groups_per_cu(2), + intel::max_work_groups_per_mp(4)]] ( +) { volatile int A = 42; }); + }); + + Q.submit([&](sycl::handler &cgh) { + cgh.single_task(Functor<512, 8, 16>{}); + }); + + return 0; +} From acba95f8d561c66ec83fb33b56c027a2f7648365 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 3 Nov 2023 03:46:45 -0700 Subject: [PATCH 7/9] PR: attribute check --- clang/lib/Sema/SemaDeclAttr.cpp | 42 +++++++++++++++++--------------- clang/test/SemaSYCL/lb_sm_70.cpp | 7 +++--- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index bffece2670d2b..a3fb07bf3ecd0 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -207,11 +207,6 @@ static std::enable_if_t, SourceLocation> getAttrLoc(const T &AL) { return AL.getLocation(); } -template , bool> = true> -static SourceLocation getAttrLoc(const T &AL) { - return AL.getScopeLoc(); -} static SourceLocation getAttrLoc(const ParsedAttr &AL) { return AL.getLoc(); } /// If Expr is a valid integer constant, get the value of the integer @@ -4452,6 +4447,25 @@ void Sema::AddSYCLIntelMaxGlobalWorkDimAttr(Decl *D, D->addAttr(::new (Context) SYCLIntelMaxGlobalWorkDimAttr(Context, CI, E)); } +// Check that the value is a non-negative integer constant that can fit in +// 32-bits. Issue correct error message and return false on failure. +bool static check32BitInt(const Expr *E, Sema &S, llvm::APSInt &I, + const AttributeCommonInfo &CI) { + if (!I.isIntN(32)) { + S.Diag(E->getExprLoc(), diag::err_ice_too_large) + << llvm::toString(I, 10, false) << 32 << /* Unsigned */ 1; + return false; + } + + if (I.isSigned() && I.isNegative()) { + S.Diag(E->getExprLoc(), diag::err_attribute_requires_positive_integer) + << CI << /* Non-negative */ 1; + return false; + } + + return true; +} + void Sema::AddSYCLIntelMinWorkGroupsPerComputeUnitAttr( Decl *D, const AttributeCommonInfo &CI, Expr *E) { if (Context.getLangOpts().SYCLIsDevice && @@ -4461,11 +4475,6 @@ void Sema::AddSYCLIntelMinWorkGroupsPerComputeUnitAttr( return; } if (!E->isValueDependent()) { - uint32_t Val; - if (!checkUInt32Argument(*this, CI, E, Val, UINT_MAX /* Idx */, - true /* StrictlyUnsigned */)) - return; - // Validate that we have an integer constant expression and then store the // converted constant expression into the semantic attribute so that we // don't have to evaluate it again later. @@ -4473,8 +4482,8 @@ void Sema::AddSYCLIntelMinWorkGroupsPerComputeUnitAttr( ExprResult Res = VerifyIntegerConstantExpression(E, &ArgVal); if (Res.isInvalid()) return; - if (Val != ArgVal) - llvm_unreachable("Values must not differ."); + if (!check32BitInt(E, *this, ArgVal, CI)) + return; E = Res.get(); // Check to see if there's a duplicate attribute with different values @@ -4526,11 +4535,6 @@ void Sema::AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( } } if (!E->isValueDependent()) { - uint32_t Val; - if (!checkUInt32Argument(*this, CI, E, Val, UINT_MAX /* Idx */, - true /* StrictlyUnsigned */)) - return; - // Validate that we have an integer constant expression and then store the // converted constant expression into the semantic attribute so that we // don't have to evaluate it again later. @@ -4538,9 +4542,9 @@ void Sema::AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr( ExprResult Res = VerifyIntegerConstantExpression(E, &ArgVal); if (Res.isInvalid()) return; + if (!check32BitInt(E, *this, ArgVal, CI)) + return; E = Res.get(); - if (Val != ArgVal) - llvm_unreachable("Values must not differ."); // Check to see if there's a duplicate attribute with different values // already applied to the declaration. diff --git a/clang/test/SemaSYCL/lb_sm_70.cpp b/clang/test/SemaSYCL/lb_sm_70.cpp index f5bee9b0242c6..071482d10ba49 100644 --- a/clang/test/SemaSYCL/lb_sm_70.cpp +++ b/clang/test/SemaSYCL/lb_sm_70.cpp @@ -26,14 +26,15 @@ int main() { intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; }); constexpr float A = 2.0; - // expected-error@+4 {{'min_work_groups_per_cu' attribute requires an integer constant}} - // expected-warning@+4 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}} + // expected-warning@+5{{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}} + // expected-error@+3 {{integral constant expression must have integral or unscoped enumeration type, not 'float'}} cgh.single_task( [=] [[intel::max_work_group_size(1, 1, 256), intel::min_work_groups_per_cu(A), intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; }); - // expected-error@+3 {{'min_work_groups_per_cu' attribute requires an integer constant}} + // expected-error@+4 {{expression is not an integral constant expression}} + // expected-note@+3 {{value 2147483648 is outside the range of representable values of type 'int'}} cgh.single_task( [=] [[intel::max_work_group_size(1, 1, 256), intel::min_work_groups_per_cu(2147483647 + 1)]] () { From 7691bbae43f4b8a24a56e8aa5c02e24ce5ebe3f5 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Fri, 3 Nov 2023 04:06:43 -0700 Subject: [PATCH 8/9] PR feedback 3 --- clang/lib/CodeGen/CodeGenFunction.cpp | 1 - clang/lib/CodeGen/Targets/NVPTX.cpp | 1 - clang/test/SemaSYCL/lb_sm_70.cpp | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 8a67d251c2e78..647ae6206d8e3 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -761,7 +761,6 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD, auto attrAsMDArg = [&](Expr *E) { const auto *CE = cast(E); std::optional ArgVal = CE->getResultAsAPSInt(); - assert(ArgVal.has_value() && "Failed to obtain attribute value."); return llvm::ConstantAsMetadata::get( Builder.getInt32(ArgVal->getSExtValue())); }; diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 3fa2d84c73b1d..bbebb560a67b1 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -255,7 +255,6 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes( auto attrValue = [&](Expr *E) { const auto *CE = cast(E); std::optional Val = CE->getResultAsAPSInt(); - assert(Val.has_value() && "Failed to get attribute value."); return Val->getZExtValue(); }; diff --git a/clang/test/SemaSYCL/lb_sm_70.cpp b/clang/test/SemaSYCL/lb_sm_70.cpp index 071482d10ba49..02a7ceee0b221 100644 --- a/clang/test/SemaSYCL/lb_sm_70.cpp +++ b/clang/test/SemaSYCL/lb_sm_70.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_70 -fsycl-is-device -Wno-c++23-extensions -S -emit-llvm %s -o -fsyntax-only -verify %s +// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_70 -fsycl-is-device -Wno-c++23-extensions %s -o -fsyntax-only -verify %s // Maximum work groups per multi-processor, mapped to maxclusterrank PTX // directive, is an SM_90 feature, make sure that correct warning is issued on From 13422248d41d46d1a2be1034c9feb5e185fc465d Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Mon, 6 Nov 2023 01:05:25 -0800 Subject: [PATCH 9/9] Remove changes to getAttrLoc --- clang/lib/Sema/SemaDeclAttr.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index a3fb07bf3ecd0..0b123a3ca878d 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -200,11 +200,11 @@ static unsigned getNumAttributeArgs(const ParsedAttr &AL) { return AL.getNumArgs() + AL.hasParsedType(); } -/// Helper functions to provide Attribute Location for the Attr types, -/// AttributeCommonInfo AND the ParsedAttr. -template -static std::enable_if_t, SourceLocation> -getAttrLoc(const T &AL) { +/// A helper function to provide Attribute Location for the Attr types +/// AND the ParsedAttr. +template +static std::enable_if_t, SourceLocation> +getAttrLoc(const AttrInfo &AL) { return AL.getLocation(); } static SourceLocation getAttrLoc(const ParsedAttr &AL) { return AL.getLoc(); }