diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 1a9aee6a3ebfc..4af37e4e9b4d8 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1583,6 +1583,22 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr {
   let SupportsNonconformingLambdaSyntax = 1;
 }
 
+def SYCLIntelMinWorkGroupsPerComputeUnit : InheritableAttr {
+  let Spellings = [CXX11<"intel", "min_work_groups_per_cu">];
+  let Args = [ExprArgument<"Value">];
+  let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs];
+}
+
+def SYCLIntelMaxWorkGroupsPerMultiprocessor : InheritableAttr {
+  let Spellings = [CXX11<"intel", "max_work_groups_per_mp">];
+  let Args = [ExprArgument<"Value">];
+  let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [SYCLIntelMaxWorkGroupsPerMultiprocessorDocs];
+}
+
 def SYCLIntelMaxGlobalWorkDim : InheritableAttr {
   let Spellings = [CXX11<"intel", "max_global_work_dim">];
   let Args = [ExprArgument<"Value">];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 0957da6019298..3be0445701ec4 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3017,6 +3017,65 @@ In SYCL 2020 mode, the attribute is not propagated to the kernel.
   }];
 }
 
+def SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs: Documentation {
+  let Category = DocCatFunction;
+  let Heading = "intel::min_work_groups_per_cu";
+  let Content = [{
+Applies to a device function/lambda function. Indicates the desired minimum
+number of resident work_groups per multiprocessor. It complies to the
+.minnctapersm PTX directive.
+
+.. code-block:: c++
+
+  [[intel::min_work_groups_per_cu(2)]] void foo() {}
+
+  class Foo {
+  public:
+    [[intel::min_work_groups_per_cu(2)]] void operator()() const {}
+  };
+
+  template <int N>
+  class Functor {
+  public:
+    [[intel::min_work_groups_per_cu(N)]] void operator()() const {}
+  };
+
+  template <int N>
+  [[intel::min_work_groups_per_cu(N)]] void func() {}
+
+  }];
+}
+
+def SYCLIntelMaxWorkGroupsPerMultiprocessorDocs: Documentation {
+  let Category = DocCatFunction;
+  let Heading = "intel::max_work_groups_per_mp";
+  let Content = [{
+Applies to a device function/lambda function. Indicates the desired maximum
+number work_groups per cluster with which the application will ever launch. It
+complies to the .maxclusterrank PTX directive. Note, that the feature requires
+SM_90 or higher.
+
+.. code-block:: c++
+
+  [[intel::max_work_groups_per_mp(2)]] void foo() {}
+
+  class Foo {
+  public:
+    [[intel::max_work_groups_per_mp(2)]] void operator()() const {}
+  };
+
+  template <int N>
+  class Functor {
+  public:
+    [[intel::max_work_groups_per_mp(N)]] void operator()() const {}
+  };
+
+  template <int N>
+  [[intel::max_work_groups_per_mp(N)]] void func() {}
+
+  }];
+}
+
 def SYCLIntelMaxGlobalWorkDimAttrDocs : Documentation {
   let Category = DocCatFunction;
   let Heading = "intel::max_global_work_dim";
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a4328d78c0ccf..6484483134686 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12002,9 +12002,12 @@ def warn_sycl_kernel_return_type : Warning<
 def err_sycl_special_type_num_init_method : Error<
   "types with 'sycl_special_class' attribute must have one and only one '__init' "
   "method defined">;
+def warn_launch_bounds_is_cuda_specific : Warning<
+  "%0 attribute ignored, only applicable when targeting Nvidia devices">,
+  InGroup<IgnoredAttributes>;
 
 def warn_cuda_maxclusterrank_sm_90 : Warning<
-  "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
+  "'maxclusterrank' requires sm_90 or higher, CUDA arch provided: %0, ignoring "
   "%1 attribute">, InGroup<IgnoredAttributes>;
 
 def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must "
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 5c749d2e0c165..07929a8006f39 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11432,6 +11432,16 @@ class Sema final {
   SYCLIntelMaxGlobalWorkDimAttr *
   MergeSYCLIntelMaxGlobalWorkDimAttr(Decl *D,
                                      const SYCLIntelMaxGlobalWorkDimAttr &A);
+  void AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+      Decl *D, const AttributeCommonInfo &CI, Expr *E);
+  SYCLIntelMinWorkGroupsPerComputeUnitAttr *
+  MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+      Decl *D, const SYCLIntelMinWorkGroupsPerComputeUnitAttr &A);
+  void AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+      Decl *D, const AttributeCommonInfo &CI, Expr *E);
+  SYCLIntelMaxWorkGroupsPerMultiprocessorAttr *
+  MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+      Decl *D, const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr &A);
   void AddSYCLIntelBankWidthAttr(Decl *D, const AttributeCommonInfo &CI,
                                  Expr *E);
   SYCLIntelBankWidthAttr *
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 3fe3a66ae03d7..647ae6206d8e3 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -758,6 +758,24 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
                     llvm::MDNode::get(Context, AttrMDArgs));
   }
 
+  auto attrAsMDArg = [&](Expr *E) {
+    const auto *CE = cast<ConstantExpr>(E);
+    std::optional<llvm::APSInt> ArgVal = CE->getResultAsAPSInt();
+    return llvm::ConstantAsMetadata::get(
+        Builder.getInt32(ArgVal->getSExtValue()));
+  };
+
+  if (const auto *A = FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+    Fn->setMetadata("min_work_groups_per_cu",
+                    llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())}));
+  }
+
+  if (const auto *A =
+          FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
+    Fn->setMetadata("max_work_groups_per_mp",
+                    llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())}));
+  }
+
   if (const SYCLIntelMaxWorkGroupSizeAttr *A =
           FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
 
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 820c99a7b3410..bbebb560a67b1 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -245,6 +245,31 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
       // And kernel functions are not subject to inlining
       F->addFnAttr(llvm::Attribute::NoInline);
     }
+    if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
+      auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
+                        (*MWGS->getYDimVal()).getExtValue() *
+                        (*MWGS->getXDimVal()).getExtValue();
+      if (MaxThreads > 0)
+        addNVVMMetadata(F, "maxntidx", MaxThreads);
+
+      auto attrValue = [&](Expr *E) {
+        const auto *CE = cast<ConstantExpr>(E);
+        std::optional<llvm::APInt> Val = CE->getResultAsAPSInt();
+        return Val->getZExtValue();
+      };
+
+      if (const auto *MWGPCU =
+              FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+        // The value is guaranteed to be > 0, pass it to the metadata.
+        addNVVMMetadata(F, "minnctapersm", attrValue(MWGPCU->getValue()));
+
+        if (const auto *MWGPMP =
+                FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
+          // The value is guaranteed to be > 0, pass it to the metadata.
+          addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue()));
+        }
+      }
+    }
   }
 
   // Perform special handling in CUDA mode.
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 47632f544a47d..c74abb5174e46 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2999,6 +2999,12 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
     NewAttr = S.MergeSYCLIntelInitiationIntervalAttr(D, *A);
   else if (const auto *A = dyn_cast<SYCLWorkGroupSizeHintAttr>(Attr))
     NewAttr = S.MergeSYCLWorkGroupSizeHintAttr(D, *A);
+  else if (const auto *A =
+               dyn_cast<SYCLIntelMinWorkGroupsPerComputeUnitAttr>(Attr))
+    NewAttr = S.MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(D, *A);
+  else if (const auto *A =
+               dyn_cast<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>(Attr))
+    NewAttr = S.MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(D, *A);
   else if (const auto *A = dyn_cast<SYCLIntelMaxGlobalWorkDimAttr>(Attr))
     NewAttr = S.MergeSYCLIntelMaxGlobalWorkDimAttr(D, *A);
   else if (const auto *BTFA = dyn_cast<BTFDeclTagAttr>(Attr))
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 22fc155343b1d..0b123a3ca878d 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3744,6 +3744,19 @@ static void handleSYCLIntelMaxWorkGroupSize(Sema &S, Decl *D,
                                      AL.getArgAsExpr(1), AL.getArgAsExpr(2));
 }
 
+// Handles min_work_groups_per_cu attribute.
+static void handleSYCLIntelMinWorkGroupsPerComputeUnit(Sema &S, Decl *D,
+                                                       const ParsedAttr &AL) {
+  S.AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(D, AL, AL.getArgAsExpr(0));
+}
+
+// Handles max_work_groups_per_mp attribute.
+static void
+handleSYCLIntelMaxWorkGroupsPerMultiprocessor(Sema &S, Decl *D,
+                                              const ParsedAttr &AL) {
+  S.AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(D, AL, AL.getArgAsExpr(0));
+}
+
 // Handles reqd_work_group_size.
 // If the 'reqd_work_group_size' attribute is specified on a declaration along
 // with 'num_simd_work_items' attribute, the required work group size specified
@@ -4434,6 +4447,127 @@ void Sema::AddSYCLIntelMaxGlobalWorkDimAttr(Decl *D,
   D->addAttr(::new (Context) SYCLIntelMaxGlobalWorkDimAttr(Context, CI, E));
 }
 
+// Check that the value is a non-negative integer constant that can fit in
+// 32-bits. Issue correct error message and return false on failure.
+bool static check32BitInt(const Expr *E, Sema &S, llvm::APSInt &I,
+                          const AttributeCommonInfo &CI) {
+  if (!I.isIntN(32)) {
+    S.Diag(E->getExprLoc(), diag::err_ice_too_large)
+        << llvm::toString(I, 10, false) << 32 << /* Unsigned */ 1;
+    return false;
+  }
+
+  if (I.isSigned() && I.isNegative()) {
+    S.Diag(E->getExprLoc(), diag::err_attribute_requires_positive_integer)
+        << CI << /* Non-negative */ 1;
+    return false;
+  }
+
+  return true;
+}
+
+void Sema::AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+    Decl *D, const AttributeCommonInfo &CI, Expr *E) {
+  if (Context.getLangOpts().SYCLIsDevice &&
+      !Context.getTargetInfo().getTriple().isNVPTX()) {
+    Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific)
+        << CI << E->getSourceRange();
+    return;
+  }
+  if (!E->isValueDependent()) {
+    // Validate that we have an integer constant expression and then store the
+    // converted constant expression into the semantic attribute so that we
+    // don't have to evaluate it again later.
+    llvm::APSInt ArgVal;
+    ExprResult Res = VerifyIntegerConstantExpression(E, &ArgVal);
+    if (Res.isInvalid())
+      return;
+    if (!check32BitInt(E, *this, ArgVal, CI))
+      return;
+    E = Res.get();
+
+    // Check to see if there's a duplicate attribute with different values
+    // already applied to the declaration.
+    if (const auto *DeclAttr =
+            D->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+      // If the other attribute argument is instantiation dependent, we won't
+      // have converted it to a constant expression yet and thus we test
+      // whether this is a null pointer.
+      if (const auto *DeclExpr = dyn_cast<ConstantExpr>(DeclAttr->getValue())) {
+        if (ArgVal != DeclExpr->getResultAsAPSInt()) {
+          Diag(CI.getLoc(), diag::warn_duplicate_attribute) << CI;
+          Diag(DeclAttr->getLoc(), diag::note_previous_attribute);
+        }
+        // Drop the duplicate attribute.
+        return;
+      }
+    }
+  }
+
+  D->addAttr(::new (Context)
+                 SYCLIntelMinWorkGroupsPerComputeUnitAttr(Context, CI, E));
+}
+
+// Helper to get CudaArch.
+static CudaArch getCudaArch(const TargetInfo &TI) {
+  if (!TI.getTriple().isNVPTX())
+    llvm_unreachable("getCudaArch is only valid for NVPTX triple");
+  auto &TO = TI.getTargetOpts();
+  return StringToCudaArch(TO.CPU);
+}
+
+void Sema::AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+    Decl *D, const AttributeCommonInfo &CI, Expr *E) {
+  auto &TI = Context.getTargetInfo();
+  if (Context.getLangOpts().SYCLIsDevice) {
+    if (!TI.getTriple().isNVPTX()) {
+      Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific)
+          << CI << E->getSourceRange();
+      return;
+    }
+
+    // Feature '.maxclusterrank' requires .target sm_90 or higher.
+    auto SM = getCudaArch(TI);
+    if (SM == CudaArch::UNKNOWN || SM < CudaArch::SM_90) {
+      Diag(E->getBeginLoc(), diag::warn_cuda_maxclusterrank_sm_90)
+          << CudaArchToString(SM) << CI << E->getSourceRange();
+      return;
+    }
+  }
+  if (!E->isValueDependent()) {
+    // Validate that we have an integer constant expression and then store the
+    // converted constant expression into the semantic attribute so that we
+    // don't have to evaluate it again later.
+    llvm::APSInt ArgVal;
+    ExprResult Res = VerifyIntegerConstantExpression(E, &ArgVal);
+    if (Res.isInvalid())
+      return;
+    if (!check32BitInt(E, *this, ArgVal, CI))
+      return;
+    E = Res.get();
+
+    // Check to see if there's a duplicate attribute with different values
+    // already applied to the declaration.
+    if (const auto *DeclAttr =
+            D->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
+      // If the other attribute argument is instantiation dependent, we won't
+      // have converted it to a constant expression yet and thus we test
+      // whether this is a null pointer.
+      if (const auto *DeclExpr = dyn_cast<ConstantExpr>(DeclAttr->getValue())) {
+        if (ArgVal != DeclExpr->getResultAsAPSInt()) {
+          Diag(CI.getLoc(), diag::warn_duplicate_attribute) << CI;
+          Diag(DeclAttr->getLoc(), diag::note_previous_attribute);
+        }
+        // Drop the duplicate attribute.
+        return;
+      }
+    }
+  }
+
+  D->addAttr(::new (Context)
+                 SYCLIntelMaxWorkGroupsPerMultiprocessorAttr(Context, CI, E));
+}
+
 SYCLIntelMaxGlobalWorkDimAttr *Sema::MergeSYCLIntelMaxGlobalWorkDimAttr(
     Decl *D, const SYCLIntelMaxGlobalWorkDimAttr &A) {
   // Check to see if there's a duplicate attribute with different values
@@ -4473,6 +4607,52 @@ static void handleSYCLIntelMaxGlobalWorkDimAttr(Sema &S, Decl *D,
   S.AddSYCLIntelMaxGlobalWorkDimAttr(D, AL, E);
 }
 
+SYCLIntelMinWorkGroupsPerComputeUnitAttr *
+Sema::MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+    Decl *D, const SYCLIntelMinWorkGroupsPerComputeUnitAttr &A) {
+  // Check to see if there's a duplicate attribute with different values
+  // already applied to the declaration.
+  if (const auto *DeclAttr =
+          D->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+    if (const auto *DeclExpr = dyn_cast<ConstantExpr>(DeclAttr->getValue())) {
+      if (const auto *MergeExpr = dyn_cast<ConstantExpr>(A.getValue())) {
+        if (DeclExpr->getResultAsAPSInt() != MergeExpr->getResultAsAPSInt()) {
+          Diag(DeclAttr->getLoc(), diag::warn_duplicate_attribute) << &A;
+          Diag(A.getLoc(), diag::note_previous_attribute);
+        }
+        // Do not add a duplicate attribute.
+        return nullptr;
+      }
+    }
+  }
+
+  return ::new (Context)
+      SYCLIntelMinWorkGroupsPerComputeUnitAttr(Context, A, A.getValue());
+}
+
+SYCLIntelMaxWorkGroupsPerMultiprocessorAttr *
+Sema::MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+    Decl *D, const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr &A) {
+  // Check to see if there's a duplicate attribute with different values
+  // already applied to the declaration.
+  if (const auto *DeclAttr =
+          D->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
+    if (const auto *DeclExpr = dyn_cast<ConstantExpr>(DeclAttr->getValue())) {
+      if (const auto *MergeExpr = dyn_cast<ConstantExpr>(A.getValue())) {
+        if (DeclExpr->getResultAsAPSInt() != MergeExpr->getResultAsAPSInt()) {
+          Diag(DeclAttr->getLoc(), diag::warn_duplicate_attribute) << &A;
+          Diag(A.getLoc(), diag::note_previous_attribute);
+        }
+        // Do not add a duplicate attribute.
+        return nullptr;
+      }
+    }
+  }
+
+  return ::new (Context)
+      SYCLIntelMaxWorkGroupsPerMultiprocessorAttr(Context, A, A.getValue());
+}
+
 // Handles [[intel::loop_fuse]] and [[intel::loop_fuse_independent]].
 void Sema::AddSYCLIntelLoopFuseAttr(Decl *D, const AttributeCommonInfo &CI,
                                     Expr *E) {
@@ -7045,14 +7225,6 @@ bool Sema::CheckRegparmAttr(const ParsedAttr &AL, unsigned &numParams) {
   return false;
 }
 
-// Helper to get CudaArch.
-static CudaArch getCudaArch(const TargetInfo &TI) {
-  if (!TI.getTriple().isNVPTX())
-    llvm_unreachable("getCudaArch is only valid for NVPTX triple");
-  auto &TO = TI.getTargetOpts();
-  return StringToCudaArch(TO.CPU);
-}
-
 // Checks whether an argument of launch_bounds attribute is
 // acceptable, performs implicit conversion to Rvalue, and returns
 // non-nullptr Expr result on success. Otherwise, it returns nullptr
@@ -12010,6 +12182,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_SYCLIntelMaxWorkGroupSize:
     handleSYCLIntelMaxWorkGroupSize(S, D, AL);
     break;
+  case ParsedAttr::AT_SYCLIntelMinWorkGroupsPerComputeUnit:
+    handleSYCLIntelMinWorkGroupsPerComputeUnit(S, D, AL);
+    break;
+  case ParsedAttr::AT_SYCLIntelMaxWorkGroupsPerMultiprocessor:
+    handleSYCLIntelMaxWorkGroupsPerMultiprocessor(S, D, AL);
+    break;
   case ParsedAttr::AT_IntelReqdSubGroupSize:
     handleIntelReqdSubGroupSize(S, D, AL);
     break;
@@ -12537,6 +12715,14 @@ void Sema::ProcessDeclAttributeList(
     } else if (const auto *A = D->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
       Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
       D->setInvalidDecl();
+    } else if (const auto *A =
+                   D->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+      Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
+      D->setInvalidDecl();
+    } else if (const auto *A =
+                   D->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
+      Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
+      D->setInvalidDecl();
     } else if (const auto *A = D->getAttr<SYCLIntelNoGlobalWorkOffsetAttr>()) {
       Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
       D->setInvalidDecl();
diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index 15182d369143f..e2a8d10ded1ab 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -538,6 +538,8 @@ static void collectSYCLAttributes(Sema &S, FunctionDecl *FD,
                  SYCLIntelKernelArgsRestrictAttr, SYCLIntelNumSimdWorkItemsAttr,
                  SYCLIntelSchedulerTargetFmaxMhzAttr,
                  SYCLIntelMaxWorkGroupSizeAttr, SYCLIntelMaxGlobalWorkDimAttr,
+                 SYCLIntelMinWorkGroupsPerComputeUnitAttr,
+                 SYCLIntelMaxWorkGroupsPerMultiprocessorAttr,
                  SYCLIntelNoGlobalWorkOffsetAttr, SYCLSimdAttr>(A);
     });
   }
@@ -4489,6 +4491,8 @@ static void PropagateAndDiagnoseDeviceAttr(
   case attr::Kind::SYCLIntelNumSimdWorkItems:
   case attr::Kind::SYCLIntelSchedulerTargetFmaxMhz:
   case attr::Kind::SYCLIntelMaxGlobalWorkDim:
+  case attr::Kind::SYCLIntelMinWorkGroupsPerComputeUnit:
+  case attr::Kind::SYCLIntelMaxWorkGroupsPerMultiprocessor:
   case attr::Kind::SYCLIntelNoGlobalWorkOffset:
   case attr::Kind::SYCLIntelLoopFuse:
   case attr::Kind::SYCLIntelMaxConcurrency:
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 66e3dc0612525..7d44576a6b7de 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -754,6 +754,28 @@ static void instantiateSYCLIntelMaxGlobalWorkDimAttr(
     S.AddSYCLIntelMaxGlobalWorkDimAttr(New, *A, Result.getAs<Expr>());
 }
 
+static void instantiateSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const SYCLIntelMinWorkGroupsPerComputeUnitAttr *A, Decl *New) {
+  EnterExpressionEvaluationContext Unevaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+  ExprResult Result = S.SubstExpr(A->getValue(), TemplateArgs);
+  if (!Result.isInvalid())
+    S.AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(New, *A,
+                                                  Result.getAs<Expr>());
+}
+
+static void instantiateSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr *A, Decl *New) {
+  EnterExpressionEvaluationContext Unevaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+  ExprResult Result = S.SubstExpr(A->getValue(), TemplateArgs);
+  if (!Result.isInvalid())
+    S.AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(New, *A,
+                                                     Result.getAs<Expr>());
+}
+
 static void instantiateSYCLIntelMaxConcurrencyAttr(
     Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
     const SYCLIntelMaxConcurrencyAttr *A, Decl *New) {
@@ -1145,6 +1167,18 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
                                                SYCLIntelMaxGlobalWorkDim, New);
       continue;
     }
+    if (const auto *SYCLIntelMinWorkGroupsPerComputeUnit =
+            dyn_cast<SYCLIntelMinWorkGroupsPerComputeUnitAttr>(TmplAttr)) {
+      instantiateSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+          *this, TemplateArgs, SYCLIntelMinWorkGroupsPerComputeUnit, New);
+      continue;
+    }
+    if (const auto *SYCLIntelMaxWorkGroupsPerMultiprocessor =
+            dyn_cast<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>(TmplAttr)) {
+      instantiateSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+          *this, TemplateArgs, SYCLIntelMaxWorkGroupsPerMultiprocessor, New);
+      continue;
+    }
     if (const auto *SYCLIntelLoopFuse =
             dyn_cast<SYCLIntelLoopFuseAttr>(TmplAttr)) {
       instantiateSYCLIntelLoopFuseAttr(*this, TemplateArgs, SYCLIntelLoopFuse,
diff --git a/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp
new file mode 100644
index 0000000000000..13c6d1ce4e5f0
--- /dev/null
+++ b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp
@@ -0,0 +1,105 @@
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -triple nvptx-unknown-unknown -target-cpu sm_90 -disable-llvm-passes -S -emit-llvm -o - %s | FileCheck %s
+
+// Test correct handling of maximum work group size, minimum work groups per
+// compute unit and maximum work groups per multi-processor attributes, that
+// correspond to CUDA's launch bounds. Expect max_work_group_size,
+// min_work_groups_per_cu and max_work_groups_per_mp that are mapped to
+// maxntidx, minnctapersm, maxclusterrank PTX directives respectively.
+
+#include "sycl.hpp"
+
+using namespace sycl;
+queue q;
+
+class Foo {
+public:
+  [[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+    intel::max_work_groups_per_mp(4)]] void
+  operator()() const {}
+};
+
+template <int N> class Functor {
+public:
+  [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+    intel::max_work_groups_per_mp(N)]] void
+  operator()() const {}
+};
+
+template <int N>
+[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+  intel::max_work_groups_per_mp(N)]] void
+zoo() {}
+
+[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+  intel::max_work_groups_per_mp(4)]] void
+bar() {}
+
+int main() {
+  q.submit([&](handler &h) {
+    // Test attribute argument size.
+    Foo boo;
+    h.single_task<class kernel_name1>(boo);
+
+    // Test attribute is applied on lambda.
+    h.single_task<class kernel_name2>(
+        [] [[intel::max_work_group_size(8, 8, 8),
+             intel::min_work_groups_per_cu(2),
+             intel::max_work_groups_per_mp(4)]] () {});
+
+    // Test class template argument.
+    Functor<6> f;
+    h.single_task<class kernel_name3>(f);
+
+    // Test attribute is propagated.
+    h.single_task<class kernel_name4>([]() { bar(); });
+
+    // Test function template argument.
+    h.single_task<class kernel_name5>([]() { zoo<16>(); });
+  });
+  return 0;
+}
+
+// CHECK: define dso_local void @{{.*}}kernel_name1() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
+// CHECK: define dso_local void @{{.*}}kernel_name2() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
+// CHECK: define dso_local void @{{.*}}kernel_name3() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM]] !max_work_group_size ![[MWGS_2:[0-9]+]]
+// CHECK: define dso_local void @{{.*}}kernel_name4() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
+// CHECK: define dso_local void @{{.*}}kernel_name5() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM_2:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM_2]] !max_work_group_size ![[MWGS_3:[0-9]+]]
+
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxclusterrank", i32 4}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxclusterrank", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxclusterrank", i32 4}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxclusterrank", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 384}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"minnctapersm", i32 6}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxclusterrank", i32 6}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 384}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minnctapersm", i32 6}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxclusterrank", i32 6}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxclusterrank", i32 4}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxclusterrank", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 1024}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"minnctapersm", i32 16}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxclusterrank", i32 16}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 1024}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"minnctapersm", i32 16}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxclusterrank", i32 16}
+
+// CHECK: ![[MWGPC]] = !{i32 2}
+// CHECK: ![[MWGPM]] = !{i32 4}
+// CHECK: ![[MWGS]] = !{i32 8, i32 8, i32 8}
+// CHECK: ![[MWGPC_MWGPM]] = !{i32 6}
+// CHECK: ![[MWGS_2]] = !{i32 8, i32 8, i32 6}
+// CHECK: ![[MWGPC_MWGPM_2]] = !{i32 16}
+// CHECK: ![[MWGS_3]] = !{i32 8, i32 8, i32 16}
diff --git a/clang/test/CodeGenSYCL/lb_sm_90.cpp b/clang/test/CodeGenSYCL/lb_sm_90.cpp
new file mode 100644
index 0000000000000..5909edde1479f
--- /dev/null
+++ b/clang/test/CodeGenSYCL/lb_sm_90.cpp
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_90 -fsycl-is-device -Wno-c++23-extensions -S -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR
+
+// Maximum work groups per multi-processor, mapped to maxclusterrank PTX
+// directive, is an SM_90 feature, make sure that correct metadata is generated.
+
+#include "sycl.hpp"
+
+template <int N1, int N2, int N3> class Functor {
+public:
+  [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2),
+    intel::max_work_groups_per_mp(N3)]] void
+  operator()() const {}
+};
+
+int main() {
+  sycl::queue Q{};
+
+  sycl::range<1> Gws(32);
+
+  Q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<class K1>(Gws, [=] [[intel::max_work_group_size(1, 1, 256),
+                                          intel::min_work_groups_per_cu(2),
+                                          intel::max_work_groups_per_mp(4)]] (
+                                        sycl::id<1>) { volatile int A = 42; });
+  });
+  // CHECK-IR: !min_work_groups_per_cu [[MWGPCU:![0-9]+]]
+  // CHECK-IR: !max_work_groups_per_mp [[MWGPMP:![0-9]+]]
+  // CHECK-IR: !max_work_group_size [[MWGS:![0-9]+]]
+
+  Q.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class F>(Functor<512, 8, 16>{});
+  });
+  // CHECK-IR: !min_work_groups_per_cu [[MWGPCU_F:![0-9]+]]
+  // CHECK-IR: !max_work_groups_per_mp [[MWGPMP_F:![0-9]+]]
+  // CHECK-IR: !max_work_group_size [[MWGS_F:![0-9]+]]
+
+  // CHECK-IR: [[MWGPCU]] = !{i32 2}
+  // CHECK-IR: [[MWGPMP]] = !{i32 4}
+  // CHECK-IR: [[MWGS]] = !{i32 256, i32 1, i32 1}
+
+  // CHECK-IR: [[MWGPCU_F]] = !{i32 8}
+  // CHECK-IR: [[MWGPMP_F]] = !{i32 16}
+  // CHECK-IR: [[MWGS_F]] = !{i32 512, i32 1, i32 1}
+
+  return 0;
+}
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index a79c6157e7ffa..b160de34da345 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -181,6 +181,8 @@
 // CHECK-NEXT: SYCLIntelMaxConcurrency (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLIntelMaxGlobalWorkDim (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLIntelMaxWorkGroupSize (SubjectMatchRule_function)
+// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessor (SubjectMatchRule_function)
+// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnit (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLIntelNoGlobalWorkOffset (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLIntelNumSimdWorkItems (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLIntelPipeIO (SubjectMatchRule_variable)
diff --git a/clang/test/SemaCUDA/launch_bounds.cu b/clang/test/SemaCUDA/launch_bounds.cu
index 045f475692959..aa83c3560e71d 100644
--- a/clang/test/SemaCUDA/launch_bounds.cu
+++ b/clang/test/SemaCUDA/launch_bounds.cu
@@ -11,7 +11,7 @@ __launch_bounds__(0x10000000000000000) void TestWayTooBigArg(void); // expected-
 
 __launch_bounds__(-128, 7) void TestNegArg1(void); // expected-warning {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
 __launch_bounds__(128, -7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 1 is negative and will be ignored}}
-__launch_bounds__(128, 2, -8) void TestNegArg2(void); // expected-warning {{maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}}
+__launch_bounds__(128, 2, -8) void TestNegArg2(void); // expected-warning {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}}
 
 __launch_bounds__(1, 2, 3, 4) void Test4Args(void); // expected-error {{'launch_bounds' attribute takes no more than 3 arguments}}
 __launch_bounds__() void TestNoArgs(void); // expected-error {{'launch_bounds' attribute takes at least 1 argument}}
@@ -49,4 +49,4 @@ __launch_bounds__(Args) void TestTemplateVariadicArgs(void) {} // expected-error
 template <int... Args>
 __launch_bounds__(1, Args) void TestTemplateVariadicArgs2(void) {} // expected-error {{expression contains unexpanded parameter pack 'Args'}}
 
-__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-warning {{maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}}
+__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-warning {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}}
diff --git a/clang/test/SemaSYCL/lb_sm_70.cpp b/clang/test/SemaSYCL/lb_sm_70.cpp
new file mode 100644
index 0000000000000..02a7ceee0b221
--- /dev/null
+++ b/clang/test/SemaSYCL/lb_sm_70.cpp
@@ -0,0 +1,61 @@
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_70 -fsycl-is-device -Wno-c++23-extensions %s -o -fsyntax-only -verify %s
+
+// Maximum work groups per multi-processor, mapped to maxclusterrank PTX
+// directive, is an SM_90 feature, make sure that correct warning is issued on
+// architectures lower than that. Furthermore, warn/error incorrect values
+// specified for max_work_groups_per_mp and min_work_groups_per_cu.
+
+#include "sycl.hpp"
+
+template <int N1, int N2, int N3> class Functor {
+public:
+  // expected-warning@+2 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}}
+  [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2),
+    intel::max_work_groups_per_mp(N3)]] void
+  operator()() const {}
+};
+
+int main() {
+  sycl::queue Q{};
+
+  Q.submit([&](sycl::handler &cgh) {
+    // expected-warning@+4 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}}
+    cgh.single_task<class T1>(
+        [=] [[intel::max_work_group_size(1, 1, 256),
+              intel::min_work_groups_per_cu(2),
+              intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; });
+
+    constexpr float A = 2.0;
+    // expected-warning@+5{{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}}
+    // expected-error@+3 {{integral constant expression must have integral or unscoped enumeration type, not 'float'}}
+    cgh.single_task<class T2>(
+        [=] [[intel::max_work_group_size(1, 1, 256),
+              intel::min_work_groups_per_cu(A),
+              intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; });
+
+    // expected-error@+4 {{expression is not an integral constant expression}}
+    // expected-note@+3 {{value 2147483648 is outside the range of representable values of type 'int'}}
+    cgh.single_task<class T3>(
+        [=] [[intel::max_work_group_size(1, 1, 256),
+              intel::min_work_groups_per_cu(2147483647 + 1)]] () {
+          volatile int A = 42;
+        });
+
+    // expected-warning@+5 {{attribute 'min_work_groups_per_cu' is already applied with different arguments}}
+    // expected-note@+3 {{previous attribute is here}}
+    cgh.single_task<class T4>(
+        [=] [[intel::max_work_group_size(1, 1, 256),
+              intel::min_work_groups_per_cu(4),
+              intel::min_work_groups_per_cu(8)]] () { volatile int A = 42; });
+
+    // expected-error@+2 {{'min_work_groups_per_cu' attribute requires a non-negative integral compile time constant expression}}
+    cgh.single_task<class T5>(
+        [=] [[intel::min_work_groups_per_cu(-8)]] () { volatile int A = 42; });
+  });
+
+  Q.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class F>(Functor<512, 8, 16>{});
+  });
+
+  return 0;
+}
diff --git a/clang/test/SemaSYCL/lb_sm_90.cpp b/clang/test/SemaSYCL/lb_sm_90.cpp
new file mode 100644
index 0000000000000..9d2f61571565d
--- /dev/null
+++ b/clang/test/SemaSYCL/lb_sm_90.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -internal-isystem %S/Inputs %s -triple nvptx64-nvidia-cuda -target-cpu sm_90 -fsycl-is-device -fsyntax-only -Wno-c++23-extensions -verify
+// expected-no-diagnostics
+
+// Maximum work groups per multi-processor, mapped to maxclusterrank PTX
+// directive, is an SM_90 feature, make sure that no warnings/errors are issued.
+
+#include "sycl.hpp"
+
+template <int N1, int N2, int N3> class Functor {
+public:
+  [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2),
+    intel::max_work_groups_per_mp(N3)]] void
+  operator()() const {}
+};
+
+int main() {
+  sycl::queue Q{};
+
+  Q.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class T1>( [=] [[intel::max_work_group_size(1, 1, 256),
+                                          intel::min_work_groups_per_cu(2),
+                                          intel::max_work_groups_per_mp(4)]] (
+) { volatile int A = 42; });
+  });
+
+  Q.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class F>(Functor<512, 8, 16>{});
+  });
+
+  return 0;
+}
diff --git a/clang/test/SemaSYCL/lb_sm_90_ast.cpp b/clang/test/SemaSYCL/lb_sm_90_ast.cpp
new file mode 100644
index 0000000000000..2493445d6b985
--- /dev/null
+++ b/clang/test/SemaSYCL/lb_sm_90_ast.cpp
@@ -0,0 +1,176 @@
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -ast-dump -triple nvptx-unknown-unknown -target-cpu sm_90 -Wno-c++23-extensions %s | FileCheck %s
+
+// Tests for AST of Intel max_work_group_size, min_work_groups_per_cu and
+// max_work_groups_per_mp attribute.
+
+#include "sycl.hpp"
+
+sycl::queue deviceQueue;
+
+// CHECK: FunctionDecl {{.*}} func1 'void ()'
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: SYCLIntelMaxWorkGroupSizeAttr {{.*}}
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}}
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 4
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
+// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}}
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 2
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2
+[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(4),
+  intel::max_work_groups_per_mp(2)]] void
+func1() {}
+
+// Test that checks template parameter support on function.
+// CHECK: FunctionTemplateDecl {{.*}} func2
+// CHECK: FunctionDecl {{.*}} func2 'void ()'
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: SYCLIntelMaxWorkGroupSizeAttr  {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'N' 'int'
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr   {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'N' 'int'
+// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr   {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'N' 'int'
+
+// CHECK: FunctionDecl {{.*}} func2 'void ()'
+// CHECK-NEXT: TemplateArgument integral 6
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: SYCLIntelMaxWorkGroupSizeAttr {{.*}}
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 6
+// CHECK-NEXT: SubstNonTypeTemplateParmExpr
+// CHECK-NEXT: NonTypeTemplateParmDecl
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}}
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 6
+// CHECK-NEXT: SubstNonTypeTemplateParmExpr
+// CHECK-NEXT: NonTypeTemplateParmDecl
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6
+// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}}
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 6
+// CHECK-NEXT: SubstNonTypeTemplateParmExpr
+// CHECK-NEXT: NonTypeTemplateParmDecl
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6
+template <int N>
+[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+  intel::max_work_groups_per_mp(N)]] void
+func2() {}
+
+class KernelFunctor {
+public:
+  void operator()() const { func1(); }
+};
+
+// Test that checks template parameter support on class member function.
+template <int N> class KernelFunctor2 {
+public:
+  [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+    intel::max_work_groups_per_mp(N)]] void
+  operator()() const {}
+};
+
+int main() {
+  deviceQueue.submit([&](sycl::handler &h) {
+    // CHECK-LABEL: FunctionDecl {{.*}}kernel_name_1
+    // CHECK: SYCLIntelMaxWorkGroupSizeAttr
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK: SYCLIntelMinWorkGroupsPerComputeUnitAttr
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 4
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
+    // CHECK: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 2
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2
+    KernelFunctor f1;
+    h.single_task<class kernel_name_1>(f1);
+
+    // CHECK-LABEL: FunctionDecl {{.*}}kernel_name_2
+    // CHECK: SYCLIntelMaxWorkGroupSizeAttr
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 3
+    // CHECK-NEXT: SubstNonTypeTemplateParmExpr
+    // CHECK-NEXT: NonTypeTemplateParmDecl
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK: SYCLIntelMinWorkGroupsPerComputeUnitAttr
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 3
+    // CHECK-NEXT: SubstNonTypeTemplateParmExpr
+    // CHECK-NEXT: NonTypeTemplateParmDecl
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3
+    // CHECK: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 3
+    // CHECK-NEXT: SubstNonTypeTemplateParmExpr
+    // CHECK-NEXT: NonTypeTemplateParmDecl
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3
+    KernelFunctor2<3> f2;
+    h.single_task<class kernel_name_2>(f2);
+
+    // CHECK-LABEL: FunctionDecl {{.*}}kernel_name_3
+    // CHECK: SYCLIntelMaxWorkGroupSizeAttr {{.*}}
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}}
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 4
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
+    // CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}}
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 6
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6
+    h.single_task<class kernel_name_3>(
+        [] [[intel::max_work_group_size(8, 8, 8),
+             intel::min_work_groups_per_cu(4),
+             intel::max_work_groups_per_mp(6)]] () {});
+  });
+
+  func2<6>();
+
+  return 0;
+}