From ffbfd68d91740acc0171564f2c6fa3a0cfe10328 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean@nvidia.com>
Date: Tue, 28 Jan 2025 00:30:16 +0000
Subject: [PATCH 1/2] auto-update cc ptx_kernel

---
 llvm/include/llvm/IR/AutoUpgrade.h            |  4 ++
 llvm/lib/AsmParser/LLParser.cpp               |  1 +
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |  2 +
 llvm/lib/IR/AutoUpgrade.cpp                   | 67 +++++++++++++++++++
 llvm/lib/Linker/IRMover.cpp                   |  1 +
 llvm/lib/Target/NVPTX/NVPTXUtilities.cpp      | 27 +++-----
 llvm/lib/Target/NVPTX/NVPTXUtilities.h        |  7 +-
 .../CodeGen/NVPTX/upgrade-nvvm-annotations.ll | 28 ++++++++
 8 files changed, 118 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index 97c3e4d7589d7..8c093568a1e03 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -61,6 +61,10 @@ namespace llvm {
   /// module is modified.
   bool UpgradeModuleFlags(Module &M);
 
+  /// Convert legacy nvvm.annotations metadata to appropriate function
+  /// attributes.
+  void UpgradeNVVMAnnotations(Module &M);
+
   /// Convert calls to ARC runtime functions to intrinsic calls and upgrade the
   /// old retain release marker to new module flag format.
   void UpgradeARCRuntime(Module &M);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index e842a8b2e3797..a1f79926fcc99 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -448,6 +448,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
     llvm::UpgradeDebugInfo(*M);
 
   UpgradeModuleFlags(*M);
+  UpgradeNVVMAnnotations(*M);
   UpgradeSectionAttributes(*M);
 
   if (PreserveInputDbgFormat != cl::boolOrDefault::BOU_TRUE)
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index e16e8a0f4703f..1a09e80c4fbb2 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -7157,6 +7157,8 @@ Error BitcodeReader::materializeModule() {
 
   UpgradeModuleFlags(*TheModule);
 
+  UpgradeNVVMAnnotations(*TheModule);
+
   UpgradeARCRuntime(*TheModule);
 
   return Error::success();
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 3725f412b8930..e886a6012b219 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/AttributeMask.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -5019,6 +5020,72 @@ bool llvm::UpgradeDebugInfo(Module &M) {
   return Modified;
 }
 
+bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
+                                        const Metadata *V) {
+  if (K == "kernel") {
+    if (!mdconst::extract<ConstantInt>(V)->isZero())
+      cast<Function>(GV)->setCallingConv(CallingConv::PTX_Kernel);
+    return true;
+  }
+  if (K == "align") {
+    // V is a bitfeild specifying two 16-bit values. The alignment value is
+    // specfied in low 16-bits, The index is specified in the high bits. For the
+    // index, 0 indicates the return value while higher values correspond to
+    // each parameter (idx = param + 1).
+    const uint64_t AlignIdxValuePair =
+        mdconst::extract<ConstantInt>(V)->getZExtValue();
+    const unsigned Idx = (AlignIdxValuePair >> 16);
+    const Align StackAlign = Align(AlignIdxValuePair & 0xFFFF);
+    // TODO: Skip adding the stackalign attribute for returns, for now.
+    if (!Idx)
+      return false;
+    cast<Function>(GV)->addAttributeAtIndex(
+        Idx, Attribute::getWithStackAlignment(GV->getContext(), StackAlign));
+    return true;
+  }
+
+  return false;
+}
+
+void llvm::UpgradeNVVMAnnotations(Module &M) {
+  NamedMDNode *NamedMD = M.getNamedMetadata("nvvm.annotations");
+  if (!NamedMD)
+    return;
+
+  SmallVector<MDNode *, 8> NewNodes;
+  SmallSet<const MDNode *, 8> SeenNodes;
+  for (MDNode *MD : NamedMD->operands()) {
+    if (!SeenNodes.insert(MD).second)
+      continue;
+
+    auto *GV = mdconst::dyn_extract_or_null<GlobalValue>(MD->getOperand(0));
+    if (!GV)
+      continue;
+
+    assert((MD->getNumOperands() % 2) == 1 && "Invalid number of operands");
+
+    SmallVector<Metadata *, 8> NewOperands{MD->getOperand(0)};
+    // Each nvvm.annotations metadata entry will be of the following form:
+    //   !{ ptr @gv, !"key1", value1, !"key2", value2, ... }
+    // start index = 1, to skip the global variable key
+    // increment = 2, to skip the value for each property-value pairs
+    for (unsigned j = 1, je = MD->getNumOperands(); j < je; j += 2) {
+      MDString *K = cast<MDString>(MD->getOperand(j));
+      const MDOperand &V = MD->getOperand(j + 1);
+      bool Upgraded = upgradeSingleNVVMAnnotation(GV, K->getString(), V);
+      if (!Upgraded)
+        NewOperands.append({K, V});
+    }
+
+    if (NewOperands.size() > 1)
+      NewNodes.push_back(MDNode::get(M.getContext(), NewOperands));
+  }
+
+  NamedMD->clearOperands();
+  for (MDNode *N : NewNodes)
+    NamedMD->addOperand(N);
+}
+
 /// This checks for objc retain release marker which should be upgraded. It
 /// returns true if module is modified.
 static bool upgradeRetainReleaseMarker(Module &M) {
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 43fcfe75ba46b..62e2af4da57bb 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1244,6 +1244,7 @@ Error IRLinker::linkModuleFlagsMetadata() {
 
   // Check for module flag for updates before do anything.
   UpgradeModuleFlags(*SrcM);
+  UpgradeNVVMAnnotations(*SrcM);
 
   // If the destination module doesn't have module flags yet, then just copy
   // over the source module's flags.
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 0f2bec711b249..a41943880807c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -310,30 +310,21 @@ std::optional<unsigned> getMaxNReg(const Function &F) {
   return findOneNVVMAnnotation(&F, "maxnreg");
 }
 
-bool isKernelFunction(const Function &F) {
-  if (F.getCallingConv() == CallingConv::PTX_Kernel)
-    return true;
-
-  if (const auto X = findOneNVVMAnnotation(&F, "kernel"))
-    return (*X == 1);
-
-  return false;
-}
-
 MaybeAlign getAlign(const Function &F, unsigned Index) {
   // First check the alignstack metadata
   if (MaybeAlign StackAlign =
           F.getAttributes().getAttributes(Index).getStackAlignment())
     return StackAlign;
 
-  // If that is missing, check the legacy nvvm metadata
-  std::vector<unsigned> Vs;
-  bool retval = findAllNVVMAnnotation(&F, "align", Vs);
-  if (!retval)
-    return std::nullopt;
-  for (unsigned V : Vs)
-    if ((V >> 16) == Index)
-      return Align(V & 0xFFFF);
+  // check the legacy nvvm metadata only for the return value since llvm does
+  // not support stackalign attribute for this.
+  if (Index == 0) {
+    std::vector<unsigned> Vs;
+    if (findAllNVVMAnnotation(&F, "align", Vs))
+      for (unsigned V : Vs)
+        if ((V >> 16) == Index)
+          return Align(V & 0xFFFF);
+  }
 
   return std::nullopt;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 7ce00b9b5688d..cf35eaf4cbae5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -16,6 +16,7 @@
 #include "NVPTX.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -63,7 +64,11 @@ std::optional<unsigned> getClusterDimz(const Function &);
 std::optional<unsigned> getMaxClusterRank(const Function &);
 std::optional<unsigned> getMinCTASm(const Function &);
 std::optional<unsigned> getMaxNReg(const Function &);
-bool isKernelFunction(const Function &);
+
+inline bool isKernelFunction(const Function &F) {
+  return F.getCallingConv() == CallingConv::PTX_Kernel;
+}
+
 bool isParamGridConstant(const Value &);
 
 MaybeAlign getAlign(const Function &, unsigned);
diff --git a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
new file mode 100644
index 0000000000000..a9f370a12a945
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt < %s -mtriple=nvptx64-unknown-unknown -O0 -S | FileCheck %s
+
+define i32 @foo(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @foo(
+; CHECK-SAME: i32 alignstack(8) [[A:%.*]], i32 alignstack(16) [[B:%.*]]) {
+; CHECK-NEXT:    ret i32 0
+;
+  ret i32 0
+}
+
+define i32 @bar(i32 %a, i32 %b) {
+; CHECK-LABEL: define ptx_kernel i32 @bar(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    ret i32 0
+;
+  ret i32 0
+}
+
+!nvvm.annotations = !{!0, !1, !2}
+
+!0 = !{ptr @foo, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020010}
+!1 = !{null, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020008}
+!2 = !{ptr @bar, !"kernel", i32 1}
+
+;.
+; CHECK: [[META0:![0-9]+]] = !{ptr @foo, !"align", i32 8}
+;.

From 7f126e1f3678c03994e3d2b2fd9901802299e570 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean@nvidia.com>
Date: Tue, 28 Jan 2025 00:52:28 +0000
Subject: [PATCH 2/2] remove dead omp code

---
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 28 ++++-----------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 682227916e712..70ae9327d75dd 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -5906,39 +5906,19 @@ bool llvm::omp::isOpenMPKernel(Function &Fn) {
 }
 
 KernelSet llvm::omp::getDeviceKernels(Module &M) {
-  // TODO: Create a more cross-platform way of determining device kernels.
   KernelSet Kernels;
 
-  DenseSet<const Function *> SeenKernels;
-  auto ProcessKernel = [&](Function &KF) {
-    if (SeenKernels.insert(&KF).second) {
+  for (Function &F : M)
+    if (F.hasKernelCallingConv()) {
       // We are only interested in OpenMP target regions. Others, such as
       // kernels generated by CUDA but linked together, are not interesting to
       // this pass.
-      if (isOpenMPKernel(KF)) {
+      if (isOpenMPKernel(F)) {
         ++NumOpenMPTargetRegionKernels;
-        Kernels.insert(&KF);
+        Kernels.insert(&F);
       } else
         ++NumNonOpenMPTargetRegionKernels;
     }
-  };
-
-  if (NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"))
-    for (auto *Op : MD->operands()) {
-      if (Op->getNumOperands() < 2)
-        continue;
-      MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
-      if (!KindID || KindID->getString() != "kernel")
-        continue;
-
-      if (auto *KernelFn =
-              mdconst::dyn_extract_or_null<Function>(Op->getOperand(0)))
-        ProcessKernel(*KernelFn);
-    }
-
-  for (Function &F : M)
-    if (F.hasKernelCallingConv())
-      ProcessKernel(F);
 
   return Kernels;
 }